- Timestamp:
- 06/04/18 19:25:08 (6 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
XIOS/dev/branch_openmp/extern/src_ep_dev/ep_intercomm_kernel.cpp
r1368 r1520 14 14 int ep_size, num_ep, mpi_size; 15 15 16 ep_rank = local_comm.ep_comm_ptr->size_rank_info[0].first; 17 ep_rank_loc = local_comm.ep_comm_ptr->size_rank_info[1].first; 18 mpi_rank = local_comm.ep_comm_ptr->size_rank_info[2].first; 19 ep_size = local_comm.ep_comm_ptr->size_rank_info[0].second; 20 num_ep = local_comm.ep_comm_ptr->size_rank_info[1].second; 21 mpi_size = local_comm.ep_comm_ptr->size_rank_info[2].second; 22 23 std::vector<int> rank_info[4]; //! 0->rank_in_world of local_comm, 1->rank_in_local_parent of local_comm 24 //! 2->rank_in_world of remote_comm, 3->rank_in_local_parent of remote_comm 16 ep_rank = local_comm->ep_comm_ptr->size_rank_info[0].first; 17 ep_rank_loc = local_comm->ep_comm_ptr->size_rank_info[1].first; 18 mpi_rank = local_comm->ep_comm_ptr->size_rank_info[2].first; 19 ep_size = local_comm->ep_comm_ptr->size_rank_info[0].second; 20 num_ep = local_comm->ep_comm_ptr->size_rank_info[1].second; 21 mpi_size = local_comm->ep_comm_ptr->size_rank_info[2].second; 22 23 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 24 // step 1 : local leaders exchange ep_size, leader_rank_in_peer, leader_rank_in_peer_mpi, leader_rank_in_world. // 25 // local leaders bcast results to all ep in local_comm // 26 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 27 28 bool is_local_leader = ep_rank==local_leader? true: false; 29 30 31 32 int local_leader_rank_in_peer; 33 int local_leader_rank_in_peer_mpi; 34 int local_leader_rank_in_world; 35 36 int remote_ep_size; 37 int remote_leader_rank_in_peer; 38 int remote_leader_rank_in_peer_mpi; 39 int remote_leader_rank_in_world; 40 41 int send_quadruple[4]; 42 int recv_quadruple[4]; 43 44 45 if(is_local_leader) 46 { 47 MPI_Comm_rank(peer_comm, &local_leader_rank_in_peer); 48 ::MPI_Comm_rank(to_mpi_comm(peer_comm->mpi_comm), &local_leader_rank_in_peer_mpi); 49 ::MPI_Comm_rank(to_mpi_comm(MPI_COMM_WORLD->mpi_comm), &local_leader_rank_in_world); 50 51 send_quadruple[0] = ep_size; 52 send_quadruple[1] = local_leader_rank_in_peer; 53 send_quadruple[2] = local_leader_rank_in_peer_mpi; 54 send_quadruple[3] = local_leader_rank_in_world; 55 56 MPI_Request request; 57 MPI_Status status; 58 59 60 if(remote_leader > local_leader_rank_in_peer) 61 { 62 MPI_Isend(send_quadruple, 4, MPI_INT, remote_leader, tag, peer_comm, &request); 63 MPI_Wait(&request, &status); 64 65 MPI_Irecv(recv_quadruple, 4, MPI_INT, remote_leader, tag, peer_comm, &request); 66 MPI_Wait(&request, &status); 67 } 68 else 69 { 70 MPI_Irecv(recv_quadruple, 4, MPI_INT, remote_leader, tag, peer_comm, &request); 71 MPI_Wait(&request, &status); 72 73 MPI_Isend(send_quadruple, 4, MPI_INT, remote_leader, tag, peer_comm, &request); 74 MPI_Wait(&request, &status); 75 } 76 77 remote_ep_size = recv_quadruple[0]; 78 remote_leader_rank_in_peer = recv_quadruple[1]; 79 remote_leader_rank_in_peer_mpi = recv_quadruple[2]; 80 remote_leader_rank_in_world = recv_quadruple[3]; 81 #ifdef _showinfo 82 printf("peer_rank = %d, packed exchange OK\n", local_leader_rank_in_peer); 83 #endif 84 } 85 86 MPI_Bcast(send_quadruple, 4, MPI_INT, local_leader, local_comm); 87 MPI_Bcast(recv_quadruple, 4, MPI_INT, local_leader, local_comm); 88 89 if(!is_local_leader) 90 { 91 local_leader_rank_in_peer = send_quadruple[1]; 92 local_leader_rank_in_peer_mpi = send_quadruple[2]; 93 local_leader_rank_in_world = send_quadruple[3]; 94 95 remote_ep_size = recv_quadruple[0]; 96 remote_leader_rank_in_peer = recv_quadruple[1]; 97 remote_leader_rank_in_peer_mpi = recv_quadruple[2]; 98 remote_leader_rank_in_world = recv_quadruple[3]; 99 } 100 101 102 #ifdef _showinfo 103 MPI_Barrier(peer_comm); 104 MPI_Barrier(peer_comm); 105 printf("peer_rank = %d, ep_size = %d, remote_ep_size = %d\n", peer_comm->ep_comm_ptr->size_rank_info[0].first, ep_size, remote_ep_size); 106 MPI_Barrier(peer_comm); 107 MPI_Barrier(peer_comm); 108 #endif 109 110 /////////////////////////////////////////////////////////////////// 111 // step 2 : gather ranks in world for both local and remote comm // 112 /////////////////////////////////////////////////////////////////// 25 113 26 114 int rank_in_world; 27 int rank_in_local_parent; 28 29 int rank_in_peer_mpi[2]; 30 31 int local_ep_size = ep_size; 32 int remote_ep_size; 33 34 35 ::MPI_Comm local_mpi_comm = to_mpi_comm(local_comm.mpi_comm); 36 37 38 ::MPI_Comm_rank(to_mpi_comm(MPI_COMM_WORLD.mpi_comm), &rank_in_world); 39 ::MPI_Comm_rank(local_mpi_comm, &rank_in_local_parent); 40 41 42 bool is_proc_master = false; 43 bool is_local_leader = false; 44 bool is_final_master = false; 45 46 47 if(ep_rank == local_leader) { is_proc_master = true; is_local_leader = true; is_final_master = true;} 48 if(ep_rank_loc == 0 && mpi_rank != local_comm.rank_map->at(local_leader).second) is_proc_master = true; 49 50 51 int size_info[4]; //! used for choose size of rank_info 0-> mpi_size of local_comm, 1-> mpi_size of remote_comm 52 53 int leader_info[4]; //! 0->world rank of local_leader, 1->world rank of remote leader 54 55 56 std::vector<int> ep_info[2]; //! 0-> num_ep in local_comm, 1->num_ep in remote_comm 57 58 std::vector<int> new_rank_info[4]; 59 std::vector<int> new_ep_info[2]; 60 61 std::vector<int> offset; 62 63 if(is_proc_master) 64 { 65 66 size_info[0] = mpi_size; 67 68 rank_info[0].resize(size_info[0]); 69 rank_info[1].resize(size_info[0]); 70 71 72 73 ep_info[0].resize(size_info[0]); 74 75 vector<int> send_buf(6); 76 vector<int> recv_buf(3*size_info[0]); 77 78 send_buf[0] = rank_in_world; 79 send_buf[1] = rank_in_local_parent; 80 send_buf[2] = num_ep; 81 82 ::MPI_Allgather(send_buf.data(), 3, to_mpi_type(MPI_INT), recv_buf.data(), 3, to_mpi_type(MPI_INT), local_mpi_comm); 83 84 for(int i=0; i<size_info[0]; i++) 85 { 86 rank_info[0][i] = recv_buf[3*i]; 87 rank_info[1][i] = recv_buf[3*i+1]; 88 ep_info[0][i] = recv_buf[3*i+2]; 89 } 115 ::MPI_Comm_rank(to_mpi_comm(MPI_COMM_WORLD->mpi_comm), &rank_in_world); 116 117 int *ranks_in_world_local = new int[ep_size]; 118 int *ranks_in_world_remote = new int[remote_ep_size]; 119 120 MPI_Allgather(&rank_in_world, 1, MPI_INT, ranks_in_world_local, 1, MPI_INT, local_comm); 121 122 if(is_local_leader) 123 { 124 MPI_Request request; 125 MPI_Status status; 126 127 if(remote_leader > local_leader_rank_in_peer) 128 { 129 MPI_Isend(ranks_in_world_local, ep_size, MPI_INT, remote_leader, tag, peer_comm, &request); 130 MPI_Wait(&request, &status); 131 132 MPI_Irecv(ranks_in_world_remote, remote_ep_size, MPI_INT, remote_leader, tag, peer_comm, &request); 133 MPI_Wait(&request, &status); 134 } 135 else 136 { 137 MPI_Irecv(ranks_in_world_remote, remote_ep_size, MPI_INT, remote_leader, tag, peer_comm, &request); 138 MPI_Wait(&request, &status); 139 140 MPI_Isend(ranks_in_world_local, ep_size, MPI_INT, remote_leader, tag, peer_comm, &request); 141 MPI_Wait(&request, &status); 142 } 143 #ifdef _showinfo 144 printf("peer_rank = %d, ranks_in_world exchange OK\n", local_leader_rank_in_peer); 145 #endif 146 } 147 148 MPI_Bcast(ranks_in_world_remote, remote_ep_size, MPI_INT, local_leader, local_comm); 149 150 #ifdef _showinfo 151 152 MPI_Barrier(peer_comm); 153 MPI_Barrier(peer_comm); 154 155 if(remote_leader == 4) 156 { 157 for(int i=0; i<ep_size; i++) 158 { 159 if(ep_rank == i) 160 { 161 printf("peer_rank = %d, ranks_in_world_local = \n", peer_comm->ep_comm_ptr->size_rank_info[0].first); 162 for(int i=0; i<ep_size; i++) 163 { 164 printf("%d\t", ranks_in_world_local[i]); 165 } 166 167 printf("\npeer_rank = %d, ranks_in_world_remote = \n", peer_comm->ep_comm_ptr->size_rank_info[0].first); 168 for(int i=0; i<remote_ep_size; i++) 169 { 170 printf("%d\t", ranks_in_world_remote[i]); 171 } 172 printf("\n"); 173 174 } 175 176 MPI_Barrier(local_comm); 177 MPI_Barrier(local_comm); 178 MPI_Barrier(local_comm); 179 } 180 } 181 182 MPI_Barrier(peer_comm); 183 MPI_Barrier(peer_comm); 184 MPI_Barrier(peer_comm); 185 186 if(remote_leader == 13) 187 { 188 for(int i=0; i<ep_size; i++) 189 { 190 if(ep_rank == i) 191 { 192 printf("peer_rank = %d, ranks_in_world_local = \n", peer_comm->ep_comm_ptr->size_rank_info[0].first); 193 for(int i=0; i<ep_size; i++) 194 { 195 printf("%d\t", ranks_in_world_local[i]); 196 } 197 198 printf("\npeer_rank = %d, ranks_in_world_remote = \n", peer_comm->ep_comm_ptr->size_rank_info[0].first); 199 for(int i=0; i<remote_ep_size; i++) 200 { 201 printf("%d\t", ranks_in_world_remote[i]); 202 } 203 printf("\n"); 204 205 } 206 207 MPI_Barrier(local_comm); 208 MPI_Barrier(local_comm); 209 MPI_Barrier(local_comm); 210 } 211 } 212 213 MPI_Barrier(peer_comm); 214 MPI_Barrier(peer_comm); 215 216 #endif 217 218 ////////////////////////////////////////////////////////////// 219 // step 3 : determine the priority and ownership of each ep // 220 ////////////////////////////////////////////////////////////// 221 222 bool priority = local_leader_rank_in_peer > remote_leader_rank_in_peer? true : false; 223 224 225 int ownership; 226 227 if(rank_in_world == ranks_in_world_local[local_leader]) ownership = 1; 228 else if(rank_in_world == remote_leader_rank_in_world) ownership = 0; 229 else 230 { 231 ownership = 1; 232 for(int i=0; i<remote_ep_size; i++) 233 { 234 if(rank_in_world == ranks_in_world_remote[i]) 235 { 236 ownership = priority? 1 : 0; 237 break; 238 } 239 } 240 } 241 242 #ifdef _showinfo 243 MPI_Barrier(peer_comm); 244 MPI_Barrier(peer_comm); 245 printf("peer_rank = %d, priority = %d, local_leader_rank_in_peer = %d, remote_leader_rank_in_peer = %d\n", peer_comm->ep_comm_ptr->size_rank_info[0].first, priority, local_leader_rank_in_peer, remote_leader_rank_in_peer); 246 MPI_Barrier(peer_comm); 247 MPI_Barrier(peer_comm); 248 #endif 249 250 251 #ifdef _showinfo 252 MPI_Barrier(peer_comm); 253 MPI_Barrier(peer_comm); 254 printf("peer_rank = %d, priority = %d, ownership = %d\n", peer_comm->ep_comm_ptr->size_rank_info[0].first, priority, ownership); 255 MPI_Barrier(peer_comm); 256 MPI_Barrier(peer_comm); 257 #endif 258 259 ////////////////////////////////////////////////////// 260 // step 4 : extract local_comm and create intercomm // 261 ////////////////////////////////////////////////////// 262 263 bool is_involved = is_local_leader || (!is_local_leader && ep_rank_loc == 0 && rank_in_world != local_leader_rank_in_world); 264 265 #ifdef _showinfo 266 267 MPI_Barrier(peer_comm); 268 MPI_Barrier(peer_comm); 269 printf("peer_rank = %d, is_involved = %d\n", peer_comm->ep_comm_ptr->size_rank_info[0].first, is_involved); 270 MPI_Barrier(peer_comm); 271 MPI_Barrier(peer_comm); 272 273 #endif 274 275 if(is_involved) 276 { 277 ::MPI_Group local_group; 278 ::MPI_Group extracted_group; 279 ::MPI_Comm *extracted_comm = new ::MPI_Comm; 280 281 282 ::MPI_Comm_group(to_mpi_comm(local_comm->mpi_comm), &local_group); 283 284 int *ownership_list = new int[mpi_size]; 285 int *mpi_rank_list = new int[mpi_size]; 286 287 ::MPI_Allgather(&ownership, 1, to_mpi_type(MPI_INT), ownership_list, 1, to_mpi_type(MPI_INT), to_mpi_comm(local_comm->mpi_comm)); 288 ::MPI_Allgather(&mpi_rank, 1, to_mpi_type(MPI_INT), mpi_rank_list, 1, to_mpi_type(MPI_INT), to_mpi_comm(local_comm->mpi_comm)); 289 290 291 int n=0; 292 for(int i=0; i<mpi_size; i++) 293 { 294 n+=ownership_list[i]; 295 } 296 297 int *new_mpi_rank_list = new int[n]; 298 int j=0; 299 for(int i=0; i<mpi_size; i++) 300 { 301 if(ownership_list[i] !=0) 302 { 303 new_mpi_rank_list[j++] = mpi_rank_list[i]; 304 } 305 } 306 307 308 ::MPI_Group_incl(local_group, n, new_mpi_rank_list, &extracted_group); 309 310 ::MPI_Comm_create(to_mpi_comm(local_comm->mpi_comm), extracted_group, extracted_comm); 311 312 ::MPI_Comm *mpi_inter_comm = new ::MPI_Comm; 313 314 int local_leader_rank_in_extracted_comm; 90 315 91 316 if(is_local_leader) 92 317 { 93 leader_info[0] = rank_in_world; 94 leader_info[1] = remote_leader; 95 96 ::MPI_Comm_rank(to_mpi_comm(peer_comm.mpi_comm), &rank_in_peer_mpi[0]); 97 98 send_buf[0] = size_info[0]; 99 send_buf[1] = local_ep_size; 100 send_buf[2] = rank_in_peer_mpi[0]; 101 102 103 104 MPI_Request requests[2]; 105 MPI_Status statuses[2]; 106 107 MPI_Isend(send_buf.data(), 3, MPI_INT, remote_leader, tag, peer_comm, &requests[0]); 108 MPI_Irecv(recv_buf.data(), 3, MPI_INT, remote_leader, tag, peer_comm, &requests[1]); 109 110 111 MPI_Waitall(2, requests, statuses); 112 113 size_info[1] = recv_buf[0]; 114 remote_ep_size = recv_buf[1]; 115 rank_in_peer_mpi[1] = recv_buf[2]; 116 117 } 118 119 120 121 send_buf[0] = size_info[1]; 122 send_buf[1] = leader_info[0]; 123 send_buf[2] = leader_info[1]; 124 send_buf[3] = rank_in_peer_mpi[0]; 125 send_buf[4] = rank_in_peer_mpi[1]; 126 127 ::MPI_Bcast(send_buf.data(), 5, to_mpi_type(MPI_INT), local_comm.rank_map->at(local_leader).second, local_mpi_comm); 128 129 size_info[1] = send_buf[0]; 130 leader_info[0] = send_buf[1]; 131 leader_info[1] = send_buf[2]; 132 rank_in_peer_mpi[0] = send_buf[3]; 133 rank_in_peer_mpi[1] = send_buf[4]; 134 135 136 rank_info[2].resize(size_info[1]); 137 rank_info[3].resize(size_info[1]); 138 139 ep_info[1].resize(size_info[1]); 140 141 send_buf.resize(3*size_info[0]); 142 recv_buf.resize(3*size_info[1]); 143 144 if(is_local_leader) 145 { 146 MPI_Request requests[2]; 147 MPI_Status statuses[2]; 148 149 std::copy ( rank_info[0].data(), rank_info[0].data() + size_info[0], send_buf.begin() ); 150 std::copy ( rank_info[1].data(), rank_info[1].data() + size_info[0], send_buf.begin() + size_info[0] ); 151 std::copy ( ep_info[0].data(), ep_info[0].data() + size_info[0], send_buf.begin() + 2*size_info[0] ); 152 153 MPI_Isend(send_buf.data(), 3*size_info[0], MPI_INT, remote_leader, tag+1, peer_comm, &requests[0]); 154 MPI_Irecv(recv_buf.data(), 3*size_info[1], MPI_INT, remote_leader, tag+1, peer_comm, &requests[1]); 155 156 MPI_Waitall(2, requests, statuses); 157 } 158 159 ::MPI_Bcast(recv_buf.data(), 3*size_info[1], to_mpi_type(MPI_INT), local_comm.rank_map->at(local_leader).second, local_mpi_comm); 160 161 std::copy ( recv_buf.data(), recv_buf.data() + size_info[1], rank_info[2].begin() ); 162 std::copy ( recv_buf.data() + size_info[1], recv_buf.data() + 2*size_info[1], rank_info[3].begin() ); 163 std::copy ( recv_buf.data() + 2*size_info[1], recv_buf.data() + 3*size_info[1], ep_info[1].begin() ); 164 165 166 offset.resize(size_info[0]); 167 168 if(leader_info[0]<leader_info[1]) // erase all ranks doubled with remote_comm, except the local leader 169 { 170 171 bool found = false; 172 int ep_local; 173 int ep_remote; 174 for(int i=0; i<size_info[0]; i++) 318 ::MPI_Comm_rank(*extracted_comm, &local_leader_rank_in_extracted_comm); 319 } 320 321 ::MPI_Bcast(&local_leader_rank_in_extracted_comm, 1, to_mpi_type(MPI_INT), local_comm->ep_rank_map->at(local_leader).second, to_mpi_comm(local_comm->mpi_comm)); 322 323 324 if(ownership) 325 ::MPI_Intercomm_create(*extracted_comm, local_leader_rank_in_extracted_comm, to_mpi_comm(peer_comm->mpi_comm), remote_leader_rank_in_peer_mpi, tag, mpi_inter_comm); 326 327 //////////////////////////////////// 328 // step 5 :: determine new num_ep // 329 //////////////////////////////////// 330 331 int num_ep_count=0; 332 333 for(int i=0; i<ep_size; i++) 334 { 335 if(rank_in_world == ranks_in_world_local[i]) 336 num_ep_count++; 337 } 338 339 for(int i=0; i<remote_ep_size; i++) 340 { 341 if(rank_in_world == ranks_in_world_remote[i]) 342 num_ep_count++; 343 } 344 345 346 /////////////////////////////////////////////////// 347 // step 6 : create endpoints from extracted_comm // 348 /////////////////////////////////////////////////// 349 350 if(ownership) 351 { 352 MPI_Comm *ep_comm; 353 MPI_Info info; 354 MPI_Comm_create_endpoints(extracted_comm, num_ep_count, info, ep_comm); 355 356 #ifdef _showinfo 357 printf("new ep_comm->ep_comm_ptr->intercomm->mpi_inter_comm = %p\n", mpi_inter_comm); 358 #endif 359 360 for(int i=0; i<num_ep_count; i++) 175 361 { 176 int target = rank_info[0][i]; 177 found = false; 178 for(int j=0; j<size_info[1]; j++) 362 ep_comm[i]->is_intercomm = true; 363 ep_comm[i]->ep_comm_ptr->comm_label = ranks_in_world_local[local_leader]; 364 ep_comm[i]->ep_comm_ptr->intercomm = new ep_lib::ep_intercomm; 365 #ifdef _showinfo 366 printf("new ep_comm[%d]->ep_comm_ptr->intercomm = %p\n", i, ep_comm[i]->ep_comm_ptr->intercomm); 367 #endif 368 ep_comm[i]->ep_comm_ptr->intercomm->mpi_inter_comm = mpi_inter_comm; 369 370 } 371 372 373 #pragma omp critical (write_to_tag_list) 374 intercomm_list.push_back(make_pair( make_pair(tag, min(local_leader_rank_in_world, remote_leader_rank_in_world)) , make_pair(ep_comm , make_pair(num_ep_count, 0)))); 375 #pragma omp flush 376 #ifdef _showinfo 377 for(int i=0; i<num_ep_count; i++) 378 printf("peer_rank = %d, ep_comm = %p, ep_comm[%d] -> new_ep_rank = %d\n", peer_comm->ep_comm_ptr->size_rank_info[0].first, ep_comm, i, ep_comm[i]->ep_comm_ptr->size_rank_info[0].first); 379 #endif 380 } 381 382 383 delete ownership_list; 384 delete mpi_rank_list; 385 delete new_mpi_rank_list; 386 387 } 388 389 int repeated=0; 390 for(int i=0; i<remote_ep_size; i++) 391 { 392 if(rank_in_world == ranks_in_world_remote[i]) 393 repeated++; 394 } 395 396 int my_turn = ownership? ep_rank_loc : ep_rank_loc+repeated; 397 398 #ifdef _showinfo 399 400 MPI_Barrier(peer_comm); 401 MPI_Barrier(peer_comm); 402 printf("peer_rank = %d, ep_rank_loc = %d, ownership = %d, repeated = %d, my_turn = %d\n", peer_comm->ep_comm_ptr->size_rank_info[0].first, ep_rank_loc, ownership, repeated, my_turn); 403 MPI_Barrier(peer_comm); 404 MPI_Barrier(peer_comm); 405 406 #endif 407 408 409 #pragma omp flush 410 #pragma omp critical (read_from_intercomm_list) 411 { 412 bool flag=true; 413 while(flag) 414 { 415 for(std::list<std::pair<std::pair<int, int> , std::pair<MPI_Comm * , std::pair<int, int> > > >::iterator iter = intercomm_list.begin(); iter!=intercomm_list.end(); iter++) 416 { 417 if(iter->first == make_pair(tag, min(local_leader_rank_in_world, remote_leader_rank_in_world))) 179 418 { 180 if(target == rank_info[2][j]) 181 { 182 found = true; 183 ep_local = ep_info[0][j]; 184 ep_remote = ep_info[1][j]; 185 break; 186 } 187 } 188 if(found) 189 { 190 191 if(target == leader_info[0]) // the leader is doubled in remote 192 { 193 new_rank_info[0].push_back(target); 194 new_rank_info[1].push_back(rank_info[1][i]); 195 196 new_ep_info[0].push_back(ep_local + ep_remote); 197 offset[i] = 0; 198 } 199 else 200 { 201 offset[i] = ep_local; 202 } 203 } 204 else 205 { 206 new_rank_info[0].push_back(target); 207 new_rank_info[1].push_back(rank_info[1][i]); 208 209 new_ep_info[0].push_back(ep_info[0][i]); 210 211 offset[i] = 0; 212 } 213 214 } 215 } 216 217 else // erase rank doubled with remote leader 218 { 219 220 bool found = false; 221 int ep_local; 222 int ep_remote; 223 for(int i=0; i<size_info[0]; i++) 224 { 225 int target = rank_info[0][i]; 226 found = false; 227 for(int j=0; j<size_info[1]; j++) 228 { 229 230 if(target == rank_info[2][j]) 231 { 232 found = true; 233 ep_local = ep_info[0][j]; 234 ep_remote = ep_info[1][j]; 235 break; 236 } 237 } 238 if(found) 239 { 240 if(target != leader_info[1]) 241 { 242 new_rank_info[0].push_back(target); 243 new_rank_info[1].push_back(rank_info[1][i]); 244 245 new_ep_info[0].push_back(ep_local + ep_remote); 246 offset[i] = 0; 247 } 248 else // found remote leader 249 { 250 offset[i] = ep_remote; 251 } 252 } 253 else 254 { 255 new_rank_info[0].push_back(target); 256 new_rank_info[1].push_back(rank_info[1][i]); 257 258 new_ep_info[0].push_back(ep_info[0][i]); 259 offset[i] = 0; 419 *newintercomm = iter->second.first[my_turn]; 420 421 iter->second.second.second++; 422 423 if(iter->second.second.first == iter->second.second.second) 424 intercomm_list.erase(iter); 425 426 flag = false; 427 break; 260 428 } 261 429 } 262 430 } 263 264 if(offset[mpi_rank] == 0) 265 { 266 is_final_master = true; 267 } 268 269 270 //! size_info[4]: 2->size of new_ep_info for local, 3->size of new_ep_info for remote 271 272 if(is_local_leader) 273 { 274 size_info[2] = new_ep_info[0].size(); 275 MPI_Request requests[2]; 276 MPI_Status statuses[2]; 277 MPI_Isend(&size_info[2], 1, MPI_INT, remote_leader, tag+2, peer_comm, &requests[0]); 278 MPI_Irecv(&size_info[3], 1, MPI_INT, remote_leader, tag+2, peer_comm, &requests[1]); 279 280 MPI_Waitall(2, requests, statuses); 281 } 282 283 ::MPI_Bcast(&size_info[2], 2, to_mpi_type(MPI_INT), local_comm.rank_map->at(local_leader).second, local_mpi_comm); 284 285 new_rank_info[2].resize(size_info[3]); 286 new_rank_info[3].resize(size_info[3]); 287 new_ep_info[1].resize(size_info[3]); 288 289 send_buf.resize(size_info[2]); 290 recv_buf.resize(size_info[3]); 291 292 if(is_local_leader) 293 { 294 MPI_Request requests[2]; 295 MPI_Status statuses[2]; 296 297 std::copy ( new_rank_info[0].data(), new_rank_info[0].data() + size_info[2], send_buf.begin() ); 298 std::copy ( new_rank_info[1].data(), new_rank_info[1].data() + size_info[2], send_buf.begin() + size_info[2] ); 299 std::copy ( new_ep_info[0].data(), new_ep_info[0].data() + size_info[0], send_buf.begin() + 2*size_info[2] ); 300 301 MPI_Isend(send_buf.data(), 3*size_info[2], MPI_INT, remote_leader, tag+3, peer_comm, &requests[0]); 302 MPI_Irecv(recv_buf.data(), 3*size_info[3], MPI_INT, remote_leader, tag+3, peer_comm, &requests[1]); 303 304 MPI_Waitall(2, requests, statuses); 305 } 306 307 ::MPI_Bcast(recv_buf.data(), 3*size_info[3], to_mpi_type(MPI_INT), local_comm.rank_map->at(local_leader).second, local_mpi_comm); 308 309 std::copy ( recv_buf.data(), recv_buf.data() + size_info[3], new_rank_info[2].begin() ); 310 std::copy ( recv_buf.data() + size_info[3], recv_buf.data() + 2*size_info[3], new_rank_info[3].begin() ); 311 std::copy ( recv_buf.data() + 2*size_info[3], recv_buf.data() + 3*size_info[3], new_ep_info[1].begin() ); 312 313 } 314 315 316 317 if(is_proc_master) 318 { 319 //! leader_info[4]: 2-> rank of local leader in new_group generated comm; 320 // 3-> rank of remote leader in new_group generated comm; 321 ::MPI_Group local_group; 322 ::MPI_Group new_group; 323 ::MPI_Comm *new_comm = new ::MPI_Comm; 324 ::MPI_Comm *intercomm = new ::MPI_Comm; 325 326 ::MPI_Comm_group(local_mpi_comm, &local_group); 327 328 ::MPI_Group_incl(local_group, size_info[2], new_rank_info[1].data(), &new_group); 329 330 ::MPI_Comm_create(local_mpi_comm, new_group, new_comm); 331 332 333 334 if(is_local_leader) 335 { 336 ::MPI_Comm_rank(*new_comm, &leader_info[2]); 337 } 338 339 ::MPI_Bcast(&leader_info[2], 1, to_mpi_type(MPI_INT), local_comm.rank_map->at(local_leader).second, local_mpi_comm); 340 341 if(new_comm != static_cast< ::MPI_Comm*>(MPI_COMM_NULL.mpi_comm)) 342 { 343 344 ::MPI_Barrier(*new_comm); 345 346 ::MPI_Intercomm_create(*new_comm, leader_info[2], to_mpi_comm(peer_comm.mpi_comm), rank_in_peer_mpi[1], tag, intercomm); 347 348 int id; 349 350 ::MPI_Comm_rank(*new_comm, &id); 351 int my_num_ep = new_ep_info[0][id]; 352 353 MPI_Comm *ep_intercomm; 354 MPI_Info info; 355 MPI_Comm_create_endpoints(new_comm, my_num_ep, info, ep_intercomm); 356 357 358 for(int i= 0; i<my_num_ep; i++) 431 } 432 433 #ifdef _showinfo 434 435 MPI_Barrier(peer_comm); 436 MPI_Barrier(peer_comm); 437 printf("peer_rank = %d, test_rank = %d\n", peer_comm->ep_comm_ptr->size_rank_info[0].first, (*newintercomm)->ep_comm_ptr->size_rank_info[0].first); 438 MPI_Barrier(peer_comm); 439 MPI_Barrier(peer_comm); 440 441 #endif 442 443 ////////////////////////////////////////////////////////// 444 // step 7 : create intercomm_rank_map for local leaders // 445 ////////////////////////////////////////////////////////// 446 447 int my_quadruple[4]; 448 449 my_quadruple[0] = ep_rank; 450 my_quadruple[1] = (*newintercomm)->ep_comm_ptr->size_rank_info[1].first; 451 my_quadruple[2] = (*newintercomm)->ep_comm_ptr->size_rank_info[2].first; 452 my_quadruple[3] = (*newintercomm)->ep_comm_ptr->comm_label; 453 454 455 #ifdef _showinfo 456 457 MPI_Barrier(peer_comm); 458 MPI_Barrier(peer_comm); 459 printf("peer_rank = %d, my_quadruple = %d %d %d %d\n", peer_comm->ep_comm_ptr->size_rank_info[0].first, my_quadruple[0], my_quadruple[1], my_quadruple[2], my_quadruple[3]); 460 MPI_Barrier(peer_comm); 461 MPI_Barrier(peer_comm); 462 #endif 463 464 int *local_quadruple_list; 465 int *remote_quadruple_list; 466 if(is_involved) 467 { 468 local_quadruple_list = new int[4*ep_size]; 469 remote_quadruple_list = new int[4*remote_ep_size]; 470 471 } 472 473 MPI_Gather(my_quadruple, 4, MPI_INT, local_quadruple_list, 4, MPI_INT, local_leader, local_comm); 474 475 476 if(is_local_leader) 477 { 478 MPI_Request request; 479 MPI_Status status; 480 481 if(remote_leader > local_leader_rank_in_peer) 482 { 483 MPI_Isend(local_quadruple_list, 4*ep_size, MPI_INT, remote_leader, tag, peer_comm, &request); 484 MPI_Wait(&request, &status); 485 486 MPI_Irecv(remote_quadruple_list, 4*remote_ep_size, MPI_INT, remote_leader, tag, peer_comm, &request); 487 MPI_Wait(&request, &status); 488 } 489 else 490 { 491 MPI_Irecv(remote_quadruple_list, 4*remote_ep_size, MPI_INT, remote_leader, tag, peer_comm, &request); 492 MPI_Wait(&request, &status); 493 494 MPI_Isend(local_quadruple_list, 4*ep_size, MPI_INT, remote_leader, tag, peer_comm, &request); 495 MPI_Wait(&request, &status); 496 } 497 #ifdef _showinfo 498 printf("peer_rank = %d, quadruple_list exchange OK\n", local_leader_rank_in_peer); 499 #endif 500 } 501 502 if(is_involved) 503 { 504 ::MPI_Bcast(remote_quadruple_list, 4*remote_ep_size, to_mpi_type(MPI_INT), local_comm->ep_rank_map->at(local_leader).second, to_mpi_comm(local_comm->mpi_comm)); 505 (*newintercomm)->ep_comm_ptr->intercomm->intercomm_rank_map = new INTERCOMM_RANK_MAP; 506 #ifdef _showinfo 507 printf("new intercomm_rank_map = %p\n", (*newintercomm)->ep_comm_ptr->intercomm->intercomm_rank_map); 508 #endif 509 for(int i=0; i<remote_ep_size; i++) 510 { 511 (*newintercomm)->ep_comm_ptr->intercomm->intercomm_rank_map->insert(std::pair<int, std::pair< int, std::pair<int, int> > >(remote_quadruple_list[4*i], remote_quadruple_list[4*i+1], remote_quadruple_list[4*i+2], remote_quadruple_list[4*i+3])); 512 } 513 } 514 515 //////////////////////////////////////////////////////// 516 // step 8 : associate intercomm_rank_map to endpoints // 517 //////////////////////////////////////////////////////// 518 519 int *leader_rank_in_world_local_gathered = new int[(*newintercomm)->ep_comm_ptr->size_rank_info[1].second]; 520 521 MPI_Allgather_local(&local_leader_rank_in_world, 1, MPI_INT, leader_rank_in_world_local_gathered, *newintercomm); 522 523 524 int new_rank_loc = (*newintercomm)->ep_comm_ptr->size_rank_info[1].first; 525 int *new_rank_loc_local_gathered = new int[(*newintercomm)->ep_comm_ptr->size_rank_info[1].second]; 526 527 MPI_Allgather_local(&new_rank_loc, 1, MPI_INT, new_rank_loc_local_gathered, *newintercomm); 528 529 //printf("peer_rank = %d, leader_rank_in_world_local_gathered = %d %d %d %d, new_rank_loc_local_gathered = %d %d %d %d\n", 530 // peer_comm->ep_comm_ptr->size_rank_info[0].first, leader_rank_in_world_local_gathered[0], leader_rank_in_world_local_gathered[1], leader_rank_in_world_local_gathered[2], leader_rank_in_world_local_gathered[3], 531 // new_rank_loc_local_gathered[0], new_rank_loc_local_gathered[1], new_rank_loc_local_gathered[2], new_rank_loc_local_gathered[3]); 532 533 if(is_involved) 534 { 535 536 (*newintercomm)->ep_comm_ptr->intercomm->local_rank_map = new EP_RANK_MAP; 537 #ifdef _showinfo 538 printf("new local_rank_map = %p\n", (*newintercomm)->ep_comm_ptr->intercomm->local_rank_map); 539 #endif 540 541 for(std::map<int, std::pair<int, int> >::iterator it = local_comm->ep_rank_map->begin(); it != local_comm->ep_rank_map->end(); it++) 542 { 543 (*newintercomm)->ep_comm_ptr->intercomm->local_rank_map->insert(std::pair<int, std::pair< int, int > >(it->first, it->second.first, it->second.second)); 544 } 545 } 546 547 MPI_Barrier_local(*newintercomm); 548 549 550 if(!is_involved) 551 { 552 int target; 553 for(int i=0; i<(*newintercomm)->ep_comm_ptr->size_rank_info[1].second; i++) 554 { 555 if(local_leader_rank_in_world == leader_rank_in_world_local_gathered[i]) 359 556 { 360 ep_intercomm[i].is_intercomm = true; 361 362 ep_intercomm[i].ep_comm_ptr->intercomm = new ep_lib::ep_intercomm; 363 ep_intercomm[i].ep_comm_ptr->intercomm->mpi_inter_comm = intercomm; 364 ep_intercomm[i].ep_comm_ptr->comm_label = leader_info[0]; 365 } 366 367 368 #pragma omp critical (write_to_tag_list) 369 tag_list.push_back(make_pair( make_pair(tag, min(leader_info[0], leader_info[1])) , ep_intercomm)); 370 //printf("tag_list size = %lu\n", tag_list.size()); 371 } 372 } 373 374 vector<int> bcast_buf(8); 375 if(is_local_leader) 376 { 377 std::copy(size_info, size_info+4, bcast_buf.begin()); 378 std::copy(leader_info, leader_info+4, bcast_buf.begin()+4); 379 } 380 381 MPI_Bcast(bcast_buf.data(), 8, MPI_INT, local_leader, local_comm); 382 383 if(!is_local_leader) 384 { 385 std::copy(bcast_buf.begin(), bcast_buf.begin()+4, size_info); 386 std::copy(bcast_buf.begin()+4, bcast_buf.begin()+8, leader_info); 387 } 388 389 if(!is_local_leader) 390 { 391 new_rank_info[1].resize(size_info[2]); 392 ep_info[1].resize(size_info[1]); 393 offset.resize(size_info[0]); 394 } 395 396 bcast_buf.resize(size_info[2]+size_info[1]+size_info[0]+1); 397 398 if(is_local_leader) 399 { 400 bcast_buf[0] = remote_ep_size; 401 std::copy(new_rank_info[1].data(), new_rank_info[1].data()+size_info[2], bcast_buf.begin()+1); 402 std::copy(ep_info[1].data(), ep_info[1].data()+size_info[1], bcast_buf.begin()+size_info[2]+1); 403 std::copy(offset.data(), offset.data()+size_info[0], bcast_buf.begin()+size_info[2]+size_info[1]+1); 404 } 405 406 MPI_Bcast(bcast_buf.data(), size_info[2]+size_info[1]+size_info[0]+1, MPI_INT, local_leader, local_comm); 407 408 if(!is_local_leader) 409 { 410 remote_ep_size = bcast_buf[0]; 411 std::copy(bcast_buf.data()+1, bcast_buf.data()+1+size_info[2], new_rank_info[1].begin()); 412 std::copy(bcast_buf.data()+1+size_info[2], bcast_buf.data()+1+size_info[2]+size_info[1], ep_info[1].begin()); 413 std::copy(bcast_buf.data()+1+size_info[2]+size_info[1], bcast_buf.data()+1+size_info[2]+size_info[1]+size_info[0], offset.begin()); 414 } 415 416 int my_position = offset[rank_in_local_parent]+ep_rank_loc; 417 418 MPI_Barrier_local(local_comm); 419 #pragma omp flush 420 421 422 #pragma omp critical (read_from_tag_list) 423 { 424 bool found = false; 425 while(!found) 426 { 427 for(std::list<std::pair < std::pair<int,int>, MPI_Comm* > >::iterator iter = tag_list.begin(); iter!=tag_list.end(); iter++) 428 { 429 if((*iter).first == make_pair(tag, min(leader_info[0], leader_info[1]))) 430 { 431 *newintercomm = iter->second[my_position]; 432 found = true; 433 break; 434 } 435 } 436 } 437 } 438 439 MPI_Barrier(local_comm); 440 441 if(is_local_leader) 442 { 443 int local_flag = true; 444 int remote_flag = false; 445 MPI_Request mpi_requests[2]; 446 MPI_Status mpi_statuses[2]; 447 448 MPI_Isend(&local_flag, 1, MPI_INT, remote_leader, tag, peer_comm, &mpi_requests[0]); 449 MPI_Irecv(&remote_flag, 1, MPI_INT, remote_leader, tag, peer_comm, &mpi_requests[1]); 450 451 MPI_Waitall(2, mpi_requests, mpi_statuses); 452 } 453 454 455 MPI_Barrier(local_comm); 456 457 if(is_proc_master) 458 { 459 for(std::list<std::pair < std::pair<int,int>, MPI_Comm* > >::iterator iter = tag_list.begin(); iter!=tag_list.end(); iter++) 460 { 461 if((*iter).first == make_pair(tag, min(leader_info[0], leader_info[1]))) 462 { 463 tag_list.erase(iter); 557 target = i; 558 (*newintercomm)->ep_comm_ptr->intercomm->intercomm_tag = target; 464 559 break; 465 560 } 466 561 } 467 } 468 469 int intercomm_ep_rank, intercomm_ep_rank_loc, intercomm_mpi_rank; 470 int intercomm_ep_size, intercomm_num_ep, intercomm_mpi_size; 471 472 intercomm_ep_rank = newintercomm->ep_comm_ptr->size_rank_info[0].first; 473 intercomm_ep_rank_loc = newintercomm->ep_comm_ptr->size_rank_info[1].first; 474 intercomm_mpi_rank = newintercomm->ep_comm_ptr->size_rank_info[2].first; 475 intercomm_ep_size = newintercomm->ep_comm_ptr->size_rank_info[0].second; 476 intercomm_num_ep = newintercomm->ep_comm_ptr->size_rank_info[1].second; 477 intercomm_mpi_size = newintercomm->ep_comm_ptr->size_rank_info[2].second; 478 479 MPI_Bcast(&remote_ep_size, 1, MPI_INT, local_leader, local_comm); 480 481 int my_rank_map_elem[2]; 482 483 my_rank_map_elem[0] = intercomm_ep_rank; 484 my_rank_map_elem[1] = (*newintercomm).ep_comm_ptr->comm_label; 485 486 vector<pair<int, int> > local_rank_map_array; 487 vector<pair<int, int> > remote_rank_map_array; 488 489 490 (*newintercomm).ep_comm_ptr->intercomm->local_rank_map = new RANK_MAP; 491 (*newintercomm).ep_comm_ptr->intercomm->local_rank_map->resize(local_ep_size); 492 493 MPI_Allgather(my_rank_map_elem, 2, MPI_INT, 494 (*newintercomm).ep_comm_ptr->intercomm->local_rank_map->data(), 2, MPI_INT, local_comm); 495 496 (*newintercomm).ep_comm_ptr->intercomm->remote_rank_map = new RANK_MAP; 497 (*newintercomm).ep_comm_ptr->intercomm->remote_rank_map->resize(remote_ep_size); 498 499 (*newintercomm).ep_comm_ptr->intercomm->size_rank_info[0] = local_comm.ep_comm_ptr->size_rank_info[0]; 500 (*newintercomm).ep_comm_ptr->intercomm->size_rank_info[1] = local_comm.ep_comm_ptr->size_rank_info[1]; 501 (*newintercomm).ep_comm_ptr->intercomm->size_rank_info[2] = local_comm.ep_comm_ptr->size_rank_info[2]; 502 503 int local_intercomm_size = intercomm_ep_size; 504 int remote_intercomm_size; 505 506 int new_bcast_root_0 = 0; 507 int new_bcast_root = 0; 508 509 510 if(is_local_leader) 511 { 512 MPI_Request requests[4]; 513 MPI_Status statuses[4]; 514 515 MPI_Isend((*newintercomm).ep_comm_ptr->intercomm->local_rank_map->data(), 2*local_ep_size, MPI_INT, remote_leader, tag+4, peer_comm, &requests[0]); 516 MPI_Irecv((*newintercomm).ep_comm_ptr->intercomm->remote_rank_map->data(), 2*remote_ep_size, MPI_INT, remote_leader, tag+4, peer_comm, &requests[1]); 517 518 MPI_Isend(&local_intercomm_size, 1, MPI_INT, remote_leader, tag+5, peer_comm, &requests[2]); 519 MPI_Irecv(&remote_intercomm_size, 1, MPI_INT, remote_leader, tag+5, peer_comm, &requests[3]); 520 521 MPI_Waitall(4, requests, statuses); 522 523 new_bcast_root_0 = intercomm_ep_rank; 524 } 525 526 MPI_Allreduce(&new_bcast_root_0, &new_bcast_root, 1, MPI_INT, MPI_SUM, *newintercomm); 527 528 529 MPI_Bcast((*newintercomm).ep_comm_ptr->intercomm->remote_rank_map->data(), 2*remote_ep_size, MPI_INT, local_leader, local_comm); 530 MPI_Bcast(&remote_intercomm_size, 1, MPI_INT, new_bcast_root, *newintercomm); 531 532 533 (*newintercomm).ep_comm_ptr->intercomm->intercomm_rank_map = new RANK_MAP; 534 (*newintercomm).ep_comm_ptr->intercomm->intercomm_rank_map->resize(remote_intercomm_size); 535 536 537 538 539 if(is_local_leader) 540 { 541 MPI_Request requests[2]; 542 MPI_Status statuses[2]; 543 544 MPI_Isend((*newintercomm).rank_map->data(), 2*local_intercomm_size, MPI_INT, remote_leader, tag+6, peer_comm, &requests[0]); 545 MPI_Irecv((*newintercomm).ep_comm_ptr->intercomm->intercomm_rank_map->data(), 2*remote_intercomm_size, MPI_INT, remote_leader, tag+6, peer_comm, &requests[1]); 546 547 MPI_Waitall(2, requests, statuses); 548 } 549 550 MPI_Bcast((*newintercomm).ep_comm_ptr->intercomm->intercomm_rank_map->data(), 2*remote_intercomm_size, MPI_INT, new_bcast_root, *newintercomm); 551 552 (*newintercomm).ep_comm_ptr->intercomm->local_comm = &(local_comm.ep_comm_ptr->comm_list[ep_rank_loc]); 553 (*newintercomm).ep_comm_ptr->intercomm->intercomm_tag = tag; 562 (*newintercomm)->ep_comm_ptr->intercomm->intercomm_rank_map = (*newintercomm)->ep_comm_ptr->comm_list[target]->ep_comm_ptr->intercomm->intercomm_rank_map; 563 (*newintercomm)->ep_comm_ptr->intercomm->local_rank_map = (*newintercomm)->ep_comm_ptr->comm_list[target]->ep_comm_ptr->intercomm->local_rank_map; 564 } 565 else 566 { 567 (*newintercomm)->ep_comm_ptr->intercomm->intercomm_tag = -1; 568 } 569 570 (*newintercomm)->ep_comm_ptr->intercomm->size_rank_info[0] = (*newintercomm)->ep_comm_ptr->size_rank_info[0]; 571 (*newintercomm)->ep_comm_ptr->intercomm->size_rank_info[1] = (*newintercomm)->ep_comm_ptr->size_rank_info[1]; 572 (*newintercomm)->ep_comm_ptr->intercomm->size_rank_info[2] = (*newintercomm)->ep_comm_ptr->size_rank_info[2]; 573 574 575 (*newintercomm)->ep_comm_ptr->size_rank_info[0] = local_comm->ep_comm_ptr->size_rank_info[0]; 576 (*newintercomm)->ep_comm_ptr->size_rank_info[1] = local_comm->ep_comm_ptr->size_rank_info[1]; 577 (*newintercomm)->ep_comm_ptr->size_rank_info[2] = local_comm->ep_comm_ptr->size_rank_info[2]; 578 579 580 #ifdef _showinfo 581 MPI_Barrier(peer_comm); 582 MPI_Barrier(peer_comm); 583 584 printf("peer_rank = %d, size_rank_info = %d %d %d %d %d %d\n", peer_comm->ep_comm_ptr->size_rank_info[0].first, 585 (*newintercomm)->ep_comm_ptr->size_rank_info[0].first, (*newintercomm)->ep_comm_ptr->size_rank_info[0].second, 586 (*newintercomm)->ep_comm_ptr->size_rank_info[1].first, (*newintercomm)->ep_comm_ptr->size_rank_info[1].second, 587 (*newintercomm)->ep_comm_ptr->size_rank_info[2].first, (*newintercomm)->ep_comm_ptr->size_rank_info[2].second); 588 589 MPI_Barrier(peer_comm); 590 MPI_Barrier(peer_comm); 591 592 #endif 554 593 555 594 /* 556 for(int i=0; i<local_ep_size; i++) 557 if(local_comm.ep_comm_ptr->comm_label == 0) printf("ep_rank (from EP) = %d, local_rank_map[%d] = (%d,%d)\n", intercomm_ep_rank, i, 558 (*newintercomm).ep_comm_ptr->intercomm->local_rank_map->at(i).first, (*newintercomm).ep_comm_ptr->intercomm->local_rank_map->at(i).second); 559 560 for(int i=0; i<remote_ep_size; i++) 561 if(local_comm.ep_comm_ptr->comm_label == 0) printf("ep_rank (from EP) = %d, remote_rank_map[%d] = (%d,%d)\n", intercomm_ep_rank, i, 562 (*newintercomm).ep_comm_ptr->intercomm->remote_rank_map->at(i).first, (*newintercomm).ep_comm_ptr->intercomm->remote_rank_map->at(i).second); 563 564 for(int i=0; i<remote_intercomm_size; i++) 565 if(local_comm.ep_comm_ptr->comm_label == 0) printf("ep_rank (from EP) = %d, intercomm_rank_map[%d] = (%d,%d)\n", intercomm_ep_rank, i, 566 (*newintercomm).ep_comm_ptr->intercomm->intercomm_rank_map->at(i).first, (*newintercomm).ep_comm_ptr->intercomm->intercomm_rank_map->at(i).second); 595 if(peer_comm->ep_comm_ptr->size_rank_info[0].first == 5) 596 { 597 int receiver = rand()%10; 598 printf("receiver = %d, intercomm_local_rank = %d, intercomm_mpi_rank = %d, comm_label = %d\n", receiver, (*newintercomm)->ep_comm_ptr->intercomm->intercomm_rank_map->at(receiver).first, 599 (*newintercomm)->ep_comm_ptr->intercomm->intercomm_rank_map->at(receiver).second.first, (*newintercomm)->ep_comm_ptr->intercomm->intercomm_rank_map->at(receiver).second.second); 600 } 601 602 if(peer_comm->ep_comm_ptr->size_rank_info[0].first == 9) 603 { 604 int receiver = rand()%6; 605 printf("receiver = %d, intercomm_local_rank = %d, intercomm_mpi_rank = %d, comm_label = %d\n", receiver, (*newintercomm)->ep_comm_ptr->intercomm->intercomm_rank_map->at(receiver).first, 606 (*newintercomm)->ep_comm_ptr->intercomm->intercomm_rank_map->at(receiver).second.first, (*newintercomm)->ep_comm_ptr->intercomm->intercomm_rank_map->at(receiver).second.second); 607 } 608 609 610 if(peer_comm->ep_comm_ptr->size_rank_info[0].first == 5) 611 { 612 for(int i=0; i<ep_size; i++) 613 { 614 printf("rank_map->at(%d) = %d, %d\n", i, (*newintercomm)->ep_rank_map->at(i).first, (*newintercomm)->ep_rank_map->at(i).second); 615 } 616 } 567 617 */ 568 569 // for(int i=0; i<(*newintercomm).rank_map->size(); i++) 570 // if(local_comm.ep_comm_ptr->comm_label != 99) printf("ep_rank = %d, rank_map[%d] = (%d,%d)\n", intercomm_ep_rank, i, 571 // (*newintercomm).rank_map->at(i).first, (*newintercomm).rank_map->at(i).second); 572 573 // MPI_Comm *test_comm = newintercomm->ep_comm_ptr->intercomm->local_comm; 574 // int test_rank; 575 // MPI_Comm_rank(*test_comm, &test_rank); 576 // printf("=================test_rank = %d\n", test_rank); 577 578 579 580 return MPI_SUCCESS; 618 ////////////// 619 // clean up // 620 ////////////// 621 622 delete ranks_in_world_local; 623 delete ranks_in_world_remote; 624 625 if(is_involved) 626 { 627 delete local_quadruple_list; 628 delete remote_quadruple_list; 629 } 630 631 delete leader_rank_in_world_local_gathered; 632 delete new_rank_loc_local_gathered; 633 581 634 582 635 } … … 584 637 585 638 586 587 int MPI_Intercomm_create_unique_leader(MPI_Comm local_comm, int local_leader, MPI_Comm peer_comm, int remote_leader, int tag, MPI_Comm *newintercomm) 588 { 589 //! mpi_size of local comm = 1 590 //! same world rank of leaders 591 592 int ep_rank, ep_rank_loc, mpi_rank; 593 int ep_size, num_ep, mpi_size; 594 595 ep_rank = local_comm.ep_comm_ptr->size_rank_info[0].first; 596 ep_rank_loc = local_comm.ep_comm_ptr->size_rank_info[1].first; 597 mpi_rank = local_comm.ep_comm_ptr->size_rank_info[2].first; 598 ep_size = local_comm.ep_comm_ptr->size_rank_info[0].second; 599 num_ep = local_comm.ep_comm_ptr->size_rank_info[1].second; 600 mpi_size = local_comm.ep_comm_ptr->size_rank_info[2].second; 601 602 603 604 std::vector<int> rank_info[4]; //! 0->rank_in_world of local_comm, 1->rank_in_local_parent of local_comm 605 //! 2->rank_in_world of remote_comm, 3->rank_in_local_parent of remote_comm 606 607 int rank_in_world; 608 609 int rank_in_peer_mpi[2]; 610 611 ::MPI_Comm_rank(to_mpi_comm(MPI_COMM_WORLD.mpi_comm), &rank_in_world); 612 613 614 int local_num_ep = num_ep; 615 int remote_num_ep; 616 int total_num_ep; 617 618 int leader_rank_in_peer[2]; 619 620 int my_position; 621 int tag_label[2]; 622 623 vector<int> send_buf(4); 624 vector<int> recv_buf(4); 625 626 627 if(ep_rank == local_leader) 628 { 629 MPI_Status status; 630 631 632 633 MPI_Comm_rank(peer_comm, &leader_rank_in_peer[0]); 634 635 send_buf[0] = local_num_ep; 636 send_buf[1] = leader_rank_in_peer[0]; 637 638 MPI_Request req_s, req_r; 639 640 MPI_Isend(send_buf.data(), 2, MPI_INT, remote_leader, tag, peer_comm, &req_s); 641 MPI_Irecv(recv_buf.data(), 2, MPI_INT, remote_leader, tag, peer_comm, &req_r); 642 643 644 MPI_Wait(&req_s, &status); 645 MPI_Wait(&req_r, &status); 646 647 recv_buf[2] = leader_rank_in_peer[0]; 648 649 } 650 651 MPI_Bcast(recv_buf.data(), 3, MPI_INT, local_leader, local_comm); 652 653 remote_num_ep = recv_buf[0]; 654 leader_rank_in_peer[1] = recv_buf[1]; 655 leader_rank_in_peer[0] = recv_buf[2]; 656 657 total_num_ep = local_num_ep + remote_num_ep; 658 659 660 if(leader_rank_in_peer[0] < leader_rank_in_peer[1]) 661 { 662 my_position = ep_rank_loc; 663 //! LEADER create EP 664 if(ep_rank == local_leader) 665 { 666 ::MPI_Comm *mpi_dup = new ::MPI_Comm; 667 668 ::MPI_Comm_dup(to_mpi_comm(local_comm.mpi_comm), mpi_dup); 669 670 MPI_Comm *ep_intercomm; 671 MPI_Info info; 672 MPI_Comm_create_endpoints(mpi_dup, total_num_ep, info, ep_intercomm); 673 674 675 for(int i=0; i<total_num_ep; i++) 676 { 677 ep_intercomm[i].is_intercomm = true; 678 ep_intercomm[i].ep_comm_ptr->intercomm = new ep_lib::ep_intercomm; 679 ep_intercomm[i].ep_comm_ptr->intercomm->mpi_inter_comm = 0; 680 681 ep_intercomm[i].ep_comm_ptr->comm_label = leader_rank_in_peer[0]; 682 } 683 684 tag_label[0] = TAG++; 685 tag_label[1] = rank_in_world; 686 687 #pragma omp critical (write_to_tag_list) 688 tag_list.push_back(make_pair( make_pair(tag_label[0], tag_label[1]) , ep_intercomm)); 689 690 MPI_Request req_s; 691 MPI_Status sta_s; 692 MPI_Isend(tag_label, 2, MPI_INT, remote_leader, tag, peer_comm, &req_s); 693 694 MPI_Wait(&req_s, &sta_s); 695 696 } 697 } 698 else 699 { 700 //! Wait for EP creation 701 my_position = remote_num_ep + ep_rank_loc; 702 if(ep_rank == local_leader) 703 { 704 MPI_Status status; 705 MPI_Request req_r; 706 MPI_Irecv(tag_label, 2, MPI_INT, remote_leader, tag, peer_comm, &req_r); 707 MPI_Wait(&req_r, &status); 708 } 709 } 710 711 MPI_Bcast(tag_label, 2, MPI_INT, local_leader, local_comm); 712 713 714 715 716 #pragma omp critical (read_from_tag_list) 717 { 718 bool found = false; 719 while(!found) 720 { 721 for(std::list<std::pair < std::pair<int,int>, MPI_Comm* > >::iterator iter = tag_list.begin(); iter!=tag_list.end(); iter++) 722 { 723 if((*iter).first == make_pair(tag_label[0], tag_label[1])) 724 { 725 *newintercomm = iter->second[my_position]; 726 found = true; 727 // tag_list.erase(iter); 728 break; 729 } 730 } 731 } 732 } 733 734 MPI_Barrier_local(local_comm); 735 736 if(leader_rank_in_peer[0] < leader_rank_in_peer[1]) 737 { 738 for(std::list<std::pair < std::pair<int,int>, MPI_Comm* > >::iterator iter = tag_list.begin(); iter!=tag_list.end(); iter++) 739 { 740 if((*iter).first == make_pair(tag_label[0], tag_label[1])) 741 { 742 tag_list.erase(iter); 743 break; 744 } 745 } 746 } 747 748 749 750 int intercomm_ep_rank, intercomm_ep_rank_loc, intercomm_mpi_rank; 751 int intercomm_ep_size, intercomm_num_ep, intercomm_mpi_size; 752 753 intercomm_ep_rank = newintercomm->ep_comm_ptr->size_rank_info[0].first; 754 intercomm_ep_rank_loc = newintercomm->ep_comm_ptr->size_rank_info[1].first; 755 intercomm_mpi_rank = newintercomm->ep_comm_ptr->size_rank_info[2].first; 756 intercomm_ep_size = newintercomm->ep_comm_ptr->size_rank_info[0].second; 757 intercomm_num_ep = newintercomm->ep_comm_ptr->size_rank_info[1].second; 758 intercomm_mpi_size = newintercomm->ep_comm_ptr->size_rank_info[2].second; 759 760 761 762 (*newintercomm).ep_comm_ptr->intercomm->local_rank_map = new RANK_MAP; 763 (*newintercomm).ep_comm_ptr->intercomm->remote_rank_map = new RANK_MAP; 764 (*newintercomm).ep_comm_ptr->intercomm->local_rank_map->resize(local_num_ep); 765 (*newintercomm).ep_comm_ptr->intercomm->remote_rank_map->resize(remote_num_ep); 766 767 (*newintercomm).ep_comm_ptr->intercomm->size_rank_info[0] = local_comm.ep_comm_ptr->size_rank_info[0]; 768 (*newintercomm).ep_comm_ptr->intercomm->size_rank_info[1] = local_comm.ep_comm_ptr->size_rank_info[1]; 769 (*newintercomm).ep_comm_ptr->intercomm->size_rank_info[2] = local_comm.ep_comm_ptr->size_rank_info[2]; 770 771 772 773 int local_rank_map_ele[2]; 774 local_rank_map_ele[0] = intercomm_ep_rank; 775 local_rank_map_ele[1] = (*newintercomm).ep_comm_ptr->comm_label; 776 777 MPI_Allgather(local_rank_map_ele, 2, MPI_INT, 778 (*newintercomm).ep_comm_ptr->intercomm->local_rank_map->data(), 2, MPI_INT, local_comm); 779 780 if(ep_rank == local_leader) 781 { 782 MPI_Status status; 783 MPI_Request req_s, req_r; 784 785 MPI_Isend((*newintercomm).ep_comm_ptr->intercomm->local_rank_map->data(), 2*local_num_ep, MPI_INT, remote_leader, tag, peer_comm, &req_s); 786 MPI_Irecv((*newintercomm).ep_comm_ptr->intercomm->remote_rank_map->data(), 2*remote_num_ep, MPI_INT, remote_leader, tag, peer_comm, &req_r); 787 788 789 MPI_Wait(&req_s, &status); 790 MPI_Wait(&req_r, &status); 791 792 } 793 794 MPI_Bcast((*newintercomm).ep_comm_ptr->intercomm->remote_rank_map->data(), 2*remote_num_ep, MPI_INT, local_leader, local_comm); 795 (*newintercomm).ep_comm_ptr->intercomm->local_comm = &(local_comm.ep_comm_ptr->comm_list[ep_rank_loc]); 796 (*newintercomm).ep_comm_ptr->intercomm->intercomm_tag = tag; 797 798 799 return MPI_SUCCESS; 800 } 801 802 639 803 640 }
Note: See TracChangeset
for help on using the changeset viewer.