#include "ep_lib.hpp" #include #include "ep_declaration.hpp" #include "ep_mpi.hpp" using namespace std; namespace ep_lib { int MPI_Intercomm_create_kernel(MPI_Comm local_comm, int local_leader, MPI_Comm peer_comm, int remote_leader, int tag, MPI_Comm *newintercomm) { int ep_rank, ep_rank_loc, mpi_rank; int ep_size, num_ep, mpi_size; ep_rank = local_comm->ep_comm_ptr->size_rank_info[0].first; ep_rank_loc = local_comm->ep_comm_ptr->size_rank_info[1].first; mpi_rank = local_comm->ep_comm_ptr->size_rank_info[2].first; ep_size = local_comm->ep_comm_ptr->size_rank_info[0].second; num_ep = local_comm->ep_comm_ptr->size_rank_info[1].second; mpi_size = local_comm->ep_comm_ptr->size_rank_info[2].second; ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // step 1 : local leaders exchange ep_size, leader_rank_in_peer, leader_rank_in_peer_mpi, leader_rank_in_world. // // local leaders bcast results to all ep in local_comm // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// bool is_local_leader = ep_rank==local_leader? true: false; int local_leader_rank_in_peer; int local_leader_rank_in_peer_mpi; int local_leader_rank_in_world; int remote_ep_size; int remote_leader_rank_in_peer; int remote_leader_rank_in_peer_mpi; int remote_leader_rank_in_world; int send_quadruple[4]; int recv_quadruple[4]; if(is_local_leader) { MPI_Comm_rank(peer_comm, &local_leader_rank_in_peer); ::MPI_Comm_rank(to_mpi_comm(peer_comm->mpi_comm), &local_leader_rank_in_peer_mpi); ::MPI_Comm_rank(to_mpi_comm(MPI_COMM_WORLD->mpi_comm), &local_leader_rank_in_world); send_quadruple[0] = ep_size; send_quadruple[1] = local_leader_rank_in_peer; send_quadruple[2] = local_leader_rank_in_peer_mpi; send_quadruple[3] = local_leader_rank_in_world; MPI_Request request; MPI_Status status; if(remote_leader > local_leader_rank_in_peer) { MPI_Isend(send_quadruple, 4, MPI_INT, remote_leader, tag, peer_comm, &request); MPI_Wait(&request, &status); MPI_Irecv(recv_quadruple, 4, MPI_INT, remote_leader, tag, peer_comm, &request); MPI_Wait(&request, &status); } else { MPI_Irecv(recv_quadruple, 4, MPI_INT, remote_leader, tag, peer_comm, &request); MPI_Wait(&request, &status); MPI_Isend(send_quadruple, 4, MPI_INT, remote_leader, tag, peer_comm, &request); MPI_Wait(&request, &status); } remote_ep_size = recv_quadruple[0]; remote_leader_rank_in_peer = recv_quadruple[1]; remote_leader_rank_in_peer_mpi = recv_quadruple[2]; remote_leader_rank_in_world = recv_quadruple[3]; #ifdef _showinfo printf("peer_rank = %d, packed exchange OK\n", local_leader_rank_in_peer); #endif } MPI_Bcast(send_quadruple, 4, MPI_INT, local_leader, local_comm); MPI_Bcast(recv_quadruple, 4, MPI_INT, local_leader, local_comm); if(!is_local_leader) { local_leader_rank_in_peer = send_quadruple[1]; local_leader_rank_in_peer_mpi = send_quadruple[2]; local_leader_rank_in_world = send_quadruple[3]; remote_ep_size = recv_quadruple[0]; remote_leader_rank_in_peer = recv_quadruple[1]; remote_leader_rank_in_peer_mpi = recv_quadruple[2]; remote_leader_rank_in_world = recv_quadruple[3]; } #ifdef _showinfo MPI_Barrier(peer_comm); MPI_Barrier(peer_comm); printf("peer_rank = %d, ep_size = %d, remote_ep_size = %d\n", peer_comm->ep_comm_ptr->size_rank_info[0].first, ep_size, remote_ep_size); MPI_Barrier(peer_comm); MPI_Barrier(peer_comm); #endif /////////////////////////////////////////////////////////////////// // step 2 : gather ranks in world for both local and remote comm // /////////////////////////////////////////////////////////////////// int rank_in_world; ::MPI_Comm_rank(to_mpi_comm(MPI_COMM_WORLD->mpi_comm), &rank_in_world); int *ranks_in_world_local = new int[ep_size]; int *ranks_in_world_remote = new int[remote_ep_size]; MPI_Allgather(&rank_in_world, 1, MPI_INT, ranks_in_world_local, 1, MPI_INT, local_comm); if(is_local_leader) { MPI_Request request; MPI_Status status; if(remote_leader > local_leader_rank_in_peer) { MPI_Isend(ranks_in_world_local, ep_size, MPI_INT, remote_leader, tag, peer_comm, &request); MPI_Wait(&request, &status); MPI_Irecv(ranks_in_world_remote, remote_ep_size, MPI_INT, remote_leader, tag, peer_comm, &request); MPI_Wait(&request, &status); } else { MPI_Irecv(ranks_in_world_remote, remote_ep_size, MPI_INT, remote_leader, tag, peer_comm, &request); MPI_Wait(&request, &status); MPI_Isend(ranks_in_world_local, ep_size, MPI_INT, remote_leader, tag, peer_comm, &request); MPI_Wait(&request, &status); } #ifdef _showinfo printf("peer_rank = %d, ranks_in_world exchange OK\n", local_leader_rank_in_peer); #endif } MPI_Bcast(ranks_in_world_remote, remote_ep_size, MPI_INT, local_leader, local_comm); #ifdef _showinfo MPI_Barrier(peer_comm); MPI_Barrier(peer_comm); if(remote_leader == 4) { for(int i=0; iep_comm_ptr->size_rank_info[0].first); for(int i=0; iep_comm_ptr->size_rank_info[0].first); for(int i=0; iep_comm_ptr->size_rank_info[0].first); for(int i=0; iep_comm_ptr->size_rank_info[0].first); for(int i=0; i remote_leader_rank_in_peer? true : false; int ownership; if(rank_in_world == ranks_in_world_local[local_leader]) ownership = 1; else if(rank_in_world == remote_leader_rank_in_world) ownership = 0; else { ownership = 1; for(int i=0; iep_comm_ptr->size_rank_info[0].first, priority, local_leader_rank_in_peer, remote_leader_rank_in_peer); MPI_Barrier(peer_comm); MPI_Barrier(peer_comm); #endif #ifdef _showinfo MPI_Barrier(peer_comm); MPI_Barrier(peer_comm); printf("peer_rank = %d, priority = %d, ownership = %d\n", peer_comm->ep_comm_ptr->size_rank_info[0].first, priority, ownership); MPI_Barrier(peer_comm); MPI_Barrier(peer_comm); #endif ////////////////////////////////////////////////////// // step 4 : extract local_comm and create intercomm // ////////////////////////////////////////////////////// bool is_involved = is_local_leader || (!is_local_leader && ep_rank_loc == 0 && rank_in_world != local_leader_rank_in_world); #ifdef _showinfo MPI_Barrier(peer_comm); MPI_Barrier(peer_comm); printf("peer_rank = %d, is_involved = %d\n", peer_comm->ep_comm_ptr->size_rank_info[0].first, is_involved); MPI_Barrier(peer_comm); MPI_Barrier(peer_comm); #endif if(is_involved) { ::MPI_Group local_group; ::MPI_Group extracted_group; ::MPI_Comm *extracted_comm = new ::MPI_Comm; ::MPI_Comm_group(to_mpi_comm(local_comm->mpi_comm), &local_group); int *ownership_list = new int[mpi_size]; int *mpi_rank_list = new int[mpi_size]; ::MPI_Allgather(&ownership, 1, to_mpi_type(MPI_INT), ownership_list, 1, to_mpi_type(MPI_INT), to_mpi_comm(local_comm->mpi_comm)); ::MPI_Allgather(&mpi_rank, 1, to_mpi_type(MPI_INT), mpi_rank_list, 1, to_mpi_type(MPI_INT), to_mpi_comm(local_comm->mpi_comm)); int n=0; for(int i=0; impi_comm), extracted_group, extracted_comm); ::MPI_Comm *mpi_inter_comm = new ::MPI_Comm; int local_leader_rank_in_extracted_comm; if(is_local_leader) { ::MPI_Comm_rank(*extracted_comm, &local_leader_rank_in_extracted_comm); } ::MPI_Bcast(&local_leader_rank_in_extracted_comm, 1, to_mpi_type(MPI_INT), local_comm->ep_rank_map->at(local_leader).second, to_mpi_comm(local_comm->mpi_comm)); if(ownership) ::MPI_Intercomm_create(*extracted_comm, local_leader_rank_in_extracted_comm, to_mpi_comm(peer_comm->mpi_comm), remote_leader_rank_in_peer_mpi, tag, mpi_inter_comm); //////////////////////////////////// // step 5 :: determine new num_ep // //////////////////////////////////// int num_ep_count=0; for(int i=0; iep_comm_ptr->intercomm->mpi_inter_comm = %p\n", mpi_inter_comm); #endif for(int i=0; iis_intercomm = true; ep_comm[i]->ep_comm_ptr->comm_label = ranks_in_world_local[local_leader]; ep_comm[i]->ep_comm_ptr->intercomm = new ep_lib::ep_intercomm; #ifdef _showinfo printf("new ep_comm[%d]->ep_comm_ptr->intercomm = %p\n", i, ep_comm[i]->ep_comm_ptr->intercomm); #endif ep_comm[i]->ep_comm_ptr->intercomm->mpi_inter_comm = mpi_inter_comm; } #pragma omp critical (write_to_tag_list) intercomm_list.push_back(make_pair( make_pair(tag, min(local_leader_rank_in_world, remote_leader_rank_in_world)) , make_pair(ep_comm , make_pair(num_ep_count, 0)))); #pragma omp flush #ifdef _showinfo for(int i=0; i new_ep_rank = %d\n", peer_comm->ep_comm_ptr->size_rank_info[0].first, ep_comm, i, ep_comm[i]->ep_comm_ptr->size_rank_info[0].first); #endif } delete ownership_list; delete mpi_rank_list; delete new_mpi_rank_list; } int repeated=0; for(int i=0; iep_comm_ptr->size_rank_info[0].first, ep_rank_loc, ownership, repeated, my_turn); MPI_Barrier(peer_comm); MPI_Barrier(peer_comm); #endif #pragma omp flush #pragma omp critical (read_from_intercomm_list) { bool flag=true; while(flag) { for(std::list , std::pair > > >::iterator iter = intercomm_list.begin(); iter!=intercomm_list.end(); iter++) { if(iter->first == make_pair(tag, min(local_leader_rank_in_world, remote_leader_rank_in_world))) { *newintercomm = iter->second.first[my_turn]; iter->second.second.second++; if(iter->second.second.first == iter->second.second.second) intercomm_list.erase(iter); flag = false; break; } } } } #ifdef _showinfo MPI_Barrier(peer_comm); MPI_Barrier(peer_comm); printf("peer_rank = %d, test_rank = %d\n", peer_comm->ep_comm_ptr->size_rank_info[0].first, (*newintercomm)->ep_comm_ptr->size_rank_info[0].first); MPI_Barrier(peer_comm); MPI_Barrier(peer_comm); #endif ////////////////////////////////////////////////////////// // step 7 : create intercomm_rank_map for local leaders // ////////////////////////////////////////////////////////// int my_quadruple[4]; my_quadruple[0] = ep_rank; my_quadruple[1] = (*newintercomm)->ep_comm_ptr->size_rank_info[1].first; my_quadruple[2] = (*newintercomm)->ep_comm_ptr->size_rank_info[2].first; my_quadruple[3] = (*newintercomm)->ep_comm_ptr->comm_label; #ifdef _showinfo MPI_Barrier(peer_comm); MPI_Barrier(peer_comm); printf("peer_rank = %d, my_quadruple = %d %d %d %d\n", peer_comm->ep_comm_ptr->size_rank_info[0].first, my_quadruple[0], my_quadruple[1], my_quadruple[2], my_quadruple[3]); MPI_Barrier(peer_comm); MPI_Barrier(peer_comm); #endif int *local_quadruple_list; int *remote_quadruple_list; if(is_involved) { local_quadruple_list = new int[4*ep_size]; remote_quadruple_list = new int[4*remote_ep_size]; } MPI_Gather(my_quadruple, 4, MPI_INT, local_quadruple_list, 4, MPI_INT, local_leader, local_comm); if(is_local_leader) { MPI_Request request; MPI_Status status; if(remote_leader > local_leader_rank_in_peer) { MPI_Isend(local_quadruple_list, 4*ep_size, MPI_INT, remote_leader, tag, peer_comm, &request); MPI_Wait(&request, &status); MPI_Irecv(remote_quadruple_list, 4*remote_ep_size, MPI_INT, remote_leader, tag, peer_comm, &request); MPI_Wait(&request, &status); } else { MPI_Irecv(remote_quadruple_list, 4*remote_ep_size, MPI_INT, remote_leader, tag, peer_comm, &request); MPI_Wait(&request, &status); MPI_Isend(local_quadruple_list, 4*ep_size, MPI_INT, remote_leader, tag, peer_comm, &request); MPI_Wait(&request, &status); } #ifdef _showinfo printf("peer_rank = %d, quadruple_list exchange OK\n", local_leader_rank_in_peer); #endif } if(is_involved) { ::MPI_Bcast(remote_quadruple_list, 4*remote_ep_size, to_mpi_type(MPI_INT), local_comm->ep_rank_map->at(local_leader).second, to_mpi_comm(local_comm->mpi_comm)); (*newintercomm)->ep_comm_ptr->intercomm->intercomm_rank_map = new INTERCOMM_RANK_MAP; #ifdef _showinfo printf("new intercomm_rank_map = %p\n", (*newintercomm)->ep_comm_ptr->intercomm->intercomm_rank_map); #endif for(int i=0; iep_comm_ptr->intercomm->intercomm_rank_map->insert(std::pair > >(remote_quadruple_list[4*i], remote_quadruple_list[4*i+1], remote_quadruple_list[4*i+2], remote_quadruple_list[4*i+3])); } } //////////////////////////////////////////////////////// // step 8 : associate intercomm_rank_map to endpoints // //////////////////////////////////////////////////////// int *leader_rank_in_world_local_gathered = new int[(*newintercomm)->ep_comm_ptr->size_rank_info[1].second]; MPI_Allgather_local(&local_leader_rank_in_world, 1, MPI_INT, leader_rank_in_world_local_gathered, *newintercomm); int new_rank_loc = (*newintercomm)->ep_comm_ptr->size_rank_info[1].first; int *new_rank_loc_local_gathered = new int[(*newintercomm)->ep_comm_ptr->size_rank_info[1].second]; MPI_Allgather_local(&new_rank_loc, 1, MPI_INT, new_rank_loc_local_gathered, *newintercomm); //printf("peer_rank = %d, leader_rank_in_world_local_gathered = %d %d %d %d, new_rank_loc_local_gathered = %d %d %d %d\n", // peer_comm->ep_comm_ptr->size_rank_info[0].first, leader_rank_in_world_local_gathered[0], leader_rank_in_world_local_gathered[1], leader_rank_in_world_local_gathered[2], leader_rank_in_world_local_gathered[3], // new_rank_loc_local_gathered[0], new_rank_loc_local_gathered[1], new_rank_loc_local_gathered[2], new_rank_loc_local_gathered[3]); if(is_involved) { (*newintercomm)->ep_comm_ptr->intercomm->local_rank_map = new EP_RANK_MAP; #ifdef _showinfo printf("new local_rank_map = %p\n", (*newintercomm)->ep_comm_ptr->intercomm->local_rank_map); #endif for(std::map >::iterator it = local_comm->ep_rank_map->begin(); it != local_comm->ep_rank_map->end(); it++) { (*newintercomm)->ep_comm_ptr->intercomm->local_rank_map->insert(std::pair >(it->first, it->second.first, it->second.second)); } } MPI_Barrier_local(*newintercomm); if(!is_involved) { int target; for(int i=0; i<(*newintercomm)->ep_comm_ptr->size_rank_info[1].second; i++) { if(local_leader_rank_in_world == leader_rank_in_world_local_gathered[i]) { target = i; (*newintercomm)->ep_comm_ptr->intercomm->intercomm_tag = target; break; } } (*newintercomm)->ep_comm_ptr->intercomm->intercomm_rank_map = (*newintercomm)->ep_comm_ptr->comm_list[target]->ep_comm_ptr->intercomm->intercomm_rank_map; (*newintercomm)->ep_comm_ptr->intercomm->local_rank_map = (*newintercomm)->ep_comm_ptr->comm_list[target]->ep_comm_ptr->intercomm->local_rank_map; } else { (*newintercomm)->ep_comm_ptr->intercomm->intercomm_tag = -1; } (*newintercomm)->ep_comm_ptr->intercomm->size_rank_info[0] = (*newintercomm)->ep_comm_ptr->size_rank_info[0]; (*newintercomm)->ep_comm_ptr->intercomm->size_rank_info[1] = (*newintercomm)->ep_comm_ptr->size_rank_info[1]; (*newintercomm)->ep_comm_ptr->intercomm->size_rank_info[2] = (*newintercomm)->ep_comm_ptr->size_rank_info[2]; (*newintercomm)->ep_comm_ptr->size_rank_info[0] = local_comm->ep_comm_ptr->size_rank_info[0]; (*newintercomm)->ep_comm_ptr->size_rank_info[1] = local_comm->ep_comm_ptr->size_rank_info[1]; (*newintercomm)->ep_comm_ptr->size_rank_info[2] = local_comm->ep_comm_ptr->size_rank_info[2]; #ifdef _showinfo MPI_Barrier(peer_comm); MPI_Barrier(peer_comm); printf("peer_rank = %d, size_rank_info = %d %d %d %d %d %d\n", peer_comm->ep_comm_ptr->size_rank_info[0].first, (*newintercomm)->ep_comm_ptr->size_rank_info[0].first, (*newintercomm)->ep_comm_ptr->size_rank_info[0].second, (*newintercomm)->ep_comm_ptr->size_rank_info[1].first, (*newintercomm)->ep_comm_ptr->size_rank_info[1].second, (*newintercomm)->ep_comm_ptr->size_rank_info[2].first, (*newintercomm)->ep_comm_ptr->size_rank_info[2].second); MPI_Barrier(peer_comm); MPI_Barrier(peer_comm); #endif /* if(peer_comm->ep_comm_ptr->size_rank_info[0].first == 5) { int receiver = rand()%10; printf("receiver = %d, intercomm_local_rank = %d, intercomm_mpi_rank = %d, comm_label = %d\n", receiver, (*newintercomm)->ep_comm_ptr->intercomm->intercomm_rank_map->at(receiver).first, (*newintercomm)->ep_comm_ptr->intercomm->intercomm_rank_map->at(receiver).second.first, (*newintercomm)->ep_comm_ptr->intercomm->intercomm_rank_map->at(receiver).second.second); } if(peer_comm->ep_comm_ptr->size_rank_info[0].first == 9) { int receiver = rand()%6; printf("receiver = %d, intercomm_local_rank = %d, intercomm_mpi_rank = %d, comm_label = %d\n", receiver, (*newintercomm)->ep_comm_ptr->intercomm->intercomm_rank_map->at(receiver).first, (*newintercomm)->ep_comm_ptr->intercomm->intercomm_rank_map->at(receiver).second.first, (*newintercomm)->ep_comm_ptr->intercomm->intercomm_rank_map->at(receiver).second.second); } if(peer_comm->ep_comm_ptr->size_rank_info[0].first == 5) { for(int i=0; iat(%d) = %d, %d\n", i, (*newintercomm)->ep_rank_map->at(i).first, (*newintercomm)->ep_rank_map->at(i).second); } } */ ////////////// // clean up // ////////////// delete ranks_in_world_local; delete ranks_in_world_remote; if(is_involved) { delete local_quadruple_list; delete remote_quadruple_list; } delete leader_rank_in_world_local_gathered; delete new_rank_loc_local_gathered; } }