Changeset 999


Ignore:
Timestamp:
01/10/20 12:20:51 (4 years ago)
Author:
adurocher
Message:

transfert_mpi : aggregate messages

Only one message is sent per MPI process pair.
One message was sent for each pair of tiles, and each MPI process can have multiple tiles.

Location:
codes/icosagcm/trunk/src/parallel
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • codes/icosagcm/trunk/src/parallel/transfert_mpi.f90

    r998 r999  
    2121  ! Describes how to pack/unpack a message from a domain to another, and contains MPI buffer 
    2222  type t_submessage 
    23     integer :: ind_loc, remote_ind_glo ! index of local and remote domain 
     23    integer :: ind_loc, remote_ind_glo, remote_rank ! index of local and remote domain 
    2424    integer :: npoints ! Number of cells to transfer (dim12) 
    2525    integer, allocatable :: displs(:) ! List of indexes to copy from field to buffer for each level 
    2626    integer, allocatable :: sign(:) ! Sign change to be applied for vector requests 
    27     real, allocatable :: buff(:,:,:) ! MPI buffer buff(iim*jjm[*3],dim3,dim4) 
     27    integer :: mpi_buffer_displ = -1 
     28  end type 
     29   
     30  type mpi_buffer_t 
     31    integer :: n 
     32    real, allocatable :: buff(:) 
    2833  end type 
    2934 
     
    3540    type (t_submessage), pointer :: message_in(:) ! Messages to recieve from remote ranks and to copy back to the field 
    3641    type (t_submessage), pointer :: message_out(:) ! Halos to copy to MPI buffer and to send to remote ranks 
     42    type (mpi_buffer_t), pointer :: mpi_buffer_in(:) 
     43    type (mpi_buffer_t), pointer :: mpi_buffer_out(:) 
    3744    integer, pointer :: mpi_requests_in(:) ! MPI requests used for message_in. 
    3845    integer, pointer :: mpi_requests_out(:) ! MPI requests used for message_out. 
     
    100107    type(t_local_submessage) :: submessage_local 
    101108    integer :: dim3, dim4 
    102     integer :: ind, ind_loc, remote_ind_glo, i 
    103     integer :: message_in_size, message_out_size, message_local_size 
     109    integer :: ind, ind_loc, remote_ind_glo, loc_ind_glo, i, remote_rank 
     110    integer :: message_in_size, message_out_size, message_local_size, buffer_in_size, buffer_out_size 
    104111    type(t_local_submessage), allocatable :: message_local_tmp(:) 
    105112    type(t_submessage), allocatable :: message_in_tmp(:), message_out_tmp(:) 
    106     type(t_submessage), pointer :: submessage 
    107113    integer :: field_type 
    108114 
     
    136142    allocate(message_local_tmp(INITIAL_ALLOC_SIZE)) 
    137143    message_local_size=0 
    138     do ind_loc = 1, ndomain 
     144    do loc_ind_glo = 1, ndomain_glo 
    139145      do remote_ind_glo = 1, ndomain_glo 
    140         if( domglo_rank(remote_ind_glo) == mpi_rank ) then ! If sending to local domain 
    141           if(request(ind_loc)%points_HtoB(remote_ind_glo)%npoints > 0 ) then ! Add only non-empty messages 
    142             ! Add local message ind_loc -> remote_ind_glo, aggregarting submessage_in and submessage_out into submessage_local 
    143             submessage_out = make_submessage( field_type, request(ind_loc)%points_HtoB(remote_ind_glo), & 
    144                                               ind_loc, remote_ind_glo, dim3, dim4, request(1)%vector ) 
    145             submessage_in = make_submessage( field_type, request(domglo_loc_ind(remote_ind_glo))%points_BtoH(domloc_glo_ind(ind_loc)), & 
    146                                             domglo_loc_ind(remote_ind_glo), domloc_glo_ind(ind_loc), dim3, dim4, request(1)%vector) 
    147             submessage_local%src_ind_loc = ind_loc 
    148             submessage_local%dest_ind_loc = domglo_loc_ind(remote_ind_glo) 
    149             submessage_local%npoints = submessage_out%npoints 
    150             submessage_local%displ_src = submessage_out%displs 
    151             submessage_local%displ_dest = submessage_in%displs 
    152             submessage_local%sign = submessage_in%sign 
    153             ! Add to local message list 
    154             call array_append_local_submessage( message_local_tmp, message_local_size, submessage_local) 
    155           endif 
    156         else ! If remote domain 
    157           ! When data to send to remote_domain, add submessage in message%message_out 
    158           if( request(ind_loc)%points_HtoB(remote_ind_glo)%npoints > 0 ) then 
    159             submessage_out = make_submessage( field_type, request(ind_loc)%points_HtoB(remote_ind_glo), & 
    160                                               ind_loc, remote_ind_glo, dim3, dim4, request(1)%vector ) 
    161             call array_append_submessage( message_out_tmp, message_out_size, submessage_out ) 
     146        if(domglo_rank(loc_ind_glo) == mpi_rank) then  
     147          ind_loc = domglo_loc_ind(loc_ind_glo) 
     148          if( domglo_rank(remote_ind_glo) == mpi_rank ) then ! If sending to local domain 
     149            if(request(ind_loc)%points_HtoB(remote_ind_glo)%npoints > 0 ) then ! Add only non-empty messages 
     150              ! Add local message ind_loc -> remote_ind_glo, aggregarting submessage_in and submessage_out into submessage_local 
     151              submessage_out = make_submessage( field_type, request(ind_loc)%points_HtoB(remote_ind_glo), & 
     152                                                ind_loc, remote_ind_glo, dim3, dim4, request(1)%vector ) 
     153              submessage_in = make_submessage( field_type, request(domglo_loc_ind(remote_ind_glo))%points_BtoH(domloc_glo_ind(ind_loc)), & 
     154                                              domglo_loc_ind(remote_ind_glo), domloc_glo_ind(ind_loc), dim3, dim4, request(1)%vector) 
     155              submessage_local%src_ind_loc = ind_loc 
     156              submessage_local%dest_ind_loc = domglo_loc_ind(remote_ind_glo) 
     157              submessage_local%npoints = submessage_out%npoints 
     158              submessage_local%displ_src = submessage_out%displs 
     159              submessage_local%displ_dest = submessage_in%displs 
     160              submessage_local%sign = submessage_in%sign 
     161              ! Add to local message list 
     162              call array_append_local_submessage( message_local_tmp, message_local_size, submessage_local) 
     163            endif 
     164          else ! If remote domain 
     165            ! When data to send to remote_domain, add submessage in message%message_out 
     166            if( request(ind_loc)%points_HtoB(remote_ind_glo)%npoints > 0 ) then 
     167              submessage_out = make_submessage( field_type, request(ind_loc)%points_HtoB(remote_ind_glo), & 
     168                                                ind_loc, remote_ind_glo, dim3, dim4, request(1)%vector ) 
     169              call array_append_submessage( message_out_tmp, message_out_size, submessage_out ) 
     170            end if           
    162171          end if 
     172        end if 
     173      end do 
     174    end do 
     175    ! Recv and Send submessages are transposed to recieve and send in same order 
     176    ! We iterate over global domain index to match sends with recieves (local domains are not ordered like global domains) 
     177    do remote_ind_glo = 1, ndomain_glo 
     178      do loc_ind_glo = 1, ndomain_glo 
     179        if( (domglo_rank(loc_ind_glo) == mpi_rank) .and. (domglo_rank(remote_ind_glo) /= mpi_rank) ) then 
     180          ind_loc = domglo_loc_ind(loc_ind_glo) 
    163181          if( request(ind_loc)%points_BtoH(remote_ind_glo)%npoints > 0 ) then 
    164182            submessage_in = make_submessage( field_type, request(ind_loc)%points_BtoH(remote_ind_glo), & 
     
    169187      end do 
    170188    end do 
     189     
     190     
    171191    ! Trim message_xx_tmp and put it in message%message_xx 
    172192    allocate(message%message_in(message_in_size)); message%message_in(:) = message_in_tmp(:message_in_size) 
     
    174194    allocate(message%message_local(message_local_size)); message%message_local(:) = message_local_tmp(:message_local_size) 
    175195 
    176     ! Create MPI Persistant Send/Recv requests 
    177     allocate( message%mpi_requests_in(size(message%message_in)) ) 
    178     allocate( message%mpi_requests_out(size(message%message_out)) ) 
     196    ! Allocate MPI buffers 
     197    allocate( message%mpi_buffer_in(0:mpi_size-1) ) 
     198    allocate( message%mpi_buffer_out(0:mpi_size-1) ) 
     199    do i = 0, mpi_size-1 
     200      buffer_in_size = dim3*dim4*sum( message%message_in(:)%npoints, message%message_in(:)%remote_rank == i ) 
     201      buffer_out_size = dim3*dim4*sum( message%message_out(:)%npoints, message%message_out(:)%remote_rank == i ) 
     202      !TODO : what if size == 0 ? 
     203      allocate( message%mpi_buffer_in(i)%buff( buffer_in_size ) ) 
     204      allocate( message%mpi_buffer_out(i)%buff( buffer_out_size ) ) 
     205      message%mpi_buffer_in(i)%n=0 
     206      message%mpi_buffer_out(i)%n=0 
     207    end do 
     208    ! Set offsets in submessages 
    179209    do i=1, size(message%message_out) 
    180       submessage => message%message_out(i) 
    181       call MPI_Send_init( submessage%buff, size(submessage%buff), MPI_DOUBLE, domglo_rank(submessage%remote_ind_glo), & 
    182         submessage%remote_ind_glo+ndomain_glo*domloc_glo_ind(submessage%ind_loc), & 
    183         comm_icosa, message%mpi_requests_out(i), ierr ) 
     210      remote_rank = message%message_out(i)%remote_rank 
     211      message%message_out(i)%mpi_buffer_displ = message%mpi_buffer_out(remote_rank)%n 
     212      message%mpi_buffer_out(remote_rank)%n = message%mpi_buffer_out(remote_rank)%n + message%message_out(i)%npoints*dim3*dim4 
    184213    end do 
    185214    do i=1, size(message%message_in) 
    186       submessage => message%message_in(i) 
    187       call MPI_Recv_init( submessage%buff, size(submessage%buff), MPI_DOUBLE, domglo_rank(submessage%remote_ind_glo), & 
    188         domloc_glo_ind(submessage%ind_loc)+ndomain_glo*submessage%remote_ind_glo, & 
    189         comm_icosa,  message%mpi_requests_in(i), ierr ) 
     215      remote_rank = message%message_in(i)%remote_rank 
     216      message%message_in(i)%mpi_buffer_displ = message%mpi_buffer_in(remote_rank)%n 
     217      message%mpi_buffer_in(remote_rank)%n = message%mpi_buffer_in(remote_rank)%n + message%message_in(i)%npoints*dim3*dim4 
     218    end do 
     219    ! Create persistant MPI requests 
     220    allocate( message%mpi_requests_in(0:mpi_size-1) ) 
     221    allocate( message%mpi_requests_out(0:mpi_size-1) ) 
     222    do i = 0, mpi_size-1 
     223      if(  size(message%mpi_buffer_in(i)%buff) /= message%mpi_buffer_in(i)%n & 
     224      .or. size(message%mpi_buffer_out(i)%buff) /= message%mpi_buffer_out(i)%n)& 
     225        call dynamico_abort("Internal error in transfert_mpi : mpi buffer size different than expected") 
     226      call MPI_Send_Init( message%mpi_buffer_out(i)%buff, message%mpi_buffer_out(i)%n, MPI_REAL8, i,& 
     227                          100, comm_icosa, message%mpi_requests_out(i), ierr ) 
     228      call MPI_Recv_Init( message%mpi_buffer_in(i)%buff, message%mpi_buffer_in(i)%n, MPI_REAL8, i,& 
     229                          100, comm_icosa, message%mpi_requests_in(i), ierr ) 
    190230    end do 
    191231    !$omp end master 
     
    205245      submessage%ind_loc = ind_loc 
    206246      submessage%remote_ind_glo = remote_ind_glo 
     247      submessage%remote_rank = domglo_rank(remote_ind_glo) 
    207248      submessage%npoints = points%npoints 
    208       allocate( submessage%buff( points%npoints, dim3, dim4 ) ) 
     249      submessage%mpi_buffer_displ = -1 ! Buffers not allocated yet 
    209250      allocate( submessage%displs( points%npoints ) ) 
    210251      submessage%displs(:) = points%i + (points%j-1)*iim 
     
    256297  subroutine message_create_ondevice(message) 
    257298    use mpi_mod 
    258     use mpipara, only : comm_icosa 
     299    use mpipara, only : mpi_size, comm_icosa 
    259300    type(t_message), intent(inout) :: message 
    260     type(t_submessage), pointer :: submessage 
    261301    integer :: i, ierr 
    262302 
     
    264304 
    265305    !$acc enter data copyin(message) async 
     306    !$acc enter data copyin(message%mpi_buffer_in(:)) async 
     307    !$acc enter data copyin(message%mpi_buffer_out(:)) async 
     308    do i = 0, mpi_size-1 
     309      !$acc enter data copyin(message%mpi_buffer_in(i)%buff(:)) async 
     310      !$acc enter data copyin(message%mpi_buffer_out(i)%buff(:)) async 
     311    end do 
    266312    !$acc enter data copyin(message%message_in(:)) async 
    267313    do i = 1, size( message%message_in ) 
    268       !$acc enter data copyin(message%message_in(i)%buff(:,:,:)) async 
    269314      !$acc enter data copyin(message%message_in(i)%displs(:)) async 
    270315      !$acc enter data copyin(message%message_in(i)%sign(:)) async 
     
    272317    !$acc enter data copyin(message%message_out(:)) async 
    273318    do i = 1, size( message%message_out ) 
    274       !$acc enter data copyin(message%message_out(i)%buff(:,:,:)) async 
    275319      !$acc enter data copyin(message%message_out(i)%displs(:)) async 
    276320      !!$acc enter data copyin(message%message_out(i)%sign(:)) async 
     
    287331    end do 
    288332 
    289     do i=1, size(message%message_out) 
    290       submessage => message%message_out(i) 
     333    !$acc wait 
     334    do i = 0, mpi_size-1 
    291335      call MPI_Request_free(message%mpi_requests_out(i), ierr) 
    292       !$acc host_data use_device(submessage%buff) 
    293       call MPI_Send_init( submessage%buff, size(submessage%buff), MPI_DOUBLE, domglo_rank(submessage%remote_ind_glo), & 
    294         submessage%remote_ind_glo+ndomain_glo*domloc_glo_ind(submessage%ind_loc), & 
    295         comm_icosa, message%mpi_requests_out(i), ierr ) 
     336      call MPI_Request_free(message%mpi_requests_in(i), ierr) 
     337      !$acc host_data use_device(message%mpi_buffer_out(i)%buff) 
     338      call MPI_Send_Init( message%mpi_buffer_out(i)%buff, message%mpi_buffer_out(i)%n, MPI_REAL8, i,& 
     339                          0, comm_icosa, message%mpi_requests_out(i), ierr ) 
    296340      !$acc end host_data 
    297     end do 
    298     do i=1, size(message%message_in) 
    299       submessage => message%message_in(i) 
    300       call MPI_Request_free(message%mpi_requests_in(i), ierr) 
    301       !$acc host_data use_device(submessage%buff) 
    302       call MPI_Recv_init( submessage%buff, size(submessage%buff), MPI_DOUBLE, domglo_rank(submessage%remote_ind_glo), & 
    303         domloc_glo_ind(submessage%ind_loc)+ndomain_glo*submessage%remote_ind_glo, & 
    304         comm_icosa,  message%mpi_requests_in(i), ierr ) 
     341      !$acc host_data use_device(message%mpi_buffer_in(i)%buff) 
     342      call MPI_Recv_Init( message%mpi_buffer_in(i)%buff, message%mpi_buffer_in(i)%n, MPI_REAL8, i,& 
     343                          0, comm_icosa, message%mpi_requests_in(i), ierr ) 
    305344      !$acc end host_data 
    306345    end do 
     
    311350 
    312351  subroutine message_delete_ondevice(message) 
     352    use mpipara, only : mpi_size 
    313353    type(t_message), intent(inout) :: message 
    314354    integer :: i 
     
    317357 
    318358    do i = 1, size( message%message_in ) 
    319       !$acc exit data delete(message%message_in(i)%buff(:,:,:)) async 
    320359      !$acc exit data delete(message%message_in(i)%displs(:)) async 
    321360      !$acc exit data delete(message%message_in(i)%sign(:)) async 
     
    323362    !$acc exit data delete(message%message_in(:)) async 
    324363    do i = 1, size( message%message_out ) 
    325       !$acc exit data delete(message%message_out(i)%buff(:,:,:)) async 
    326364      !$acc exit data delete(message%message_out(i)%displs(:)) async 
    327365      !!$acc exit data delete(message%message_out(i)%sign(:)) async 
     
    334372    end do 
    335373    !$acc exit data delete(message%message_local(:)) async 
    336  
     374    do i = 0, mpi_size-1 
     375      !$acc exit data delete(message%mpi_buffer_in(i)%buff(:)) async 
     376      !$acc exit data delete(message%mpi_buffer_out(i)%buff(:)) async 
     377    end do 
     378    !$acc exit data delete(message%mpi_buffer_in(:)) async 
     379    !$acc exit data delete(message%mpi_buffer_out(:)) async 
    337380    do i = 1, ndomain 
    338381      !$acc exit data delete(message%field(i)%rval4d(:,:,:)) async 
     
    344387 
    345388  subroutine finalize_message(message) 
     389    use mpipara, only : mpi_size 
    346390    type(t_message), intent(inout) :: message 
    347391    integer :: i, ierr 
     
    355399    deallocate(message%message_out) 
    356400    deallocate(message%message_local) 
    357     do i=1, size(message%mpi_requests_in) 
     401    do i=0, mpi_size-1 
    358402      call MPI_Request_free(message%mpi_requests_in(i), ierr) 
    359     end do 
     403      call MPI_Request_free(message%mpi_requests_out(i), ierr) 
     404      deallocate(message%mpi_buffer_in(i)%buff) 
     405      deallocate(message%mpi_buffer_out(i)%buff) 
     406    end do 
     407    deallocate(message%mpi_buffer_in) 
     408    deallocate(message%mpi_buffer_out) 
    360409    deallocate(message%mpi_requests_in) 
    361     do i=1, size(message%mpi_requests_out) 
    362       call MPI_Request_free(message%mpi_requests_out(i), ierr) 
    363     end do 
    364410    deallocate(message%mpi_requests_out) 
    365411    !$omp end master 
     
    373419    type(t_field),pointer :: field(:) 
    374420    type(t_message), target :: message 
    375     integer :: i, k, d3, d4 
    376     integer :: ierr, d3_begin, d3_end, dim4 
     421    integer :: i, k, d3, d4, local_displ 
     422    integer :: ierr, d3_begin, d3_end, dim4, dim3 
    377423 
    378424    call enter_profile(profile_mpi) 
     
    397443 
    398444    dim4 = size(message%field(1)%rval4d, 3) 
    399     CALL distrib_level( 1, size(message%field(1)%rval4d, 2), d3_begin, d3_end ) 
     445    dim3 = size(message%field(1)%rval4d, 2) 
     446    CALL distrib_level( 1, dim3, d3_begin, d3_end ) 
    400447 
    401448    call enter_profile(profile_mpi_copies) 
     
    409456        do d4 = 1, dim4 
    410457          do d3 = d3_begin, d3_end 
     458            local_displ = message%message_out(i)%mpi_buffer_displ + message%message_out(i)%npoints*( (d3-1) + dim3*(d4-1) ) 
    411459            !$acc loop 
    412460             do k = 1, message%message_out(i)%npoints 
    413               message%message_out(i)%buff(k,d3,d4) = message%field(message%message_out(i)%ind_loc)%rval4d(message%message_out(i)%displs(k),d3, d4) 
     461              !print *, "buff", message%message_out(i)%remote_rank, k+local_displ, "<- field", message%message_out(i)%ind_loc, message%message_out(i)%displs(k), d3, d4, message%field(message%message_out(i)%ind_loc)%rval4d(message%message_out(i)%displs(k),d3, d4) 
     462              message%mpi_buffer_out(message%message_out(i)%remote_rank)%buff(k+local_displ) = message%field(message%message_out(i)%ind_loc)%rval4d(message%message_out(i)%displs(k),d3, d4) 
    414463            end do 
    415464          end do 
     
    470519    use omp_para, only : distrib_level 
    471520    type(t_message), target :: message 
    472     integer :: d3_begin, d3_end, dim4 
    473     integer :: i, ierr, k, d3, d4 
     521    integer :: d3_begin, d3_end, dim4, dim3, dim12 
     522    integer :: i, ierr, k, d3, d4, local_displ 
    474523 
    475524    ! Check if message has been sent and not recieved yet 
     
    483532    call enter_profile(profile_mpi) 
    484533 
    485     CALL distrib_level( 1, size(message%field(1)%rval4d, 2), d3_begin, d3_end ) 
     534    dim4 = size(message%field(1)%rval4d, 3) 
     535    dim3 = size(message%field(1)%rval4d, 2) 
     536    dim12 = size(message%field(1)%rval4d, 1) 
     537    CALL distrib_level( 1, dim3, d3_begin, d3_end ) 
    486538 
    487539    call enter_profile(profile_mpi_waitall) 
     
    507559        do d4 = 1, dim4 
    508560          do d3 = d3_begin, d3_end 
     561            local_displ = message%message_in(i)%mpi_buffer_displ + message%message_in(i)%npoints*( (d3-1) + dim3*(d4-1) ) 
    509562            !$acc loop 
    510             do k = 1, message%message_in(i)%npoints 
    511               message%field(message%message_in(i)%ind_loc)%rval4d(message%message_in(i)%displs(k),d3,d4) = message%message_in(i)%sign(k)*message%message_in(i)%buff(k,d3,d4) 
     563            do k = 1, message%message_in(i)%npoints  
     564              !print *, "field", message%message_in(i)%ind_loc, message%message_in(i)%displs(k), d3, d4, "-> buff", message%message_in(i)%remote_rank, k+local_displ, message%mpi_buffer_in(message%message_in(i)%remote_rank)%buff(k+local_displ) 
     565              message%field(message%message_in(i)%ind_loc)%rval4d(message%message_in(i)%displs(k),d3,d4) & 
     566                  = message%mpi_buffer_in(message%message_in(i)%remote_rank)%buff(k+local_displ) 
    512567            end do 
    513568          end do 
  • codes/icosagcm/trunk/src/parallel/transfert_requests.F90

    r965 r999  
    218218      integer :: k, i, last, i_min, tmp 
    219219 
    220       displs = (points%i-1) + (points%j-1)*iim 
    221       if(allocated(points%elt)) displs = displs + (points%elt-1)*iim*jjm 
     220      displs = (points%i(1:points%npoints)-1) + (points%j(1:points%npoints)-1)*iim 
     221      if(allocated(points%elt)) displs = displs + (points%elt(1:points%npoints)-1)*iim*jjm 
    222222 
    223223      last=0 
     
    226226        tmp = displs(i_min) 
    227227        displs(i_min) = displs(i) 
    228         if(last==0 .or. displs(i_min) /= displs(last)) last=last+1 
     228        if( last==0 ) then 
     229          last=last+1 
     230        else if( displs(i_min) /= displs(last) ) then  
     231          last=last+1 
     232        endif 
    229233        displs(last) = tmp 
    230234      end do 
Note: See TracChangeset for help on using the changeset viewer.