MODULE mppsum !!====================================================================== !! *** MODULE mpp_sum *** !! NEMO: Summation of arrays across processors !!====================================================================== !!---------------------------------------------------------------------- !! mppsum : Order independent MPP reproducible sum !!---------------------------------------------------------------------- !! * Modules used USE par_kind, ONLY : & ! Precision variables & wp USE dom_oce, ONLY : & ! Ocean space and time domain variables & nproc USE par_oce, ONLY : & ! Ocean parameters & jpnij USE lib_mpp USE mppallgatherv USE in_out_manager IMPLICIT NONE !! * Routine accessibility PRIVATE PUBLIC & & mpp_sum_indep, & ! Order independent MPP reproducible sum & comp_sum, & ! Perform compensated (i.e. accurate) summation. & fround ! Rounding of floating-point number CONTAINS FUNCTION mpp_sum_indep( pval, kn ) !!---------------------------------------------------------------------- !! *** ROUTINE mpp_sum_indep *** !! !! ** Purpose : Sum all elements in the pval array in !! an accurate order-independent way. !! !! ** Method : The code iterates the compensated summation until the !! result is guaranteed to be within 4*eps of the true sum. !! It then rounds the result to the nearest floating-point !! number whose last three bits are zero, thereby !! guaranteeing an order-independent result. !! !! ** Action : This does only work for MPI. !! It does not work for SHMEM. !! !! References : M. Fisher (ECMWF): IFS code + personal communication !! The algorithm is based on Ogita et al. (2005) !! SIAM J. Sci. Computing, Vol.26, No.6, pp1955-1988. !! This is based in turn on an algorithm !! by Knuth (1969, seminumerical algorithms). !! !! History : !! ! 07-07 (K. Mogensen) Original code heavily based on IFS. !!---------------------------------------------------------------------- !! * Function return REAL(wp) mpp_sum_indep !! * Arguments INTEGER, INTENT(IN) :: & & kn REAL(wp), DIMENSION(kn), INTENT(IN) :: & & pval !! * Local declarations REAL(wp), DIMENSION(3) ::& & zbuffl REAL(wp), DIMENSION(:), ALLOCATABLE :: & & zpsums, & & zperrs, & & zpcors, & & zbuffg, & & zp REAL(wp) :: & & zcorr, & & zerr, & & zolderr, & & zbeta, & & zres INTEGER, DIMENSION(:), allocatable :: & & irecv, & & istart INTEGER :: & & ing INTEGER :: & & jj ! Get global number of elements ing = kn #ifdef key_mpp CALL mpp_sum( ing ) #endif ! Check that the the algorithm can work IF ( ( REAL( 2 * ing ) * EPSILON( zres ) ) >= 1.0 ) THEN CALL ctl_stop('mpp_sum_indep:', & & 'kn is too large to guarantee error bounds') ENDIF ALLOCATE( & & zp(MAX(kn,1)), & & zbuffg(jpnij*SIZE(zbuffl)), & & zpsums(jpnij), & & zperrs(jpnij), & & zpcors(jpnij) & & ) zolderr = HUGE(zerr) ! Copy the input array. This avoids some tricky indexing, at the ! expense of some inefficency. IF ( kn > 0 ) THEN zp(:) = pval(:) ELSE zp(1) = 0.0_wp ENDIF k_loop: DO ! Transform local arrays IF ( kn > 0 ) THEN CALL comp_sum ( zp, kn, zcorr, zerr ) ENDIF ! Gather partial sums and error bounds to all processors zbuffl(1) = zp(MAX(kn,1)) IF ( kn > 0 ) THEN zbuffl(2) = zerr zbuffl(3) = zcorr ELSE zbuffl(2) = 0.0_wp zbuffl(3) = 0.0_wp ENDIF IF ( jpnij > 1 ) THEN ALLOCATE( & & irecv(jpnij), & & istart(jpnij) & & ) CALL mpp_allgatherv( zbuffl, SIZE(zbuffl), & & zbuffg, jpnij * SIZE(zbuffl), irecv, istart ) DEALLOCATE( & & irecv, & & istart & & ) DO jj = 1, jpnij zpsums(jj) = zbuffg(1+(jj-1)*SIZE(zbuffl)) zperrs(jj) = zbuffg(2+(jj-1)*SIZE(zbuffl)) zpcors(jj) = zbuffg(3+(jj-1)*SIZE(zbuffl)) END DO ELSE zpsums(1) = zbuffl(1) zperrs(1) = zbuffl(2) zpcors(1) = zbuffl(3) ENDIF ! Transform partial sums CALL comp_sum( zpsums, jpnij, zcorr, zerr ) zerr = zerr + SUM(zperrs) zcorr = zcorr + SUM(zpcors) ! Calculate final result zres = zpsums(jpnij) + zcorr ! Calculate error bound. This is corollary 4.7 from Ogita et al. ! (2005) zbeta = zerr *( REAL( 2*ing, wp ) * EPSILON(zres) ) & & /(1.0_wp - REAL( 2*ing, wp ) * EPSILON(zres) ) zerr = EPSILON(zres) * ABS(zres) & & +(zbeta + ( 2.0_wp * EPSILON(zres) * EPSILON(zres) * ABS(zres) & & +3.0_wp * TINY(zres) ) ) ! Update the last element of the local array zp(MAX(kn,1)) = zpsums(nproc+1) ! Exit if the global error is small enough IF ( zerr < 4.0_wp * SPACING(zres) ) EXIT k_loop ! Take appropriate action if ZRES cannot be sufficiently refined. IF (zerr >= zolderr) THEN CALL ctl_stop('Failed to refine sum', & & 'Warning: Possiblity of non-reproducible results') ENDIF zolderr = zerr ENDDO k_loop ! At this stage, we have guaranteed that ZRES less than 4*EPS ! away from the exact sum. There are only four floating point ! numbers in this range. So, if we find the nearest number that ! has its last three bits zero, then we have a reproducible result. mpp_sum_indep = fround(zres) DEALLOCATE( & & zpcors, & & zperrs, & & zpsums, & & zbuffg, & & zp & & ) END FUNCTION mpp_sum_indep SUBROUTINE comp_sum( pval, kn, pcorr, perr ) !!---------------------------------------------------------------------- !! *** ROUTINE comp_sum *** !! !! ** Purpose : To perform compensated (i.e. accurate) summation. !! !! ** Method : These routines transform the elements of the array P, !! such that: !! 1) pval(kn) contains sum(pval) !! 2) pval(1)...pval(kn-1) contain the rounding errors !! that were made in calculating sum(pval). !! 3) The exact sum of the elements of pval is unmodified. !! On return, pcorr contains the sum of the rounding errors, !! perr contains the sum of their absolute values. !! After calling this routine, an accurate sum of the !! elements of pval can be calculated as res=pval(n)+pcorr. !! !! ** Action : !! !! References : M. Fisher (ECMWF) IFS code + personal communications !! !! History : !! ! 07-07 (K. Mogensen) Original code heavily based on IFS !!---------------------------------------------------------------------- !! * Arguments INTEGER, INTENT(IN) :: & & kn ! Number of elements in input array REAL(wp), DIMENSION(kn), INTENT(INOUT) :: & & pval ! Input array to be sum on input ! pval(kn) = sum (pval) on output ! pval(1)...pval(kn-1) = rounding errors on output REAL(wp) :: & & pcorr, & ! Sum of rounding errors & perr ! Sum of absolute rounding errors !! * Local declarations REAL(wp) :: & & zx, & & zz, & & zpsum integer :: & & jj pcorr = 0.0_wp perr = 0.0_wp zpsum = pval(1) DO jj = 2, kn ! It is vital that these 4 lines are not optimized in any way that ! changes the results. zx = pval(jj) + zpsum zz = zx - pval(jj) pval(jj-1) = ( pval(jj) - ( zx - zz ) ) + ( zpsum - zz ) zpsum = zx ! Accumulate the correction and the error pcorr = pcorr + pval(jj-1) perr = perr + ABS( pval(jj-1) ) END DO pval(kn) = zpsum END SUBROUTINE comp_sum FUNCTION fround(pres) !!---------------------------------------------------------------------- !! *** ROUTINE fround *** !! !! ** Purpose : Rounding of floating-point number !! !! ** Method : Returns the value of PRES rounded to the nearest !! floating-point number that has its last three bits zero !! This works on big-endian and little-endian machines. !! !! ** Action : !! !! References : M. Fisher (ECMWF) IFS code + personal communication !! !! History : !! ! 07-07 (K. Mogensen) Original code heavily based on IFS. !!---------------------------------------------------------------------- !! * Function return REAL(wp) fround !! * Arguments REAL(wp), INTENT(IN) :: & & pres ! Value to be rounded !! * Local declarations REAL(wp) :: & & zz(2), & & zup, & & zdown INTEGER :: & & ii(2), & & iequiv(8), & & ints_per_real, & & i_low_word INTEGER :: & & jj ii(:) = 1 zz(:) = 1.0_wp ! Warning: If wp = 64 bits (or 32 bits for key_sp) this will not work. #if defined key_sp ints_per_real = 32 / BIT_SIZE(ii) #else ints_per_real = 64 / BIT_SIZE(ii) #endif ! Test whether big-endian or little-endian zup = -1.0_wp iequiv(1:ints_per_real) = TRANSFER(zup,iequiv(1:ints_per_real)) IF ( iequiv(1) == 0 ) THEN i_low_word = 1 ! Little-endian ELSE i_low_word = ints_per_real ! Big-endian ENDIF ! Find the nearest number with all 3 lowest-order bits zeroed iequiv(1:ints_per_real) = transfer(pres,iequiv(1:ints_per_real)) zup = pres zdown = pres IF (IBITS(iequiv(i_low_word),0,3)/=0) THEN DO jj = 1, 4 zup = NEAREST( zup, 1.0_wp ) iequiv(1:ints_per_real) = TRANSFER( zup, iequiv(1:ints_per_real) ) IF ( IBITS( iequiv(i_low_word), 0, 3 ) == 0 ) EXIT zdown = NEAREST( zdown, -1.0 ) iequiv(1:ints_per_real) = TRANSFER( zdown, iequiv(1:ints_per_real)) IF ( IBITS( iequiv(i_low_word),0,3) == 0 ) EXIT END DO IF ( IBITS( iequiv( i_low_word ), 0, 3) /= 0 ) THEN CALL ctl_stop('Fround:','This is not possible') ENDIF ENDIF fround = TRANSFER( iequiv(1:ints_per_real), pres ) END FUNCTION fround END MODULE mppsum