MODULE mppsum
   !!======================================================================
   !!                       ***  MODULE mpp_sum  ***
   !! NEMO: Summation of arrays across processors
   !!======================================================================

   !!----------------------------------------------------------------------
   !!   mppsum : Order independent MPP reproducible sum
   !!----------------------------------------------------------------------
   !! * Modules used   
   USE par_kind, ONLY : &   ! Precision variables
      & wp
   USE dom_oce, ONLY : &    ! Ocean space and time domain variables
      & nproc
   USE par_oce, ONLY : &    ! Ocean parameters
      & jpnij
   USE lib_mpp
   USE mppallgatherv
   USE in_out_manager

   IMPLICIT NONE

   !! * Routine accessibility
   PRIVATE

   PUBLIC &
      & mpp_sum_indep, & ! Order independent MPP reproducible sum
      & comp_sum,      & ! Perform compensated (i.e. accurate) summation.
      & fround           ! Rounding of floating-point number

CONTAINS

   FUNCTION mpp_sum_indep( pval, kn )
      !!----------------------------------------------------------------------
      !!               ***  ROUTINE mpp_sum_indep ***
      !!          
      !! ** Purpose : Sum all elements in the pval array in
      !!              an accurate order-independent way.
      !!
      !! ** Method  : The code iterates the compensated summation until the 
      !!              result is guaranteed to be within 4*eps of the true sum.
      !!              It then rounds the result to the nearest floating-point 
      !!              number whose last three bits are zero, thereby 
      !!              guaranteeing an order-independent result.
      !!
      !! ** Action  : This does only work for MPI. 
      !!              It does not work for SHMEM.
      !!
      !! References : M. Fisher (ECMWF): IFS code + personal communication
      !!              The algorithm is based on Ogita et al. (2005) 
      !!              SIAM J. Sci. Computing, Vol.26, No.6, pp1955-1988. 
      !!              This is based in turn on an algorithm
      !!              by Knuth (1969, seminumerical algorithms).
      !!
      !! History :
      !!        !  07-07  (K. Mogensen)  Original code heavily based on IFS.
      !!----------------------------------------------------------------------
      !! * Function return
      REAL(wp) mpp_sum_indep
      !! * Arguments
      INTEGER, INTENT(IN) :: &
         & kn
      REAL(wp), DIMENSION(kn), INTENT(IN) :: &
         & pval
      !! * Local declarations
      REAL(wp), DIMENSION(3) ::&
         & zbuffl
      REAL(wp), DIMENSION(:), ALLOCATABLE :: &
         & zpsums, &
         & zperrs, &
         & zpcors, &
         & zbuffg, &
         & zp
      REAL(wp) :: &
         & zcorr,   &
         & zerr,    &
         & zolderr, &
         & zbeta,   &
         & zres
      INTEGER, DIMENSION(:), allocatable :: &
         & irecv, &
         & istart
      INTEGER :: &
         & ing
      INTEGER :: &
         & jj


      ! Get global number of elements
      ing = kn
#ifdef key_mpp
      CALL mpp_sum( ing )
#endif
      ! Check that the the algorithm can work 
      
      IF ( ( REAL( 2 * ing ) * EPSILON( zres ) ) >= 1.0 ) THEN

         CALL ctl_stop('mpp_sum_indep:', &
            &          'kn is too large to guarantee error bounds')

      ENDIF
      
      ALLOCATE( &
         & zp(MAX(kn,1)),                & 
         & zbuffg(jpnij*SIZE(zbuffl)), &
         & zpsums(jpnij),              &
         & zperrs(jpnij),              &
         & zpcors(jpnij)               &
         & )

      zolderr = HUGE(zerr)

      ! Copy the input array. This avoids some tricky indexing, at the
      ! expense of some inefficency.

      IF ( kn > 0 ) THEN

         zp(:) = pval(:)

      ELSE

         zp(1) = 0.0_wp

      ENDIF
      
      k_loop: DO

         ! Transform local arrays
         
         IF ( kn > 0 ) THEN

            CALL comp_sum ( zp, kn, zcorr, zerr )

         ENDIF

         ! Gather partial sums and error bounds to all processors

         zbuffl(1) = zp(MAX(kn,1))
         
         IF ( kn > 0 ) THEN

            zbuffl(2) = zerr
            zbuffl(3) = zcorr

         ELSE
            
            zbuffl(2) = 0.0_wp
            zbuffl(3) = 0.0_wp

         ENDIF
         
         IF ( jpnij > 1 ) THEN

            ALLOCATE( &
               & irecv(jpnij), &
               & istart(jpnij) &
               & )

            CALL mpp_allgatherv( zbuffl, SIZE(zbuffl), &
               & zbuffg, jpnij * SIZE(zbuffl), irecv, istart )

            DEALLOCATE( &
               & irecv, &
               & istart &
               & )
            
            DO jj = 1, jpnij

               zpsums(jj) = zbuffg(1+(jj-1)*SIZE(zbuffl))
               zperrs(jj) = zbuffg(2+(jj-1)*SIZE(zbuffl))
               zpcors(jj) = zbuffg(3+(jj-1)*SIZE(zbuffl))

            END DO

         ELSE

            zpsums(1) = zbuffl(1)
            zperrs(1) = zbuffl(2)
            zpcors(1) = zbuffl(3)

         ENDIF

         ! Transform partial sums

         CALL comp_sum( zpsums, jpnij, zcorr, zerr )
         zerr  = zerr  + SUM(zperrs)
         zcorr = zcorr + SUM(zpcors)
         
         ! Calculate final result
         
         zres = zpsums(jpnij) + zcorr

         ! Calculate error bound. This is corollary 4.7 from Ogita et al. 
         ! (2005)
         
         zbeta = zerr *( REAL( 2*ing, wp ) * EPSILON(zres) ) &
            &  /(1.0_wp - REAL( 2*ing, wp ) * EPSILON(zres) )

         zerr = EPSILON(zres) * ABS(zres) &
            & +(zbeta + ( 2.0_wp * EPSILON(zres) * EPSILON(zres) * ABS(zres) &
            &            +3.0_wp * TINY(zres) ) )

         ! Update the last element of the local array

         zp(MAX(kn,1)) = zpsums(nproc+1)

         ! Exit if the global error is small enough

         IF ( zerr < 4.0_wp * SPACING(zres) ) EXIT k_loop
         
         ! Take appropriate action if ZRES cannot be sufficiently refined.
         
         IF (zerr >= zolderr) THEN

            CALL ctl_stop('Failed to refine sum', &
               &          'Warning: Possiblity of non-reproducible results')

         ENDIF

         zolderr = zerr
         
      ENDDO k_loop

      ! At this stage, we have guaranteed that ZRES less than 4*EPS
      ! away from the exact sum. There are only four floating point
      ! numbers in this range. So, if we find the nearest number that
      ! has its last three bits zero, then we have a reproducible result.

      mpp_sum_indep = fround(zres)

      DEALLOCATE( &
         & zpcors, &
         & zperrs, &
         & zpsums, &
         & zbuffg, &
         & zp      &
         & )

   END FUNCTION mpp_sum_indep

   SUBROUTINE comp_sum( pval, kn, pcorr, perr )
      !!----------------------------------------------------------------------
      !!               ***  ROUTINE comp_sum ***
      !!          
      !! ** Purpose : To perform compensated (i.e. accurate) summation.
      !!
      !! ** Method  : These routines transform the elements of the array P, 
      !!              such that:
      !!              1)  pval(kn)         contains sum(pval)
      !!              2)  pval(1)...pval(kn-1) contain the rounding errors 
      !!                  that were made in calculating sum(pval).
      !!              3)  The exact sum of the elements of pval is unmodified.
      !!              On return, pcorr contains the sum of the rounding errors,
      !!              perr contains the sum of their absolute values.
      !!              After calling this routine, an accurate sum of the 
      !!              elements of pval can be calculated as res=pval(n)+pcorr.
      !!
      !! ** Action  : 
      !!
      !! References : M. Fisher (ECMWF) IFS code + personal communications
      !!
      !! History :
      !!        !  07-07  (K. Mogensen)  Original code heavily based on IFS 
      !!----------------------------------------------------------------------
      !! * Arguments
      INTEGER, INTENT(IN) :: &
         & kn         ! Number of elements in input array
      REAL(wp), DIMENSION(kn), INTENT(INOUT) :: &
         & pval       ! Input array to be sum on input
                      ! pval(kn) = sum (pval) on output
                      ! pval(1)...pval(kn-1) = rounding errors on output
      REAL(wp) :: &
         & pcorr, &   ! Sum of rounding errors
         & perr       ! Sum of absolute rounding errors
      !! * Local declarations
      REAL(wp) :: &
         & zx, &
         & zz, &
         & zpsum
      integer :: &
         & jj

      pcorr = 0.0_wp
      perr  = 0.0_wp
      
      zpsum = pval(1)
      
      DO jj = 2, kn

         ! It is vital that these 4 lines are not optimized in any way that
         ! changes the results.

         zx         = pval(jj) + zpsum
         zz         = zx - pval(jj)
         pval(jj-1) = ( pval(jj) - ( zx - zz ) ) + ( zpsum - zz ) 
         zpsum      = zx

         ! Accumulate the correction and the error
         
         pcorr      = pcorr + pval(jj-1)
         perr       = perr + ABS( pval(jj-1) )

      END DO

      pval(kn) = zpsum
      
   END SUBROUTINE comp_sum

   FUNCTION fround(pres)
      !!----------------------------------------------------------------------
      !!               ***  ROUTINE fround ***
      !!          
      !! ** Purpose : Rounding of floating-point number
      !!
      !! ** Method  : Returns the value of PRES rounded to the nearest 
      !!              floating-point number that has its last three bits zero
      !!              This works on big-endian and little-endian machines.
      !!
      !! ** Action  : 
      !!
      !! References : M. Fisher (ECMWF) IFS code + personal communication
      !!
      !! History :
      !!        !  07-07  (K. Mogensen)  Original code heavily based on IFS.
      !!----------------------------------------------------------------------
      !! * Function return
      REAL(wp) fround
      !! * Arguments
      REAL(wp), INTENT(IN) :: &
         & pres      ! Value to be rounded
      !! * Local declarations
      REAL(wp) :: &
         & zz(2), &
         & zup,   &
         & zdown
      INTEGER :: &
         & ii(2),         & 
         & iequiv(8),     &
         & ints_per_real, &
         & i_low_word
      INTEGER :: &
         & jj

      ii(:) = 1
      zz(:) = 1.0_wp

      ! Warning: If wp = 64 bits (or 32 bits for key_sp) this will not work.

#if defined key_sp
      ints_per_real = 32 / BIT_SIZE(ii)
#else
      ints_per_real = 64 / BIT_SIZE(ii)
#endif

      ! Test whether big-endian or little-endian
      
      zup = -1.0_wp
      iequiv(1:ints_per_real) = TRANSFER(zup,iequiv(1:ints_per_real))
      
      IF ( iequiv(1) == 0 ) THEN
         i_low_word = 1                ! Little-endian
      ELSE
         i_low_word = ints_per_real    ! Big-endian
      ENDIF

      ! Find the nearest number with all 3 lowest-order bits zeroed
      
      iequiv(1:ints_per_real) = transfer(pres,iequiv(1:ints_per_real))
      zup    = pres
      zdown  = pres

      IF (IBITS(iequiv(i_low_word),0,3)/=0) THEN

         DO jj = 1, 4

            zup = NEAREST( zup, 1.0_wp )
            iequiv(1:ints_per_real) = TRANSFER( zup, iequiv(1:ints_per_real) )

            IF ( IBITS( iequiv(i_low_word), 0, 3 ) == 0 ) EXIT

            zdown = NEAREST( zdown, -1.0 )

            iequiv(1:ints_per_real) = TRANSFER( zdown, iequiv(1:ints_per_real))

            IF ( IBITS( iequiv(i_low_word),0,3) == 0 ) EXIT

         END DO

         IF ( IBITS( iequiv( i_low_word ), 0, 3) /= 0 ) THEN

            CALL ctl_stop('Fround:','This is not possible')

         ENDIF
         
      ENDIF
      
      fround = TRANSFER( iequiv(1:ints_per_real), pres )
      
   END FUNCTION fround

END MODULE mppsum