[2578] | 1 | MODULE mppsum |
---|
| 2 | !!====================================================================== |
---|
| 3 | !! *** MODULE mpp_sum *** |
---|
| 4 | !! NEMO: Summation of arrays across processors |
---|
| 5 | !!====================================================================== |
---|
| 6 | |
---|
| 7 | !!---------------------------------------------------------------------- |
---|
| 8 | !! mppsum : Order independent MPP reproducible sum |
---|
| 9 | !!---------------------------------------------------------------------- |
---|
| 10 | !! * Modules used |
---|
| 11 | USE par_kind, ONLY : & ! Precision variables |
---|
| 12 | & wp |
---|
| 13 | USE dom_oce, ONLY : & ! Ocean space and time domain variables |
---|
| 14 | & nproc |
---|
| 15 | USE par_oce, ONLY : & ! Ocean parameters |
---|
| 16 | & jpnij |
---|
| 17 | USE lib_mpp |
---|
| 18 | USE mppallgatherv |
---|
| 19 | USE in_out_manager |
---|
| 20 | |
---|
| 21 | IMPLICIT NONE |
---|
| 22 | |
---|
| 23 | !! * Routine accessibility |
---|
| 24 | PRIVATE |
---|
| 25 | |
---|
| 26 | PUBLIC & |
---|
| 27 | & mpp_sum_indep, & ! Order independent MPP reproducible sum |
---|
| 28 | & comp_sum, & ! Perform compensated (i.e. accurate) summation. |
---|
| 29 | & fround ! Rounding of floating-point number |
---|
| 30 | |
---|
| 31 | CONTAINS |
---|
| 32 | |
---|
| 33 | FUNCTION mpp_sum_indep( pval, kn ) |
---|
| 34 | !!---------------------------------------------------------------------- |
---|
| 35 | !! *** ROUTINE mpp_sum_indep *** |
---|
| 36 | !! |
---|
| 37 | !! ** Purpose : Sum all elements in the pval array in |
---|
| 38 | !! an accurate order-independent way. |
---|
| 39 | !! |
---|
| 40 | !! ** Method : The code iterates the compensated summation until the |
---|
| 41 | !! result is guaranteed to be within 4*eps of the true sum. |
---|
| 42 | !! It then rounds the result to the nearest floating-point |
---|
| 43 | !! number whose last three bits are zero, thereby |
---|
| 44 | !! guaranteeing an order-independent result. |
---|
| 45 | !! |
---|
| 46 | !! ** Action : This does only work for MPI. |
---|
| 47 | !! It does not work for SHMEM. |
---|
| 48 | !! |
---|
| 49 | !! References : M. Fisher (ECMWF): IFS code + personal communication |
---|
| 50 | !! The algorithm is based on Ogita et al. (2005) |
---|
| 51 | !! SIAM J. Sci. Computing, Vol.26, No.6, pp1955-1988. |
---|
| 52 | !! This is based in turn on an algorithm |
---|
| 53 | !! by Knuth (1969, seminumerical algorithms). |
---|
| 54 | !! |
---|
| 55 | !! History : |
---|
| 56 | !! ! 07-07 (K. Mogensen) Original code heavily based on IFS. |
---|
| 57 | !!---------------------------------------------------------------------- |
---|
| 58 | !! * Function return |
---|
| 59 | REAL(wp) mpp_sum_indep |
---|
| 60 | !! * Arguments |
---|
| 61 | INTEGER, INTENT(IN) :: & |
---|
| 62 | & kn |
---|
| 63 | REAL(wp), DIMENSION(kn), INTENT(IN) :: & |
---|
| 64 | & pval |
---|
| 65 | !! * Local declarations |
---|
| 66 | REAL(wp), DIMENSION(3) ::& |
---|
| 67 | & zbuffl |
---|
| 68 | REAL(wp), DIMENSION(:), ALLOCATABLE :: & |
---|
| 69 | & zpsums, & |
---|
| 70 | & zperrs, & |
---|
| 71 | & zpcors, & |
---|
| 72 | & zbuffg, & |
---|
| 73 | & zp |
---|
| 74 | REAL(wp) :: & |
---|
| 75 | & zcorr, & |
---|
| 76 | & zerr, & |
---|
| 77 | & zolderr, & |
---|
| 78 | & zbeta, & |
---|
| 79 | & zres |
---|
| 80 | INTEGER, DIMENSION(:), allocatable :: & |
---|
| 81 | & irecv, & |
---|
| 82 | & istart |
---|
| 83 | INTEGER :: & |
---|
| 84 | & ing |
---|
| 85 | INTEGER :: & |
---|
| 86 | & jj |
---|
| 87 | |
---|
| 88 | |
---|
| 89 | ! Get global number of elements |
---|
| 90 | ing = kn |
---|
| 91 | #ifdef key_mpp |
---|
| 92 | CALL mpp_sum( ing ) |
---|
| 93 | #endif |
---|
| 94 | ! Check that the the algorithm can work |
---|
| 95 | |
---|
| 96 | IF ( ( REAL( 2 * ing ) * EPSILON( zres ) ) >= 1.0 ) THEN |
---|
| 97 | |
---|
| 98 | CALL ctl_stop('mpp_sum_indep:', & |
---|
| 99 | & 'kn is too large to guarantee error bounds') |
---|
| 100 | |
---|
| 101 | ENDIF |
---|
| 102 | |
---|
| 103 | ALLOCATE( & |
---|
| 104 | & zp(MAX(kn,1)), & |
---|
| 105 | & zbuffg(jpnij*SIZE(zbuffl)), & |
---|
| 106 | & zpsums(jpnij), & |
---|
| 107 | & zperrs(jpnij), & |
---|
| 108 | & zpcors(jpnij) & |
---|
| 109 | & ) |
---|
| 110 | |
---|
| 111 | zolderr = HUGE(zerr) |
---|
| 112 | |
---|
| 113 | ! Copy the input array. This avoids some tricky indexing, at the |
---|
| 114 | ! expense of some inefficency. |
---|
| 115 | |
---|
| 116 | IF ( kn > 0 ) THEN |
---|
| 117 | |
---|
| 118 | zp(:) = pval(:) |
---|
| 119 | |
---|
| 120 | ELSE |
---|
| 121 | |
---|
| 122 | zp(1) = 0.0_wp |
---|
| 123 | |
---|
| 124 | ENDIF |
---|
| 125 | |
---|
| 126 | k_loop: DO |
---|
| 127 | |
---|
| 128 | ! Transform local arrays |
---|
| 129 | |
---|
| 130 | IF ( kn > 0 ) THEN |
---|
| 131 | |
---|
| 132 | CALL comp_sum ( zp, kn, zcorr, zerr ) |
---|
| 133 | |
---|
| 134 | ENDIF |
---|
| 135 | |
---|
| 136 | ! Gather partial sums and error bounds to all processors |
---|
| 137 | |
---|
| 138 | zbuffl(1) = zp(MAX(kn,1)) |
---|
| 139 | |
---|
| 140 | IF ( kn > 0 ) THEN |
---|
| 141 | |
---|
| 142 | zbuffl(2) = zerr |
---|
| 143 | zbuffl(3) = zcorr |
---|
| 144 | |
---|
| 145 | ELSE |
---|
| 146 | |
---|
| 147 | zbuffl(2) = 0.0_wp |
---|
| 148 | zbuffl(3) = 0.0_wp |
---|
| 149 | |
---|
| 150 | ENDIF |
---|
| 151 | |
---|
| 152 | IF ( jpnij > 1 ) THEN |
---|
| 153 | |
---|
| 154 | ALLOCATE( & |
---|
| 155 | & irecv(jpnij), & |
---|
| 156 | & istart(jpnij) & |
---|
| 157 | & ) |
---|
| 158 | |
---|
| 159 | CALL mpp_allgatherv( zbuffl, SIZE(zbuffl), & |
---|
| 160 | & zbuffg, jpnij * SIZE(zbuffl), irecv, istart ) |
---|
| 161 | |
---|
| 162 | DEALLOCATE( & |
---|
| 163 | & irecv, & |
---|
| 164 | & istart & |
---|
| 165 | & ) |
---|
| 166 | |
---|
| 167 | DO jj = 1, jpnij |
---|
| 168 | |
---|
| 169 | zpsums(jj) = zbuffg(1+(jj-1)*SIZE(zbuffl)) |
---|
| 170 | zperrs(jj) = zbuffg(2+(jj-1)*SIZE(zbuffl)) |
---|
| 171 | zpcors(jj) = zbuffg(3+(jj-1)*SIZE(zbuffl)) |
---|
| 172 | |
---|
| 173 | END DO |
---|
| 174 | |
---|
| 175 | ELSE |
---|
| 176 | |
---|
| 177 | zpsums(1) = zbuffl(1) |
---|
| 178 | zperrs(1) = zbuffl(2) |
---|
| 179 | zpcors(1) = zbuffl(3) |
---|
| 180 | |
---|
| 181 | ENDIF |
---|
| 182 | |
---|
| 183 | ! Transform partial sums |
---|
| 184 | |
---|
| 185 | CALL comp_sum( zpsums, jpnij, zcorr, zerr ) |
---|
| 186 | zerr = zerr + SUM(zperrs) |
---|
| 187 | zcorr = zcorr + SUM(zpcors) |
---|
| 188 | |
---|
| 189 | ! Calculate final result |
---|
| 190 | |
---|
| 191 | zres = zpsums(jpnij) + zcorr |
---|
| 192 | |
---|
| 193 | ! Calculate error bound. This is corollary 4.7 from Ogita et al. |
---|
| 194 | ! (2005) |
---|
| 195 | |
---|
| 196 | zbeta = zerr *( REAL( 2*ing, wp ) * EPSILON(zres) ) & |
---|
| 197 | & /(1.0_wp - REAL( 2*ing, wp ) * EPSILON(zres) ) |
---|
| 198 | |
---|
| 199 | zerr = EPSILON(zres) * ABS(zres) & |
---|
| 200 | & +(zbeta + ( 2.0_wp * EPSILON(zres) * EPSILON(zres) * ABS(zres) & |
---|
| 201 | & +3.0_wp * TINY(zres) ) ) |
---|
| 202 | |
---|
| 203 | ! Update the last element of the local array |
---|
| 204 | |
---|
| 205 | zp(MAX(kn,1)) = zpsums(nproc+1) |
---|
| 206 | |
---|
| 207 | ! Exit if the global error is small enough |
---|
| 208 | |
---|
| 209 | IF ( zerr < 4.0_wp * SPACING(zres) ) EXIT k_loop |
---|
| 210 | |
---|
| 211 | ! Take appropriate action if ZRES cannot be sufficiently refined. |
---|
| 212 | |
---|
| 213 | IF (zerr >= zolderr) THEN |
---|
| 214 | |
---|
| 215 | CALL ctl_stop('Failed to refine sum', & |
---|
| 216 | & 'Warning: Possiblity of non-reproducible results') |
---|
| 217 | |
---|
| 218 | ENDIF |
---|
| 219 | |
---|
| 220 | zolderr = zerr |
---|
| 221 | |
---|
| 222 | ENDDO k_loop |
---|
| 223 | |
---|
| 224 | ! At this stage, we have guaranteed that ZRES less than 4*EPS |
---|
| 225 | ! away from the exact sum. There are only four floating point |
---|
| 226 | ! numbers in this range. So, if we find the nearest number that |
---|
| 227 | ! has its last three bits zero, then we have a reproducible result. |
---|
| 228 | |
---|
| 229 | mpp_sum_indep = fround(zres) |
---|
| 230 | |
---|
| 231 | DEALLOCATE( & |
---|
| 232 | & zpcors, & |
---|
| 233 | & zperrs, & |
---|
| 234 | & zpsums, & |
---|
| 235 | & zbuffg, & |
---|
| 236 | & zp & |
---|
| 237 | & ) |
---|
| 238 | |
---|
| 239 | END FUNCTION mpp_sum_indep |
---|
| 240 | |
---|
| 241 | SUBROUTINE comp_sum( pval, kn, pcorr, perr ) |
---|
| 242 | !!---------------------------------------------------------------------- |
---|
| 243 | !! *** ROUTINE comp_sum *** |
---|
| 244 | !! |
---|
| 245 | !! ** Purpose : To perform compensated (i.e. accurate) summation. |
---|
| 246 | !! |
---|
| 247 | !! ** Method : These routines transform the elements of the array P, |
---|
| 248 | !! such that: |
---|
| 249 | !! 1) pval(kn) contains sum(pval) |
---|
| 250 | !! 2) pval(1)...pval(kn-1) contain the rounding errors |
---|
| 251 | !! that were made in calculating sum(pval). |
---|
| 252 | !! 3) The exact sum of the elements of pval is unmodified. |
---|
| 253 | !! On return, pcorr contains the sum of the rounding errors, |
---|
| 254 | !! perr contains the sum of their absolute values. |
---|
| 255 | !! After calling this routine, an accurate sum of the |
---|
| 256 | !! elements of pval can be calculated as res=pval(n)+pcorr. |
---|
| 257 | !! |
---|
| 258 | !! ** Action : |
---|
| 259 | !! |
---|
| 260 | !! References : M. Fisher (ECMWF) IFS code + personal communications |
---|
| 261 | !! |
---|
| 262 | !! History : |
---|
| 263 | !! ! 07-07 (K. Mogensen) Original code heavily based on IFS |
---|
| 264 | !!---------------------------------------------------------------------- |
---|
| 265 | !! * Arguments |
---|
| 266 | INTEGER, INTENT(IN) :: & |
---|
| 267 | & kn ! Number of elements in input array |
---|
| 268 | REAL(wp), DIMENSION(kn), INTENT(INOUT) :: & |
---|
| 269 | & pval ! Input array to be sum on input |
---|
| 270 | ! pval(kn) = sum (pval) on output |
---|
| 271 | ! pval(1)...pval(kn-1) = rounding errors on output |
---|
| 272 | REAL(wp) :: & |
---|
| 273 | & pcorr, & ! Sum of rounding errors |
---|
| 274 | & perr ! Sum of absolute rounding errors |
---|
| 275 | !! * Local declarations |
---|
| 276 | REAL(wp) :: & |
---|
| 277 | & zx, & |
---|
| 278 | & zz, & |
---|
| 279 | & zpsum |
---|
| 280 | integer :: & |
---|
| 281 | & jj |
---|
| 282 | |
---|
| 283 | pcorr = 0.0_wp |
---|
| 284 | perr = 0.0_wp |
---|
| 285 | |
---|
| 286 | zpsum = pval(1) |
---|
| 287 | |
---|
| 288 | DO jj = 2, kn |
---|
| 289 | |
---|
| 290 | ! It is vital that these 4 lines are not optimized in any way that |
---|
| 291 | ! changes the results. |
---|
| 292 | |
---|
| 293 | zx = pval(jj) + zpsum |
---|
| 294 | zz = zx - pval(jj) |
---|
| 295 | pval(jj-1) = ( pval(jj) - ( zx - zz ) ) + ( zpsum - zz ) |
---|
| 296 | zpsum = zx |
---|
| 297 | |
---|
| 298 | ! Accumulate the correction and the error |
---|
| 299 | |
---|
| 300 | pcorr = pcorr + pval(jj-1) |
---|
| 301 | perr = perr + ABS( pval(jj-1) ) |
---|
| 302 | |
---|
| 303 | END DO |
---|
| 304 | |
---|
| 305 | pval(kn) = zpsum |
---|
| 306 | |
---|
| 307 | END SUBROUTINE comp_sum |
---|
| 308 | |
---|
| 309 | FUNCTION fround(pres) |
---|
| 310 | !!---------------------------------------------------------------------- |
---|
| 311 | !! *** ROUTINE fround *** |
---|
| 312 | !! |
---|
| 313 | !! ** Purpose : Rounding of floating-point number |
---|
| 314 | !! |
---|
| 315 | !! ** Method : Returns the value of PRES rounded to the nearest |
---|
| 316 | !! floating-point number that has its last three bits zero |
---|
| 317 | !! This works on big-endian and little-endian machines. |
---|
| 318 | !! |
---|
| 319 | !! ** Action : |
---|
| 320 | !! |
---|
| 321 | !! References : M. Fisher (ECMWF) IFS code + personal communication |
---|
| 322 | !! |
---|
| 323 | !! History : |
---|
| 324 | !! ! 07-07 (K. Mogensen) Original code heavily based on IFS. |
---|
| 325 | !!---------------------------------------------------------------------- |
---|
| 326 | !! * Function return |
---|
| 327 | REAL(wp) fround |
---|
| 328 | !! * Arguments |
---|
| 329 | REAL(wp), INTENT(IN) :: & |
---|
| 330 | & pres ! Value to be rounded |
---|
| 331 | !! * Local declarations |
---|
| 332 | REAL(wp) :: & |
---|
| 333 | & zz(2), & |
---|
| 334 | & zup, & |
---|
| 335 | & zdown |
---|
| 336 | INTEGER :: & |
---|
| 337 | & ii(2), & |
---|
| 338 | & iequiv(8), & |
---|
| 339 | & ints_per_real, & |
---|
| 340 | & i_low_word |
---|
| 341 | INTEGER :: & |
---|
| 342 | & jj |
---|
| 343 | |
---|
| 344 | ii(:) = 1 |
---|
| 345 | zz(:) = 1.0_wp |
---|
| 346 | |
---|
| 347 | ! Warning: If wp = 64 bits (or 32 bits for key_sp) this will not work. |
---|
| 348 | |
---|
| 349 | #if defined key_sp |
---|
| 350 | ints_per_real = 32 / BIT_SIZE(ii) |
---|
| 351 | #else |
---|
| 352 | ints_per_real = 64 / BIT_SIZE(ii) |
---|
| 353 | #endif |
---|
| 354 | |
---|
| 355 | ! Test whether big-endian or little-endian |
---|
| 356 | |
---|
| 357 | zup = -1.0_wp |
---|
| 358 | iequiv(1:ints_per_real) = TRANSFER(zup,iequiv(1:ints_per_real)) |
---|
| 359 | |
---|
| 360 | IF ( iequiv(1) == 0 ) THEN |
---|
| 361 | i_low_word = 1 ! Little-endian |
---|
| 362 | ELSE |
---|
| 363 | i_low_word = ints_per_real ! Big-endian |
---|
| 364 | ENDIF |
---|
| 365 | |
---|
| 366 | ! Find the nearest number with all 3 lowest-order bits zeroed |
---|
| 367 | |
---|
| 368 | iequiv(1:ints_per_real) = transfer(pres,iequiv(1:ints_per_real)) |
---|
| 369 | zup = pres |
---|
| 370 | zdown = pres |
---|
| 371 | |
---|
| 372 | IF (IBITS(iequiv(i_low_word),0,3)/=0) THEN |
---|
| 373 | |
---|
| 374 | DO jj = 1, 4 |
---|
| 375 | |
---|
| 376 | zup = NEAREST( zup, 1.0_wp ) |
---|
| 377 | iequiv(1:ints_per_real) = TRANSFER( zup, iequiv(1:ints_per_real) ) |
---|
| 378 | |
---|
| 379 | IF ( IBITS( iequiv(i_low_word), 0, 3 ) == 0 ) EXIT |
---|
| 380 | |
---|
| 381 | zdown = NEAREST( zdown, -1.0 ) |
---|
| 382 | |
---|
| 383 | iequiv(1:ints_per_real) = TRANSFER( zdown, iequiv(1:ints_per_real)) |
---|
| 384 | |
---|
| 385 | IF ( IBITS( iequiv(i_low_word),0,3) == 0 ) EXIT |
---|
| 386 | |
---|
| 387 | END DO |
---|
| 388 | |
---|
| 389 | IF ( IBITS( iequiv( i_low_word ), 0, 3) /= 0 ) THEN |
---|
| 390 | |
---|
| 391 | CALL ctl_stop('Fround:','This is not possible') |
---|
| 392 | |
---|
| 393 | ENDIF |
---|
| 394 | |
---|
| 395 | ENDIF |
---|
| 396 | |
---|
| 397 | fround = TRANSFER( iequiv(1:ints_per_real), pres ) |
---|
| 398 | |
---|
| 399 | END FUNCTION fround |
---|
| 400 | |
---|
| 401 | END MODULE mppsum |
---|