1 | MODULE mppsum |
---|
2 | !!====================================================================== |
---|
3 | !! *** MODULE mpp_sum *** |
---|
4 | !! NEMO: Summation of arrays across processors |
---|
5 | !!====================================================================== |
---|
6 | |
---|
7 | !!---------------------------------------------------------------------- |
---|
8 | !! mppsum : Order independent MPP reproducible sum |
---|
9 | !!---------------------------------------------------------------------- |
---|
10 | !! * Modules used |
---|
11 | USE par_kind, ONLY : & ! Precision variables |
---|
12 | & wp |
---|
13 | USE dom_oce, ONLY : & ! Ocean space and time domain variables |
---|
14 | & nproc |
---|
15 | USE par_oce, ONLY : & ! Ocean parameters |
---|
16 | & jpnij |
---|
17 | USE lib_mpp |
---|
18 | USE mppallgatherv |
---|
19 | USE in_out_manager |
---|
20 | |
---|
21 | IMPLICIT NONE |
---|
22 | |
---|
23 | !! * Routine accessibility |
---|
24 | PRIVATE |
---|
25 | |
---|
26 | PUBLIC & |
---|
27 | & mpp_sum_indep, & ! Order independent MPP reproducible sum |
---|
28 | & comp_sum, & ! Perform compensated (i.e. accurate) summation. |
---|
29 | & fround ! Rounding of floating-point number |
---|
30 | |
---|
31 | CONTAINS |
---|
32 | |
---|
33 | FUNCTION mpp_sum_indep( pval, kn ) |
---|
34 | !!---------------------------------------------------------------------- |
---|
35 | !! *** ROUTINE mpp_sum_indep *** |
---|
36 | !! |
---|
37 | !! ** Purpose : Sum all elements in the pval array in |
---|
38 | !! an accurate order-independent way. |
---|
39 | !! |
---|
40 | !! ** Method : The code iterates the compensated summation until the |
---|
41 | !! result is guaranteed to be within 4*eps of the true sum. |
---|
42 | !! It then rounds the result to the nearest floating-point |
---|
43 | !! number whose last three bits are zero, thereby |
---|
44 | !! guaranteeing an order-independent result. |
---|
45 | !! |
---|
46 | !! ** Action : This does only work for MPI. |
---|
47 | !! It does not work for SHMEM. |
---|
48 | !! |
---|
49 | !! References : M. Fisher (ECMWF): IFS code + personal communication |
---|
50 | !! The algorithm is based on Ogita et al. (2005) |
---|
51 | !! SIAM J. Sci. Computing, Vol.26, No.6, pp1955-1988. |
---|
52 | !! This is based in turn on an algorithm |
---|
53 | !! by Knuth (1969, seminumerical algorithms). |
---|
54 | !! |
---|
55 | !! History : |
---|
56 | !! ! 07-07 (K. Mogensen) Original code heavily based on IFS. |
---|
57 | !!---------------------------------------------------------------------- |
---|
58 | !! * Function return |
---|
59 | REAL(wp) mpp_sum_indep |
---|
60 | !! * Arguments |
---|
61 | INTEGER, INTENT(IN) :: & |
---|
62 | & kn |
---|
63 | REAL(wp), DIMENSION(kn), INTENT(IN) :: & |
---|
64 | & pval |
---|
65 | !! * Local declarations |
---|
66 | REAL(wp), DIMENSION(3) ::& |
---|
67 | & zbuffl |
---|
68 | REAL(wp), DIMENSION(:), ALLOCATABLE :: & |
---|
69 | & zpsums, & |
---|
70 | & zperrs, & |
---|
71 | & zpcors, & |
---|
72 | & zbuffg, & |
---|
73 | & zp |
---|
74 | REAL(wp) :: & |
---|
75 | & zcorr, & |
---|
76 | & zerr, & |
---|
77 | & zolderr, & |
---|
78 | & zbeta, & |
---|
79 | & zres |
---|
80 | INTEGER, DIMENSION(:), allocatable :: & |
---|
81 | & irecv, & |
---|
82 | & istart |
---|
83 | INTEGER :: & |
---|
84 | & ing |
---|
85 | INTEGER :: & |
---|
86 | & jj |
---|
87 | |
---|
88 | ! initialise to avoid uninitialised variables trapping of some compilers to complain. |
---|
89 | zres = 0.0_wp ; zerr = 0.0_wp ; zbuffl(:) = 0.0_wp |
---|
90 | ! Get global number of elements |
---|
91 | ing = kn |
---|
92 | #ifdef key_mpp |
---|
93 | CALL mpp_sum( ing ) |
---|
94 | #endif |
---|
95 | ! Check that the the algorithm can work |
---|
96 | |
---|
97 | IF ( ( REAL( 2 * ing ) * EPSILON( zres ) ) >= 1.0 ) THEN |
---|
98 | |
---|
99 | CALL ctl_stop('mpp_sum_indep:', & |
---|
100 | & 'kn is too large to guarantee error bounds') |
---|
101 | |
---|
102 | ENDIF |
---|
103 | |
---|
104 | ALLOCATE( & |
---|
105 | & zp(MAX(kn,1)), & |
---|
106 | & zbuffg(jpnij*SIZE(zbuffl)), & |
---|
107 | & zpsums(jpnij), & |
---|
108 | & zperrs(jpnij), & |
---|
109 | & zpcors(jpnij) & |
---|
110 | & ) |
---|
111 | |
---|
112 | zolderr = HUGE(zerr) |
---|
113 | |
---|
114 | ! Copy the input array. This avoids some tricky indexing, at the |
---|
115 | ! expense of some inefficency. |
---|
116 | |
---|
117 | IF ( kn > 0 ) THEN |
---|
118 | |
---|
119 | zp(:) = pval(:) |
---|
120 | |
---|
121 | ELSE |
---|
122 | |
---|
123 | zp(1) = 0.0_wp |
---|
124 | |
---|
125 | ENDIF |
---|
126 | |
---|
127 | k_loop: DO |
---|
128 | |
---|
129 | ! Transform local arrays |
---|
130 | |
---|
131 | IF ( kn > 0 ) THEN |
---|
132 | |
---|
133 | CALL comp_sum ( zp, kn, zcorr, zerr ) |
---|
134 | |
---|
135 | ENDIF |
---|
136 | |
---|
137 | ! Gather partial sums and error bounds to all processors |
---|
138 | |
---|
139 | zbuffl(1) = zp(MAX(kn,1)) |
---|
140 | |
---|
141 | IF ( kn > 0 ) THEN |
---|
142 | |
---|
143 | zbuffl(2) = zerr |
---|
144 | zbuffl(3) = zcorr |
---|
145 | |
---|
146 | ELSE |
---|
147 | |
---|
148 | zbuffl(2) = 0.0_wp |
---|
149 | zbuffl(3) = 0.0_wp |
---|
150 | |
---|
151 | ENDIF |
---|
152 | |
---|
153 | IF ( jpnij > 1 ) THEN |
---|
154 | |
---|
155 | ALLOCATE( & |
---|
156 | & irecv(jpnij), & |
---|
157 | & istart(jpnij) & |
---|
158 | & ) |
---|
159 | |
---|
160 | CALL mpp_allgatherv( zbuffl, SIZE(zbuffl), & |
---|
161 | & zbuffg, jpnij * SIZE(zbuffl), irecv, istart ) |
---|
162 | |
---|
163 | DEALLOCATE( & |
---|
164 | & irecv, & |
---|
165 | & istart & |
---|
166 | & ) |
---|
167 | |
---|
168 | DO jj = 1, jpnij |
---|
169 | |
---|
170 | zpsums(jj) = zbuffg(1+(jj-1)*SIZE(zbuffl)) |
---|
171 | zperrs(jj) = zbuffg(2+(jj-1)*SIZE(zbuffl)) |
---|
172 | zpcors(jj) = zbuffg(3+(jj-1)*SIZE(zbuffl)) |
---|
173 | |
---|
174 | END DO |
---|
175 | |
---|
176 | ELSE |
---|
177 | |
---|
178 | zpsums(1) = zbuffl(1) |
---|
179 | zperrs(1) = zbuffl(2) |
---|
180 | zpcors(1) = zbuffl(3) |
---|
181 | |
---|
182 | ENDIF |
---|
183 | |
---|
184 | ! Transform partial sums |
---|
185 | |
---|
186 | CALL comp_sum( zpsums, jpnij, zcorr, zerr ) |
---|
187 | zerr = zerr + SUM(zperrs) |
---|
188 | zcorr = zcorr + SUM(zpcors) |
---|
189 | |
---|
190 | ! Calculate final result |
---|
191 | |
---|
192 | zres = zpsums(jpnij) + zcorr |
---|
193 | |
---|
194 | ! Calculate error bound. This is corollary 4.7 from Ogita et al. |
---|
195 | ! (2005) |
---|
196 | |
---|
197 | zbeta = zerr *( REAL( 2*ing, wp ) * EPSILON(zres) ) & |
---|
198 | & /(1.0_wp - REAL( 2*ing, wp ) * EPSILON(zres) ) |
---|
199 | |
---|
200 | zerr = EPSILON(zres) * ABS(zres) & |
---|
201 | & +(zbeta + ( 2.0_wp * EPSILON(zres) * EPSILON(zres) * ABS(zres) & |
---|
202 | & +3.0_wp * TINY(zres) ) ) |
---|
203 | |
---|
204 | ! Update the last element of the local array |
---|
205 | |
---|
206 | zp(MAX(kn,1)) = zpsums(nproc+1) |
---|
207 | |
---|
208 | ! Exit if the global error is small enough |
---|
209 | |
---|
210 | IF ( zerr < 4.0_wp * SPACING(zres) ) EXIT k_loop |
---|
211 | |
---|
212 | ! Take appropriate action if ZRES cannot be sufficiently refined. |
---|
213 | |
---|
214 | IF (zerr >= zolderr) THEN |
---|
215 | |
---|
216 | CALL ctl_stop('Failed to refine sum', & |
---|
217 | & 'Warning: Possiblity of non-reproducible results') |
---|
218 | |
---|
219 | ENDIF |
---|
220 | |
---|
221 | zolderr = zerr |
---|
222 | |
---|
223 | ENDDO k_loop |
---|
224 | |
---|
225 | ! At this stage, we have guaranteed that ZRES less than 4*EPS |
---|
226 | ! away from the exact sum. There are only four floating point |
---|
227 | ! numbers in this range. So, if we find the nearest number that |
---|
228 | ! has its last three bits zero, then we have a reproducible result. |
---|
229 | |
---|
230 | mpp_sum_indep = fround(zres) |
---|
231 | |
---|
232 | DEALLOCATE( & |
---|
233 | & zpcors, & |
---|
234 | & zperrs, & |
---|
235 | & zpsums, & |
---|
236 | & zbuffg, & |
---|
237 | & zp & |
---|
238 | & ) |
---|
239 | |
---|
240 | END FUNCTION mpp_sum_indep |
---|
241 | |
---|
242 | SUBROUTINE comp_sum( pval, kn, pcorr, perr ) |
---|
243 | !!---------------------------------------------------------------------- |
---|
244 | !! *** ROUTINE comp_sum *** |
---|
245 | !! |
---|
246 | !! ** Purpose : To perform compensated (i.e. accurate) summation. |
---|
247 | !! |
---|
248 | !! ** Method : These routines transform the elements of the array P, |
---|
249 | !! such that: |
---|
250 | !! 1) pval(kn) contains sum(pval) |
---|
251 | !! 2) pval(1)...pval(kn-1) contain the rounding errors |
---|
252 | !! that were made in calculating sum(pval). |
---|
253 | !! 3) The exact sum of the elements of pval is unmodified. |
---|
254 | !! On return, pcorr contains the sum of the rounding errors, |
---|
255 | !! perr contains the sum of their absolute values. |
---|
256 | !! After calling this routine, an accurate sum of the |
---|
257 | !! elements of pval can be calculated as res=pval(n)+pcorr. |
---|
258 | !! |
---|
259 | !! ** Action : |
---|
260 | !! |
---|
261 | !! References : M. Fisher (ECMWF) IFS code + personal communications |
---|
262 | !! |
---|
263 | !! History : |
---|
264 | !! ! 07-07 (K. Mogensen) Original code heavily based on IFS |
---|
265 | !!---------------------------------------------------------------------- |
---|
266 | !! * Arguments |
---|
267 | INTEGER, INTENT(IN) :: & |
---|
268 | & kn ! Number of elements in input array |
---|
269 | REAL(wp), DIMENSION(kn), INTENT(INOUT) :: & |
---|
270 | & pval ! Input array to be sum on input |
---|
271 | ! pval(kn) = sum (pval) on output |
---|
272 | ! pval(1)...pval(kn-1) = rounding errors on output |
---|
273 | REAL(wp) :: & |
---|
274 | & pcorr, & ! Sum of rounding errors |
---|
275 | & perr ! Sum of absolute rounding errors |
---|
276 | !! * Local declarations |
---|
277 | REAL(wp) :: & |
---|
278 | & zx, & |
---|
279 | & zz, & |
---|
280 | & zpsum |
---|
281 | integer :: & |
---|
282 | & jj |
---|
283 | |
---|
284 | pcorr = 0.0_wp |
---|
285 | perr = 0.0_wp |
---|
286 | |
---|
287 | zpsum = pval(1) |
---|
288 | |
---|
289 | DO jj = 2, kn |
---|
290 | |
---|
291 | ! It is vital that these 4 lines are not optimized in any way that |
---|
292 | ! changes the results. |
---|
293 | |
---|
294 | zx = pval(jj) + zpsum |
---|
295 | zz = zx - pval(jj) |
---|
296 | pval(jj-1) = ( pval(jj) - ( zx - zz ) ) + ( zpsum - zz ) |
---|
297 | zpsum = zx |
---|
298 | |
---|
299 | ! Accumulate the correction and the error |
---|
300 | |
---|
301 | pcorr = pcorr + pval(jj-1) |
---|
302 | perr = perr + ABS( pval(jj-1) ) |
---|
303 | |
---|
304 | END DO |
---|
305 | |
---|
306 | pval(kn) = zpsum |
---|
307 | |
---|
308 | END SUBROUTINE comp_sum |
---|
309 | |
---|
310 | FUNCTION fround(pres) |
---|
311 | !!---------------------------------------------------------------------- |
---|
312 | !! *** ROUTINE fround *** |
---|
313 | !! |
---|
314 | !! ** Purpose : Rounding of floating-point number |
---|
315 | !! |
---|
316 | !! ** Method : Returns the value of PRES rounded to the nearest |
---|
317 | !! floating-point number that has its last three bits zero |
---|
318 | !! This works on big-endian and little-endian machines. |
---|
319 | !! |
---|
320 | !! ** Action : |
---|
321 | !! |
---|
322 | !! References : M. Fisher (ECMWF) IFS code + personal communication |
---|
323 | !! |
---|
324 | !! History : |
---|
325 | !! ! 07-07 (K. Mogensen) Original code heavily based on IFS. |
---|
326 | !!---------------------------------------------------------------------- |
---|
327 | !! * Function return |
---|
328 | REAL(wp) fround |
---|
329 | !! * Arguments |
---|
330 | REAL(wp), INTENT(IN) :: & |
---|
331 | & pres ! Value to be rounded |
---|
332 | !! * Local declarations |
---|
333 | REAL(wp) :: & |
---|
334 | & zz(2), & |
---|
335 | & zup, & |
---|
336 | & zdown |
---|
337 | INTEGER :: & |
---|
338 | & ii(2), & |
---|
339 | & iequiv(8), & |
---|
340 | & ints_per_real, & |
---|
341 | & i_low_word |
---|
342 | INTEGER :: & |
---|
343 | & jj |
---|
344 | |
---|
345 | ii(:) = 1 |
---|
346 | zz(:) = 1.0_wp |
---|
347 | |
---|
348 | ! Warning: If wp = 64 bits (or 32 bits for key_sp) this will not work. |
---|
349 | |
---|
350 | #if defined key_sp |
---|
351 | ints_per_real = 32 / BIT_SIZE(ii) |
---|
352 | #else |
---|
353 | ints_per_real = 64 / BIT_SIZE(ii) |
---|
354 | #endif |
---|
355 | |
---|
356 | ! Test whether big-endian or little-endian |
---|
357 | |
---|
358 | zup = -1.0_wp |
---|
359 | iequiv(1:ints_per_real) = TRANSFER(zup,iequiv(1:ints_per_real)) |
---|
360 | |
---|
361 | IF ( iequiv(1) == 0 ) THEN |
---|
362 | i_low_word = 1 ! Little-endian |
---|
363 | ELSE |
---|
364 | i_low_word = ints_per_real ! Big-endian |
---|
365 | ENDIF |
---|
366 | |
---|
367 | ! Find the nearest number with all 3 lowest-order bits zeroed |
---|
368 | |
---|
369 | iequiv(1:ints_per_real) = transfer(pres,iequiv(1:ints_per_real)) |
---|
370 | zup = pres |
---|
371 | zdown = pres |
---|
372 | |
---|
373 | IF (IBITS(iequiv(i_low_word),0,3)/=0) THEN |
---|
374 | |
---|
375 | DO jj = 1, 4 |
---|
376 | |
---|
377 | zup = NEAREST( zup, 1.0_wp ) |
---|
378 | iequiv(1:ints_per_real) = TRANSFER( zup, iequiv(1:ints_per_real) ) |
---|
379 | |
---|
380 | IF ( IBITS( iequiv(i_low_word), 0, 3 ) == 0 ) EXIT |
---|
381 | |
---|
382 | zdown = NEAREST( zdown, -1.0 ) |
---|
383 | |
---|
384 | iequiv(1:ints_per_real) = TRANSFER( zdown, iequiv(1:ints_per_real)) |
---|
385 | |
---|
386 | IF ( IBITS( iequiv(i_low_word),0,3) == 0 ) EXIT |
---|
387 | |
---|
388 | END DO |
---|
389 | |
---|
390 | IF ( IBITS( iequiv( i_low_word ), 0, 3) /= 0 ) THEN |
---|
391 | |
---|
392 | CALL ctl_stop('Fround:','This is not possible') |
---|
393 | |
---|
394 | ENDIF |
---|
395 | |
---|
396 | ENDIF |
---|
397 | |
---|
398 | fround = TRANSFER( iequiv(1:ints_per_real), pres ) |
---|
399 | |
---|
400 | END FUNCTION fround |
---|
401 | |
---|
402 | END MODULE mppsum |
---|