1 | MODULE mppsum |
---|
2 | !!====================================================================== |
---|
3 | !! *** MODULE mpp_sum *** |
---|
4 | !! NEMO: Summation of arrays across processors |
---|
5 | !!====================================================================== |
---|
6 | |
---|
7 | !!---------------------------------------------------------------------- |
---|
8 | !! mppsum : Order independent MPP reproducible sum |
---|
9 | !!---------------------------------------------------------------------- |
---|
10 | !! * Modules used |
---|
11 | USE par_kind, ONLY : & ! Precision variables |
---|
12 | & wp |
---|
13 | USE dom_oce, ONLY : & ! Ocean space and time domain variables |
---|
14 | & nproc |
---|
15 | USE par_oce, ONLY : & ! Ocean parameters |
---|
16 | & jpnij |
---|
17 | USE lib_mpp |
---|
18 | USE mppallgatherv |
---|
19 | USE in_out_manager |
---|
20 | |
---|
21 | IMPLICIT NONE |
---|
22 | |
---|
23 | !! * Routine accessibility |
---|
24 | PRIVATE |
---|
25 | |
---|
26 | PUBLIC & |
---|
27 | & mpp_sum_indep, & ! Order independent MPP reproducible sum |
---|
28 | & comp_sum, & ! Perform compensated (i.e. accurate) summation. |
---|
29 | & fround ! Rounding of floating-point number |
---|
30 | |
---|
31 | CONTAINS |
---|
32 | |
---|
33 | FUNCTION mpp_sum_indep( pval, kn ) |
---|
34 | !!---------------------------------------------------------------------- |
---|
35 | !! *** ROUTINE mpp_sum_indep *** |
---|
36 | !! |
---|
37 | !! ** Purpose : Sum all elements in the pval array in |
---|
38 | !! an accurate order-independent way. |
---|
39 | !! |
---|
40 | !! ** Method : The code iterates the compensated summation until the |
---|
41 | !! result is guaranteed to be within 4*eps of the true sum. |
---|
42 | !! It then rounds the result to the nearest floating-point |
---|
43 | !! number whose last three bits are zero, thereby |
---|
44 | !! guaranteeing an order-independent result. |
---|
45 | !! |
---|
46 | !! ** Action : This does only work for MPI. |
---|
47 | !! It does not work for SHMEM. |
---|
48 | !! |
---|
49 | !! References : M. Fisher (ECMWF): IFS code + personal communication |
---|
50 | !! The algorithm is based on Ogita et al. (2005) |
---|
51 | !! SIAM J. Sci. Computing, Vol.26, No.6, pp1955-1988. |
---|
52 | !! This is based in turn on an algorithm |
---|
53 | !! by Knuth (1969, seminumerical algorithms). |
---|
54 | !! |
---|
55 | !! History : |
---|
56 | !! ! 07-07 (K. Mogensen) Original code heavily based on IFS. |
---|
57 | !!---------------------------------------------------------------------- |
---|
58 | !! * Function return |
---|
59 | REAL(wp) mpp_sum_indep |
---|
60 | !! * Arguments |
---|
61 | INTEGER, INTENT(IN) :: & |
---|
62 | & kn |
---|
63 | REAL(wp), DIMENSION(kn), INTENT(IN) :: & |
---|
64 | & pval |
---|
65 | !! * Local declarations |
---|
66 | REAL(wp), DIMENSION(3) ::& |
---|
67 | & zbuffl |
---|
68 | REAL(wp), DIMENSION(:), ALLOCATABLE :: & |
---|
69 | & zpsums, & |
---|
70 | & zperrs, & |
---|
71 | & zpcors, & |
---|
72 | & zbuffg, & |
---|
73 | & zp |
---|
74 | REAL(wp) :: & |
---|
75 | & zcorr, & |
---|
76 | & zerr, & |
---|
77 | & zolderr, & |
---|
78 | & zbeta, & |
---|
79 | & zres |
---|
80 | INTEGER, DIMENSION(:), allocatable :: & |
---|
81 | & irecv, & |
---|
82 | & istart |
---|
83 | INTEGER :: & |
---|
84 | & ing |
---|
85 | INTEGER :: & |
---|
86 | & jj |
---|
87 | |
---|
88 | |
---|
89 | ! Get global number of elements |
---|
90 | ing = kn |
---|
91 | #ifdef key_mpp |
---|
92 | CALL mpp_sum( ing ) |
---|
93 | #endif |
---|
94 | ! Check that the the algorithm can work |
---|
95 | |
---|
96 | IF ( ( REAL( 2 * ing ) * EPSILON( zres ) ) >= 1.0 ) THEN |
---|
97 | |
---|
98 | CALL ctl_stop('mpp_sum_indep:', & |
---|
99 | & 'kn is too large to guarantee error bounds') |
---|
100 | |
---|
101 | ENDIF |
---|
102 | |
---|
103 | ALLOCATE( & |
---|
104 | & zp(MAX(kn,1)), & |
---|
105 | & zbuffg(jpnij*SIZE(zbuffl)), & |
---|
106 | & zpsums(jpnij), & |
---|
107 | & zperrs(jpnij), & |
---|
108 | & zpcors(jpnij) & |
---|
109 | & ) |
---|
110 | |
---|
111 | zolderr = HUGE(zerr) |
---|
112 | |
---|
113 | ! Copy the input array. This avoids some tricky indexing, at the |
---|
114 | ! expense of some inefficency. |
---|
115 | |
---|
116 | IF ( kn > 0 ) THEN |
---|
117 | |
---|
118 | zp(:) = pval(:) |
---|
119 | |
---|
120 | ELSE |
---|
121 | |
---|
122 | zp(1) = 0.0_wp |
---|
123 | |
---|
124 | ENDIF |
---|
125 | |
---|
126 | k_loop: DO |
---|
127 | |
---|
128 | ! Transform local arrays |
---|
129 | |
---|
130 | IF ( kn > 0 ) THEN |
---|
131 | |
---|
132 | CALL comp_sum ( zp, kn, zcorr, zerr ) |
---|
133 | |
---|
134 | ENDIF |
---|
135 | |
---|
136 | ! Gather partial sums and error bounds to all processors |
---|
137 | |
---|
138 | zbuffl(1) = zp(MAX(kn,1)) |
---|
139 | |
---|
140 | IF ( kn > 0 ) THEN |
---|
141 | |
---|
142 | zbuffl(2) = zerr |
---|
143 | zbuffl(3) = zcorr |
---|
144 | |
---|
145 | ELSE |
---|
146 | |
---|
147 | zbuffl(2) = 0.0_wp |
---|
148 | zbuffl(3) = 0.0_wp |
---|
149 | |
---|
150 | ENDIF |
---|
151 | |
---|
152 | IF ( jpnij > 1 ) THEN |
---|
153 | |
---|
154 | ALLOCATE( & |
---|
155 | & irecv(jpnij), & |
---|
156 | & istart(jpnij) & |
---|
157 | & ) |
---|
158 | |
---|
159 | CALL mpp_allgatherv( zbuffl, SIZE(zbuffl), & |
---|
160 | & zbuffg, jpnij * SIZE(zbuffl), irecv, istart ) |
---|
161 | |
---|
162 | DEALLOCATE( & |
---|
163 | & irecv, & |
---|
164 | & istart & |
---|
165 | & ) |
---|
166 | |
---|
167 | DO jj = 1, jpnij |
---|
168 | |
---|
169 | zpsums(jj) = zbuffg(1+(jj-1)*SIZE(zbuffl)) |
---|
170 | zperrs(jj) = zbuffg(2+(jj-1)*SIZE(zbuffl)) |
---|
171 | zpcors(jj) = zbuffg(3+(jj-1)*SIZE(zbuffl)) |
---|
172 | |
---|
173 | END DO |
---|
174 | |
---|
175 | ELSE |
---|
176 | |
---|
177 | zpsums(1) = zbuffl(1) |
---|
178 | zperrs(1) = zbuffl(2) |
---|
179 | zpcors(1) = zbuffl(3) |
---|
180 | |
---|
181 | ENDIF |
---|
182 | |
---|
183 | ! Transform partial sums |
---|
184 | |
---|
185 | CALL comp_sum( zpsums, jpnij, zcorr, zerr ) |
---|
186 | zerr = zerr + SUM(zperrs) |
---|
187 | zcorr = zcorr + SUM(zpcors) |
---|
188 | |
---|
189 | ! Calculate final result |
---|
190 | |
---|
191 | zres = zpsums(jpnij) + zcorr |
---|
192 | |
---|
193 | ! Calculate error bound. This is corollary 4.7 from Ogita et al. |
---|
194 | ! (2005) |
---|
195 | |
---|
196 | zbeta = zerr *( REAL( 2*ing, wp ) * EPSILON(zres) ) & |
---|
197 | & /(1.0_wp - REAL( 2*ing, wp ) * EPSILON(zres) ) |
---|
198 | |
---|
199 | zerr = EPSILON(zres) * ABS(zres) & |
---|
200 | & +(zbeta + ( 2.0_wp * EPSILON(zres) * EPSILON(zres) * ABS(zres) & |
---|
201 | & +3.0_wp * TINY(zres) ) ) |
---|
202 | |
---|
203 | ! Update the last element of the local array |
---|
204 | |
---|
205 | zp(MAX(kn,1)) = zpsums(nproc+1) |
---|
206 | |
---|
207 | ! Exit if the global error is small enough |
---|
208 | |
---|
209 | IF ( zerr < 4.0_wp * SPACING(zres) ) EXIT k_loop |
---|
210 | |
---|
211 | ! Take appropriate action if ZRES cannot be sufficiently refined. |
---|
212 | |
---|
213 | IF (zerr >= zolderr) THEN |
---|
214 | |
---|
215 | CALL ctl_stop('Failed to refine sum', & |
---|
216 | & 'Warning: Possiblity of non-reproducible results') |
---|
217 | |
---|
218 | ENDIF |
---|
219 | |
---|
220 | zolderr = zerr |
---|
221 | |
---|
222 | ENDDO k_loop |
---|
223 | |
---|
224 | ! At this stage, we have guaranteed that ZRES less than 4*EPS |
---|
225 | ! away from the exact sum. There are only four floating point |
---|
226 | ! numbers in this range. So, if we find the nearest number that |
---|
227 | ! has its last three bits zero, then we have a reproducible result. |
---|
228 | |
---|
229 | mpp_sum_indep = fround(zres) |
---|
230 | |
---|
231 | DEALLOCATE( & |
---|
232 | & zpcors, & |
---|
233 | & zperrs, & |
---|
234 | & zpsums, & |
---|
235 | & zbuffg, & |
---|
236 | & zp & |
---|
237 | & ) |
---|
238 | |
---|
239 | END FUNCTION mpp_sum_indep |
---|
240 | |
---|
241 | SUBROUTINE comp_sum( pval, kn, pcorr, perr ) |
---|
242 | !!---------------------------------------------------------------------- |
---|
243 | !! *** ROUTINE comp_sum *** |
---|
244 | !! |
---|
245 | !! ** Purpose : To perform compensated (i.e. accurate) summation. |
---|
246 | !! |
---|
247 | !! ** Method : These routines transform the elements of the array P, |
---|
248 | !! such that: |
---|
249 | !! 1) pval(kn) contains sum(pval) |
---|
250 | !! 2) pval(1)...pval(kn-1) contain the rounding errors |
---|
251 | !! that were made in calculating sum(pval). |
---|
252 | !! 3) The exact sum of the elements of pval is unmodified. |
---|
253 | !! On return, pcorr contains the sum of the rounding errors, |
---|
254 | !! perr contains the sum of their absolute values. |
---|
255 | !! After calling this routine, an accurate sum of the |
---|
256 | !! elements of pval can be calculated as res=pval(n)+pcorr. |
---|
257 | !! |
---|
258 | !! ** Action : |
---|
259 | !! |
---|
260 | !! References : M. Fisher (ECMWF) IFS code + personal communications |
---|
261 | !! |
---|
262 | !! History : |
---|
263 | !! ! 07-07 (K. Mogensen) Original code heavily based on IFS |
---|
264 | !!---------------------------------------------------------------------- |
---|
265 | !! * Arguments |
---|
266 | INTEGER, INTENT(IN) :: & |
---|
267 | & kn ! Number of elements in input array |
---|
268 | REAL(wp), DIMENSION(kn), INTENT(INOUT) :: & |
---|
269 | & pval ! Input array to be sum on input |
---|
270 | ! pval(kn) = sum (pval) on output |
---|
271 | ! pval(1)...pval(kn-1) = rounding errors on output |
---|
272 | REAL(wp) :: & |
---|
273 | & pcorr, & ! Sum of rounding errors |
---|
274 | & perr ! Sum of absolute rounding errors |
---|
275 | !! * Local declarations |
---|
276 | REAL(wp) :: & |
---|
277 | & zx, & |
---|
278 | & zz, & |
---|
279 | & zpsum |
---|
280 | integer :: & |
---|
281 | & jj |
---|
282 | |
---|
283 | pcorr = 0.0_wp |
---|
284 | perr = 0.0_wp |
---|
285 | |
---|
286 | zpsum = pval(1) |
---|
287 | |
---|
288 | DO jj = 2, kn |
---|
289 | |
---|
290 | ! It is vital that these 4 lines are not optimized in any way that |
---|
291 | ! changes the results. |
---|
292 | |
---|
293 | zx = pval(jj) + zpsum |
---|
294 | zz = zx - pval(jj) |
---|
295 | pval(jj-1) = ( pval(jj) - ( zx - zz ) ) + ( zpsum - zz ) |
---|
296 | zpsum = zx |
---|
297 | |
---|
298 | ! Accumulate the correction and the error |
---|
299 | |
---|
300 | pcorr = pcorr + pval(jj-1) |
---|
301 | perr = perr + ABS( pval(jj-1) ) |
---|
302 | |
---|
303 | END DO |
---|
304 | |
---|
305 | pval(kn) = zpsum |
---|
306 | |
---|
307 | END SUBROUTINE comp_sum |
---|
308 | |
---|
309 | FUNCTION fround(pres) |
---|
310 | !!---------------------------------------------------------------------- |
---|
311 | !! *** ROUTINE fround *** |
---|
312 | !! |
---|
313 | !! ** Purpose : Rounding of floating-point number |
---|
314 | !! |
---|
315 | !! ** Method : Returns the value of PRES rounded to the nearest |
---|
316 | !! floating-point number that has its last three bits zero |
---|
317 | !! This works on big-endian and little-endian machines. |
---|
318 | !! |
---|
319 | !! ** Action : |
---|
320 | !! |
---|
321 | !! References : M. Fisher (ECMWF) IFS code + personal communication |
---|
322 | !! |
---|
323 | !! History : |
---|
324 | !! ! 07-07 (K. Mogensen) Original code heavily based on IFS. |
---|
325 | !!---------------------------------------------------------------------- |
---|
326 | !! * Function return |
---|
327 | REAL(wp) fround |
---|
328 | !! * Arguments |
---|
329 | REAL(wp), INTENT(IN) :: & |
---|
330 | & pres ! Value to be rounded |
---|
331 | !! * Local declarations |
---|
332 | REAL(wp) :: & |
---|
333 | & zz(2), & |
---|
334 | & zup, & |
---|
335 | & zdown |
---|
336 | INTEGER :: & |
---|
337 | & ii(2), & |
---|
338 | & iequiv(8), & |
---|
339 | & ints_per_real, & |
---|
340 | & i_low_word |
---|
341 | INTEGER :: & |
---|
342 | & jj |
---|
343 | |
---|
344 | ii(:) = 1 |
---|
345 | zz(:) = 1.0_wp |
---|
346 | |
---|
347 | ! Warning: If wp = 64 bits (or 32 bits for key_sp) this will not work. |
---|
348 | |
---|
349 | #if defined key_sp |
---|
350 | ints_per_real = 32 / BIT_SIZE(ii) |
---|
351 | #else |
---|
352 | ints_per_real = 64 / BIT_SIZE(ii) |
---|
353 | #endif |
---|
354 | |
---|
355 | ! Test whether big-endian or little-endian |
---|
356 | |
---|
357 | zup = -1.0_wp |
---|
358 | iequiv(1:ints_per_real) = TRANSFER(zup,iequiv(1:ints_per_real)) |
---|
359 | |
---|
360 | IF ( iequiv(1) == 0 ) THEN |
---|
361 | i_low_word = 1 ! Little-endian |
---|
362 | ELSE |
---|
363 | i_low_word = ints_per_real ! Big-endian |
---|
364 | ENDIF |
---|
365 | |
---|
366 | ! Find the nearest number with all 3 lowest-order bits zeroed |
---|
367 | |
---|
368 | iequiv(1:ints_per_real) = transfer(pres,iequiv(1:ints_per_real)) |
---|
369 | zup = pres |
---|
370 | zdown = pres |
---|
371 | |
---|
372 | IF (IBITS(iequiv(i_low_word),0,3)/=0) THEN |
---|
373 | |
---|
374 | DO jj = 1, 4 |
---|
375 | |
---|
376 | zup = NEAREST( zup, 1.0_wp ) |
---|
377 | iequiv(1:ints_per_real) = TRANSFER( zup, iequiv(1:ints_per_real) ) |
---|
378 | |
---|
379 | IF ( IBITS( iequiv(i_low_word), 0, 3 ) == 0 ) EXIT |
---|
380 | |
---|
381 | zdown = NEAREST( zdown, -1.0 ) |
---|
382 | |
---|
383 | iequiv(1:ints_per_real) = TRANSFER( zdown, iequiv(1:ints_per_real)) |
---|
384 | |
---|
385 | IF ( IBITS( iequiv(i_low_word),0,3) == 0 ) EXIT |
---|
386 | |
---|
387 | END DO |
---|
388 | |
---|
389 | IF ( IBITS( iequiv( i_low_word ), 0, 3) /= 0 ) THEN |
---|
390 | |
---|
391 | CALL ctl_stop('Fround:','This is not possible') |
---|
392 | |
---|
393 | ENDIF |
---|
394 | |
---|
395 | ENDIF |
---|
396 | |
---|
397 | fround = TRANSFER( iequiv(1:ints_per_real), pres ) |
---|
398 | |
---|
399 | END FUNCTION fround |
---|
400 | |
---|
401 | END MODULE mppsum |
---|