New URL for NEMO forge! http://forge.nemo-ocean.eu

Since March 2022 along with NEMO 4.2 release, the code development moved to a self-hosted GitLab.
This present forge is now archived and remained online for history.

mppsum.F90 in branches/TAM_V3_0/NEMO/OPA_SRC – NEMO

Context Navigation

source: branches/TAM_V3_0/NEMO/OPA_SRC/mppsum.F90 @ 1945

Last change on this file since 1945 was 1945, checked in by rblod, 14 years ago
adjustement for TAM branch
Property svn:executable set to ``*
File size: 11.6 KB

Line
1	MODULE mppsum
2	!!======================================================================
3	!! * MODULE mpp_sum *
4	!! NEMO: Summation of arrays across processors
5	!!======================================================================
6
7	!!----------------------------------------------------------------------
8	!! mppsum : Order independent MPP reproducible sum
9	!!----------------------------------------------------------------------
10	!! * Modules used
11	USE par_kind, ONLY : & ! Precision variables
12	& wp
13	USE dom_oce, ONLY : & ! Ocean space and time domain variables
14	& nproc
15	USE par_oce, ONLY : & ! Ocean parameters
16	& jpnij
17	USE lib_mpp
18	USE mppallgatherv
19	USE in_out_manager
20
21	IMPLICIT NONE
22
23	!! * Routine accessibility
24	PRIVATE
25
26	PUBLIC &
27	& mpp_sum_indep, & ! Order independent MPP reproducible sum
28	& comp_sum, & ! Perform compensated (i.e. accurate) summation.
29	& fround ! Rounding of floating-point number
30
31	CONTAINS
32
33	FUNCTION mpp_sum_indep( pval, kn )
34	!!----------------------------------------------------------------------
35	!! * ROUTINE mpp_sum_indep *
36	!!
37	!! ** Purpose : Sum all elements in the pval array in
38	!! an accurate order-independent way.
39	!!
40	!! ** Method : The code iterates the compensated summation until the
41	!! result is guaranteed to be within 4*eps of the true sum.
42	!! It then rounds the result to the nearest floating-point
43	!! number whose last three bits are zero, thereby
44	!! guaranteeing an order-independent result.
45	!!
46	!! ** Action : This does only work for MPI.
47	!! It does not work for SHMEM.
48	!!
49	!! References : M. Fisher (ECMWF): IFS code + personal communication
50	!! The algorithm is based on Ogita et al. (2005)
51	!! SIAM J. Sci. Computing, Vol.26, No.6, pp1955-1988.
52	!! This is based in turn on an algorithm
53	!! by Knuth (1969, seminumerical algorithms).
54	!!
55	!! History :
56	!! ! 07-07 (K. Mogensen) Original code heavily based on IFS.
57	!!----------------------------------------------------------------------
58	!! * Function return
59	REAL(wp) mpp_sum_indep
60	!! * Arguments
61	INTEGER, INTENT(IN) :: &
62	& kn
63	REAL(wp), DIMENSION(kn), INTENT(IN) :: &
64	& pval
65	!! * Local declarations
66	REAL(wp), DIMENSION(3) ::&
67	& zbuffl
68	REAL(wp), DIMENSION(:), ALLOCATABLE :: &
69	& zpsums, &
70	& zperrs, &
71	& zpcors, &
72	& zbuffg, &
73	& zp
74	REAL(wp) :: &
75	& zcorr, &
76	& zerr, &
77	& zolderr, &
78	& zbeta, &
79	& zres
80	INTEGER, DIMENSION(:), allocatable :: &
81	& irecv, &
82	& istart
83	INTEGER :: &
84	& ing
85	INTEGER :: &
86	& jj
87
88
89	! Get global number of elements
90	ing = kn
91	#ifdef key_mpp
92	CALL mpp_sum( ing )
93	#endif
94	! Check that the the algorithm can work
95
96	IF ( ( REAL( 2 * ing ) * EPSILON( zres ) ) >= 1.0 ) THEN
97
98	CALL ctl_stop('mpp_sum_indep:', &
99	& 'kn is too large to guarantee error bounds')
100
101	ENDIF
102
103	ALLOCATE( &
104	& zp(MAX(kn,1)), &
105	& zbuffg(jpnij*SIZE(zbuffl)), &
106	& zpsums(jpnij), &
107	& zperrs(jpnij), &
108	& zpcors(jpnij) &
109	& )
110
111	zolderr = HUGE(zerr)
112
113	! Copy the input array. This avoids some tricky indexing, at the
114	! expense of some inefficency.
115
116	IF ( kn > 0 ) THEN
117
118	zp(:) = pval(:)
119
120	ELSE
121
122	zp(1) = 0.0_wp
123
124	ENDIF
125
126	k_loop: DO
127
128	! Transform local arrays
129
130	IF ( kn > 0 ) THEN
131
132	CALL comp_sum ( zp, kn, zcorr, zerr )
133
134	ENDIF
135
136	! Gather partial sums and error bounds to all processors
137
138	zbuffl(1) = zp(MAX(kn,1))
139
140	IF ( kn > 0 ) THEN
141
142	zbuffl(2) = zerr
143	zbuffl(3) = zcorr
144
145	ELSE
146
147	zbuffl(2) = 0.0_wp
148	zbuffl(3) = 0.0_wp
149
150	ENDIF
151
152	IF ( jpnij > 1 ) THEN
153
154	ALLOCATE( &
155	& irecv(jpnij), &
156	& istart(jpnij) &
157	& )
158
159	CALL mpp_allgatherv( zbuffl, SIZE(zbuffl), &
160	& zbuffg, jpnij * SIZE(zbuffl), irecv, istart )
161
162	DEALLOCATE( &
163	& irecv, &
164	& istart &
165	& )
166
167	DO jj = 1, jpnij
168
169	zpsums(jj) = zbuffg(1+(jj-1)*SIZE(zbuffl))
170	zperrs(jj) = zbuffg(2+(jj-1)*SIZE(zbuffl))
171	zpcors(jj) = zbuffg(3+(jj-1)*SIZE(zbuffl))
172
173	END DO
174
175	ELSE
176
177	zpsums(1) = zbuffl(1)
178	zperrs(1) = zbuffl(2)
179	zpcors(1) = zbuffl(3)
180
181	ENDIF
182
183	! Transform partial sums
184
185	CALL comp_sum( zpsums, jpnij, zcorr, zerr )
186	zerr = zerr + SUM(zperrs)
187	zcorr = zcorr + SUM(zpcors)
188
189	! Calculate final result
190
191	zres = zpsums(jpnij) + zcorr
192
193	! Calculate error bound. This is corollary 4.7 from Ogita et al.
194	! (2005)
195
196	zbeta = zerr ( REAL( 2ing, wp ) * EPSILON(zres) ) &
197	& /(1.0_wp - REAL( 2ing, wp ) EPSILON(zres) )
198
199	zerr = EPSILON(zres) * ABS(zres) &
200	& +(zbeta + ( 2.0_wp * EPSILON(zres) * EPSILON(zres) * ABS(zres) &
201	& +3.0_wp * TINY(zres) ) )
202
203	! Update the last element of the local array
204
205	zp(MAX(kn,1)) = zpsums(nproc+1)
206
207	! Exit if the global error is small enough
208
209	IF ( zerr < 4.0_wp * SPACING(zres) ) EXIT k_loop
210
211	! Take appropriate action if ZRES cannot be sufficiently refined.
212
213	IF (zerr >= zolderr) THEN
214
215	CALL ctl_stop('Failed to refine sum', &
216	& 'Warning: Possiblity of non-reproducible results')
217
218	ENDIF
219
220	zolderr = zerr
221
222	ENDDO k_loop
223
224	! At this stage, we have guaranteed that ZRES less than 4*EPS
225	! away from the exact sum. There are only four floating point
226	! numbers in this range. So, if we find the nearest number that
227	! has its last three bits zero, then we have a reproducible result.
228
229	mpp_sum_indep = fround(zres)
230
231	DEALLOCATE( &
232	& zpcors, &
233	& zperrs, &
234	& zpsums, &
235	& zbuffg, &
236	& zp &
237	& )
238
239	END FUNCTION mpp_sum_indep
240
241	SUBROUTINE comp_sum( pval, kn, pcorr, perr )
242	!!----------------------------------------------------------------------
243	!! * ROUTINE comp_sum *
244	!!
245	!! ** Purpose : To perform compensated (i.e. accurate) summation.
246	!!
247	!! ** Method : These routines transform the elements of the array P,
248	!! such that:
249	!! 1) pval(kn) contains sum(pval)
250	!! 2) pval(1)...pval(kn-1) contain the rounding errors
251	!! that were made in calculating sum(pval).
252	!! 3) The exact sum of the elements of pval is unmodified.
253	!! On return, pcorr contains the sum of the rounding errors,
254	!! perr contains the sum of their absolute values.
255	!! After calling this routine, an accurate sum of the
256	!! elements of pval can be calculated as res=pval(n)+pcorr.
257	!!
258	!! ** Action :
259	!!
260	!! References : M. Fisher (ECMWF) IFS code + personal communications
261	!!
262	!! History :
263	!! ! 07-07 (K. Mogensen) Original code heavily based on IFS
264	!!----------------------------------------------------------------------
265	!! * Arguments
266	INTEGER, INTENT(IN) :: &
267	& kn ! Number of elements in input array
268	REAL(wp), DIMENSION(kn), INTENT(INOUT) :: &
269	& pval ! Input array to be sum on input
270	! pval(kn) = sum (pval) on output
271	! pval(1)...pval(kn-1) = rounding errors on output
272	REAL(wp) :: &
273	& pcorr, & ! Sum of rounding errors
274	& perr ! Sum of absolute rounding errors
275	!! * Local declarations
276	REAL(wp) :: &
277	& zx, &
278	& zz, &
279	& zpsum
280	integer :: &
281	& jj
282
283	pcorr = 0.0_wp
284	perr = 0.0_wp
285
286	zpsum = pval(1)
287
288	DO jj = 2, kn
289
290	! It is vital that these 4 lines are not optimized in any way that
291	! changes the results.
292
293	zx = pval(jj) + zpsum
294	zz = zx - pval(jj)
295	pval(jj-1) = ( pval(jj) - ( zx - zz ) ) + ( zpsum - zz )
296	zpsum = zx
297
298	! Accumulate the correction and the error
299
300	pcorr = pcorr + pval(jj-1)
301	perr = perr + ABS( pval(jj-1) )
302
303	END DO
304
305	pval(kn) = zpsum
306
307	END SUBROUTINE comp_sum
308
309	FUNCTION fround(pres)
310	!!----------------------------------------------------------------------
311	!! * ROUTINE fround *
312	!!
313	!! ** Purpose : Rounding of floating-point number
314	!!
315	!! ** Method : Returns the value of PRES rounded to the nearest
316	!! floating-point number that has its last three bits zero
317	!! This works on big-endian and little-endian machines.
318	!!
319	!! ** Action :
320	!!
321	!! References : M. Fisher (ECMWF) IFS code + personal communication
322	!!
323	!! History :
324	!! ! 07-07 (K. Mogensen) Original code heavily based on IFS.
325	!!----------------------------------------------------------------------
326	!! * Function return
327	REAL(wp) fround
328	!! * Arguments
329	REAL(wp), INTENT(IN) :: &
330	& pres ! Value to be rounded
331	!! * Local declarations
332	REAL(wp) :: &
333	& zz(2), &
334	& zup, &
335	& zdown
336	INTEGER :: &
337	& ii(2), &
338	& iequiv(8), &
339	& ints_per_real, &
340	& i_low_word
341	INTEGER :: &
342	& jj
343
344	ii(:) = 1
345	zz(:) = 1.0_wp
346
347	! Warning: If wp = 64 bits (or 32 bits for key_sp) this will not work.
348
349	#if defined key_sp
350	ints_per_real = 32 / BIT_SIZE(ii)
351	#else
352	ints_per_real = 64 / BIT_SIZE(ii)
353	#endif
354
355	! Test whether big-endian or little-endian
356
357	zup = -1.0_wp
358	iequiv(1:ints_per_real) = TRANSFER(zup,iequiv(1:ints_per_real))
359
360	IF ( iequiv(1) == 0 ) THEN
361	i_low_word = 1 ! Little-endian
362	ELSE
363	i_low_word = ints_per_real ! Big-endian
364	ENDIF
365
366	! Find the nearest number with all 3 lowest-order bits zeroed
367
368	iequiv(1:ints_per_real) = transfer(pres,iequiv(1:ints_per_real))
369	zup = pres
370	zdown = pres
371
372	IF (IBITS(iequiv(i_low_word),0,3)/=0) THEN
373
374	DO jj = 1, 4
375
376	zup = NEAREST( zup, 1.0_wp )
377	iequiv(1:ints_per_real) = TRANSFER( zup, iequiv(1:ints_per_real) )
378
379	IF ( IBITS( iequiv(i_low_word), 0, 3 ) == 0 ) EXIT
380
381	zdown = NEAREST( zdown, -1.0 )
382
383	iequiv(1:ints_per_real) = TRANSFER( zdown, iequiv(1:ints_per_real))
384
385	IF ( IBITS( iequiv(i_low_word),0,3) == 0 ) EXIT
386
387	END DO
388
389	IF ( IBITS( iequiv( i_low_word ), 0, 3) /= 0 ) THEN
390
391	CALL ctl_stop('Fround:','This is not possible')
392
393	ENDIF
394
395	ENDIF
396
397	fround = TRANSFER( iequiv(1:ints_per_real), pres )
398
399	END FUNCTION fround
400
401	END MODULE mppsum

Note: See TracBrowser for help on using the repository browser.

Download in other formats: