New URL for NEMO forge! http://forge.nemo-ocean.eu

Since March 2022 along with NEMO 4.2 release, the code development moved to a self-hosted GitLab.
This present forge is now archived and remained online for history.

trusting_func.sh in branches/2015/dev_r5092_CNRS18_TRUST/NEMOGCM/TRUST/inc – NEMO

Context Navigation

source: branches/2015/dev_r5092_CNRS18_TRUST/NEMOGCM/TRUST/inc/trusting_func.sh @ 8797

Last change on this file since 8797 was 8797, checked in by nicolasmartin, 6 years ago
Modifications to get it working on Curie for all trusting tests
Property eol-style set to `native` Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Rev URL`
File size: 11.9 KB

Line
1	#!/bin/bash
2
3
4	## Messenger filenames
5	file_date=mesg_01_date.txt ; file_rslt=mesg_02_result.txt
6	file_stat=mesg_03_status.txt; file_nemo=mesg_04_nemo.txt
7	file_xios=mesg_05_xios.txt ; file_cmpf=mesg_06_compiler.txt
8	file_lmpi=mesg_07_mpi.txt ; file_ncdf=mesg_08_netcdf.txt
9	file_inpt=mesg_09_inputs.txt; file_time=mesg_10_time.txt
10	file_memy=mesg_11_memory.txt; file_note=mesg_12_comments.txt
11
12
13	## Functions in order of use
14	print_step() {
15	local char_nb=$( echo "$1" \| wc -c )
16	local outline=$( printf "%${char_nb}s" )
17
18	printf "\nStep.....\n%s\n%s\n" "$1" ${outline// /-}
19	}
20
21	init_files() {
22	echo 'Date' > ${file_date}; echo 'Result' > ${file_rslt}
23	echo 'Status' > ${file_stat}; echo 'NEMOGCM rev.' > ${file_nemo}
24	echo 'XIOS rev.' > ${file_xios}; echo 'Fortran compiler' > ${file_cmpf}
25	echo 'MPI libs' > ${file_lmpi}; echo 'NetCDF libs' > ${file_ncdf}
26	echo 'Input files' > ${file_inpt}; echo 'Elapsed time' > ${file_time}
27	echo 'Memory usage (P/V)' > ${file_memy}; echo 'Comments' > ${file_note}
28
29	## 'Failed' status with 'Unknown error' by default
30	echo ${TRUS_RSLT} \
31	>> ${file_rslt}
32	echo 'Unknown error' \
33	>> ${file_stat}
34	}
35
36	get_date() {
37	## UTC time zone for timestamping
38	local dat=$( date -ud "${TRUS_DATE}" +"%F %R %Z" )
39
40	echo $dat \
41	>> ${file_date}
42	}
43
44	get_nemo_rev() {
45	local dir rev_loc
46	local rev=0
47
48	## Loop on essential NEMO directories
49	for dir in ${TRUS_CKOT} ${TRUS_XIOS}; do
50
51	## For time being, just get revision from XIOS with no action on directory
52	if [ $dir == ${TRUS_XIOS} ]; then
53	rev_loc=$( svn info $dir \| awk '/Last Changed Rev/ {print $NF}' )
54	echo 'XIOS '${rev_loc} \
55	>> model.log
56	echo "<a href=\"https://forge.ipsl.jussieu.fr/ioserver/changeset/${rev_loc}\" target=\"_blank\">${rev_loc}</a>" \
57	>> ${file_xios}
58	continue
59	fi
60
61	echo $dir && ${TRUS_SVNA} ${TRUS_NGCM}/$dir
62	rev_loc=$( svn info ${TRUS_NGCM}/$dir \| awk '/Last Changed Rev/ {print $NF}' )
63
64	## Keep last rev. nb
65	[ ${rev_loc} -gt $rev ] && rev=${rev_loc}
66	done
67
68	echo 'NEMOGCM '$rev \
69	>> model.log
70	echo "<a href=\"https://forge.ipsl.jussieu.fr/nemo/changeset/$rev\" target=\"_blank\">$rev</a>" \
71	>> ${file_nemo}
72	}
73
74	get_soft_rel() {
75	local soft_rel str
76
77	## Sourcing environment
78	if [ -n "${TRUS_ENVI}" ]; then
79	if [[ -e ${TRUS_ENVI}.env && $( declare -F \| grep ' module' ) ]]; then
80	## .env file if module function is available
81	. ${TRUS_ENVI}.env
82	else
83	## .path file if existing, if not the given file
84	[ -e ${TRUS_ENVI}.path ] && . ${TRUS_ENVI}.path \|\| . ${TRUS_ENVI}
85	fi
86	fi
87
88	## Problem with `prepend-path` of modulefile that use ':' instead of ' ' as delimiter
89	[ $TRUS_HPCC == 'X64_ADA' ] && WRAPPER_LDFLAGS='-L/smplocal/pub/IdrMemMPI/1.4/lib -lidrmem '${WRAPPER_LDFLAGS}
90
91	for str in ${TRUS_CMPV} ${TRUS_MPIR} ${TRUS_CDFR} ${TRUS_CDOR}; do
92	[ -z "$str" ] && continue
93	soft_rel=''
94
95	## Software release: next word after "$soft" in $PATH (case-insensitive)
96	soft_rel=$( echo $PATH \| sed "s#.$str\([0-9.a-z_]\).*#\1#i" )
97
98	## option --version would work for main compilers (gfortran, intel, pgfortran, ...)
99	[ $str == ${TRUS_CMPV} ] && soft_rel=$( $str --version \| grep -m1 -oe '\<[0-9. ]*\>' )
100
101	## Cleaning characters string to display proper soft name
102	str=$( echo $str \| sed 's#\\##g; s#[/-]$##' )
103
104	echo $str ${soft_rel} \
105	>> model.log
106	done
107
108	sed -n 3p model.log \
109	>> ${file_cmpf}
110	sed -n 4p model.log \
111	>> ${file_lmpi}
112	sed -n 5p model.log \
113	>> ${file_ncdf}
114	}
115
116	get_inputs() {
117	# List archive content & extract it by default
118	local get_iol=$( eval "
119	for archive in ${TRUS_TARF}; do
120	tar -tvf ${TRUS_FORC}/\$archive >> inputs_list.txt;
121	done
122	" )
123	local get_iof=$( eval "
124	for archive in ${TRUS_TARF}; do
125	tar -vxf ${TRUS_FORC}/\$archive > /dev/null;
126	done
127	" )
128
129	## List & copy files in case of personal inputs
130	if [ -z "${TRUS_TARF}" ]; then
131	get_iol="ls ${TRUS_FORC}/*"
132	get_iof="\cp ${TRUS_FORC}/* ."
133	fi
134
135	${get_iol}; ${get_iof}
136
137	if [ `find -name '*.gz' -print -quit` ]; then
138	find . -name '*.gz' -exec gzip -d {} \;
139	fi
140	}
141
142	diff_inputs() {
143	local dif file
144	local files_list='' mesg='Same'
145
146	## Simple diff
147	for file in 'inputs_list.txt' namelist_ .xml cpp_; do
148	dif=''
149
150	## Continue even if input file is not in here (see after)
151	if [ -e ${TRUS_STOR}/$file ]; then dif=$( diff -q $file ${TRUS_STOR}/$file ); else dif=0; fi
152
153	## Pass over useless file omission in benckmark directory
154	[[ -n "$dif" && "$dif" != '0' ]] && { mesg='Different'; echo $dif; files_list+=$file' '; }
155	done
156
157	[ $mesg == 'Same' ] && echo $mesg
158	echo $mesg \
159	>> ${file_inpt}
160
161	## List different files for web comment
162	[ -n "${files_list}" ] && echo 'Inputs : '${files_list}'differ<br>' \
163	>> temp_${file_note}
164	}
165
166	job_pending() {
167	local outline=$( printf "%100s" ) time_elapsed=0 time_increment=30
168
169	sleep ${time_increment}
170
171	## Append a log file while pending
172	while [[ $( eval ${TRUS_JSTA} ) && ${time_elapsed} -lt ${TRUS_TOUT} ]]; do
173	printf "\n%s\n" ${outline// /#} \
174	>> computation.log
175	[ -n "${TRUS_JINF}" ] && eval ${TRUS_JINF} \
176	>> computation.log
177	sleep ${time_increment}
178	time_elapsed=$(( ${time_elapsed} + ${time_increment} ))
179	done
180
181	sleep ${time_increment}
182
183	## Kill remaining job & stop the test if it's too long
184	[ ${time_elapsed} -eq ${TRUS_TOUT} ] && { eval ${TRUS_JKIL} &> /dev/null; get_out 6; }
185	}
186
187	diff_results() {
188	local file
189	local files_list='' mesg='Same'
190
191	## Simple diff
192	for file in 'ocean.output' *.stat; do
193	## Stop if no benchmark files (ocean.output, eventual stat files)
194	[ ! -e ${TRUS_STOR}/$file ] && { TRUS_RSLT='FAILED'; get_out 7; }
195
196	diff -q $file ${TRUS_STOR}/$file
197
198	## Continue even if it differs
199	[ $? -ne 0 ] && { TRUS_RSLT='FAILED'; mesg='Different'; files_list+=$file' '; }
200	done
201
202	[ $mesg == 'Same' ] && echo $mesg
203
204	## List different files for web comment
205	[ -n "${files_list}" ] && echo 'Results : '${files_list}'differ<br>' \
206	>> temp_${file_note}
207	}
208
209	diff_restart() {
210	local dif filebase filebases ndomain out
211	local files_list='' dif_sum='undef'
212
213	## Find all restart files to rebuild
214	if [ $( find -regex "._restart.[0-9]\.nc" -print -quit ) ]; then
215	filebases=$( find -regextype sed -regex ".*_[0-9]\{4\}\.nc" \
216	\| sed 's/\(.\)_./\1/' \| sort -u )
217
218	for filebase in ${filebases}; do
219
220	ndomain=$( find -regex ".${name}_[0-9].nc" \| wc -l \| awk '{print $1}' )
221
222	[ ${ndomain} -eq 0 ] && TRUS_RSLT='FAILED' && get_out 8
223
224	${TRUS_NGCM}/TOOLS/REBUILD_NEMO/rebuild_nemo \
225	-t ${TRUS_NPRO} $filebase ${ndomain} \
226	> /dev/null
227
228	## Possibility of remaining decomposed restarts (even after rebuild)
229	[ $? -eq 0 ] && rm -f ${file}_[0-9]*.nc \
230	> /dev/null
231
232	dif=''
233
234	## Stop if no benchmark files (restart file)
235	if [ -e ${TRUS_STOR}/$file.nc ]; then
236
237	out=$( $TRUS_CDOD $name.nc ${TRUS_STOR}/$name.nc 2>&1 )
238	dif=$( echo $out \| grep -o "[0-9]* of [0-9]* records" )
239
240	## Fix for cdo aborting on restarts with different inputs
241	if [[ $out =~ 'Abort' ]]; then
242	dif=$( echo $out \| awk -F: '{print $NF}' )
243	fi
244
245	if [ -n "$dif" ]; then
246	export TRUS_RSLT='FAILED'
247	files_list+=$name' ' && echo $name'.nc: '$dif
248	let dif_sum+=$( echo $dif \| awk '{print $1}' )
249	fi
250
251	done
252
253	## Stop if no benchmark files (ie time.step)
254	if [ ${dif_sum} == 'undef' ]; then
255	TRUS_RSLT='FAILED'
256	get_out 7
257	## List different files for web comment with sum of different records
258	elif [ ${dif_sum} -ne 0 ]; then
259	echo 'Restarts: '${files_list}${dif_sum}' record(s) differ<br>' \
260	>> temp_${file_note}
261	else
262	echo 'Same'
263	fi
264
265	else
266	TRUS_RSLT='FAILED'
267	fi
268
269	}
270
271	get_time() {
272	[ -z "${TRUS_JTIM}" ] && return
273
274	## Interest for checking unusual time computation
275	local time_cpu=$( eval ${TRUS_JTIM} )
276
277	printf "Elapsed time: "
278	echo ${time_cpu} \| tee -a ${file_time}
279	}
280
281	get_memy() {
282	[[ -z "${TRUS_JPME}" && -z "${TRUS_JVME}" ]] && return
283
284	## Interest for checking unusual memory usage
285	local memory_pmax=$( eval ${TRUS_JPME} ) memory_vmax=$( eval ${TRUS_JVME} )
286
287	printf "Memory max usage (physical/virtual): "
288	echo ${memory_pmax}' / '${memory_vmax} \| tee -a ${file_memy}
289	}
290
291	comments() {
292	local opat
293	local line='' state=$1
294
295	if [ -e ocean.output ]; then
296	## 'W A R N I N G' pattern by default
297	opat="-A2 \"^ $state\""
298	[ "$state" == 'E R R O R' ] && opat="-A4 \"$state\""
299
300	## Select first occurence for web comment
301	line=$( eval grep -m1 $opat ocean.output \| tr -d '\n' )
302	fi
303
304	[ -n "$line" ] && ( echo $line; printf "$line<br>" \
305	>> temp_${file_note} )
306	}
307
308	log_make() {
309	## Format comments for web
310	[ -e temp_${file_note} ] && cat temp_${file_note} \| tr -d '\n' \| sed 's/<br>$//' \
311	>> ${file_note}
312
313	## Construct txt file with all messenger files
314	paste -d ';' mesg_*.txt \| tee ${TRUS_FILE}
315	}
316
317	prod_publish() {
318	local cmd
319	local rev=$( awk '/NEMOGCM/ {print $NF}' model.log )
320
321	## Production mode (-p\|--prod)
322	if [ ${TRUS_PROD} -eq 1 ]; then
323
324	## Create or append trusting logfile
325	if [ -f ${TRUS_HIST} ]; then cmd='tail -1'; else cmd='cat'; fi
326
327	$cmd ${TRUS_FILE} \
328	>> ${TRUS_HIST}
329
330	## Send mail only when FAILED
331	if [[ ! -z "${TRUS_MAIL}" && ${TRUS_RSLT} == 'FAILED' ]]; then
332
333	## Content
334	cat <<END_MAIL \
335	> trusting.mail
336	Dear all,
337
338
339	The trusting sequence has not completed successfully on new configuration ${TRUS_CONF} based on ${TRUS_REFE}.
340
341	Here is the model summary:
342	`cat model.log`
343
344	First checking would be on the trusting environment files:
345	${TRUS_USER}.cfg & ${TRUS_HPCC}.cfg
346
347	For more details, look into the testing folder at:
348	${TRUS_SCRA}
349
350	An archive has been created to share the questionable configuration for further studies:
351	${TRUS_STOR}/${TRUS_ARCH}
352
353	END_MAIL
354
355	## Send with detailed subject
356	mail -s "[NEMO Trusting][$rev][${TRUS_BRAN}][${TRUS_REFE}] ${TRUS_RSLT} ${TRUS_RORR}" ${TRUS_MAIL} \
357	< trusting.mail
358	fi
359
360	fi
361	}
362
363	get_out() {
364	local time_step=0
365
366	TRUS_RORR=$1
367
368	printf "\n\nEnd of test\n"
369
370	## In case of compilation error
371	cd ${TRUS_SCRA}
372
373	if [ ${TRUS_RSLT} == 'FAILED' ]; then
374	echo 'Failure'
375
376	## Error identification
377	case ${TRUS_RORR} in
378	## Compilation
379	'1') TRUS_RORR='XIOS compilation failed' ;; '2') TRUS_RORR='NEMO compilation failed';;
380	## Submission
381	'3') TRUS_RORR='Missing input files' ;; '4') TRUS_RORR='Job submission error' ;;
382	## Computation
383	'5') TRUS_RORR='Crashed at time step' ;; '6') TRUS_RORR='Exceeded time limit' ;;
384	## Results
385	'7') TRUS_RORR='Missing previous outputs';; '8') TRUS_RORR='New outputs differ' ;;
386	## Other
387	'*') TRUS_RORR='Unknown error' ;;
388	esac
389
390	else
391	echo 'Success' && TRUS_RORR='Code is reliable'
392	fi
393
394	## Eventual comments from ocean.output
395	if [ "${TRUS_RORR}" == 'Crashed at time step' ]; then
396	comments 'E R R O R'
397	[ -e time.step ] && time_step=$( grep -o [0-9]* time.step )
398	TRUS_RORR+=' '$time_step
399	else
400	comments 'W A R N I N G'
401	[ "${TRUS_RORR}" == 'Exceeded time limit' ] && TRUS_RORR+=' '$(( ${TRUS_TOUT}/3600 ))'h'
402	fi
403
404	## Last messenger files
405	#export TRUS_RORR
406	sed -i "2 s/./$TRUS_RSLT/" ${file_rslt}; sed -i "2 s/./$TRUS_RORR/" ${file_stat}
407
408	## Save tested configuration if trusting failed in production mode (-p\|--prod)
409	if [[ ${TRUS_RSLT} == 'FAILED' && ${TRUS_PROD} -eq 1 ]]; then
410	echo 'Creating archive '${TRUS_ARCH}' under '${TRUS_STOR}
411	tar -czf ${TRUS_STOR}/${TRUS_ARCH} * \
412	-C ${TRUS_NGCM}/CONFIG/${TRUS_CONF}/MY_SRC . \
413	-C ${TRUS_NGCM}/CONFIG/${TRUS_CONF} cpp_${TRUS_CONF}.fcm
414	fi
415
416	## Logfile construct & eventual sending of notification email
417	printf "\nTrusting digest:\n----------------\n"
418	log_make
419	prod_publish
420
421	exit 0
422	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: