New URL for NEMO forge! http://forge.nemo-ocean.eu

Since March 2022 along with NEMO 4.2 release, the code development moved to a self-hosted GitLab.
This present forge is now archived and remained online for history.

trusting_func.sh in branches/2015/dev_r5092_CNRS18_TRUST/NEMOGCM/TRUST – NEMO

Context Navigation

source: branches/2015/dev_r5092_CNRS18_TRUST/NEMOGCM/TRUST/trusting_func.sh @ 5788

Last change on this file since 5788 was 5788, checked in by nicolasmartin, 9 years ago
dev_r5092_CNRS18_TRUST Several enhancements (global variables rename, notification mail object, add job performances, templates description, script to install new minimal branch to test) to improve accessibility for NEMO users & bugfixes (tar creation, modulefiles)
Property eol-style set to `native` Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Rev URL`
File size: 12.0 KB

Line
1	#!/bin/bash
2
3
4	## Messenger filenames
5	FILE_DATE=mesg_01_date_$PATTERNAME.txt ; FILE_TRUS_RSLT=mesg_02_result_$PATTERNAME.txt
6	FILE_STAT=mesg_03_state_$PATTERNAME.txt ; FILE_NEMO=mesg_04_nemo_$PATTERNAME.txt
7	FILE_XIOS=mesg_05_xios_$PATTERNAME.txt ; FILE_CMPF=mesg_06_compiler_$PATTERNAME.txt
8	FILE_LMPI=mesg_07_mpi_$PATTERNAME.txt ; FILE_NCDF=mesg_08_netcdf_$PATTERNAME.txt
9	FILE_INPT=mesg_09_inputs_$PATTERNAME.txt; FILE_TIME=mesg_10_time_$PATTERNAME.txt
10	FILE_MEMY=mesg_11_memory_$PATTERNAME.txt; FILE_NOTE=mesg_12_comments_$PATTERNAME.txt
11
12	## Timestamped logfile & archive filenames
13	FILE_TRUS=trusting_${DATE}_$PATTERNAME.txt; FILE_ARCH=trusting_${DATE}_$PATTERNAME.tgz
14
15
16	## Functions in order of use
17	print_step() {
18	local char_nb=$( echo "$1" \| wc -c )
19	local outline=$( printf "%${char_nb}s" )
20
21	printf "\nStep.....\n%s\n%s\n" "$1" ${outline// /-}
22	}
23
24	init_files() {
25	echo 'Date' > ${FILE_DATE}; echo 'Result' > ${FILE_TRUS_RSLT}
26	echo 'State' > ${FILE_STAT}; echo 'NEMOGCM rev.' > ${FILE_NEMO}
27	echo 'XIOS rev.' > ${FILE_XIOS}; echo 'Fortran compiler' > ${FILE_CMPF}
28	echo 'MPI libs' > ${FILE_LMPI}; echo 'NetCDF libs' > ${FILE_NCDF}
29	echo 'Input files' > ${FILE_INPT}; echo 'Elapsed time' > ${FILE_TIME}
30	echo 'Memory usage (P/V)' > ${FILE_MEMY}; echo 'Comments' > ${FILE_NOTE}
31
32	## 'Failed' status with 'Unknown error' by default
33	echo $TRUS_RSLT \
34	>> ${FILE_TRUS_RSLT}
35	echo 'Unknown error' \
36	>> ${FILE_STAT}
37	}
38
39	get_date() {
40	## UTC time zone for timestamping
41	local dat=$( date -ud "${DATE}" +"%F %R %Z" )
42
43	echo $dat \
44	>> ${FILE_DATE}
45	}
46
47	get_nemo_rev() {
48	local dir rev_loc
49	local rev=0 list=( 'ARCH CONFIG NEMO EXTERNAL/AGRIF EXTERNAL/IOIPSL EXTERNAL/fcm TOOLS/COMPILE TOOLS/REBUILD_NEMO' )
50
51	## Loop on essential NEMO directories
52	for dir in $list ${DIR_XIOS}; do
53
54	## For time being, just get revision from XIOS with no action on directory
55	if [ $dir == ${DIR_XIOS} ]; then
56	rev_loc=$( svn info $dir \| awk '(NR == 9) {print $NF}' )
57	echo 'XIOS '${rev_loc} \
58	>> model.log
59	echo "<a href=\"https://forge.ipsl.jussieu.fr/ioserver/changeset/${rev_loc}\" target=\"_blank\">${rev_loc}</a>" \
60	>> ${FILE_XIOS}
61	continue
62	fi
63
64	echo $dir && ${SVN_CMD} ${TRUS_WKCY}/$dir
65	rev_loc=$( svn info ${TRUS_WKCY}/$dir \| awk '(NR == 9) {print $NF}' )
66
67	## Keep last rev. nb
68	[ ${rev_loc} -gt $rev ] && rev=${rev_loc}
69	done
70
71	echo 'NEMOGCM '$rev \
72	>> model.log
73	echo "<a href=\"https://forge.ipsl.jussieu.fr/nemo/changeset/$rev\" target=\"_blank\">$rev</a>" \
74	>> ${FILE_NEMO}
75	}
76
77	get_soft_rel() {
78	local soft soft_rel
79
80	## Sourcing environment modulefile only if module function is set
81	[[ -e ${ARCH_ENV} && $( declare -F \| grep ' module' ) ]] && . ${ARCH_ENV}
82
83	for soft in $CMPF ${STR_CDOD} ${STR_LMPI} ${STR_NCDF}; do
84	soft_rel=''
85
86	## Software release: next word after "$soft" in $PATH (case-insensitive)
87	soft_rel=$( echo $PATH \| sed "s#.$soft\([0-9.a-z_]\).*#\1#i" )
88
89	## option --version would work for main compilers (gfortran, intel, pgfortran, ...)
90	[ $soft == $COMPILER ] && soft_rel=$( $soft --version \| grep -m1 -oe '\<[0-9. ]*\>' )
91
92	## Cleaning characters string to display proper soft name
93	soft=$( echo $soft \| sed 's#\\##g; s#[/-]$##' )
94
95	echo $soft ${soft_rel} \
96	>> model.log
97	done
98
99	sed -n 4p model.log \
100	>> ${FILE_CMPF}
101	sed -n 5p model.log \
102	>> ${FILE_LMPI}
103	sed -n 6p model.log \
104	>> ${FILE_NCDF}
105	}
106
107	get_inputs() {
108	# List archive content & extract it by default
109	local cmd_iol="tar -tvf ${TRUS_FORC}/${TRUS_TARF}" cmd_iof="tar -vxf ${TRUS_FORC}/${TRUS_TARF}"
110
111	## List & copy files in case of personal inputs
112	[ -z "${TRUS_TARF}" ] && { cmd_iol="ls ${TRUS_FORC}/"; cmd_iof="\cp ${TRUS_FORC}/ ."; }
113
114	${cmd_iol} > inputs_list.txt
115	${cmd_iof} > /dev/null
116	}
117
118	diff_inputs() {
119	local dif file
120	local files_list='' mesg='Same'
121
122	## Simple diff
123	for file in 'inputs_list.txt' namelist_ .xml cpp_; do
124	dif=''
125
126	## Continue even if input file is not in here (see after)
127	if [ -e ${TRUS_BHMK}/$file ]; then dif=$( diff -q $file ${TRUS_BHMK}/$file ); else dif=0; fi
128
129	## Pass over useless file omission in benckmark directory
130	[[ -n "$dif" && "$dif" != '0' ]] && ( mesg='Different'; echo $dif; files_list+=$file' ' )
131	done
132
133	[ $mesg == 'Same' ] && echo $mesg
134	echo $mesg \
135	>> ${FILE_INPT}
136
137	## List different files for web comment
138	[ -n "${files_list}" ] && echo 'Inputs : '${files_list}'differ<br>' \
139	>> temp_${FILE_NOTE}
140	}
141
142	job_pending() {
143	local outline=$( printf "%100s" ) time_elapsed=0 time_increment=30
144
145	sleep ${time_increment}
146
147	## Append a log file while pending
148	while [[ $( eval ${JOB_STAT} ) && ${time_elapsed} -lt $TIMEOUT ]]; do
149	printf "\n%s\n" ${outline// /#} \
150	>> computation.log
151	eval ${JOB_INFO} \
152	>> computation.log
153	sleep ${time_increment}
154	time_elapsed=$(( ${time_elapsed} + ${time_increment} ))
155	done
156
157	sleep ${time_increment}
158
159	## Kill remaining job & stop the test if it's too long
160	[ ${time_elapsed} -eq $TIMEOUT ] && { eval ${JOB_DELE} &> /dev/null; get_out 6; }
161	}
162
163	diff_results() {
164	local file
165	local files_list='' mesg='Same'
166
167	## Simple diff
168	for file in 'ocean.output' *.stat; do
169	## Stop if no benchmark files (ocean.output, eventual stat files)
170	[ ! -e ${TRUS_BHMK}/$file ] && { export TRUS_RSLT='FAILED'; get_out 7; }
171
172	diff -q $file ${TRUS_BHMK}/$file
173
174	## Continue even if it differs
175	[ $? -ne 0 ] && { export TRUS_RSLT='FAILED'; mesg='Different'; files_list+=$file' '; }
176	done
177
178	[ $mesg == 'Same' ] && echo $mesg
179
180	## List different files for web comment
181	[ -n "${files_list}" ] && echo 'Results : '${files_list}'differ<br>' \
182	>> temp_${FILE_NOTE}
183	}
184
185	diff_restart() {
186	local base_name comp dif file list_comp list_tmsp nb_dom time_step tmsp
187	local files_list='' dift=0
188
189	## Stop if no benchmark files (ie time.step)
190	[ ! -e ${TRUS_BHMK}/time.step ] && { export TRUS_RSLT='FAILED'; get_out 7; }
191	time_step=$( cat ${TRUS_BHMK}/time.step \| tr -d [:space:] )
192
193	## Find all restart files to rebuild
194	if [ $( find -regex "._restart.[0-9]\.nc" -print -quit ) ]; then
195	base_name=$( find -regex "._restart.[0-9]\.nc" \
196	\| sed "s#^\./\(.\)_[0-9]_restart.*#\1#" \| sort -u )
197	list_comp=$( find -regex "._restart.[0-9]\.nc" \
198	\| sed "s#^.\(restart[a-z_]\)_[0-9].*\.nc#\1#" \| sort -u )
199	list_tmsp=$( find -regex "._restart.[0-9]\.nc" \
200	\| sed "s#^.\([0-9]\{8\}\)_restart.#\1#" \| sort -u )
201
202	## Loop on each time step
203	for tmsp in ${list_tmsp}; do
204
205	for comp in ${list_comp}; do
206	file=${base_name}_${tmsp}_${comp}
207	nb_dom=$( find -name "${file}_[0-9]*.nc" \| wc -l \| awk '{ print $1 }' )
208
209	if [ ${nb_dom} -gt 1 ]; then
210	${TRUS_WKCY}/TOOLS/REBUILD_NEMO/rebuild_nemo -t ${TRUS_NPRO} $file ${nb_dom} > /dev/null
211	[ $? -eq 0 ] && rm -f ${file}_[0-9]*.nc > /dev/null
212	elif [ ${nb_dom} -eq 0 ]; then
213	export TRUS_RSLT='FAILED' && get_out 8
214	fi
215
216	## Compare restart files at same time step
217	if [ $tmsp -eq ${time_step} ]; then
218
219	## Stop if no benchmark files (restart file)
220	if [ -e ${TRUS_BHMK}/$file.nc ]; then
221
222	## UNIX `cmp` not suitable (timestamp in .nc file)
223	dif=$( $CDOD $file.nc ${TRUS_BHMK}/$file.nc 2> /dev/null \
224	\| awk '/records/ {print $0}' \| sed '2 s/^/,/' \| tr -d '\n' )
225
226	## CDO can return void stdout with no difference
227	if [[ -n "$dif" && $( echo $dif \| awk '{print $1}' ) -ne 0 ]]; then
228	export TRUS_RSLT='FAILED'
229	files_list+=$comp' ' && let dif_sum+=$( echo $dif \| awk '{print $1}' )
230	echo $file.nc': '$dif
231	fi
232
233	else
234	export TRUS_RSLT='FAILED' && get_out 7
235	fi
236
237	else
238	continue
239	fi
240
241	done
242
243	done
244
245	## List different files for web comment with sum of different parameters
246	if [ ${dif_sum} -ne 0 ]; then
247	echo 'Restarts: '${files_list}${dif_sum}' record(s) differ<br>' \
248	>> temp_${FILE_NOTE}
249	else
250	echo 'Same'
251	fi
252
253	else
254	export TRUS_RSLT='FAILED'
255	fi
256
257	}
258
259	get_time() {
260	## Interest for checking unusual time computation
261	local time_cpu=$( eval ${JOB_TIME} )
262
263	printf "Elapsed time: "
264	echo ${time_cpu} \| tee -a ${FILE_TIME}
265	}
266
267	get_memy() {
268	## Interest for checking unusual memory usage
269	local memory_pmax=$( eval ${JOB_PMEM} ) memory_vmax=$( eval ${JOB_VMEM} )
270
271	printf "Memory max usage (physical/virtual): "
272	echo ${memory_pmax}' / '${memory_vmax} \| tee -a ${FILE_MEMY}
273	}
274
275	comments() {
276	local opat
277	local line='' state=$1
278
279	if [ -e ocean.output ]; then
280	## 'W A R N I N G' pattern by default
281	opat="-A2 \"^ $state\""
282	[ "$state" == 'E R R O R' ] && opat="-A4 \"$state\""
283
284	## Select first occurence for web comment
285	line=$( eval grep -m1 $opat ocean.output \| tr -d '\n' )
286	fi
287
288	[ -n "$line" ] && ( echo $line; printf "$line<br>" \
289	>> temp_${FILE_NOTE} )
290	}
291
292	log_make() {
293	## Format comments for web
294	[ -e temp_${FILE_NOTE} ] && cat temp_${FILE_NOTE} \| tr -d '\n' \| sed 's/<br>$//' \
295	>> ${FILE_NOTE}
296
297	## Construct txt file with all messenger files
298	paste -d ';' mesg_*.txt \| tee ${FILE_TRUS}
299	}
300
301	prod_publish() {
302	local cmd
303	local rev=$( awk '/NEMOGCM/ {print $NF}' model.log )
304
305	## Production mode (-p\|--prod)
306	if [ $PROD -eq 1 ]; then
307
308	## Create or append trusting logfile
309	if [ -f ${TRUS_BHMK}/trusting_$PATTERNAME.txt ]; then cmd='tail -1'; else cmd='cat'; fi
310
311	$cmd ${FILE_TRUS} \
312	>> ${TRUS_BHMK}/trusting_$PATTERNAME.txt
313
314	## Send mail only when FAILED
315	if [[ ! -z "$TRUS_MAIL" && $TRUS_RSLT == 'FAILED' ]]; then
316
317	## Content
318	cat <<END_MAIL \
319	> trusting.mail
320	Dear all,
321
322
323	The trusting sequence has not completed successfully on new configuration ${TRUS_TEST} based on ${TRUS_REFE}.
324
325	Here is the model summary:
326	`cat model.log`
327
328	First checking would be on the trusting environment files:
329	${TRUS_USER}.cfg & ${TRUS_HPCC}.cfg
330
331	For more details, look into the testing directory at:
332	${TEST_DIR}
333
334	An archive has been created to share the questionable configuration for further studies:
335	${TRUS_BHMK}/${FILE_ARCH}
336
337	END_MAIL
338
339	## Send with detailed subject
340	mail -s "[NEMO Trusting][$rev][${TRUS_WKCY}][${TRUS_REFE}] $TRUS_RSLT $ERR" $TRUS_MAIL \
341	< trusting.mail
342	fi
343
344	fi
345	}
346
347	get_out() {
348	local time_step=0
349
350	ERR=$1
351
352	printf "\n\nEnd of test\n"
353
354	## In case of compilation error
355	cd ${TEST_DIR}
356
357	if [ $TRUS_RSLT == 'FAILED' ]; then
358	echo 'Failure'
359
360	## Error identification
361	case $ERR in
362	## Compilation
363	'1') ERR='XIOS compilation failed' ;; '2') ERR='NEMO compilation failed';;
364	## Submission
365	'3') ERR='Missing input files' ;; '4') ERR='Job submission error' ;;
366	## Computation
367	'5') ERR='Crashed at time step' ;; '6') ERR='Exceeded time limit' ;;
368	## Results
369	'7') ERR='Missing previous outputs';; '8') ERR='New outputs differ' ;;
370	esac
371
372	else
373	echo 'Success' && ERR='Code is reliable'
374	fi
375
376	## Eventual comments from ocean.output
377	if [ "$ERR" == 'Crashed at time step' ]; then
378	comments 'E R R O R'
379	[ -e time.step ] && time_step=$( grep -o [0-9]* time.step )
380	ERR+=' '$time_step
381	else
382	comments 'W A R N I N G'
383	[ "$ERR" == 'Exceeded time limit' ] && ERR+=' '$(( ${TIMEOUTT}/3600 ))'h'
384	fi
385
386	## Last messenger files
387	export ERR
388	sed -i "2 s/./$TRUS_RSLT/" ${FILE_TRUS_RSLT}; sed -i "2 s/./$ERR/" ${FILE_STAT}
389
390	## Save tested configuration if trusting failed in production mode (-p\|--prod)
391	if [[ $TRUS_RSLT == 'FAILED' && $PROD -eq 1 ]]; then
392	echo 'Creating archive '${FILE_ARCH}' under '${TRUS_BHMK}
393	tar -czf ${TRUS_BHMK}/${FILE_ARCH} * \
394	-C ${TRUS_WKCY}/CONFIG/${TRUS_TEST}/MY_SRC . \
395	-C ${TRUS_WKCY}/CONFIG/${TRUS_TEST} cpp_${TRUS_TEST}.fcm
396	fi
397
398	## Logfile construct & eventual sending of notification email
399	printf "\nTrusting digest:\n----------------\n"
400	log_make
401	prod_publish
402
403	exit 0
404	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: