New URL for NEMO forge! http://forge.nemo-ocean.eu

Since March 2022 along with NEMO 4.2 release, the code development moved to a self-hosted GitLab.
This present forge is now archived and remained online for history.

trusting_func.sh in branches/2015/dev_r5092_CNRS18_TRUST/NEMOGCM/TRUST – NEMO

Context Navigation

source: branches/2015/dev_r5092_CNRS18_TRUST/NEMOGCM/TRUST/trusting_func.sh @ 5799

Last change on this file since 5799 was 5799, checked in by nicolasmartin, 9 years ago
dev_r5092_CNRS18_TRUST Establishing common environment for installing & running trusting, consolidation of scripts & continuation of templates & help section improvments
Property eol-style set to `native` Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Rev URL`
File size: 12.6 KB

Line
1	#!/bin/bash
2
3
4	## Messenger filenames
5	FILE_DATE=mesg_01_date_$PATTERNAME.txt ; FILE_RSLT=mesg_02_result_$PATTERNAME.txt
6	FILE_STAT=mesg_03_state_$PATTERNAME.txt ; FILE_NEMO=mesg_04_nemo_$PATTERNAME.txt
7	FILE_XIOS=mesg_05_xios_$PATTERNAME.txt ; FILE_CMPF=mesg_06_compiler_$PATTERNAME.txt
8	FILE_LMPI=mesg_07_mpi_$PATTERNAME.txt ; FILE_NCDF=mesg_08_netcdf_$PATTERNAME.txt
9	FILE_INPT=mesg_09_inputs_$PATTERNAME.txt; FILE_TIME=mesg_10_time_$PATTERNAME.txt
10	FILE_MEMY=mesg_11_memory_$PATTERNAME.txt; FILE_NOTE=mesg_12_comments_$PATTERNAME.txt
11
12	## Trusting timestamped logfile & archive
13	TRUS_FILE=trusting_${DATE}_$PATTERNAME.txt; TRUS_ARCH=trusting_${DATE}_$PATTERNAME.tgz
14
15
16	## Functions in order of use
17	print_step() {
18	local char_nb=$( echo "$1" \| wc -c )
19	local outline=$( printf "%${char_nb}s" )
20
21	printf "\nStep.....\n%s\n%s\n" "$1" ${outline// /-}
22	}
23
24	init_files() {
25	echo 'Date' > ${FILE_DATE}; echo 'Result' > ${FILE_RSLT}
26	echo 'State' > ${FILE_STAT}; echo 'NEMOGCM rev.' > ${FILE_NEMO}
27	echo 'XIOS rev.' > ${FILE_XIOS}; echo 'Fortran compiler' > ${FILE_CMPF}
28	echo 'MPI libs' > ${FILE_LMPI}; echo 'NetCDF libs' > ${FILE_NCDF}
29	echo 'Input files' > ${FILE_INPT}; echo 'Elapsed time' > ${FILE_TIME}
30	echo 'Memory usage (P/V)' > ${FILE_MEMY}; echo 'Comments' > ${FILE_NOTE}
31
32	## 'Failed' status with 'Unknown error' by default
33	echo ${TRUS_RSLT} \
34	>> ${FILE_RSLT}
35	echo 'Unknown error' \
36	>> ${FILE_STAT}
37	}
38
39	get_date() {
40	## UTC time zone for timestamping
41	local dat=$( date -ud "${DATE}" +"%F %R %Z" )
42
43	echo $dat \
44	>> ${FILE_DATE}
45	}
46
47	get_nemo_rev() {
48	local dir rev_loc
49	local rev=0
50
51	## Loop on essential NEMO directories
52	for dir in ${TRUS_CKOT} ${TRUS_XIOS}; do
53
54	## For time being, just get revision from XIOS with no action on directory
55	if [ $dir == ${TRUS_XIOS} ]; then
56	rev_loc=$( svn info $dir \| awk '/Last Changed Rev/ {print $NF}' )
57	echo 'XIOS '${rev_loc} \
58	>> model.log
59	echo "<a href=\"https://forge.ipsl.jussieu.fr/ioserver/changeset/${rev_loc}\" target=\"_blank\">${rev_loc}</a>" \
60	>> ${FILE_XIOS}
61	continue
62	fi
63
64	echo $dir && ${TRUS_SVNA} ${TRUS_NGCM}/$dir
65	rev_loc=$( svn info ${TRUS_NGCM}/$dir \| awk '/Last Changed Rev/ {print $NF}' )
66
67	## Keep last rev. nb
68	[ ${rev_loc} -gt $rev ] && rev=${rev_loc}
69	done
70
71	echo 'NEMOGCM '$rev \
72	>> model.log
73	echo "<a href=\"https://forge.ipsl.jussieu.fr/nemo/changeset/$rev\" target=\"_blank\">$rev</a>" \
74	>> ${FILE_NEMO}
75	}
76
77	get_soft_rel() {
78	local soft_rel str
79
80	if [ -n "${TRUS_ENVI}" ]; then
81	## Sourcing environment modulefile (.env) only if module function is set
82	if [[ -e ${TRUS_ENVI}.env && $( declare -F \| grep ' module' ) ]]; then
83	. ${TRUS_ENVI}.env
84	else
85	[ -e ${TRUS_ENVI}.path ] && . ${TRUS_ENVI}.path \|\| . ${TRUS_ENVI}
86	fi
87	fi
88
89	## Problem with `prepend-path` of modulefile that use ':' instead of ' ' as delimiter on LDFLAGS variables
90	[ $TRUS_HPCC == 'X64_ADA' ] && WRAPPER_LDFLAGS='-L/smplocal/pub/IdrMemMPI/1.4/lib -lidrmem '${WRAPPER_LDFLAGS}
91
92	for str in ${TRUS_CMPF} ${TRUS_MPIR} ${TRUS_CDFR} ${TRUS_CDOR}; do
93	[ -z "$str" ] && continue
94	soft_rel=''
95
96	## Software release: next word after "$soft" in $PATH (case-insensitive)
97	soft_rel=$( echo $PATH \| sed "s#.$str\([0-9.a-z_]\).*#\1#i" )
98
99	## option --version would work for main compilers (gfortran, intel, pgfortran, ...)
100	[ $str == ${TRUS_CMPF} ] && soft_rel=$( $str --version \| grep -m1 -oe '\<[0-9. ]*\>' )
101
102	## Cleaning characters string to display proper soft name
103	str=$( echo $str \| sed 's#\\##g; s#[/-]$##' )
104
105	echo $str ${soft_rel} \
106	>> model.log
107	done
108
109	sed -n 3p model.log \
110	>> ${FILE_CMPF}
111	sed -n 4p model.log \
112	>> ${FILE_LMPI}
113	sed -n 5p model.log \
114	>> ${FILE_NCDF}
115	}
116
117	get_inputs() {
118	# List archive content & extract it by default
119	local cmd_iol="tar -tvf ${TRUS_FORC}/${TRUS_TARF}" cmd_iof="tar -vxf ${TRUS_FORC}/${TRUS_TARF}"
120
121	## List & copy files in case of personal inputs
122	if [ -z "${TRUS_TARF}" ]; then
123	cmd_iol="ls ${TRUS_FORC}/" ; cmd_iof="\cp ${TRUS_FORC}/ ."
124	fi
125
126	${cmd_iol} > inputs_list.txt
127	${cmd_iof} > /dev/null
128	}
129
130	diff_inputs() {
131	local dif file
132	local files_list='' mesg='Same'
133
134	## Simple diff
135	for file in 'inputs_list.txt' namelist_ .xml cpp_; do
136	dif=''
137
138	## Continue even if input file is not in here (see after)
139	if [ -e ${TRUS_BHMK}/$file ]; then dif=$( diff -q $file ${TRUS_BHMK}/$file ); else dif=0; fi
140
141	## Pass over useless file omission in benckmark directory
142	[[ -n "$dif" && "$dif" != '0' ]] && { mesg='Different'; echo $dif; files_list+=$file' '; }
143	done
144
145	[ $mesg == 'Same' ] && echo $mesg
146	echo $mesg \
147	>> ${FILE_INPT}
148
149	## List different files for web comment
150	[ -n "${files_list}" ] && echo 'Inputs : '${files_list}'differ<br>' \
151	>> temp_${FILE_NOTE}
152	}
153
154	job_pending() {
155	local outline=$( printf "%100s" ) time_elapsed=0 time_increment=30
156
157	sleep ${time_increment}
158
159	## Append a log file while pending
160	while [[ $( eval ${TRUS_JSTA} ) && ${time_elapsed} -lt ${TRUS_TOUT} ]]; do
161	printf "\n%s\n" ${outline// /#} \
162	>> computation.log
163	[ -n "${TRUS_JINF}" ] && eval ${JOB_INFO} \
164	>> computation.log
165	sleep ${time_increment}
166	time_elapsed=$(( ${time_elapsed} + ${time_increment} ))
167	done
168
169	sleep ${time_increment}
170
171	## Kill remaining job & stop the test if it's too long
172	[ ${time_elapsed} -eq ${TRUS_TOUT} ] && { eval ${JOB_DELE} &> /dev/null; get_out 6; }
173	}
174
175	diff_results() {
176	local file
177	local files_list='' mesg='Same'
178
179	## Simple diff
180	for file in 'ocean.output' *.stat; do
181	## Stop if no benchmark files (ocean.output, eventual stat files)
182	[ ! -e ${TRUS_BHMK}/$file ] && { TRUS_RSLT='FAILED'; get_out 7; }
183
184	diff -q $file ${TRUS_BHMK}/$file
185
186	## Continue even if it differs
187	[ $? -ne 0 ] && { TRUS_RSLT='FAILED'; mesg='Different'; files_list+=$file' '; }
188	done
189
190	[ $mesg == 'Same' ] && echo $mesg
191
192	## List different files for web comment
193	[ -n "${files_list}" ] && echo 'Results : '${files_list}'differ<br>' \
194	>> temp_${FILE_NOTE}
195	}
196
197	diff_restart() {
198	local base_name comp dif file list_comp list_tmsp nb_dom time_step tmsp
199	local files_list='' dift=0
200
201	## Stop if no benchmark files (ie time.step)
202	[ ! -e ${TRUS_BHMK}/time.step ] && { TRUS_RSLT='FAILED'; get_out 7; }
203	time_step=$( cat ${TRUS_BHMK}/time.step \| tr -d [:space:] )
204
205	## Find all restart files to rebuild
206	if [ $( find -regex "._restart.[0-9]\.nc" -print -quit ) ]; then
207	base_name=$( find -regex "._restart.[0-9]\.nc" \
208	\| sed "s#^\./\(.\)_[0-9]_restart.*#\1#" \| sort -u )
209	list_comp=$( find -regex "._restart.[0-9]\.nc" \
210	\| sed "s#^.\(restart[a-z_]\)_[0-9].*\.nc#\1#" \| sort -u )
211	list_tmsp=$( find -regex "._restart.[0-9]\.nc" \
212	\| sed "s#^.\([0-9]\{8\}\)_restart.#\1#" \| sort -u )
213
214	## Loop on each time step
215	for tmsp in ${list_tmsp}; do
216
217	for comp in ${list_comp}; do
218	file=${base_name}_${tmsp}_${comp}
219	nb_dom=$( find -name "${file}_[0-9]*.nc" \| wc -l \| awk '{ print $1 }' )
220
221	if [ ${nb_dom} -gt 1 ]; then
222	${TRUS_NGCM}/TOOLS/REBUILD_NEMO/rebuild_nemo -t ${TRUS_NPRO} $file ${nb_dom} \
223	> /dev/null
224	[ $? -eq 0 ] && rm -f ${file}_[0-9]*.nc \
225	> /dev/null
226	elif [ ${nb_dom} -eq 0 ]; then
227	TRUS_RSLT='FAILED' && get_out 8
228	fi
229
230	## Compare restart files at same time step
231	if [ $tmsp -eq ${time_step} ]; then
232
233	## Stop if no benchmark files (restart file)
234	if [ -e ${TRUS_BHMK}/$file.nc ]; then
235
236	## UNIX `cmp` not suitable (timestamp in .nc file)
237	dif=$( $TRUS_CDOD $file.nc ${TRUS_BHMK}/$file.nc 2> /dev/null \
238	\| awk '/records/ {print $0}' \| sed '2 s/^/,/' \| tr -d '\n' )
239
240	## CDO can return void stdout with no difference
241	if [[ -n "$dif" && $( echo $dif \| awk '{print $1}' ) -ne 0 ]]; then
242	TRUS_RSLT='FAILED'
243	files_list+=$comp' ' && let dif_sum+=$( echo $dif \| awk '{print $1}' )
244	echo $file.nc': '$dif
245	fi
246
247	else
248	TRUS_RSLT='FAILED' && get_out 7
249	fi
250
251	else
252	continue
253	fi
254
255	done
256
257	done
258
259	## List different files for web comment with sum of different parameters
260	if [ ${dif_sum} -ne 0 ]; then
261	echo 'Restarts: '${files_list}${dif_sum}' record(s) differ<br>' \
262	>> temp_${FILE_NOTE}
263	else
264	echo 'Same'
265	fi
266
267	else
268	TRUS_RSLT='FAILED'
269	fi
270
271	}
272
273	get_time() {
274	[ -z "${TRUS_JTIM}" ] && return
275
276	## Interest for checking unusual time computation
277	local time_cpu=$( eval ${TRUS_JTIM} )
278
279	printf "Elapsed time: "
280	echo ${time_cpu} \| tee -a ${FILE_TIME}
281	}
282
283	get_memy() {
284	[[ -z "${TRUS_JPME}" && -z "${TRUS_JVME}" ]] && return
285
286	## Interest for checking unusual memory usage
287	local memory_pmax=$( eval ${TRUS_JPME} ) memory_vmax=$( eval ${TRUS_JVME} )
288
289	printf "Memory max usage (physical/virtual): "
290	echo ${memory_pmax}' / '${memory_vmax} \| tee -a ${FILE_MEMY}
291	}
292
293	comments() {
294	local opat
295	local line='' state=$1
296
297	if [ -e ocean.output ]; then
298	## 'W A R N I N G' pattern by default
299	opat="-A2 \"^ $state\""
300	[ "$state" == 'E R R O R' ] && opat="-A4 \"$state\""
301
302	## Select first occurence for web comment
303	line=$( eval grep -m1 $opat ocean.output \| tr -d '\n' )
304	fi
305
306	[ -n "$line" ] && ( echo $line; printf "$line<br>" \
307	>> temp_${FILE_NOTE} )
308	}
309
310	log_make() {
311	## Format comments for web
312	[ -e temp_${FILE_NOTE} ] && cat temp_${FILE_NOTE} \| tr -d '\n' \| sed 's/<br>$//' \
313	>> ${FILE_NOTE}
314
315	## Construct txt file with all messenger files
316	paste -d ';' mesg_*.txt \| tee ${TRUS_FILE}
317	}
318
319	prod_publish() {
320	local cmd
321	local rev=$( awk '/NEMOGCM/ {print $NF}' model.log )
322
323	## Production mode (-p\|--prod)
324	if [ ${TRUS_PROD} -eq 1 ]; then
325
326	## Create or append trusting logfile
327	if [ -f ${TRUS_BHMK}/trusting_$PATTERNAME.txt ]; then cmd='tail -1'; else cmd='cat'; fi
328
329	$cmd ${TRUS_FILE} \
330	>> ${TRUS_BHMK}/trusting_$PATTERNAME.txt
331
332	## Send mail only when FAILED
333	if [[ ! -z "${TRUS_MAIL}" && ${TRUS_RSLT} == 'FAILED' ]]; then
334
335	## Content
336	cat <<END_MAIL \
337	> trusting.mail
338	Dear all,
339
340
341	The trusting sequence has not completed successfully on new configuration ${TRUS_CONF} based on ${TRUS_REFE}.
342
343	Here is the model summary:
344	`cat model.log`
345
346	First checking would be on the trusting environment files:
347	${TRUS_USER}.cfg & ${TRUS_HPCC}.cfg
348
349	For more details, look into the testing directory at:
350	${TRUS_TEST}
351
352	An archive has been created to share the questionable configuration for further studies:
353	${TRUS_BHMK}/${TRUS_ARCH}
354
355	END_MAIL
356
357	## Send with detailed subject
358	mail -s "[NEMO Trusting][$rev][${TRUS_BRAN}][${TRUS_REFE}] ${TRUS_RSLT} ${TRUS_RORR}" ${TRUS_MAIL} \
359	< trusting.mail
360	fi
361
362	fi
363	}
364
365	get_out() {
366	local time_step=0
367
368	TRUS_RORR=$1
369
370	printf "\n\nEnd of test\n"
371
372	## In case of compilation error
373	cd ${TRUS_TEST}
374
375	if [ ${TRUS_RSLT} == 'FAILED' ]; then
376	echo 'Failure'
377
378	## Error identification
379	case ${TRUS_RORR} in
380	## Compilation
381	'1') TRUS_RORR='XIOS compilation failed' ;; '2') TRUS_RORR='NEMO compilation failed';;
382	## Submission
383	'3') TRUS_RORR='Missing input files' ;; '4') TRUS_RORR='Job submission error' ;;
384	## Computation
385	'5') TRUS_RORR='Crashed at time step' ;; '6') TRUS_RORR='Exceeded time limit' ;;
386	## Results
387	'7') TRUS_RORR='Missing previous outputs';; '8') TRUS_RORR='New outputs differ' ;;
388	## Other
389	'*') TRUS_RORR='Unknown error' ;;
390	esac
391
392	else
393	echo 'Success' && TRUS_RORR='Code is reliable'
394	fi
395
396	## Eventual comments from ocean.output
397	if [ "${TRUS_RORR}" == 'Crashed at time step' ]; then
398	comments 'E R R O R'
399	[ -e time.step ] && time_step=$( grep -o [0-9]* time.step )
400	TRUS_RORR+=' '$time_step
401	else
402	comments 'W A R N I N G'
403	[ "${TRUS_RORR}" == 'Exceeded time limit' ] && TRUS_RORR+=' '$(( ${TRUS_TOUT}/3600 ))'h'
404	fi
405
406	## Last messenger files
407	#export TRUS_RORR
408	sed -i "2 s/./$TRUS_RSLT/" ${FILE_RSLT}; sed -i "2 s/./$TRUS_RORR/" ${FILE_STAT}
409
410	## Save tested configuration if trusting failed in production mode (-p\|--prod)
411	if [[ ${TRUS_RSLT} == 'FAILED' && ${TRUS_PROD} -eq 1 ]]; then
412	echo 'Creating archive '${TRUS_ARCH}' under '${TRUS_BHMK}
413	tar -czf ${TRUS_BHMK}/${TRUS_ARCH} * \
414	-C ${TRUS_NGCM}/CONFIG/${TRUS_CONF}/MY_SRC . \
415	-C ${TRUS_NGCM}/CONFIG/${TRUS_CONF} cpp_${TRUS_CONF}.fcm
416	fi
417
418	## Logfile construct & eventual sending of notification email
419	printf "\nTrusting digest:\n----------------\n"
420	log_make
421	prod_publish
422
423	exit 0
424	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: