New URL for NEMO forge! http://forge.nemo-ocean.eu

Since March 2022 along with NEMO 4.2 release, the code development moved to a self-hosted GitLab.
This present forge is now archived and remained online for history.

trusting_func.sh in branches/2015/dev_r5092_CNRS18_TRUST/NEMOGCM/TRUST/inc – NEMO

Context Navigation

source: branches/2015/dev_r5092_CNRS18_TRUST/NEMOGCM/TRUST/inc/trusting_func.sh @ 5988

Last change on this file since 5988 was 5988, checked in by nicolasmartin, 8 years ago
dev_r5092_CNRS18_TRUST Bug correction & updating trusting namelists
Property eol-style set to `native` Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Rev URL`
File size: 12.7 KB

Rev	Line
[5268]	1	#!/bin/bash
	2
[5383]	3
[5644]	4	## Messenger filenames
[5799]	5	FILE_DATE=mesg_01_date_$PATTERNAME.txt ; FILE_RSLT=mesg_02_result_$PATTERNAME.txt
[5788]	6	FILE_STAT=mesg_03_state_$PATTERNAME.txt ; FILE_NEMO=mesg_04_nemo_$PATTERNAME.txt
	7	FILE_XIOS=mesg_05_xios_$PATTERNAME.txt ; FILE_CMPF=mesg_06_compiler_$PATTERNAME.txt
	8	FILE_LMPI=mesg_07_mpi_$PATTERNAME.txt ; FILE_NCDF=mesg_08_netcdf_$PATTERNAME.txt
	9	FILE_INPT=mesg_09_inputs_$PATTERNAME.txt; FILE_TIME=mesg_10_time_$PATTERNAME.txt
	10	FILE_MEMY=mesg_11_memory_$PATTERNAME.txt; FILE_NOTE=mesg_12_comments_$PATTERNAME.txt
[5453]	11
[5799]	12	## Trusting timestamped logfile & archive
	13	TRUS_FILE=trusting_${DATE}_$PATTERNAME.txt; TRUS_ARCH=trusting_${DATE}_$PATTERNAME.tgz
[5644]	14
[5681]	15
[5788]	16	## Functions in order of use
	17	print_step() {
	18	local char_nb=$( echo "$1" \| wc -c )
	19	local outline=$( printf "%${char_nb}s" )
[5644]	20
[5788]	21	printf "\nStep.....\n%s\n%s\n" "$1" ${outline// /-}
	22	}
[5681]	23
[5644]	24	init_files() {
[5799]	25	echo 'Date' > ${FILE_DATE}; echo 'Result' > ${FILE_RSLT}
[5788]	26	echo 'State' > ${FILE_STAT}; echo 'NEMOGCM rev.' > ${FILE_NEMO}
	27	echo 'XIOS rev.' > ${FILE_XIOS}; echo 'Fortran compiler' > ${FILE_CMPF}
	28	echo 'MPI libs' > ${FILE_LMPI}; echo 'NetCDF libs' > ${FILE_NCDF}
	29	echo 'Input files' > ${FILE_INPT}; echo 'Elapsed time' > ${FILE_TIME}
	30	echo 'Memory usage (P/V)' > ${FILE_MEMY}; echo 'Comments' > ${FILE_NOTE}
[5664]	31
[5690]	32	## 'Failed' status with 'Unknown error' by default
[5799]	33	echo ${TRUS_RSLT} \
	34	>> ${FILE_RSLT}
[5788]	35	echo 'Unknown error' \
[5689]	36	>> ${FILE_STAT}
[5644]	37	}
	38
	39	get_date() {
[5690]	40	## UTC time zone for timestamping
	41	local dat=$( date -ud "${DATE}" +"%F %R %Z" )
[5644]	42
[5788]	43	echo $dat \
[5689]	44	>> ${FILE_DATE}
[5644]	45	}
	46
	47	get_nemo_rev() {
[5695]	48	local dir rev_loc
[5799]	49	local rev=0
[5644]	50
[5788]	51	## Loop on essential NEMO directories
[5799]	52	for dir in ${TRUS_CKOT} ${TRUS_XIOS}; do
[5455]	53
[5690]	54	## For time being, just get revision from XIOS with no action on directory
[5799]	55	if [ $dir == ${TRUS_XIOS} ]; then
	56	rev_loc=$( svn info $dir \| awk '/Last Changed Rev/ {print $NF}' )
[5689]	57	echo 'XIOS '${rev_loc} \
	58	>> model.log
[5644]	59	echo "<a href=\"https://forge.ipsl.jussieu.fr/ioserver/changeset/${rev_loc}\" target=\"_blank\">${rev_loc}</a>" \
	60	>> ${FILE_XIOS}
	61	continue
	62	fi
	63
[5799]	64	echo $dir && ${TRUS_SVNA} ${TRUS_NGCM}/$dir
	65	rev_loc=$( svn info ${TRUS_NGCM}/$dir \| awk '/Last Changed Rev/ {print $NF}' )
[5690]	66
	67	## Keep last rev. nb
[5689]	68	[ ${rev_loc} -gt $rev ] && rev=${rev_loc}
[5644]	69	done
	70
[5689]	71	echo 'NEMOGCM '$rev \
	72	>> model.log
[5644]	73	echo "<a href=\"https://forge.ipsl.jussieu.fr/nemo/changeset/$rev\" target=\"_blank\">$rev</a>" \
	74	>> ${FILE_NEMO}
[5438]	75	}
	76
[5644]	77	get_soft_rel() {
[5799]	78	local soft_rel str
[5509]	79
[5934]	80	## Sourcing environment
[5799]	81	if [ -n "${TRUS_ENVI}" ]; then
	82	if [[ -e ${TRUS_ENVI}.env && $( declare -F \| grep ' module' ) ]]; then
[5934]	83	## .env file if module function is available
[5799]	84	. ${TRUS_ENVI}.env
	85	else
[5934]	86	## .path file if existing, if not the given file
[5799]	87	[ -e ${TRUS_ENVI}.path ] && . ${TRUS_ENVI}.path \|\| . ${TRUS_ENVI}
	88	fi
	89	fi
[5788]	90
[5934]	91	## Problem with `prepend-path` of modulefile that use ':' instead of ' ' as delimiter
[5799]	92	[ $TRUS_HPCC == 'X64_ADA' ] && WRAPPER_LDFLAGS='-L/smplocal/pub/IdrMemMPI/1.4/lib -lidrmem '${WRAPPER_LDFLAGS}
	93
[5929]	94	for str in ${TRUS_CMPV} ${TRUS_MPIR} ${TRUS_CDFR} ${TRUS_CDOR}; do
[5799]	95	[ -z "$str" ] && continue
[5644]	96	soft_rel=''
[5664]	97
[5788]	98	## Software release: next word after "$soft" in $PATH (case-insensitive)
[5799]	99	soft_rel=$( echo $PATH \| sed "s#.$str\([0-9.a-z_]\).*#\1#i" )
[5664]	100
[5690]	101	## option --version would work for main compilers (gfortran, intel, pgfortran, ...)
[5929]	102	[ $str == ${TRUS_CMPV} ] && soft_rel=$( $str --version \| grep -m1 -oe '\<[0-9. ]*\>' )
[5690]	103
[5672]	104	## Cleaning characters string to display proper soft name
[5799]	105	str=$( echo $str \| sed 's#\\##g; s#[/-]$##' )
[5690]	106
[5799]	107	echo $str ${soft_rel} \
[5689]	108	>> model.log
[5644]	109	done
	110
[5799]	111	sed -n 3p model.log \
	112	>> ${FILE_CMPF}
[5689]	113	sed -n 4p model.log \
[5799]	114	>> ${FILE_LMPI}
[5689]	115	sed -n 5p model.log \
	116	>> ${FILE_NCDF}
[5644]	117	}
	118
	119	get_inputs() {
[5929]	120	## Extract archive or copy files in case of personal inputs
	121	[ -z "${TRUS_TARF}" ] && get_io="cp ${TRUS_FORC}/* ." \|\| get_io="tar -vxf ${TRUS_FORC}/${TRUS_TARF}"
[5644]	122
[5929]	123	${get_io} > /dev/null
	124	[ $? -ne 0 ] && get_out 3 \|\| echo 'Success'
[5988]	125	[ $( find -name '.gz' -print -quit ) ] && find . -name '.gz' -exec gzip -d {} \;
[5690]	126
[5929]	127	ls -lh > inputs_list.txt
[5644]	128	}
	129
[5402]	130	diff_inputs() {
[5695]	131	local dif file
[5664]	132	local files_list='' mesg='Same'
[5644]	133
[5690]	134	## Simple diff
[5689]	135	for file in 'inputs_list.txt' namelist_ .xml cpp_; do
[5644]	136	dif=''
[5690]	137
	138	## Continue even if input file is not in here (see after)
[5929]	139	if [ -e ${TRUS_STOR}/$file ]; then dif=$( diff -q $file ${TRUS_STOR}/$file ); else dif=0; fi
[5690]	140
	141	## Pass over useless file omission in benckmark directory
[5799]	142	[[ -n "$dif" && "$dif" != '0' ]] && { mesg='Different'; echo $dif; files_list+=$file' '; }
[5402]	143	done
	144
[5689]	145	[ $mesg == 'Same' ] && echo $mesg
[5788]	146	echo $mesg \
[5689]	147	>> ${FILE_INPT}
[5690]	148
	149	## List different files for web comment
[5672]	150	[ -n "${files_list}" ] && echo 'Inputs : '${files_list}'differ<br>' \
[5788]	151	>> temp_${FILE_NOTE}
[5402]	152	}
	153
[5644]	154	job_pending() {
[5672]	155	local outline=$( printf "%100s" ) time_elapsed=0 time_increment=30
[5644]	156
	157	sleep ${time_increment}
	158
[5690]	159	## Append a log file while pending
[5799]	160	while [[ $( eval ${TRUS_JSTA} ) && ${time_elapsed} -lt ${TRUS_TOUT} ]]; do
	161	printf "\n%s\n" ${outline// /#} \
[5689]	162	>> computation.log
[5799]	163	[ -n "${TRUS_JINF}" ] && eval ${JOB_INFO} \
[5689]	164	>> computation.log
[5644]	165	sleep ${time_increment}
	166	time_elapsed=$(( ${time_elapsed} + ${time_increment} ))
	167	done
	168
	169	sleep ${time_increment}
	170
[5690]	171	## Kill remaining job & stop the test if it's too long
[5799]	172	[ ${time_elapsed} -eq ${TRUS_TOUT} ] && { eval ${JOB_DELE} &> /dev/null; get_out 6; }
[5644]	173	}
	174
[5509]	175	diff_results() {
[5695]	176	local file
[5696]	177	local files_list='' mesg='Same'
[5664]	178
[5690]	179	## Simple diff
[5689]	180	for file in 'ocean.output' *.stat; do
[5690]	181	## Stop if no benchmark files (ocean.output, eventual stat files)
[5929]	182	[ ! -e ${TRUS_STOR}/$file ] && { TRUS_RSLT='FAILED'; get_out 7; }
[5690]	183
[5929]	184	diff -q $file ${TRUS_STOR}/$file
[5690]	185
	186	## Continue even if it differs
[5799]	187	[ $? -ne 0 ] && { TRUS_RSLT='FAILED'; mesg='Different'; files_list+=$file' '; }
[5509]	188	done
[5664]	189
[5696]	190	[ $mesg == 'Same' ] && echo $mesg
	191
[5690]	192	## List different files for web comment
[5672]	193	[ -n "${files_list}" ] && echo 'Results : '${files_list}'differ<br>' \
[5788]	194	>> temp_${FILE_NOTE}
[5509]	195	}
	196
[5383]	197	diff_restart() {
[5695]	198	local base_name comp dif file list_comp list_tmsp nb_dom time_step tmsp
[5934]	199	local files_list='' dif_sum=0
[5644]	200
[5695]	201	## Stop if no benchmark files (ie time.step)
[5929]	202	[ ! -e ${TRUS_STOR}/time.step ] && { TRUS_RSLT='FAILED'; get_out 7; }
	203	time_step=$( cat ${TRUS_STOR}/time.step \| tr -d [:space:] )
[5690]	204
[5672]	205	## Find all restart files to rebuild
	206	if [ $( find -regex "._restart.[0-9]\.nc" -print -quit ) ]; then
[5788]	207	base_name=$( find -regex "._restart.[0-9]\.nc" \
[5934]	208	\| sed "s#^\./\(.\)_[0-9]_restart.*#\1#" \| sort -u )
[5788]	209	list_comp=$( find -regex "._restart.[0-9]\.nc" \
[5934]	210	\| sed "s#^.\(restart[a-z_]\)_[0-9].*\.nc#\1#" \| sort -u )
[5788]	211	list_tmsp=$( find -regex "._restart.[0-9]\.nc" \
[5934]	212	\| sed "s#^.\([0-9]\{8\}\)_restart.#\1#" \| sort -u )
[5383]	213
[5690]	214	## Loop on each time step
[5672]	215	for tmsp in ${list_tmsp}; do
[5788]	216
[5672]	217	for comp in ${list_comp}; do
	218	file=${base_name}_${tmsp}_${comp}
	219	nb_dom=$( find -name "${file}_[0-9]*.nc" \| wc -l \| awk '{ print $1 }' )
[5424]	220
[5689]	221	if [ ${nb_dom} -gt 1 ]; then
[5799]	222	${TRUS_NGCM}/TOOLS/REBUILD_NEMO/rebuild_nemo -t ${TRUS_NPRO} $file ${nb_dom} \
	223	> /dev/null
[5934]	224
	225	## Possibility of remaining decomposed restarts (even after rebuild)
	226	[ $? -eq 0 ] && rm -f ${file}_[0-9]*.nc \
	227	> /dev/null
	228
[5689]	229	elif [ ${nb_dom} -eq 0 ]; then
[5799]	230	TRUS_RSLT='FAILED' && get_out 8
[5672]	231	fi
[5424]	232
[5672]	233	## Compare restart files at same time step
[5690]	234	if [ $tmsp -eq ${time_step} ]; then
	235
	236	## Stop if no benchmark files (restart file)
[5929]	237	if [ -e ${TRUS_STOR}/$file.nc ]; then
[5690]	238
[5672]	239	## UNIX `cmp` not suitable (timestamp in .nc file)
[5929]	240	dif=$( $TRUS_CDOD $file.nc ${TRUS_STOR}/$file.nc 2> /dev/null \
[5695]	241	\| awk '/records/ {print $0}' \| sed '2 s/^/,/' \| tr -d '\n' )
[5485]	242
[5695]	243	## CDO can return void stdout with no difference
[5689]	244	if [[ -n "$dif" && $( echo $dif \| awk '{print $1}' ) -ne 0 ]]; then
[5799]	245	TRUS_RSLT='FAILED'
[5788]	246	files_list+=$comp' ' && let dif_sum+=$( echo $dif \| awk '{print $1}' )
[5681]	247	echo $file.nc': '$dif
[5672]	248	fi
	249
[5689]	250	else
[5799]	251	TRUS_RSLT='FAILED' && get_out 7
[5689]	252	fi
[5672]	253
[5689]	254	else
	255	continue
	256	fi
[5485]	257
[5672]	258	done
[5788]	259
[5424]	260	done
[5383]	261
[5934]	262	## List different files for web comment with sum of different records
[5788]	263	if [ ${dif_sum} -ne 0 ]; then
	264	echo 'Restarts: '${files_list}${dif_sum}' record(s) differ<br>' \
	265	>> temp_${FILE_NOTE}
[5696]	266	else
	267	echo 'Same'
[5689]	268	fi
[5455]	269
[5689]	270	else
[5799]	271	TRUS_RSLT='FAILED'
[5689]	272	fi
[5681]	273
[5383]	274	}
	275
[5788]	276	get_time() {
[5799]	277	[ -z "${TRUS_JTIM}" ] && return
	278
[5788]	279	## Interest for checking unusual time computation
[5799]	280	local time_cpu=$( eval ${TRUS_JTIM} )
[5681]	281
[5788]	282	printf "Elapsed time: "
	283	echo ${time_cpu} \| tee -a ${FILE_TIME}
	284	}
[5472]	285
[5788]	286	get_memy() {
[5799]	287	[[ -z "${TRUS_JPME}" && -z "${TRUS_JVME}" ]] && return
	288
[5788]	289	## Interest for checking unusual memory usage
[5799]	290	local memory_pmax=$( eval ${TRUS_JPME} ) memory_vmax=$( eval ${TRUS_JVME} )
[5788]	291
	292	printf "Memory max usage (physical/virtual): "
	293	echo ${memory_pmax}' / '${memory_vmax} \| tee -a ${FILE_MEMY}
[5472]	294	}
	295
[5644]	296	comments() {
[5695]	297	local opat
	298	local line='' state=$1
[5438]	299
[5695]	300	if [ -e ocean.output ]; then
[5788]	301	## 'W A R N I N G' pattern by default
[5696]	302	opat="-A2 \"^ $state\""
	303	[ "$state" == 'E R R O R' ] && opat="-A4 \"$state\""
[5438]	304
[5695]	305	## Select first occurence for web comment
[5696]	306	line=$( eval grep -m1 $opat ocean.output \| tr -d '\n' )
[5695]	307	fi
[5690]	308
[5788]	309	[ -n "$line" ] && ( echo $line; printf "$line<br>" \
	310	>> temp_${FILE_NOTE} )
[5383]	311	}
	312
[5788]	313	log_make() {
[5669]	314	## Format comments for web
[5788]	315	[ -e temp_${FILE_NOTE} ] && cat temp_${FILE_NOTE} \| tr -d '\n' \| sed 's/<br>$//' \
	316	>> ${FILE_NOTE}
[5669]	317
[5788]	318	## Construct txt file with all messenger files
[5799]	319	paste -d ';' mesg_*.txt \| tee ${TRUS_FILE}
[5268]	320	}
	321
[5788]	322	prod_publish() {
[5644]	323	local cmd
[5788]	324	local rev=$( awk '/NEMOGCM/ {print $NF}' model.log )
[5268]	325
[5788]	326	## Production mode (-p\|--prod)
[5799]	327	if [ ${TRUS_PROD} -eq 1 ]; then
[5268]	328
[5788]	329	## Create or append trusting logfile
[5929]	330	if [ -f ${TRUS_STOR}/trusting_$PATTERNAME.txt ]; then cmd='tail -1'; else cmd='cat'; fi
[5424]	331
[5929]	332	$cmd ${TRUS_FILE} \
	333	>> ${TRUS_STOR}/trusting_$PATTERNAME.txt
[5644]	334
[5690]	335	## Send mail only when FAILED
[5799]	336	if [[ ! -z "${TRUS_MAIL}" && ${TRUS_RSLT} == 'FAILED' ]]; then
[5690]	337
	338	## Content
[5788]	339	cat <<END_MAIL \
[5681]	340	> trusting.mail
[5788]	341	Dear all,
[5472]	342
[5268]	343
[5799]	344	The trusting sequence has not completed successfully on new configuration ${TRUS_CONF} based on ${TRUS_REFE}.
[5268]	345
[5788]	346	Here is the model summary:
	347	`cat model.log`
[5268]	348
[5788]	349	First checking would be on the trusting environment files:
	350	${TRUS_USER}.cfg & ${TRUS_HPCC}.cfg
	351
[5929]	352	For more details, look into the testing folder at:
	353	${TRUS_SCRA}
[5788]	354
	355	An archive has been created to share the questionable configuration for further studies:
[5929]	356	${TRUS_STOR}/${TRUS_ARCH}
[5788]	357
[5268]	358	END_MAIL
[5690]	359
	360	## Send with detailed subject
[5799]	361	mail -s "[NEMO Trusting][$rev][${TRUS_BRAN}][${TRUS_REFE}] ${TRUS_RSLT} ${TRUS_RORR}" ${TRUS_MAIL} \
[5672]	362	< trusting.mail
[5644]	363	fi
[5521]	364
[5644]	365	fi
[5268]	366	}
	367
[5383]	368	get_out() {
[5695]	369	local time_step=0
	370
[5799]	371	TRUS_RORR=$1
[5521]	372
[5788]	373	printf "\n\nEnd of test\n"
[5696]	374
	375	## In case of compilation error
[5929]	376	cd ${TRUS_SCRA}
[5696]	377
[5799]	378	if [ ${TRUS_RSLT} == 'FAILED' ]; then
[5695]	379	echo 'Failure'
[5521]	380
[5629]	381	## Error identification
[5799]	382	case ${TRUS_RORR} in
[5695]	383	## Compilation
[5799]	384	'1') TRUS_RORR='XIOS compilation failed' ;; '2') TRUS_RORR='NEMO compilation failed';;
[5629]	385	## Submission
[5799]	386	'3') TRUS_RORR='Missing input files' ;; '4') TRUS_RORR='Job submission error' ;;
[5788]	387	## Computation
[5799]	388	'5') TRUS_RORR='Crashed at time step' ;; '6') TRUS_RORR='Exceeded time limit' ;;
[5629]	389	## Results
[5799]	390	'7') TRUS_RORR='Missing previous outputs';; '8') TRUS_RORR='New outputs differ' ;;
	391	## Other
	392	'*') TRUS_RORR='Unknown error' ;;
[5509]	393	esac
	394
[5695]	395	else
[5799]	396	echo 'Success' && TRUS_RORR='Code is reliable'
[5424]	397	fi
[5438]	398
[5690]	399	## Eventual comments from ocean.output
[5799]	400	if [ "${TRUS_RORR}" == 'Crashed at time step' ]; then
[5695]	401	comments 'E R R O R'
[5788]	402	[ -e time.step ] && time_step=$( grep -o [0-9]* time.step )
[5799]	403	TRUS_RORR+=' '$time_step
[5695]	404	else
	405	comments 'W A R N I N G'
[5799]	406	[ "${TRUS_RORR}" == 'Exceeded time limit' ] && TRUS_RORR+=' '$(( ${TRUS_TOUT}/3600 ))'h'
[5690]	407	fi
[5438]	408
[5696]	409	## Last messenger files
[5799]	410	#export TRUS_RORR
	411	sed -i "2 s/./$TRUS_RSLT/" ${FILE_RSLT}; sed -i "2 s/./$TRUS_RORR/" ${FILE_STAT}
[5690]	412
[5788]	413	## Save tested configuration if trusting failed in production mode (-p\|--prod)
[5799]	414	if [[ ${TRUS_RSLT} == 'FAILED' && ${TRUS_PROD} -eq 1 ]]; then
[5929]	415	echo 'Creating archive '${TRUS_ARCH}' under '${TRUS_STOR}
	416	tar -czf ${TRUS_STOR}/${TRUS_ARCH} * \
[5799]	417	-C ${TRUS_NGCM}/CONFIG/${TRUS_CONF}/MY_SRC . \
	418	-C ${TRUS_NGCM}/CONFIG/${TRUS_CONF} cpp_${TRUS_CONF}.fcm
[5635]	419	fi
[5521]	420
[5788]	421	## Logfile construct & eventual sending of notification email
	422	printf "\nTrusting digest:\n----------------\n"
	423	log_make
	424	prod_publish
[5689]	425
[5623]	426	exit 0
[5268]	427	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: