#!/bin/bash
## Messenger filenames
FILE_DATE=mesg_01_date_$PATTERNAME.txt ; FILE_RSLT=mesg_02_result_$PATTERNAME.txt
FILE_STAT=mesg_03_state_$PATTERNAME.txt ; FILE_NEMO=mesg_04_nemo_$PATTERNAME.txt
FILE_XIOS=mesg_05_xios_$PATTERNAME.txt ; FILE_CMPF=mesg_06_compiler_$PATTERNAME.txt
FILE_LMPI=mesg_07_mpi_$PATTERNAME.txt ; FILE_NCDF=mesg_08_netcdf_$PATTERNAME.txt
FILE_INPT=mesg_09_inputs_$PATTERNAME.txt; FILE_TIME=mesg_10_time_$PATTERNAME.txt
FILE_MEMY=mesg_11_memory_$PATTERNAME.txt; FILE_NOTE=mesg_12_comments_$PATTERNAME.txt
## Trusting timestamped logfile & archive
TRUS_FILE=trusting_${DATE}_$PATTERNAME.txt; TRUS_ARCH=trusting_${DATE}_$PATTERNAME.tgz
## Functions in order of use
print_step() {
local char_nb=$( echo "$1" | wc -c )
local outline=$( printf "%${char_nb}s" )
printf "\nStep.....\n%s\n%s\n" "$1" ${outline// /-}
}
init_files() {
echo 'Date' > ${FILE_DATE}; echo 'Result' > ${FILE_RSLT}
echo 'State' > ${FILE_STAT}; echo 'NEMOGCM rev.' > ${FILE_NEMO}
echo 'XIOS rev.' > ${FILE_XIOS}; echo 'Fortran compiler' > ${FILE_CMPF}
echo 'MPI libs' > ${FILE_LMPI}; echo 'NetCDF libs' > ${FILE_NCDF}
echo 'Input files' > ${FILE_INPT}; echo 'Elapsed time' > ${FILE_TIME}
echo 'Memory usage (P/V)' > ${FILE_MEMY}; echo 'Comments' > ${FILE_NOTE}
## 'Failed' status with 'Unknown error' by default
echo ${TRUS_RSLT} \
>> ${FILE_RSLT}
echo 'Unknown error' \
>> ${FILE_STAT}
}
get_date() {
## UTC time zone for timestamping
local dat=$( date -ud "${DATE}" +"%F %R %Z" )
echo $dat \
>> ${FILE_DATE}
}
get_nemo_rev() {
local dir rev_loc
local rev=0
## Loop on essential NEMO directories
for dir in ${TRUS_CKOT} ${TRUS_XIOS}; do
## For time being, just get revision from XIOS with no action on directory
if [ $dir == ${TRUS_XIOS} ]; then
rev_loc=$( svn info $dir | awk '/Last Changed Rev/ {print $NF}' )
echo 'XIOS '${rev_loc} \
>> model.log
echo "${rev_loc}" \
>> ${FILE_XIOS}
continue
fi
echo $dir && ${TRUS_SVNA} ${TRUS_NGCM}/$dir
rev_loc=$( svn info ${TRUS_NGCM}/$dir | awk '/Last Changed Rev/ {print $NF}' )
## Keep last rev. nb
[ ${rev_loc} -gt $rev ] && rev=${rev_loc}
done
echo 'NEMOGCM '$rev \
>> model.log
echo "$rev" \
>> ${FILE_NEMO}
}
get_soft_rel() {
local soft_rel str
## Sourcing environment
if [ -n "${TRUS_ENVI}" ]; then
if [[ -e ${TRUS_ENVI}.env && $( declare -F | grep ' module' ) ]]; then
## .env file if module function is available
. ${TRUS_ENVI}.env
else
## .path file if existing, if not the given file
[ -e ${TRUS_ENVI}.path ] && . ${TRUS_ENVI}.path || . ${TRUS_ENVI}
fi
fi
## Problem with `prepend-path` of modulefile that use ':' instead of ' ' as delimiter
[ $TRUS_HPCC == 'X64_ADA' ] && WRAPPER_LDFLAGS='-L/smplocal/pub/IdrMemMPI/1.4/lib -lidrmem '${WRAPPER_LDFLAGS}
for str in ${TRUS_CMPV} ${TRUS_MPIR} ${TRUS_CDFR} ${TRUS_CDOR}; do
[ -z "$str" ] && continue
soft_rel=''
## Software release: next word after "$soft" in $PATH (case-insensitive)
soft_rel=$( echo $PATH | sed "s#.*$str\([0-9.a-z_]*\).*#\1#i" )
## option --version would work for main compilers (gfortran, intel, pgfortran, ...)
[ $str == ${TRUS_CMPV} ] && soft_rel=$( $str --version | grep -m1 -oe '\<[0-9. ]*\>' )
## Cleaning characters string to display proper soft name
str=$( echo $str | sed 's#\\##g; s#[/-]$##' )
echo $str ${soft_rel} \
>> model.log
done
sed -n 3p model.log \
>> ${FILE_CMPF}
sed -n 4p model.log \
>> ${FILE_LMPI}
sed -n 5p model.log \
>> ${FILE_NCDF}
}
get_inputs() {
## Extract archive or copy files in case of personal inputs
[ -z "${TRUS_TARF}" ] && get_io="cp ${TRUS_FORC}/* ." || get_io="tar -vxf ${TRUS_FORC}/${TRUS_TARF}"
${get_io} > /dev/null
[ $? -ne 0 ] && get_out 3 || echo 'Success'
[ $( find -name '*.gz' -print -quit ) ] && find . -name '*.gz' -exec gzip -d {} \;
ls -lh > inputs_list.txt
}
diff_inputs() {
local dif file
local files_list='' mesg='Same'
## Simple diff
for file in 'inputs_list.txt' *namelist_* *.xml cpp_*; do
dif=''
## Continue even if input file is not in here (see after)
if [ -e ${TRUS_STOR}/$file ]; then dif=$( diff -q $file ${TRUS_STOR}/$file ); else dif=0; fi
## Pass over useless file omission in benckmark directory
[[ -n "$dif" && "$dif" != '0' ]] && { mesg='Different'; echo $dif; files_list+=$file' '; }
done
[ $mesg == 'Same' ] && echo $mesg
echo $mesg \
>> ${FILE_INPT}
## List different files for web comment
[ -n "${files_list}" ] && echo 'Inputs : '${files_list}'differ
' \
>> temp_${FILE_NOTE}
}
job_pending() {
local outline=$( printf "%100s" ) time_elapsed=0 time_increment=30
sleep ${time_increment}
## Append a log file while pending
while [[ $( eval ${TRUS_JSTA} ) && ${time_elapsed} -lt ${TRUS_TOUT} ]]; do
printf "\n%s\n" ${outline// /#} \
>> computation.log
[ -n "${TRUS_JINF}" ] && eval ${JOB_INFO} \
>> computation.log
sleep ${time_increment}
time_elapsed=$(( ${time_elapsed} + ${time_increment} ))
done
sleep ${time_increment}
## Kill remaining job & stop the test if it's too long
[ ${time_elapsed} -eq ${TRUS_TOUT} ] && { eval ${JOB_DELE} &> /dev/null; get_out 6; }
}
diff_results() {
local file
local files_list='' mesg='Same'
## Simple diff
for file in 'ocean.output' *.stat; do
## Stop if no benchmark files (ocean.output, eventual stat files)
[ ! -e ${TRUS_STOR}/$file ] && { TRUS_RSLT='FAILED'; get_out 7; }
diff -q $file ${TRUS_STOR}/$file
## Continue even if it differs
[ $? -ne 0 ] && { TRUS_RSLT='FAILED'; mesg='Different'; files_list+=$file' '; }
done
[ $mesg == 'Same' ] && echo $mesg
## List different files for web comment
[ -n "${files_list}" ] && echo 'Results : '${files_list}'differ
' \
>> temp_${FILE_NOTE}
}
diff_restart() {
local base_name comp dif file list_comp list_tmsp nb_dom time_step tmsp
local files_list='' dif_sum=0
## Stop if no benchmark files (ie time.step)
[ ! -e ${TRUS_STOR}/time.step ] && { TRUS_RSLT='FAILED'; get_out 7; }
time_step=$( cat ${TRUS_STOR}/time.step | tr -d [:space:] )
## Find all restart files to rebuild
if [ $( find -regex ".*_restart.*[0-9]\.nc" -print -quit ) ]; then
base_name=$( find -regex ".*_restart.*[0-9]\.nc" \
| sed "s#^\./\(.*\)_[0-9]*_restart.*#\1#" | sort -u )
list_comp=$( find -regex ".*_restart.*[0-9]\.nc" \
| sed "s#^.*\(restart[a-z_]*\)_[0-9].*\.nc#\1#" | sort -u )
list_tmsp=$( find -regex ".*_restart.*[0-9]\.nc" \
| sed "s#^.*\([0-9]\{8\}\)_restart.*#\1#" | sort -u )
## Loop on each time step
for tmsp in ${list_tmsp}; do
for comp in ${list_comp}; do
file=${base_name}_${tmsp}_${comp}
nb_dom=$( find -name "${file}_[0-9]*.nc" | wc -l | awk '{ print $1 }' )
if [ ${nb_dom} -gt 1 ]; then
${TRUS_NGCM}/TOOLS/REBUILD_NEMO/rebuild_nemo -t ${TRUS_NPRO} $file ${nb_dom} \
> /dev/null
## Possibility of remaining decomposed restarts (even after rebuild)
[ $? -eq 0 ] && rm -f ${file}_[0-9]*.nc \
> /dev/null
elif [ ${nb_dom} -eq 0 ]; then
TRUS_RSLT='FAILED' && get_out 8
fi
## Compare restart files at same time step
if [ $tmsp -eq ${time_step} ]; then
## Stop if no benchmark files (restart file)
if [ -e ${TRUS_STOR}/$file.nc ]; then
## UNIX `cmp` not suitable (timestamp in .nc file)
dif=$( $TRUS_CDOD $file.nc ${TRUS_STOR}/$file.nc 2> /dev/null \
| awk '/records/ {print $0}' | sed '2 s/^/,/' | tr -d '\n' )
## CDO can return void stdout with no difference
if [[ -n "$dif" && $( echo $dif | awk '{print $1}' ) -ne 0 ]]; then
TRUS_RSLT='FAILED'
files_list+=$comp' ' && let dif_sum+=$( echo $dif | awk '{print $1}' )
echo $file.nc': '$dif
fi
else
TRUS_RSLT='FAILED' && get_out 7
fi
else
continue
fi
done
done
## List different files for web comment with sum of different records
if [ ${dif_sum} -ne 0 ]; then
echo 'Restarts: '${files_list}${dif_sum}' record(s) differ
' \
>> temp_${FILE_NOTE}
else
echo 'Same'
fi
else
TRUS_RSLT='FAILED'
fi
}
get_time() {
[ -z "${TRUS_JTIM}" ] && return
## Interest for checking unusual time computation
local time_cpu=$( eval ${TRUS_JTIM} )
printf "Elapsed time: "
echo ${time_cpu} | tee -a ${FILE_TIME}
}
get_memy() {
[[ -z "${TRUS_JPME}" && -z "${TRUS_JVME}" ]] && return
## Interest for checking unusual memory usage
local memory_pmax=$( eval ${TRUS_JPME} ) memory_vmax=$( eval ${TRUS_JVME} )
printf "Memory max usage (physical/virtual): "
echo ${memory_pmax}' / '${memory_vmax} | tee -a ${FILE_MEMY}
}
comments() {
local opat
local line='' state=$1
if [ -e ocean.output ]; then
## 'W A R N I N G' pattern by default
opat="-A2 \"^ $state\""
[ "$state" == 'E R R O R' ] && opat="-A4 \"$state\""
## Select first occurence for web comment
line=$( eval grep -m1 $opat ocean.output | tr -d '\n' )
fi
[ -n "$line" ] && ( echo $line; printf "$line
" \
>> temp_${FILE_NOTE} )
}
log_make() {
## Format comments for web
[ -e temp_${FILE_NOTE} ] && cat temp_${FILE_NOTE} | tr -d '\n' | sed 's/
$//' \
>> ${FILE_NOTE}
## Construct txt file with all messenger files
paste -d ';' mesg_*.txt | tee ${TRUS_FILE}
}
prod_publish() {
local cmd
local rev=$( awk '/NEMOGCM/ {print $NF}' model.log )
## Production mode (-p|--prod)
if [ ${TRUS_PROD} -eq 1 ]; then
## Create or append trusting logfile
if [ -f ${TRUS_STOR}/trusting_$PATTERNAME.txt ]; then cmd='tail -1'; else cmd='cat'; fi
$cmd ${TRUS_FILE} \
>> ${TRUS_STOR}/trusting_$PATTERNAME.txt
## Send mail only when FAILED
if [[ ! -z "${TRUS_MAIL}" && ${TRUS_RSLT} == 'FAILED' ]]; then
## Content
cat < trusting.mail
Dear all,
The trusting sequence has not completed successfully on new configuration ${TRUS_CONF} based on ${TRUS_REFE}.
Here is the model summary:
`cat model.log`
First checking would be on the trusting environment files:
${TRUS_USER}.cfg & ${TRUS_HPCC}.cfg
For more details, look into the testing folder at:
${TRUS_SCRA}
An archive has been created to share the questionable configuration for further studies:
${TRUS_STOR}/${TRUS_ARCH}
END_MAIL
## Send with detailed subject
mail -s "[NEMO Trusting][$rev][${TRUS_BRAN}][${TRUS_REFE}] ${TRUS_RSLT} ${TRUS_RORR}" ${TRUS_MAIL} \
< trusting.mail
fi
fi
}
get_out() {
local time_step=0
TRUS_RORR=$1
printf "\n\nEnd of test\n"
## In case of compilation error
cd ${TRUS_SCRA}
if [ ${TRUS_RSLT} == 'FAILED' ]; then
echo 'Failure'
## Error identification
case ${TRUS_RORR} in
## Compilation
'1') TRUS_RORR='XIOS compilation failed' ;; '2') TRUS_RORR='NEMO compilation failed';;
## Submission
'3') TRUS_RORR='Missing input files' ;; '4') TRUS_RORR='Job submission error' ;;
## Computation
'5') TRUS_RORR='Crashed at time step' ;; '6') TRUS_RORR='Exceeded time limit' ;;
## Results
'7') TRUS_RORR='Missing previous outputs';; '8') TRUS_RORR='New outputs differ' ;;
## Other
'*') TRUS_RORR='Unknown error' ;;
esac
else
echo 'Success' && TRUS_RORR='Code is reliable'
fi
## Eventual comments from ocean.output
if [ "${TRUS_RORR}" == 'Crashed at time step' ]; then
comments 'E R R O R'
[ -e time.step ] && time_step=$( grep -o [0-9]* time.step )
TRUS_RORR+=' '$time_step
else
comments 'W A R N I N G'
[ "${TRUS_RORR}" == 'Exceeded time limit' ] && TRUS_RORR+=' '$(( ${TRUS_TOUT}/3600 ))'h'
fi
## Last messenger files
#export TRUS_RORR
sed -i "2 s/.*/$TRUS_RSLT/" ${FILE_RSLT}; sed -i "2 s/.*/$TRUS_RORR/" ${FILE_STAT}
## Save tested configuration if trusting failed in production mode (-p|--prod)
if [[ ${TRUS_RSLT} == 'FAILED' && ${TRUS_PROD} -eq 1 ]]; then
echo 'Creating archive '${TRUS_ARCH}' under '${TRUS_STOR}
tar -czf ${TRUS_STOR}/${TRUS_ARCH} * \
-C ${TRUS_NGCM}/CONFIG/${TRUS_CONF}/MY_SRC . \
-C ${TRUS_NGCM}/CONFIG/${TRUS_CONF} cpp_${TRUS_CONF}.fcm
fi
## Logfile construct & eventual sending of notification email
printf "\nTrusting digest:\n----------------\n"
log_make
prod_publish
exit 0
}