New URL for NEMO forge!   http://forge.nemo-ocean.eu

Since March 2022 along with NEMO 4.2 release, the code development moved to a self-hosted GitLab.
This present forge is now archived and remained online for history.
trusting_func.sh in branches/2015/dev_r5092_CNRS_SETTE/NEMOGCM/TRUST – NEMO

source: branches/2015/dev_r5092_CNRS_SETTE/NEMOGCM/TRUST/trusting_func.sh @ 5696

Last change on this file since 5696 was 5696, checked in by nicolasmartin, 9 years ago

dev_r5092_CNRS_SETTE Improve trusting output readability; use of commands block instead of multiples '&&'

  • Property eol-style set to native
  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Rev URL
File size: 11.7 KB
Line 
1#!/bin/bash
2
3
4## Messenger filenames
5FILE_DATE=mesg_01_date_$PATTERNAME.txt    ; FILE_STAT=mesg_02_status_$PATTERNAME.txt
6FILE_RESU=mesg_03_result_$PATTERNAME.txt  ; FILE_NEMO=mesg_04_nemogcm_$PATTERNAME.txt
7FILE_XIOS=mesg_05_xios_$PATTERNAME.txt    ; FILE_COMP=mesg_06_compiler_$PATTERNAME.txt
8FILE_MPIN=mesg_07_mpi_$PATTERNAME.txt     ; FILE_NCDF=mesg_08_netcdf_$PATTERNAME.txt
9FILE_INPT=mesg_09_inputs_$PATTERNAME.txt  ; FILE_CPUT=mesg_10_cputime_$PATTERNAME.txt
10FILE_COMM=mesg_11_comments_$PATTERNAME.txt;
11
12## Timestamped outcome & archive filenames
13FILE_TRUS=trusting_${DATE}_$PATTERNAME.txt; FILE_ARCH=trusting_${DATE}_$PATTERNAME.tar.gz
14
15
16print_step() { printf "\nStep.....\n$1\n"; }
17
18
19init_files() {
20    echo 'Date'          >  ${FILE_DATE}; echo 'Status'           >  ${FILE_STAT}
21    echo 'Result'        >  ${FILE_RESU}; echo 'NEMOGCM rev.'     >  ${FILE_NEMO}
22    echo 'XIOS rev.'     >  ${FILE_XIOS}; echo 'Fortran compiler' >  ${FILE_COMP}
23    echo 'MPI libs'      >  ${FILE_MPIN}; echo 'NetCDF libs'      >  ${FILE_NCDF}
24    echo 'Input files'   >  ${FILE_INPT}; echo 'Real CPU time'    >  ${FILE_CPUT}
25    echo 'Comments'      >  ${FILE_COMM}
26
27    ## 'Failed' status with 'Unknown error' by default
28    echo $ST             \
29   >> ${FILE_STAT}
30    echo 'Unknown error' \
31   >> ${FILE_RESU}
32}
33
34
35get_date() {
36    ## UTC time zone for timestamping
37    local dat=$( date -ud "${DATE}" +"%F %R %Z" )
38
39    echo $dat          \
40   >> ${FILE_DATE}
41}
42
43
44get_nemo_rev() {
45    local dir rev_loc
46    local rev=0
47
48    ## Loop on essential directories
49    for dir in ${NEMO_ARCH} ${NEMO_CONF} ${NEMO_ENGI} \
50               ${NEMO_EAGR} ${NEMO_EIOI} ${NEMO_EFCM} \
51          ${NEMO_TCMP} ${NEMO_TRBD}              \
52          ${DIR_XIOS}                             ; do
53
54   ## For time being, just get revision from XIOS with no action on directory
55   if [ $dir == ${DIR_XIOS} ]; then
56       rev_loc=$( svn info $dir | awk '(NR == 9) {print $NF}' )
57       echo 'XIOS '${rev_loc} \
58      >> model.log
59       echo "<a href=\"https://forge.ipsl.jussieu.fr/ioserver/changeset/${rev_loc}\" target=\"_blank\">${rev_loc}</a>" \
60      >> ${FILE_XIOS}
61       continue
62   fi
63
64   echo $dir && ${SVN_CMD} $dir
65   rev_loc=$( svn info $dir | awk '(NR == 9) {print $NF}' )
66
67   ## Keep last rev. nb
68   [ ${rev_loc} -gt $rev ] && rev=${rev_loc}
69    done
70
71    echo 'NEMOGCM '$rev \
72   >> model.log
73    echo "<a href=\"https://forge.ipsl.jussieu.fr/nemo/changeset/$rev\" target=\"_blank\">$rev</a>" \
74   >> ${FILE_NEMO}
75}
76
77
78get_soft_rel() {
79    local soft soft_rel
80
81    for soft in $CDO $COMPILER $MPI $NETCDF; do
82   soft_rel=''
83
84   ## Software strings to identify depends on use (sed is case-insensitive)
85        if [ $IMOD -eq 1 ]; then
86       ## Modules system: next word after "$soft/"
87       soft_rel=$( echo $LOADEDMODULES | sed "s#.*$soft/\([0-9.a-z_]*\).*#\1#i" )
88       ## LOADEDMODULES=netcdf/mpi/4.1.3:cdo/1.5.9:intel/2013.0
89       ## LOADEDMODULES=bullxmpi/1.2.8.2:netcdf/4.3.3.1_hdf5_parallel:cdo/1.6.7
90   else
91       ## Hard-coded paths: next word after "$soft"
92       soft_rel=$( echo $PATH          | sed "s#.*$soft\([0-9.a-z_]*\).*#\1#i"  )
93       ## PATH=/smplocal/intel/impi/4.1.0.024/intel64/bin:/smplocal/pub/cdo/1.5.9/bin:/smplocal/pub/NetCDF/4.1.3/mpi/bin
94       ## PATH=/usr/local/netcdf-4.3.3.1_hdf5_parallel/bin:/opt/mpi/bullxmpi/1.2.8.2/bin:/usr/local/netcdf-utils-4.3.3.1_hdf5/bin
95   fi
96
97   ## option --version would work for main compilers (gfortran, intel, pgfortran, ...)
98   [ $soft == $COMPILER ] && soft_rel=$( $soft --version | grep -m1 -oe '\<[0-9. ]*\>' )
99
100   ## Cleaning characters string to display proper soft name
101   soft=$( echo $soft | sed 's#\\##g; s#[/-]$##' )
102
103   echo $soft ${soft_rel} \
104       >> model.log
105    done
106
107    sed -n 4p model.log \
108   >> ${FILE_COMP}
109    sed -n 5p model.log \
110   >> ${FILE_MPIN}
111    sed -n 6p model.log \
112   >> ${FILE_NCDF}
113}
114
115
116get_inputs() {
117    # List archive content & extract it by default
118    local cmd_iol="tar -tvf ${NEMO_FORC}/${NEMO_TARF}" cmd_iof="tar -vxf ${NEMO_FORC}/${NEMO_TARF}"
119
120    ## List & copy files in case of personal inputs
121    [ -z "${NEMO_TARF}" ] && { cmd_iol="ls ${NEMO_FORC}/*"; cmd_iof="\cp ${NEMO_FORC}/* ."; }
122
123    ${cmd_iol} > inputs_list.txt
124    ${cmd_iof} > /dev/null
125}
126
127
128diff_inputs() {
129    local dif file
130    local files_list='' mesg='Same' 
131
132    ## Simple diff
133    for file in 'inputs_list.txt' *namelist_* *.xml cpp_*; do
134   dif=''
135
136   ## Continue even if input file is not in here (see after)
137   if [ -e ${REFE_DIR}/$file ]; then dif=$( diff -q $file ${REFE_DIR}/$file ); else dif=0; fi
138
139   ## Pass over useless file omission in benckmark directory
140   if [[ -n "$dif" && "$dif" != '0' ]]; then
141       mesg='Different'; echo $dif; files_list+=$file' '
142   fi
143
144    done
145
146    [ $mesg == 'Same' ] && echo $mesg
147    echo $mesg         \
148   >> ${FILE_INPT}
149
150    ## List different files for web comment
151    [ -n "${files_list}" ] && echo 'Inputs  : '${files_list}'differ<br>' \
152   >> temp_${FILE_COMM}
153}
154
155
156job_pending() {
157    local outline=$( printf "%100s" ) time_elapsed=0 time_increment=30
158
159    sleep ${time_increment}
160
161    ## Append a log file while pending
162    while [[ $( ${JOB_LIST} | grep ${JOB_ID} ) && ${time_elapsed} -lt ${TIME_LIMI} ]]; do
163   printf "\n%s\n" ${outline// /#} \
164       >> computation.log
165   ${JOB_INFO} ${JOB_ID}           \
166       >> computation.log
167   sleep ${time_increment}
168   time_elapsed=$(( ${time_elapsed} + ${time_increment} ))
169    done
170
171    sleep ${time_increment}
172
173    ## Kill remaining job & stop the test if it's too long
174    [ ${time_elapsed} -eq ${TIME_LIMI} ] && { ${JOB_DELE} ${JOB_ID} &> /dev/null; get_out 6; }
175}
176
177
178diff_results() {
179    local file
180    local files_list='' mesg='Same'
181
182    ## Simple diff
183    for file in 'ocean.output' *.stat; do
184
185   ## Stop if no benchmark files (ocean.output, eventual stat files)
186   [ ! -e ${REFE_DIR}/$file ] && { export ST='FAILED'; get_out 7; }
187
188   diff -q $file ${REFE_DIR}/$file
189
190   ## Continue even if it differs
191   [ $? -ne 0 ] && { export ST='FAILED'; mesg='Different'; files_list+=$file' '; }
192    done
193
194    [ $mesg == 'Same' ] && echo $mesg
195
196    ## List different files for web comment
197    [ -n "${files_list}" ] && echo 'Results : '${files_list}'differ<br>' \
198   >> temp_${FILE_COMM}
199}
200
201
202diff_restart() {
203    local base_name comp dif file list_comp list_tmsp nb_dom time_step tmsp
204    local files_list='' dift=0
205
206    ## Stop if no benchmark files (ie time.step)
207    [ ! -e ${REFE_DIR}/time.step ] && { export ST='FAILED'; get_out 7; }
208    time_step=$( cat ${REFE_DIR}/time.step | tr -d [:space:] )
209
210    ## Find all restart files to rebuild
211    if [ $( find -regex ".*_restart.*[0-9]\.nc" -print -quit ) ]; then
212   base_name=$( find -regex ".*_restart.*[0-9]\.nc"                      \
213                | sed "s#^\./\(.*\)_[0-9]*_restart.*#\1#"       | sort -u )
214   list_comp=$( find -regex ".*_restart.*[0-9]\.nc"                      \
215                | sed "s#^.*\(restart[a-z_]*\)_[0-9].*\.nc#\1#" | sort -u )
216   list_tmsp=$( find -regex ".*_restart.*[0-9]\.nc"                      \
217                | sed "s#^.*\([0-9]\{8\}\)_restart.*#\1#"       | sort -u )
218
219   ## Loop on each time step
220   for tmsp in ${list_tmsp}; do
221       for comp in ${list_comp}; do
222      file=${base_name}_${tmsp}_${comp}
223      nb_dom=$( find -name "${file}_[0-9]*.nc" | wc -l | awk '{ print $1 }' )
224
225      if   [ ${nb_dom} -gt 1 ]; then
226          ${NEMO_TRBD}/rebuild_nemo -t $NPROC $file ${nb_dom} > /dev/null
227          [ $? -eq 0 ] && rm -f ${file}_[0-9]*.nc             > /dev/null
228      elif [ ${nb_dom} -eq 0 ]; then
229          export ST='FAILED' && get_out 8
230      fi
231
232      ## Compare restart files at same time step
233      if [ $tmsp -eq ${time_step} ]; then
234
235                    ## Stop if no benchmark files (restart file)
236          if [ -e ${REFE_DIR}/$file.nc ]; then
237
238                   ## UNIX `cmp` not suitable (timestamp in .nc file)
239         dif=$( $CDOD $file.nc ${REFE_DIR}/$file.nc 2> /dev/null          \
240                | awk '/records/ {print $0}' | sed '2 s/^/,/' | tr -d '\n' )
241
242         ## CDO can return void stdout with no difference
243         if [[ -n "$dif" && $( echo $dif | awk '{print $1}' ) -ne 0 ]]; then
244             export ST='FAILED'
245             files_list+=$file' ' && let dift+=$( echo $dif | awk '{print $1}' )
246             echo $file.nc': '$dif
247         fi
248
249          else
250         export ST='FAILED' && get_out 7
251          fi
252
253      else
254          continue
255      fi
256
257       done
258   done
259
260        ## List different files for web comment with sum of different parameters
261   if [ $dift -ne 0 ]; then
262       echo 'Restarts: '${files_list}$dift' record(s) differ<br>' \
263      >> temp_${FILE_COMM}
264   else
265       echo 'Same'
266   fi
267
268    else
269   export ST='FAILED'
270    fi
271
272}
273
274
275get_cpu_time() {
276    ## Interest for checking unusual time computing
277    local real_cpu_time=$( eval ${JOB_TIME} )
278
279    echo ${real_cpu_time} | tee -a ${FILE_CPUT}
280}
281
282
283comments() {
284    local opat
285    local line='' state=$1
286
287    if [ -e ocean.output ]; then
288        ## 'W A R N I N G' by default
289   opat="-A2 \"^ $state\""
290   [ "$state" == 'E R R O R' ] && opat="-A4 \"$state\""
291
292        ## Select first occurence for web comment
293   line=$( eval grep -m1 $opat ocean.output | tr -d '\n' )
294   echo $line
295    fi
296
297    printf "$line<br>"      \
298   >> temp_${FILE_COMM}
299}
300
301
302mesg_make() {
303    ## Format comments for web
304    [ -e temp_${FILE_COMM} ] && cat temp_${FILE_COMM} | tr -d '\n' | sed 's/<br>$//' \
305   >> ${FILE_COMM}
306
307    ## Construct txt file for web with all messenger files
308    paste -d ';' mesg_*.txt | tee ${FILE_TRUS}
309}
310
311
312mesg_publish() {
313    local cmd
314
315    ## Production mode (-p|--publish)
316    if [ $PUBLISH -eq 1 ]; then
317
318   ## Create or append historical trusting file
319   if [ -f ${REFE_DIR}/trusting_$PATTERNAME.txt ]; then cmd='tail -1'; else cmd='cat'; fi
320
321   $cmd ${FILE_TRUS}                           \
322       >> ${REFE_DIR}/trusting_$PATTERNAME.txt
323
324        ## Send mail only when FAILED
325   if [[ ! -z "$EMAIL" && $ST == 'FAILED' ]]; then
326
327       ## Content
328       cat <<END_MAIL \
329      > trusting.mail
330XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
331
332Dear $USER,
333
334The trusting sequence for ${REFE_CONF} has failed.
335Directory: ${TEST_DIR}
336Archive created: ${FILE_ARCH} in ${REFE_DIR}
337
338XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
339
340`cat ${TEST_DIR}/${FILE_TRUS}`
341END_MAIL
342
343       ## Send with detailed subject
344       mail -s "[NEMO Trusting ${NEMO_HPCC} ${REFE_CONF} ${NEMO_BRAN}] $ST $ERR" $EMAIL \
345      <  trusting.mail
346   fi
347
348    fi
349}
350
351
352get_out() {
353    local time_step=0
354
355    ERR=$1
356
357    print_step 'End of trusting test'
358
359    ## In case of compilation error
360    cd ${TEST_DIR}
361
362    if [ $ST == 'FAILED' ]; then
363   echo 'Failure'
364
365        ## Error identification
366   case $ERR in
367            ## Compilation
368       '1') ERR='XIOS compilation failed' ;; '2') ERR='NEMO compilation failed';;
369       ## Submission
370       '3') ERR='Missing input files'     ;; '4') ERR='Job submission error'   ;;
371       ## Running
372       '5') ERR='Crashed at time step'    ;; '6') ERR='Exceeded time limit'    ;;
373       ## Results
374       '7') ERR='Missing previous outputs';; '8') ERR='New outputs differ'     ;;
375       ## Undefined
376       "*") ERR='Unknown error'           ;;
377   esac
378
379    else
380   echo 'Success' && ERR='Code is reliable'
381    fi
382
383    ## Eventual comments from ocean.output
384    if [ "$ERR" == 'Crashed at time step' ]; then
385   comments 'E R R O R'
386   [ -e time.step ] && time_step=$( cat time.step | tr -d ' ' )
387   ERR+=' '$time_step
388    else
389   comments 'W A R N I N G'
390   [ "$ERR" == 'Exceeded time limit' ] && ERR+=' '$(( ${TIME_LIMIT}/3600 ))'h'
391    fi
392
393    ## Last messenger files
394    export ERR
395    sed -i "2 s/.*/$ST/" ${FILE_STAT}; sed -i "2 s/.*/$ERR/" ${FILE_RESU}
396
397    ## Save tested configuration if trusting failed in production mode (-p|--publish)
398    if [[ $ST == 'FAILED' && $PUBLISH -eq 1 ]]; then
399   echo 'Creating archive '${FILE_ARCH}' under '${REFE_DIR}
400   tar -czf ${REFE_DIR}/${FILE_ARCH} * ${NEMO_HOME}/CONFIG/${TEST_CONF}/MY_SRC \
401                                       ${NEMO_HOME}/CONFIG/${TEST_CONF}/WORK
402    fi
403
404    ## Share trusting result (.txt file/mail)
405    print_step 'Trusting digest'
406    mesg_make
407    mesg_publish
408
409    exit 0
410}
Note: See TracBrowser for help on using the repository browser.