New URL for NEMO forge!   http://forge.nemo-ocean.eu

Since March 2022 along with NEMO 4.2 release, the code development moved to a self-hosted GitLab.
This present forge is now archived and remained online for history.
trusting_func.sh in branches/2015/dev_r5092_CNRS18_TRUST/NEMOGCM/TRUST – NEMO

source: branches/2015/dev_r5092_CNRS18_TRUST/NEMOGCM/TRUST/trusting_func.sh @ 5788

Last change on this file since 5788 was 5788, checked in by nicolasmartin, 9 years ago

dev_r5092_CNRS18_TRUST Several enhancements (global variables rename, notification mail object, add job performances, templates description, script to install new minimal branch to test) to improve accessibility for NEMO users & bugfixes (tar creation, modulefiles)

  • Property eol-style set to native
  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Rev URL
File size: 12.0 KB
Line 
1#!/bin/bash
2
3
4## Messenger filenames
5FILE_DATE=mesg_01_date_$PATTERNAME.txt  ; FILE_TRUS_RSLT=mesg_02_result_$PATTERNAME.txt
6FILE_STAT=mesg_03_state_$PATTERNAME.txt ; FILE_NEMO=mesg_04_nemo_$PATTERNAME.txt
7FILE_XIOS=mesg_05_xios_$PATTERNAME.txt  ; FILE_CMPF=mesg_06_compiler_$PATTERNAME.txt
8FILE_LMPI=mesg_07_mpi_$PATTERNAME.txt   ; FILE_NCDF=mesg_08_netcdf_$PATTERNAME.txt
9FILE_INPT=mesg_09_inputs_$PATTERNAME.txt; FILE_TIME=mesg_10_time_$PATTERNAME.txt
10FILE_MEMY=mesg_11_memory_$PATTERNAME.txt; FILE_NOTE=mesg_12_comments_$PATTERNAME.txt
11
12## Timestamped logfile & archive filenames
13FILE_TRUS=trusting_${DATE}_$PATTERNAME.txt; FILE_ARCH=trusting_${DATE}_$PATTERNAME.tgz
14
15
16## Functions in order of use
17print_step() {
18    local char_nb=$( echo "$1" | wc -c )
19    local outline=$( printf "%${char_nb}s" )
20
21    printf "\nStep.....\n%s\n%s\n" "$1" ${outline// /-}
22}
23
24init_files() {
25    echo 'Date'               > ${FILE_DATE}; echo 'Result'           > ${FILE_TRUS_RSLT}
26    echo 'State'              > ${FILE_STAT}; echo 'NEMOGCM rev.'     > ${FILE_NEMO}
27    echo 'XIOS rev.'          > ${FILE_XIOS}; echo 'Fortran compiler' > ${FILE_CMPF}
28    echo 'MPI libs'           > ${FILE_LMPI}; echo 'NetCDF libs'      > ${FILE_NCDF}
29    echo 'Input files'        > ${FILE_INPT}; echo 'Elapsed time'     > ${FILE_TIME}
30    echo 'Memory usage (P/V)' > ${FILE_MEMY}; echo 'Comments'         > ${FILE_NOTE}
31
32    ## 'Failed' status with 'Unknown error' by default
33    echo $TRUS_RSLT           \
34   >> ${FILE_TRUS_RSLT}
35    echo 'Unknown error' \
36   >> ${FILE_STAT}
37}
38
39get_date() {
40    ## UTC time zone for timestamping
41    local dat=$( date -ud "${DATE}" +"%F %R %Z" )
42
43    echo $dat           \
44   >> ${FILE_DATE}
45}
46
47get_nemo_rev() {
48    local dir rev_loc
49    local rev=0 list=( 'ARCH CONFIG NEMO EXTERNAL/AGRIF EXTERNAL/IOIPSL EXTERNAL/fcm TOOLS/COMPILE TOOLS/REBUILD_NEMO' )
50
51    ## Loop on essential NEMO directories
52    for dir in $list ${DIR_XIOS}; do
53
54   ## For time being, just get revision from XIOS with no action on directory
55   if [ $dir == ${DIR_XIOS} ]; then
56       rev_loc=$( svn info $dir | awk '(NR == 9) {print $NF}' )
57       echo 'XIOS '${rev_loc} \
58      >> model.log
59       echo "<a href=\"https://forge.ipsl.jussieu.fr/ioserver/changeset/${rev_loc}\" target=\"_blank\">${rev_loc}</a>" \
60      >> ${FILE_XIOS}
61       continue
62   fi
63
64   echo $dir && ${SVN_CMD} ${TRUS_WKCY}/$dir
65   rev_loc=$( svn info ${TRUS_WKCY}/$dir | awk '(NR == 9) {print $NF}' )
66
67   ## Keep last rev. nb
68   [ ${rev_loc} -gt $rev ] && rev=${rev_loc}
69    done
70
71    echo 'NEMOGCM '$rev \
72   >> model.log
73    echo "<a href=\"https://forge.ipsl.jussieu.fr/nemo/changeset/$rev\" target=\"_blank\">$rev</a>" \
74   >> ${FILE_NEMO}
75}
76
77get_soft_rel() {
78    local soft soft_rel
79
80    ## Sourcing environment modulefile only if module function is set
81    [[ -e ${ARCH_ENV} && $( declare -F | grep ' module' ) ]] && . ${ARCH_ENV}
82
83    for soft in $CMPF ${STR_CDOD} ${STR_LMPI} ${STR_NCDF}; do
84   soft_rel=''
85
86   ## Software release: next word after "$soft" in $PATH (case-insensitive)
87   soft_rel=$( echo $PATH | sed "s#.*$soft\([0-9.a-z_]*\).*#\1#i" )
88
89   ## option --version would work for main compilers (gfortran, intel, pgfortran, ...)
90   [ $soft == $COMPILER ] && soft_rel=$( $soft --version | grep -m1 -oe '\<[0-9. ]*\>' )
91
92   ## Cleaning characters string to display proper soft name
93   soft=$( echo $soft | sed 's#\\##g; s#[/-]$##' )
94
95   echo $soft ${soft_rel} \
96       >> model.log
97    done
98
99    sed -n 4p model.log \
100   >> ${FILE_CMPF}
101    sed -n 5p model.log \
102   >> ${FILE_LMPI}
103    sed -n 6p model.log \
104   >> ${FILE_NCDF}
105}
106
107get_inputs() {
108    # List archive content & extract it by default
109    local cmd_iol="tar -tvf ${TRUS_FORC}/${TRUS_TARF}" cmd_iof="tar -vxf ${TRUS_FORC}/${TRUS_TARF}"
110
111    ## List & copy files in case of personal inputs
112    [ -z "${TRUS_TARF}" ] && { cmd_iol="ls ${TRUS_FORC}/*"; cmd_iof="\cp ${TRUS_FORC}/* ."; }
113
114    ${cmd_iol} > inputs_list.txt
115    ${cmd_iof} > /dev/null
116}
117
118diff_inputs() {
119    local dif file
120    local files_list='' mesg='Same' 
121
122    ## Simple diff
123    for file in 'inputs_list.txt' *namelist_* *.xml cpp_*; do
124   dif=''
125
126   ## Continue even if input file is not in here (see after)
127   if [ -e ${TRUS_BHMK}/$file ]; then dif=$( diff -q $file ${TRUS_BHMK}/$file ); else dif=0; fi
128
129   ## Pass over useless file omission in benckmark directory
130   [[ -n "$dif" && "$dif" != '0' ]] && ( mesg='Different'; echo $dif; files_list+=$file' ' )
131    done
132
133    [ $mesg == 'Same' ] && echo $mesg
134    echo $mesg          \
135   >> ${FILE_INPT}
136
137    ## List different files for web comment
138    [ -n "${files_list}" ] && echo 'Inputs  : '${files_list}'differ<br>' \
139   >> temp_${FILE_NOTE}
140}
141
142job_pending() {
143    local outline=$( printf "%100s" ) time_elapsed=0 time_increment=30
144
145    sleep ${time_increment}
146
147    ## Append a log file while pending
148    while [[ $( eval ${JOB_STAT} ) && ${time_elapsed} -lt $TIMEOUT ]]; do
149   printf "\n%s\n" ${outline// /#} \
150       >> computation.log
151   eval ${JOB_INFO}                \
152       >> computation.log
153   sleep ${time_increment}
154   time_elapsed=$(( ${time_elapsed} + ${time_increment} ))
155    done
156
157    sleep ${time_increment}
158
159    ## Kill remaining job & stop the test if it's too long
160    [ ${time_elapsed} -eq $TIMEOUT ] && { eval ${JOB_DELE} &> /dev/null; get_out 6; }
161}
162
163diff_results() {
164    local file
165    local files_list='' mesg='Same'
166
167    ## Simple diff
168    for file in 'ocean.output' *.stat; do
169   ## Stop if no benchmark files (ocean.output, eventual stat files)
170   [ ! -e ${TRUS_BHMK}/$file ] && { export TRUS_RSLT='FAILED'; get_out 7; }
171
172   diff -q $file ${TRUS_BHMK}/$file
173
174   ## Continue even if it differs
175   [ $? -ne 0 ] && { export TRUS_RSLT='FAILED'; mesg='Different'; files_list+=$file' '; }
176    done
177
178    [ $mesg == 'Same' ] && echo $mesg
179
180    ## List different files for web comment
181    [ -n "${files_list}" ] && echo 'Results : '${files_list}'differ<br>' \
182   >> temp_${FILE_NOTE}
183}
184
185diff_restart() {
186    local base_name comp dif file list_comp list_tmsp nb_dom time_step tmsp
187    local files_list='' dift=0
188
189    ## Stop if no benchmark files (ie time.step)
190    [ ! -e ${TRUS_BHMK}/time.step ] && { export TRUS_RSLT='FAILED'; get_out 7; }
191    time_step=$( cat ${TRUS_BHMK}/time.step | tr -d [:space:] )
192
193    ## Find all restart files to rebuild
194    if [ $( find -regex ".*_restart.*[0-9]\.nc" -print -quit ) ]; then
195   base_name=$( find -regex ".*_restart.*[0-9]\.nc"                       \
196                | sed "s#^\./\(.*\)_[0-9]*_restart.*#\1#"       | sort -u  )
197   list_comp=$( find -regex ".*_restart.*[0-9]\.nc"                       \
198                | sed "s#^.*\(restart[a-z_]*\)_[0-9].*\.nc#\1#" | sort -u  )
199   list_tmsp=$( find -regex ".*_restart.*[0-9]\.nc"                       \
200                | sed "s#^.*\([0-9]\{8\}\)_restart.*#\1#"       | sort -u  )
201
202   ## Loop on each time step
203   for tmsp in ${list_tmsp}; do
204
205       for comp in ${list_comp}; do
206      file=${base_name}_${tmsp}_${comp}
207      nb_dom=$( find -name "${file}_[0-9]*.nc" | wc -l | awk '{ print $1 }' )
208
209      if   [ ${nb_dom} -gt 1 ]; then
210          ${TRUS_WKCY}/TOOLS/REBUILD_NEMO/rebuild_nemo -t ${TRUS_NPRO} $file ${nb_dom} > /dev/null
211          [ $? -eq 0 ] && rm -f ${file}_[0-9]*.nc                                > /dev/null
212      elif [ ${nb_dom} -eq 0 ]; then
213          export TRUS_RSLT='FAILED' && get_out 8
214      fi
215
216      ## Compare restart files at same time step
217      if [ $tmsp -eq ${time_step} ]; then
218
219                    ## Stop if no benchmark files (restart file)
220          if [ -e ${TRUS_BHMK}/$file.nc ]; then
221
222                   ## UNIX `cmp` not suitable (timestamp in .nc file)
223         dif=$( $CDOD $file.nc ${TRUS_BHMK}/$file.nc 2> /dev/null          \
224                | awk '/records/ {print $0}' | sed '2 s/^/,/' | tr -d '\n' )
225
226         ## CDO can return void stdout with no difference
227         if [[ -n "$dif" && $( echo $dif | awk '{print $1}' ) -ne 0 ]]; then
228             export TRUS_RSLT='FAILED'
229             files_list+=$comp' ' && let dif_sum+=$( echo $dif | awk '{print $1}' )
230             echo $file.nc': '$dif
231         fi
232
233          else
234         export TRUS_RSLT='FAILED' && get_out 7
235          fi
236
237      else
238          continue
239      fi
240
241       done
242
243   done
244
245        ## List different files for web comment with sum of different parameters
246   if [ ${dif_sum} -ne 0 ]; then
247       echo 'Restarts: '${files_list}${dif_sum}' record(s) differ<br>' \
248      >> temp_${FILE_NOTE}
249   else
250       echo 'Same'
251   fi
252
253    else
254   export TRUS_RSLT='FAILED'
255    fi
256
257}
258
259get_time() {
260    ## Interest for checking unusual time computation
261    local time_cpu=$( eval ${JOB_TIME} )
262
263    printf "Elapsed time: "
264    echo ${time_cpu} | tee -a ${FILE_TIME}
265}
266
267get_memy() {
268    ## Interest for checking unusual memory usage
269    local memory_pmax=$( eval ${JOB_PMEM} ) memory_vmax=$( eval ${JOB_VMEM} )
270
271    printf "Memory max usage (physical/virtual): "
272    echo ${memory_pmax}' / '${memory_vmax} | tee -a ${FILE_MEMY}
273}
274
275comments() {
276    local opat
277    local line='' state=$1
278
279    if [ -e ocean.output ]; then
280        ## 'W A R N I N G' pattern by default
281   opat="-A2 \"^ $state\""
282   [ "$state" == 'E R R O R' ] && opat="-A4 \"$state\""
283
284        ## Select first occurence for web comment
285   line=$( eval grep -m1 $opat ocean.output | tr -d '\n' )
286    fi
287
288    [ -n "$line" ] && ( echo $line; printf "$line<br>" \
289   >> temp_${FILE_NOTE} )
290}
291
292log_make() {
293    ## Format comments for web
294    [ -e temp_${FILE_NOTE} ] && cat temp_${FILE_NOTE} | tr -d '\n' | sed 's/<br>$//' \
295   >> ${FILE_NOTE}
296
297    ## Construct txt file with all messenger files
298    paste -d ';' mesg_*.txt | tee ${FILE_TRUS}
299}
300
301prod_publish() {
302    local cmd
303    local rev=$( awk '/NEMOGCM/ {print $NF}' model.log )
304
305    ## Production mode (-p|--prod)
306    if [ $PROD -eq 1 ]; then
307
308   ## Create or append trusting logfile
309   if [ -f ${TRUS_BHMK}/trusting_$PATTERNAME.txt ]; then cmd='tail -1'; else cmd='cat'; fi
310
311   $cmd ${FILE_TRUS}                           \
312       >> ${TRUS_BHMK}/trusting_$PATTERNAME.txt
313
314        ## Send mail only when FAILED
315   if [[ ! -z "$TRUS_MAIL" && $TRUS_RSLT == 'FAILED' ]]; then
316
317       ## Content
318       cat <<END_MAIL      \
319      > trusting.mail
320Dear all,
321
322
323The trusting sequence has not completed successfully on new configuration ${TRUS_TEST} based on ${TRUS_REFE}.
324
325Here is the model summary:
326`cat model.log`
327
328First checking would be on the trusting environment files:
329${TRUS_USER}.cfg & ${TRUS_HPCC}.cfg
330
331For more details, look into the testing directory at:
332${TEST_DIR}
333
334An archive has been created to share the questionable configuration for further studies:
335${TRUS_BHMK}/${FILE_ARCH}
336
337END_MAIL
338
339       ## Send with detailed subject
340       mail -s "[NEMO Trusting][$rev][${TRUS_WKCY}][${TRUS_REFE}] $TRUS_RSLT $ERR" $TRUS_MAIL \
341      <  trusting.mail
342   fi
343
344    fi
345}
346
347get_out() {
348    local time_step=0
349
350    ERR=$1
351
352    printf "\n\nEnd of test\n"
353
354    ## In case of compilation error
355    cd ${TEST_DIR}
356
357    if [ $TRUS_RSLT == 'FAILED' ]; then
358   echo 'Failure'
359
360        ## Error identification
361   case $ERR in
362            ## Compilation
363       '1') ERR='XIOS compilation failed' ;; '2') ERR='NEMO compilation failed';;
364       ## Submission
365       '3') ERR='Missing input files'     ;; '4') ERR='Job submission error'   ;;
366       ## Computation
367       '5') ERR='Crashed at time step'    ;; '6') ERR='Exceeded time limit'    ;;
368       ## Results
369       '7') ERR='Missing previous outputs';; '8') ERR='New outputs differ'     ;;
370   esac
371
372    else
373   echo 'Success' && ERR='Code is reliable'
374    fi
375
376    ## Eventual comments from ocean.output
377    if [ "$ERR" == 'Crashed at time step' ]; then
378   comments 'E R R O R'
379   [ -e time.step ] && time_step=$( grep -o [0-9]* time.step )
380   ERR+=' '$time_step
381    else
382   comments 'W A R N I N G'
383   [ "$ERR" == 'Exceeded time limit' ] && ERR+=' '$(( ${TIMEOUTT}/3600 ))'h'
384    fi
385
386    ## Last messenger files
387    export ERR
388    sed -i "2 s/.*/$TRUS_RSLT/" ${FILE_TRUS_RSLT}; sed -i "2 s/.*/$ERR/" ${FILE_STAT}
389
390    ## Save tested configuration if trusting failed in production mode (-p|--prod)
391    if [[ $TRUS_RSLT == 'FAILED' && $PROD -eq 1 ]]; then
392   echo 'Creating archive '${FILE_ARCH}' under '${TRUS_BHMK}
393   tar -czf ${TRUS_BHMK}/${FILE_ARCH}              *                    \
394       -C ${TRUS_WKCY}/CONFIG/${TRUS_TEST}/MY_SRC .                    \
395       -C ${TRUS_WKCY}/CONFIG/${TRUS_TEST}        cpp_${TRUS_TEST}.fcm
396    fi
397
398    ## Logfile construct & eventual sending of notification email
399    printf "\nTrusting digest:\n----------------\n"
400    log_make
401    prod_publish
402
403    exit 0
404}
Note: See TracBrowser for help on using the repository browser.