source: branches/2015/dev_r5092_CNRS18_TRUST/NEMOGCM/TRUST/inc/trusting_func.sh @ 8797

Last change on this file since 8797 was 8797, checked in by nicolasmartin, 3 years ago

Modifications to get it working on Curie for all trusting tests

  • Property eol-style set to native
  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Rev URL
File size: 11.9 KB
Line 
1#!/bin/bash
2
3
4## Messenger filenames
5file_date=mesg_01_date.txt  ; file_rslt=mesg_02_result.txt
6file_stat=mesg_03_status.txt; file_nemo=mesg_04_nemo.txt
7file_xios=mesg_05_xios.txt  ; file_cmpf=mesg_06_compiler.txt
8file_lmpi=mesg_07_mpi.txt   ; file_ncdf=mesg_08_netcdf.txt
9file_inpt=mesg_09_inputs.txt; file_time=mesg_10_time.txt
10file_memy=mesg_11_memory.txt; file_note=mesg_12_comments.txt
11
12
13## Functions in order of use
14print_step() {
15    local char_nb=$( echo "$1" | wc -c )
16    local outline=$( printf "%${char_nb}s" )
17
18    printf "\nStep.....\n%s\n%s\n" "$1" ${outline// /-}
19}
20
21init_files() {
22    echo 'Date'               > ${file_date}; echo 'Result'           > ${file_rslt}
23    echo 'Status'             > ${file_stat}; echo 'NEMOGCM rev.'     > ${file_nemo}
24    echo 'XIOS rev.'          > ${file_xios}; echo 'Fortran compiler' > ${file_cmpf}
25    echo 'MPI libs'           > ${file_lmpi}; echo 'NetCDF libs'      > ${file_ncdf}
26    echo 'Input files'        > ${file_inpt}; echo 'Elapsed time'     > ${file_time}
27    echo 'Memory usage (P/V)' > ${file_memy}; echo 'Comments'         > ${file_note}
28
29    ## 'Failed' status with 'Unknown error' by default
30    echo ${TRUS_RSLT}      \
31   >> ${file_rslt}
32    echo 'Unknown error' \
33   >> ${file_stat}
34}
35
36get_date() {
37    ## UTC time zone for timestamping
38    local dat=$( date -ud "${TRUS_DATE}" +"%F %R %Z" )
39
40    echo $dat           \
41   >> ${file_date}
42}
43
44get_nemo_rev() {
45    local dir rev_loc
46    local rev=0
47
48    ## Loop on essential NEMO directories
49    for dir in ${TRUS_CKOT} ${TRUS_XIOS}; do
50
51   ## For time being, just get revision from XIOS with no action on directory
52   if [ $dir == ${TRUS_XIOS} ]; then
53       rev_loc=$( svn info $dir | awk '/Last Changed Rev/ {print $NF}' )
54       echo 'XIOS '${rev_loc} \
55      >> model.log
56       echo "<a href=\"https://forge.ipsl.jussieu.fr/ioserver/changeset/${rev_loc}\" target=\"_blank\">${rev_loc}</a>" \
57      >> ${file_xios}
58       continue
59   fi
60
61   echo $dir && ${TRUS_SVNA} ${TRUS_NGCM}/$dir
62   rev_loc=$( svn info ${TRUS_NGCM}/$dir | awk '/Last Changed Rev/ {print $NF}' )
63
64   ## Keep last rev. nb
65   [ ${rev_loc} -gt $rev ] && rev=${rev_loc}
66    done
67
68    echo 'NEMOGCM '$rev \
69   >> model.log
70    echo "<a href=\"https://forge.ipsl.jussieu.fr/nemo/changeset/$rev\" target=\"_blank\">$rev</a>" \
71   >> ${file_nemo}
72}
73
74get_soft_rel() {
75    local soft_rel str
76
77    ## Sourcing environment
78    if [ -n "${TRUS_ENVI}" ]; then
79   if [[  -e ${TRUS_ENVI}.env && $( declare -F | grep ' module' ) ]]; then
80            ## .env file if module function is available
81       . ${TRUS_ENVI}.env
82   else
83            ## .path file if existing, if not the given file
84       [ -e ${TRUS_ENVI}.path ] && . ${TRUS_ENVI}.path || . ${TRUS_ENVI}
85   fi
86    fi
87
88    ## Problem with `prepend-path` of modulefile that use ':' instead of ' ' as delimiter
89    [ $TRUS_HPCC == 'X64_ADA' ] && WRAPPER_LDFLAGS='-L/smplocal/pub/IdrMemMPI/1.4/lib -lidrmem '${WRAPPER_LDFLAGS}
90
91    for str in ${TRUS_CMPV} ${TRUS_MPIR} ${TRUS_CDFR} ${TRUS_CDOR}; do
92   [ -z "$str" ] && continue
93   soft_rel=''
94
95   ## Software release: next word after "$soft" in $PATH (case-insensitive)
96   soft_rel=$( echo $PATH | sed "s#.*$str\([0-9.a-z_]*\).*#\1#i" )
97
98   ## option --version would work for main compilers (gfortran, intel, pgfortran, ...)
99   [ $str == ${TRUS_CMPV} ] && soft_rel=$( $str --version | grep -m1 -oe '\<[0-9. ]*\>' )
100
101   ## Cleaning characters string to display proper soft name
102   str=$( echo $str | sed 's#\\##g; s#[/-]$##' )
103
104   echo $str ${soft_rel} \
105       >> model.log
106    done
107
108    sed -n 3p model.log \
109   >> ${file_cmpf}
110    sed -n 4p model.log \
111   >> ${file_lmpi}
112    sed -n 5p model.log \
113   >> ${file_ncdf}
114}
115
116get_inputs() {
117    # List archive content & extract it by default
118    local get_iol=$( eval "
119   for archive in ${TRUS_TARF}; do
120       tar -tvf ${TRUS_FORC}/\$archive >> inputs_list.txt;
121   done
122   " )
123    local get_iof=$( eval "
124   for archive in ${TRUS_TARF}; do
125       tar -vxf ${TRUS_FORC}/\$archive > /dev/null;
126   done
127   " )
128
129    ## List & copy files in case of personal inputs
130    if [ -z "${TRUS_TARF}" ]; then
131   get_iol="ls ${TRUS_FORC}/*"
132   get_iof="\cp ${TRUS_FORC}/* ."
133    fi
134
135    ${get_iol}; ${get_iof}
136
137    if [ `find -name '*.gz' -print -quit` ]; then
138   find . -name '*.gz' -exec gzip -d {} \;
139    fi
140}
141
142diff_inputs() {
143    local dif file
144    local files_list='' mesg='Same' 
145
146    ## Simple diff
147    for file in 'inputs_list.txt' *namelist_* *.xml cpp_*; do
148   dif=''
149
150   ## Continue even if input file is not in here (see after)
151   if [ -e ${TRUS_STOR}/$file ]; then dif=$( diff -q $file ${TRUS_STOR}/$file ); else dif=0; fi
152
153   ## Pass over useless file omission in benckmark directory
154   [[ -n "$dif" && "$dif" != '0' ]] && { mesg='Different'; echo $dif; files_list+=$file' '; }
155    done
156
157    [ $mesg == 'Same' ] && echo $mesg
158    echo $mesg          \
159   >> ${file_inpt}
160
161    ## List different files for web comment
162    [ -n "${files_list}" ] && echo 'Inputs  : '${files_list}'differ<br>' \
163   >> temp_${file_note}
164}
165
166job_pending() {
167    local outline=$( printf "%100s" ) time_elapsed=0 time_increment=30
168
169    sleep ${time_increment}
170
171    ## Append a log file while pending
172    while [[ $( eval ${TRUS_JSTA} ) && ${time_elapsed} -lt ${TRUS_TOUT} ]]; do
173   printf "\n%s\n" ${outline// /#}          \
174       >> computation.log
175   [ -n "${TRUS_JINF}" ] && eval ${TRUS_JINF} \
176       >> computation.log
177   sleep ${time_increment}
178   time_elapsed=$(( ${time_elapsed} + ${time_increment} ))
179    done
180
181    sleep ${time_increment}
182
183    ## Kill remaining job & stop the test if it's too long
184    [ ${time_elapsed} -eq ${TRUS_TOUT} ] && { eval ${TRUS_JKIL} &> /dev/null; get_out 6; }
185}
186
187diff_results() {
188    local file
189    local files_list='' mesg='Same'
190
191    ## Simple diff
192    for file in 'ocean.output' *.stat; do
193   ## Stop if no benchmark files (ocean.output, eventual stat files)
194   [ ! -e ${TRUS_STOR}/$file ] && { TRUS_RSLT='FAILED'; get_out 7; }
195
196   diff -q $file ${TRUS_STOR}/$file
197
198   ## Continue even if it differs
199   [ $? -ne 0 ] && { TRUS_RSLT='FAILED'; mesg='Different'; files_list+=$file' '; }
200    done
201
202    [ $mesg == 'Same' ] && echo $mesg
203
204    ## List different files for web comment
205    [ -n "${files_list}" ] && echo 'Results : '${files_list}'differ<br>' \
206   >> temp_${file_note}
207}
208
209diff_restart() {
210    local dif filebase filebases ndomain out
211    local files_list='' dif_sum='undef'
212
213    ## Find all restart files to rebuild
214    if [ $( find -regex ".*_restart.*[0-9]\.nc" -print -quit ) ]; then
215   filebases=$( find -regextype sed -regex ".*_[0-9]\{4\}\.nc" \
216                | sed 's/\(.*\)_.*/\1/' | sort -u                )
217
218   for filebase in ${filebases}; do
219
220       ndomain=$( find -regex ".*${name}_[0-9]*.nc" | wc -l | awk '{print $1}' )
221
222       [ ${ndomain} -eq 0 ] && TRUS_RSLT='FAILED' && get_out 8
223
224       ${TRUS_NGCM}/TOOLS/REBUILD_NEMO/rebuild_nemo \
225      -t ${TRUS_NPRO} $filebase ${ndomain}     \
226      > /dev/null
227
228       ## Possibility of remaining decomposed restarts (even after rebuild)
229       [ $? -eq 0 ] && rm -f ${file}_[0-9]*.nc \
230                > /dev/null
231
232       dif=''
233
234            ## Stop if no benchmark files (restart file)
235       if [ -e ${TRUS_STOR}/$file.nc ]; then
236
237      out=$( $TRUS_CDOD $name.nc ${TRUS_STOR}/$name.nc 2>&1 )
238      dif=$( echo $out | grep -o "[0-9]* of [0-9]* records" )
239
240      ## Fix for cdo aborting on restarts with different inputs
241      if [[ $out =~ 'Abort' ]]; then
242          dif=$( echo $out | awk -F: '{print $NF}' )
243      fi
244
245      if [ -n "$dif" ]; then
246          export TRUS_RSLT='FAILED'
247          files_list+=$name' ' && echo $name'.nc: '$dif
248          let dif_sum+=$( echo $dif | awk '{print $1}' )
249      fi
250
251   done
252
253    ## Stop if no benchmark files (ie time.step)
254   if   [ ${dif_sum} == 'undef' ]; then
255       TRUS_RSLT='FAILED'
256       get_out 7
257        ## List different files for web comment with sum of different records
258   elif [ ${dif_sum} -ne 0      ]; then
259       echo 'Restarts: '${files_list}${dif_sum}' record(s) differ<br>' \
260      >> temp_${file_note}
261   else
262       echo 'Same'
263   fi
264
265    else
266   TRUS_RSLT='FAILED'
267    fi
268
269}
270
271get_time() {
272    [ -z "${TRUS_JTIM}" ] && return
273
274    ## Interest for checking unusual time computation
275    local time_cpu=$( eval ${TRUS_JTIM} )
276
277    printf "Elapsed time: "
278    echo ${time_cpu} | tee -a ${file_time}
279}
280
281get_memy() {
282    [[ -z "${TRUS_JPME}" && -z "${TRUS_JVME}" ]] && return
283
284    ## Interest for checking unusual memory usage
285    local memory_pmax=$( eval ${TRUS_JPME} ) memory_vmax=$( eval ${TRUS_JVME} )
286
287    printf "Memory max usage (physical/virtual): "
288    echo ${memory_pmax}' / '${memory_vmax} | tee -a ${file_memy}
289}
290
291comments() {
292    local opat
293    local line='' state=$1
294
295    if [ -e ocean.output ]; then
296        ## 'W A R N I N G' pattern by default
297   opat="-A2 \"^ $state\""
298   [ "$state" == 'E R R O R' ] && opat="-A4 \"$state\""
299
300        ## Select first occurence for web comment
301   line=$( eval grep -m1 $opat ocean.output | tr -d '\n' )
302    fi
303
304    [ -n "$line" ] && ( echo $line; printf "$line<br>" \
305   >> temp_${file_note} )
306}
307
308log_make() {
309    ## Format comments for web
310    [ -e temp_${file_note} ] && cat temp_${file_note} | tr -d '\n' | sed 's/<br>$//' \
311   >> ${file_note}
312
313    ## Construct txt file with all messenger files
314    paste -d ';' mesg_*.txt | tee ${TRUS_FILE}
315}
316
317prod_publish() {
318    local cmd
319    local rev=$( awk '/NEMOGCM/ {print $NF}' model.log )
320
321    ## Production mode (-p|--prod)
322    if [ ${TRUS_PROD} -eq 1 ]; then
323
324   ## Create or append trusting logfile
325   if [ -f ${TRUS_HIST} ]; then cmd='tail -1'; else cmd='cat'; fi
326
327   $cmd ${TRUS_FILE}   \
328       >> ${TRUS_HIST}
329
330        ## Send mail only when FAILED
331   if [[ ! -z "${TRUS_MAIL}" && ${TRUS_RSLT} == 'FAILED' ]]; then
332
333       ## Content
334       cat <<END_MAIL      \
335      > trusting.mail
336Dear all,
337
338
339The trusting sequence has not completed successfully on new configuration ${TRUS_CONF} based on ${TRUS_REFE}.
340
341Here is the model summary:
342`cat model.log`
343
344First checking would be on the trusting environment files:
345${TRUS_USER}.cfg & ${TRUS_HPCC}.cfg
346
347For more details, look into the testing folder at:
348${TRUS_SCRA}
349
350An archive has been created to share the questionable configuration for further studies:
351${TRUS_STOR}/${TRUS_ARCH}
352
353END_MAIL
354
355       ## Send with detailed subject
356       mail -s "[NEMO Trusting][$rev][${TRUS_BRAN}][${TRUS_REFE}] ${TRUS_RSLT} ${TRUS_RORR}" ${TRUS_MAIL} \
357      <  trusting.mail
358   fi
359
360    fi
361}
362
363get_out() {
364    local time_step=0
365
366    TRUS_RORR=$1
367
368    printf "\n\nEnd of test\n"
369
370    ## In case of compilation error
371    cd ${TRUS_SCRA}
372
373    if [ ${TRUS_RSLT} == 'FAILED' ]; then
374   echo 'Failure'
375
376        ## Error identification
377   case ${TRUS_RORR} in
378            ## Compilation
379       '1') TRUS_RORR='XIOS compilation failed' ;; '2') TRUS_RORR='NEMO compilation failed';;
380       ## Submission
381       '3') TRUS_RORR='Missing input files'     ;; '4') TRUS_RORR='Job submission error'   ;;
382       ## Computation
383       '5') TRUS_RORR='Crashed at time step'    ;; '6') TRUS_RORR='Exceeded time limit'    ;;
384       ## Results
385       '7') TRUS_RORR='Missing previous outputs';; '8') TRUS_RORR='New outputs differ'     ;;
386       ## Other
387       '*') TRUS_RORR='Unknown error'           ;;
388   esac
389
390    else
391   echo 'Success' && TRUS_RORR='Code is reliable'
392    fi
393
394    ## Eventual comments from ocean.output
395    if [ "${TRUS_RORR}" == 'Crashed at time step' ]; then
396   comments 'E R R O R'
397   [ -e time.step ] && time_step=$( grep -o [0-9]* time.step )
398   TRUS_RORR+=' '$time_step
399    else
400   comments 'W A R N I N G'
401   [ "${TRUS_RORR}" == 'Exceeded time limit' ] && TRUS_RORR+=' '$(( ${TRUS_TOUT}/3600 ))'h'
402    fi
403
404    ## Last messenger files
405    #export TRUS_RORR
406    sed -i "2 s/.*/$TRUS_RSLT/" ${file_rslt}; sed -i "2 s/.*/$TRUS_RORR/" ${file_stat}
407
408    ## Save tested configuration if trusting failed in production mode (-p|--prod)
409    if [[ ${TRUS_RSLT} == 'FAILED' && ${TRUS_PROD} -eq 1 ]]; then
410   echo 'Creating archive '${TRUS_ARCH}' under '${TRUS_STOR}
411   tar -czf ${TRUS_STOR}/${TRUS_ARCH}               *                    \
412       -C   ${TRUS_NGCM}/CONFIG/${TRUS_CONF}/MY_SRC .                    \
413       -C   ${TRUS_NGCM}/CONFIG/${TRUS_CONF}        cpp_${TRUS_CONF}.fcm
414    fi
415
416    ## Logfile construct & eventual sending of notification email
417    printf "\nTrusting digest:\n----------------\n"
418    log_make
419    prod_publish
420
421    exit 0
422}
Note: See TracBrowser for help on using the repository browser.