New URL for NEMO forge!   http://forge.nemo-ocean.eu

Since March 2022 along with NEMO 4.2 release, the code development moved to a self-hosted GitLab.
This present forge is now archived and remained online for history.
trusting_func.sh in branches/2015/dev_r5092_CNRS18_TRUST/NEMOGCM/TRUST/inc – NEMO

source: branches/2015/dev_r5092_CNRS18_TRUST/NEMOGCM/TRUST/inc/trusting_func.sh @ 5988

Last change on this file since 5988 was 5988, checked in by nicolasmartin, 8 years ago

dev_r5092_CNRS18_TRUST Bug correction & updating trusting namelists

  • Property eol-style set to native
  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Rev URL
File size: 12.7 KB
RevLine 
[5268]1#!/bin/bash
2
[5383]3
[5644]4## Messenger filenames
[5799]5FILE_DATE=mesg_01_date_$PATTERNAME.txt  ; FILE_RSLT=mesg_02_result_$PATTERNAME.txt
[5788]6FILE_STAT=mesg_03_state_$PATTERNAME.txt ; FILE_NEMO=mesg_04_nemo_$PATTERNAME.txt
7FILE_XIOS=mesg_05_xios_$PATTERNAME.txt  ; FILE_CMPF=mesg_06_compiler_$PATTERNAME.txt
8FILE_LMPI=mesg_07_mpi_$PATTERNAME.txt   ; FILE_NCDF=mesg_08_netcdf_$PATTERNAME.txt
9FILE_INPT=mesg_09_inputs_$PATTERNAME.txt; FILE_TIME=mesg_10_time_$PATTERNAME.txt
10FILE_MEMY=mesg_11_memory_$PATTERNAME.txt; FILE_NOTE=mesg_12_comments_$PATTERNAME.txt
[5453]11
[5799]12## Trusting timestamped logfile & archive
13TRUS_FILE=trusting_${DATE}_$PATTERNAME.txt; TRUS_ARCH=trusting_${DATE}_$PATTERNAME.tgz
[5644]14
[5681]15
[5788]16## Functions in order of use
17print_step() {
18    local char_nb=$( echo "$1" | wc -c )
19    local outline=$( printf "%${char_nb}s" )
[5644]20
[5788]21    printf "\nStep.....\n%s\n%s\n" "$1" ${outline// /-}
22}
[5681]23
[5644]24init_files() {
[5799]25    echo 'Date'               > ${FILE_DATE}; echo 'Result'           > ${FILE_RSLT}
[5788]26    echo 'State'              > ${FILE_STAT}; echo 'NEMOGCM rev.'     > ${FILE_NEMO}
27    echo 'XIOS rev.'          > ${FILE_XIOS}; echo 'Fortran compiler' > ${FILE_CMPF}
28    echo 'MPI libs'           > ${FILE_LMPI}; echo 'NetCDF libs'      > ${FILE_NCDF}
29    echo 'Input files'        > ${FILE_INPT}; echo 'Elapsed time'     > ${FILE_TIME}
30    echo 'Memory usage (P/V)' > ${FILE_MEMY}; echo 'Comments'         > ${FILE_NOTE}
[5664]31
[5690]32    ## 'Failed' status with 'Unknown error' by default
[5799]33    echo ${TRUS_RSLT}      \
34   >> ${FILE_RSLT}
[5788]35    echo 'Unknown error' \
[5689]36   >> ${FILE_STAT}
[5644]37}
38
39get_date() {
[5690]40    ## UTC time zone for timestamping
41    local dat=$( date -ud "${DATE}" +"%F %R %Z" )
[5644]42
[5788]43    echo $dat           \
[5689]44   >> ${FILE_DATE}
[5644]45}
46
47get_nemo_rev() {
[5695]48    local dir rev_loc
[5799]49    local rev=0
[5644]50
[5788]51    ## Loop on essential NEMO directories
[5799]52    for dir in ${TRUS_CKOT} ${TRUS_XIOS}; do
[5455]53
[5690]54   ## For time being, just get revision from XIOS with no action on directory
[5799]55   if [ $dir == ${TRUS_XIOS} ]; then
56       rev_loc=$( svn info $dir | awk '/Last Changed Rev/ {print $NF}' )
[5689]57       echo 'XIOS '${rev_loc} \
58      >> model.log
[5644]59       echo "<a href=\"https://forge.ipsl.jussieu.fr/ioserver/changeset/${rev_loc}\" target=\"_blank\">${rev_loc}</a>" \
60      >> ${FILE_XIOS}
61       continue
62   fi
63
[5799]64   echo $dir && ${TRUS_SVNA} ${TRUS_NGCM}/$dir
65   rev_loc=$( svn info ${TRUS_NGCM}/$dir | awk '/Last Changed Rev/ {print $NF}' )
[5690]66
67   ## Keep last rev. nb
[5689]68   [ ${rev_loc} -gt $rev ] && rev=${rev_loc}
[5644]69    done
70
[5689]71    echo 'NEMOGCM '$rev \
72   >> model.log
[5644]73    echo "<a href=\"https://forge.ipsl.jussieu.fr/nemo/changeset/$rev\" target=\"_blank\">$rev</a>" \
74   >> ${FILE_NEMO}
[5438]75}
76
[5644]77get_soft_rel() {
[5799]78    local soft_rel str
[5509]79
[5934]80    ## Sourcing environment
[5799]81    if [ -n "${TRUS_ENVI}" ]; then
82   if [[  -e ${TRUS_ENVI}.env && $( declare -F | grep ' module' ) ]]; then
[5934]83            ## .env file if module function is available
[5799]84       . ${TRUS_ENVI}.env
85   else
[5934]86            ## .path file if existing, if not the given file
[5799]87       [ -e ${TRUS_ENVI}.path ] && . ${TRUS_ENVI}.path || . ${TRUS_ENVI}
88   fi
89    fi
[5788]90
[5934]91    ## Problem with `prepend-path` of modulefile that use ':' instead of ' ' as delimiter
[5799]92    [ $TRUS_HPCC == 'X64_ADA' ] && WRAPPER_LDFLAGS='-L/smplocal/pub/IdrMemMPI/1.4/lib -lidrmem '${WRAPPER_LDFLAGS}
93
[5929]94    for str in ${TRUS_CMPV} ${TRUS_MPIR} ${TRUS_CDFR} ${TRUS_CDOR}; do
[5799]95   [ -z "$str" ] && continue
[5644]96   soft_rel=''
[5664]97
[5788]98   ## Software release: next word after "$soft" in $PATH (case-insensitive)
[5799]99   soft_rel=$( echo $PATH | sed "s#.*$str\([0-9.a-z_]*\).*#\1#i" )
[5664]100
[5690]101   ## option --version would work for main compilers (gfortran, intel, pgfortran, ...)
[5929]102   [ $str == ${TRUS_CMPV} ] && soft_rel=$( $str --version | grep -m1 -oe '\<[0-9. ]*\>' )
[5690]103
[5672]104   ## Cleaning characters string to display proper soft name
[5799]105   str=$( echo $str | sed 's#\\##g; s#[/-]$##' )
[5690]106
[5799]107   echo $str ${soft_rel} \
[5689]108       >> model.log
[5644]109    done
110
[5799]111    sed -n 3p model.log \
112   >> ${FILE_CMPF}
[5689]113    sed -n 4p model.log \
[5799]114   >> ${FILE_LMPI}
[5689]115    sed -n 5p model.log \
116   >> ${FILE_NCDF}
[5644]117}
118
119get_inputs() {
[5929]120    ## Extract archive or copy files in case of personal inputs
121    [ -z "${TRUS_TARF}" ] && get_io="cp ${TRUS_FORC}/* ." || get_io="tar -vxf ${TRUS_FORC}/${TRUS_TARF}"
[5644]122
[5929]123    ${get_io} > /dev/null
124    [ $? -ne 0 ] && get_out 3 || echo 'Success'
[5988]125    [ $( find -name '*.gz' -print -quit ) ] && find . -name '*.gz' -exec gzip -d {} \;
[5690]126
[5929]127    ls -lh > inputs_list.txt
[5644]128}
129
[5402]130diff_inputs() {
[5695]131    local dif file
[5664]132    local files_list='' mesg='Same' 
[5644]133
[5690]134    ## Simple diff
[5689]135    for file in 'inputs_list.txt' *namelist_* *.xml cpp_*; do
[5644]136   dif=''
[5690]137
138   ## Continue even if input file is not in here (see after)
[5929]139   if [ -e ${TRUS_STOR}/$file ]; then dif=$( diff -q $file ${TRUS_STOR}/$file ); else dif=0; fi
[5690]140
141   ## Pass over useless file omission in benckmark directory
[5799]142   [[ -n "$dif" && "$dif" != '0' ]] && { mesg='Different'; echo $dif; files_list+=$file' '; }
[5402]143    done
144
[5689]145    [ $mesg == 'Same' ] && echo $mesg
[5788]146    echo $mesg          \
[5689]147   >> ${FILE_INPT}
[5690]148
149    ## List different files for web comment
[5672]150    [ -n "${files_list}" ] && echo 'Inputs  : '${files_list}'differ<br>' \
[5788]151   >> temp_${FILE_NOTE}
[5402]152}
153
[5644]154job_pending() {
[5672]155    local outline=$( printf "%100s" ) time_elapsed=0 time_increment=30
[5644]156
157    sleep ${time_increment}
158
[5690]159    ## Append a log file while pending
[5799]160    while [[ $( eval ${TRUS_JSTA} ) && ${time_elapsed} -lt ${TRUS_TOUT} ]]; do
161   printf "\n%s\n" ${outline// /#}          \
[5689]162       >> computation.log
[5799]163   [ -n "${TRUS_JINF}" ] && eval ${JOB_INFO} \
[5689]164       >> computation.log
[5644]165   sleep ${time_increment}
166   time_elapsed=$(( ${time_elapsed} + ${time_increment} ))
167    done
168
169    sleep ${time_increment}
170
[5690]171    ## Kill remaining job & stop the test if it's too long
[5799]172    [ ${time_elapsed} -eq ${TRUS_TOUT} ] && { eval ${JOB_DELE} &> /dev/null; get_out 6; }
[5644]173}
174
[5509]175diff_results() {
[5695]176    local file
[5696]177    local files_list='' mesg='Same'
[5664]178
[5690]179    ## Simple diff
[5689]180    for file in 'ocean.output' *.stat; do
[5690]181   ## Stop if no benchmark files (ocean.output, eventual stat files)
[5929]182   [ ! -e ${TRUS_STOR}/$file ] && { TRUS_RSLT='FAILED'; get_out 7; }
[5690]183
[5929]184   diff -q $file ${TRUS_STOR}/$file
[5690]185
186   ## Continue even if it differs
[5799]187   [ $? -ne 0 ] && { TRUS_RSLT='FAILED'; mesg='Different'; files_list+=$file' '; }
[5509]188    done
[5664]189
[5696]190    [ $mesg == 'Same' ] && echo $mesg
191
[5690]192    ## List different files for web comment
[5672]193    [ -n "${files_list}" ] && echo 'Results : '${files_list}'differ<br>' \
[5788]194   >> temp_${FILE_NOTE}
[5509]195}
196
[5383]197diff_restart() {
[5695]198    local base_name comp dif file list_comp list_tmsp nb_dom time_step tmsp
[5934]199    local files_list='' dif_sum=0
[5644]200
[5695]201    ## Stop if no benchmark files (ie time.step)
[5929]202    [ ! -e ${TRUS_STOR}/time.step ] && { TRUS_RSLT='FAILED'; get_out 7; }
203    time_step=$( cat ${TRUS_STOR}/time.step | tr -d [:space:] )
[5690]204
[5672]205    ## Find all restart files to rebuild
206    if [ $( find -regex ".*_restart.*[0-9]\.nc" -print -quit ) ]; then
[5788]207   base_name=$( find -regex ".*_restart.*[0-9]\.nc"                       \
[5934]208                | sed "s#^\./\(.*\)_[0-9]*_restart.*#\1#"       | sort -u   )
[5788]209   list_comp=$( find -regex ".*_restart.*[0-9]\.nc"                       \
[5934]210                | sed "s#^.*\(restart[a-z_]*\)_[0-9].*\.nc#\1#" | sort -u   )
[5788]211   list_tmsp=$( find -regex ".*_restart.*[0-9]\.nc"                       \
[5934]212                | sed "s#^.*\([0-9]\{8\}\)_restart.*#\1#"       | sort -u   )
[5383]213
[5690]214   ## Loop on each time step
[5672]215   for tmsp in ${list_tmsp}; do
[5788]216
[5672]217       for comp in ${list_comp}; do
218      file=${base_name}_${tmsp}_${comp}
219      nb_dom=$( find -name "${file}_[0-9]*.nc" | wc -l | awk '{ print $1 }' )
[5424]220
[5689]221      if   [ ${nb_dom} -gt 1 ]; then
[5799]222          ${TRUS_NGCM}/TOOLS/REBUILD_NEMO/rebuild_nemo -t ${TRUS_NPRO} $file ${nb_dom} \
223         > /dev/null
[5934]224
225           ## Possibility of remaining decomposed restarts (even after rebuild)
226          [ $? -eq 0 ] && rm -f ${file}_[0-9]*.nc \
227                        > /dev/null
228
[5689]229      elif [ ${nb_dom} -eq 0 ]; then
[5799]230          TRUS_RSLT='FAILED' && get_out 8
[5672]231      fi
[5424]232
[5672]233      ## Compare restart files at same time step
[5690]234      if [ $tmsp -eq ${time_step} ]; then
235
236                    ## Stop if no benchmark files (restart file)
[5929]237          if [ -e ${TRUS_STOR}/$file.nc ]; then
[5690]238
[5672]239                   ## UNIX `cmp` not suitable (timestamp in .nc file)
[5929]240         dif=$( $TRUS_CDOD $file.nc ${TRUS_STOR}/$file.nc 2> /dev/null          \
[5695]241                | awk '/records/ {print $0}' | sed '2 s/^/,/' | tr -d '\n' )
[5485]242
[5695]243         ## CDO can return void stdout with no difference
[5689]244         if [[ -n "$dif" && $( echo $dif | awk '{print $1}' ) -ne 0 ]]; then
[5799]245             TRUS_RSLT='FAILED'
[5788]246             files_list+=$comp' ' && let dif_sum+=$( echo $dif | awk '{print $1}' )
[5681]247             echo $file.nc': '$dif
[5672]248         fi
249
[5689]250          else
[5799]251         TRUS_RSLT='FAILED' && get_out 7
[5689]252          fi
[5672]253
[5689]254      else
255          continue
256      fi
[5485]257
[5672]258       done
[5788]259
[5424]260   done
[5383]261
[5934]262        ## List different files for web comment with sum of different records
[5788]263   if [ ${dif_sum} -ne 0 ]; then
264       echo 'Restarts: '${files_list}${dif_sum}' record(s) differ<br>' \
265      >> temp_${FILE_NOTE}
[5696]266   else
267       echo 'Same'
[5689]268   fi
[5455]269
[5689]270    else
[5799]271   TRUS_RSLT='FAILED'
[5689]272    fi
[5681]273
[5383]274}
275
[5788]276get_time() {
[5799]277    [ -z "${TRUS_JTIM}" ] && return
278
[5788]279    ## Interest for checking unusual time computation
[5799]280    local time_cpu=$( eval ${TRUS_JTIM} )
[5681]281
[5788]282    printf "Elapsed time: "
283    echo ${time_cpu} | tee -a ${FILE_TIME}
284}
[5472]285
[5788]286get_memy() {
[5799]287    [[ -z "${TRUS_JPME}" && -z "${TRUS_JVME}" ]] && return
288
[5788]289    ## Interest for checking unusual memory usage
[5799]290    local memory_pmax=$( eval ${TRUS_JPME} ) memory_vmax=$( eval ${TRUS_JVME} )
[5788]291
292    printf "Memory max usage (physical/virtual): "
293    echo ${memory_pmax}' / '${memory_vmax} | tee -a ${FILE_MEMY}
[5472]294}
295
[5644]296comments() {
[5695]297    local opat
298    local line='' state=$1
[5438]299
[5695]300    if [ -e ocean.output ]; then
[5788]301        ## 'W A R N I N G' pattern by default
[5696]302   opat="-A2 \"^ $state\""
303   [ "$state" == 'E R R O R' ] && opat="-A4 \"$state\""
[5438]304
[5695]305        ## Select first occurence for web comment
[5696]306   line=$( eval grep -m1 $opat ocean.output | tr -d '\n' )
[5695]307    fi
[5690]308
[5788]309    [ -n "$line" ] && ( echo $line; printf "$line<br>" \
310   >> temp_${FILE_NOTE} )
[5383]311}
312
[5788]313log_make() {
[5669]314    ## Format comments for web
[5788]315    [ -e temp_${FILE_NOTE} ] && cat temp_${FILE_NOTE} | tr -d '\n' | sed 's/<br>$//' \
316   >> ${FILE_NOTE}
[5669]317
[5788]318    ## Construct txt file with all messenger files
[5799]319    paste -d ';' mesg_*.txt | tee ${TRUS_FILE}
[5268]320}
321
[5788]322prod_publish() {
[5644]323    local cmd
[5788]324    local rev=$( awk '/NEMOGCM/ {print $NF}' model.log )
[5268]325
[5788]326    ## Production mode (-p|--prod)
[5799]327    if [ ${TRUS_PROD} -eq 1 ]; then
[5268]328
[5788]329   ## Create or append trusting logfile
[5929]330   if [ -f ${TRUS_STOR}/trusting_$PATTERNAME.txt ]; then cmd='tail -1'; else cmd='cat'; fi
[5424]331
[5929]332   $cmd ${TRUS_FILE}                            \
333       >> ${TRUS_STOR}/trusting_$PATTERNAME.txt
[5644]334
[5690]335        ## Send mail only when FAILED
[5799]336   if [[ ! -z "${TRUS_MAIL}" && ${TRUS_RSLT} == 'FAILED' ]]; then
[5690]337
338       ## Content
[5788]339       cat <<END_MAIL      \
[5681]340      > trusting.mail
[5788]341Dear all,
[5472]342
[5268]343
[5799]344The trusting sequence has not completed successfully on new configuration ${TRUS_CONF} based on ${TRUS_REFE}.
[5268]345
[5788]346Here is the model summary:
347`cat model.log`
[5268]348
[5788]349First checking would be on the trusting environment files:
350${TRUS_USER}.cfg & ${TRUS_HPCC}.cfg
351
[5929]352For more details, look into the testing folder at:
353${TRUS_SCRA}
[5788]354
355An archive has been created to share the questionable configuration for further studies:
[5929]356${TRUS_STOR}/${TRUS_ARCH}
[5788]357
[5268]358END_MAIL
[5690]359
360       ## Send with detailed subject
[5799]361       mail -s "[NEMO Trusting][$rev][${TRUS_BRAN}][${TRUS_REFE}] ${TRUS_RSLT} ${TRUS_RORR}" ${TRUS_MAIL} \
[5672]362      <  trusting.mail
[5644]363   fi
[5521]364
[5644]365    fi
[5268]366}
367
[5383]368get_out() {
[5695]369    local time_step=0
370
[5799]371    TRUS_RORR=$1
[5521]372
[5788]373    printf "\n\nEnd of test\n"
[5696]374
375    ## In case of compilation error
[5929]376    cd ${TRUS_SCRA}
[5696]377
[5799]378    if [ ${TRUS_RSLT} == 'FAILED' ]; then
[5695]379   echo 'Failure'
[5521]380
[5629]381        ## Error identification
[5799]382   case ${TRUS_RORR} in
[5695]383            ## Compilation
[5799]384       '1') TRUS_RORR='XIOS compilation failed' ;; '2') TRUS_RORR='NEMO compilation failed';;
[5629]385       ## Submission
[5799]386       '3') TRUS_RORR='Missing input files'     ;; '4') TRUS_RORR='Job submission error'   ;;
[5788]387       ## Computation
[5799]388       '5') TRUS_RORR='Crashed at time step'    ;; '6') TRUS_RORR='Exceeded time limit'    ;;
[5629]389       ## Results
[5799]390       '7') TRUS_RORR='Missing previous outputs';; '8') TRUS_RORR='New outputs differ'     ;;
391       ## Other
392       '*') TRUS_RORR='Unknown error'           ;;
[5509]393   esac
394
[5695]395    else
[5799]396   echo 'Success' && TRUS_RORR='Code is reliable'
[5424]397    fi
[5438]398
[5690]399    ## Eventual comments from ocean.output
[5799]400    if [ "${TRUS_RORR}" == 'Crashed at time step' ]; then
[5695]401   comments 'E R R O R'
[5788]402   [ -e time.step ] && time_step=$( grep -o [0-9]* time.step )
[5799]403   TRUS_RORR+=' '$time_step
[5695]404    else
405   comments 'W A R N I N G'
[5799]406   [ "${TRUS_RORR}" == 'Exceeded time limit' ] && TRUS_RORR+=' '$(( ${TRUS_TOUT}/3600 ))'h'
[5690]407    fi
[5438]408
[5696]409    ## Last messenger files
[5799]410    #export TRUS_RORR
411    sed -i "2 s/.*/$TRUS_RSLT/" ${FILE_RSLT}; sed -i "2 s/.*/$TRUS_RORR/" ${FILE_STAT}
[5690]412
[5788]413    ## Save tested configuration if trusting failed in production mode (-p|--prod)
[5799]414    if [[ ${TRUS_RSLT} == 'FAILED' && ${TRUS_PROD} -eq 1 ]]; then
[5929]415   echo 'Creating archive '${TRUS_ARCH}' under '${TRUS_STOR}
416   tar -czf ${TRUS_STOR}/${TRUS_ARCH}               *                    \
[5799]417       -C   ${TRUS_NGCM}/CONFIG/${TRUS_CONF}/MY_SRC .                    \
418       -C   ${TRUS_NGCM}/CONFIG/${TRUS_CONF}        cpp_${TRUS_CONF}.fcm
[5635]419    fi
[5521]420
[5788]421    ## Logfile construct & eventual sending of notification email
422    printf "\nTrusting digest:\n----------------\n"
423    log_make
424    prod_publish
[5689]425
[5623]426    exit 0
[5268]427}
Note: See TracBrowser for help on using the repository browser.