Changeset 5509


Ignore:
Timestamp:
2015-06-30T10:49:09+02:00 (5 years ago)
Author:
nicolasmartin
Message:

dev_r5092_CNRS_SETTE Error management redefinition

Location:
branches/2015/dev_r5092_CNRS_SETTE/NEMOGCM/TRUST
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • branches/2015/dev_r5092_CNRS_SETTE/NEMOGCM/TRUST/trusting.sh

    r5485 r5509  
    66NEMO_TRUS=$( pwd $( dirname $0 ) ) 
    77DEBUG=0; PUBLISH=0 
    8 STATUS='FAILED'; STEP='' # 'FAILED' by default 
     8ST='FAILED'; ERR=0 # 'FAILED' by default 
    99 
    1010# Get options for replacing some initials settings 
     
    6464cd ${DIR_XIOS} 
    6565#./make_xios --full --arch $ARCH -job $NPROC >& /dev/null 
    66 [ ! -e ${DIR_XIOS}/lib/libxios.a ] && get_out $STATUS $STEP 
     66[ ! -e ${DIR_XIOS}/lib/libxios.a ] && get_out $ST ERR=1 
    6767 
    6868# NEMO config compiled from scratch 
     
    7272#[ -d ${TEST_CONF} ] && ./makenemo -n ${TEST_CONF} clean 
    7373./makenemo -n ${TEST_CONF} -r ${REFE_CONF} -m $ARCH -j $NPROC >& /dev/null 
    74 [ ! -e ${TEST_CONF}/BLD/bin/nemo.exe ] && get_out $STATUS $STEP 
     74[ ! -e ${TEST_CONF}/BLD/bin/nemo.exe ] && get_out $ST ERR=2 
    7575 
    7676# Get namelists, xml & forced files for running 
     
    8080find ${NEMO_CONF}/${TEST_CONF}/EXP00 -regex '.*\(cfg\|opa\|ref\|xml\)' -exec cp {} . \; 
    8181get_inputs 
    82 [ $? -ne 0 ] && get_out $STATUS $STEP 
     82[ $? -ne 0 ] && get_out $ST ERR=3 
    8383[ $( find . -name '*.gz' -print -quit ) ] && gunzip *.gz 
    8484 
     
    9292print_step 'Submitting job' 
    9393JOB_ID=$( eval ${JOB_SUBM} ) 
    94 [ $? -ne 0 ] && get_out $STATUS $STEP 
     94[ $? -ne 0 ] && get_out $ST ERR=4 
    9595print_step 'Holding-Running job' 
    9696job_pending 
     
    102102if   [[ ! -e time.step || $( grep -q 'E R R O R' ocean.output ) ]]; then 
    103103    comments 'E R R O R' 
    104     get_out $STATUS $STEP 
     104    get_out $ST ERR=5 
    105105else 
    106106    # Get time computation 
     
    110110fi 
    111111 
    112 STATUS='OK' # 'OK' by default 
     112ST='OK' # 'OK' by default 
    113113# Inspect output text files 
    114114#--------------------------------------------------- 
    115115print_step 'Test ASCII output files diff' 
    116116diff_textfiles 
    117 [ "$STATUS" == 'FAILED' ] && get_out $STATUS $STEP 
     117[ "$ST" == 'FAILED' ] && get_out $ST ERR=8 
    118118 
    119119# Inspect output NetCDF files 
     
    121121print_step 'Test last restart NetCDF files diff' 
    122122diff_restart 
    123 [ "$STATUS" == 'FAILED' ] && get_out $STATUS $STEP 
     123[ "$ST" == 'FAILED' ] && get_out $ST ERR=10 
    124124 
    125125# Get comments (ocean.output & diff model.log) 
     
    129129# End 
    130130#--------------------------------------------------- 
    131 [ "$STATUS" == 'OK' ] && STEP='Code is reliable' 
    132 get_out $STATUS $STEP 
     131[ "$ST" == 'OK' ] && ERR='Code is reliable' 
     132get_out $ST $ERR 
  • branches/2015/dev_r5092_CNRS_SETTE/NEMOGCM/TRUST/trusting_func.sh

    r5485 r5509  
    33 
    44comments() { 
    5     state=$1 
     5    state=$1; LAST_TIME_STEP=0 
    66 
    77    if [ "$state" == 'E R R O R' ]; then 
    8    if [ -e time.step ]; then 
    9        last_time_step=$( cat time.step | tr -d [:space:] ) 
    10        STEP='nemo.exe crashed at '${last_time_step} && export STEP 
    11    else 
    12        STEP='nemo.exe crashed at initialization'    && export STEP 
    13    fi 
     8   [ -e time.step ] && LAST_TIME_STEP=$( cat time.step | tr -d [:space:] ) 
     9   export ${LAST_TIME_STEP} ERR=1 
    1410    fi 
    1511 
     
    2218} 
    2319 
     20 
    2421diff_inputs() { 
    2522    mesg='Same' 
    26  
    2723    for file in inputs_list.txt $( ls namelist_* ) $( ls *.xml ); do 
    28    diff $file ${REFE_DIR}/$file 
     24   diff -q $file ${REFE_DIR}/$file 
    2925   [ $? -ne 0 ] && mesg='Different' 
    3026    done 
     
    3329} 
    3430 
     31diff_results() { 
     32    for file in ocean.output $( ls *.stat ); do 
     33   [ ! -e ${REFE_DIR}/$file ] && export ST='FAILED' && get_out $ST ERR=7 
     34   diff -q $file ${REFE_DIR}/$file 
     35   [ $? -ne 0 ] && export ST='FAILED' 
     36    done 
     37} 
     38 
    3539diff_restart() { 
    36  
    37     if   [ ! -e  ${REFE_DIR}/time.step ]                 ; then 
    38    echo "Seems that there is no restart files for comparison: no time.step in ${REFE_DIR}" 
    39    return 
    40     elif [ $( diff -q time.step ${REFE_DIR}/time.step ) ]; then 
    41    printf "Seems that the trusting test ends at a different time step: " 
    42    printf "${TEST_DIR} $( cat ${TEST_DIR}/time.step | tr -d [:space:] ) != " 
    43    printf "${REFE_DIR} $( cat ${REFE_DIR}/time.step | tr -d [:space:] )  \n" 
    44    return 
    45     fi 
    46  
    47     last_time_step=$( cat ${REFE_DIR}/time.step | tr -d [:space:] ) 
    48     echo 'Last time step of standard run: '${last_time_step} 
    49  
    50     if [ $( find -name "*${last_time_step}_restart*.nc" -print -quit ) ]; then 
    51    base_name=$( find -name "*${last_time_step}_restart*.nc" -print -quit | awk -F/ '{print $NF}' \ 
    52                 | sed "s/^\(.*\)$last_time_step\_restart.*/\1$last_time_step\_/"                  ) 
     40    [ ! -e  ${REFE_DIR}/time.step ] && export ST='FAILED' && get_out $ST ERR=9 
     41 
     42    export LAST_TIME_STEP=$( cat ${REFE_DIR}/time.step | tr -d [:space:] ) 
     43    echo 'Last time step of standard run: '${LAST_TIME_STEP} 
     44 
     45    if [    $( find -name "*${LAST_TIME_STEP}_restart*.nc" -print -quit ) \ 
     46    && $( diff -q     time.step ${REFE_DIR}/time.step              )  ]; then 
     47   base_name=$( find -name "*${LAST_TIME_STEP}_restart*.nc" -print -quit | awk -F/ '{print $NF}' \ 
     48                | sed "s/^\(.*\)$LAST_TIME_STEP\_restart.*/\1$LAST_TIME_STEP\_/"                  ) 
    5349 
    5450   for comp in restart restart_ice restart_trc; do 
     
    6662 
    6763      if [ ! -z "${nc_diff}" ]; then 
    68           export STATUS='FAILED' STEP="$file.nc different" 
     64          export ST='FAILED' 
    6965          printf "$CDO ${nc_diff}\n" 
    7066      else 
     
    7369 
    7470       else 
    75       printf "No previous $file.nc found for comparison\n" 
     71      export ST='FAILED' 
    7672       fi 
    7773 
     
    7975 
    8076    else 
    81    export STATUS='FAILED'; STEP='No restart files found at last time step' 
    82    printf "No $file.nc found for comparison\n" 
    83     fi 
    84  
    85 } 
    86  
    87 diff_textfiles() { 
    88     for file in ocean.output $( ls *.stat ); do 
    89  
    90    if [ -e ${REFE_DIR}/$file ]; then 
    91        diff $file ${REFE_DIR}/$file 
    92        [ $? -ne 0 ] && export STATUS='FAILED' STEP="$file different" 
    93    else 
    94        printf "No previous $file found for comparison\n" 
    95    fi 
    96  
    97     done 
     77   export ST='FAILED' 
     78    fi 
     79 
    9880} 
    9981 
     
    11698get_soft_rel() { 
    11799    for rel in $CDO $COMPILER $MPI $NETCDF; do 
    118                                  arch_rel=$( echo $LOADEDMODULES | sed  "s#.*$rel/\([^:]*\).*#\1#" ) 
    119    [ -z "${arch_rel}"  ] && arch_rel=$( echo $PATH          | sed  "s#.*$rel/\([^/]*\).*#\1#" ) 
     100                                     arch_rel=$( echo $LOADEDMODULES | sed  "s#.*$rel/\([^:]*\).*#\1#" ) 
     101   [ -z "${arch_rel}"  ]     && arch_rel=$( echo $PATH          | sed  "s#.*$rel/\([^/]*\).*#\1#" ) 
    120102   [ "$rel" == "$COMPILER" ] && arch_rel=$( $rel --version      | grep -m1 -o ' [0-9.]* '         ) 
    121103   echo $rel ${arch_rel} >> model.log 
     
    133115    if [[    $( echo ${NEMO_VERS} | grep  "HEAD\|up\|update"   ) \ 
    134116     || $( echo ${NEMO_VERS} | tr -d '[:alpha:][:punct:]' )  ]]; then 
    135    rev=$( echo ${NEMO_VERS} | tr -d '[:alpha:][:punct:]' ) || rev='HEAD' 
     117    rev=$( echo ${NEMO_VERS} | tr -d '[:alpha:][:punct:]' ) || rev='HEAD' 
    136118   svn_cmd='svn update -r '$rev  
    137119    else 
     
    161143 
    162144    echo   'NEMOGCM '$rev   >> model.log 
    163 #   printf "NEMOGCM rev.\n$rev\n" >  mesg_04_nemogcm_${CFG_USER}_${CFG_ARCH}.txt 
    164145    printf "NEMOGCM rev.\n" >  mesg_04_nemogcm_${CFG_USER}_${CFG_ARCH}.txt 
    165146    printf "<a href=\"https://forge.ipsl.jussieu.fr/nemo/changeset/$rev\" target=\"_blank\">$rev</a>" \ 
     
    181162 
    182163  # Send mail only when FAILED 
    183   if [[ ! -z $EMAIL && "$STATUS" == 'FAILED' ]]; then 
     164  if [[ ! -z $EMAIL && "$ST" == 'FAILED' ]]; then 
    184165      cat << END_MAIL > trusting.mail 
    185166XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX 
     
    194175 
    195176END_MAIL 
    196 #`tail -n 1 ${TEST_DIR}/mesg_03_step_${CFG_USER}_${CFG_ARCH}.txt` 
     177#`tail -n 1 ${TEST_DIR}/mesg_03_error_${CFG_USER}_${CFG_ARCH}.txt` 
    197178      if [ -e ${TEST_DIR}/trusting_${DATESTR}_${CFG_USER}_${CFG_ARCH}.txt ]; then 
    198179     cat ${TEST_DIR}/trusting_${DATESTR}_${CFG_USER}_${CFG_ARCH}.txt  >> trusting.mail 
    199180      fi 
    200       mail -s "[trusting ${REFE_CONF}] $STATUS $STEP" $EMAIL  <  trusting.mail 
     181      mail -s "[trusting ${REFE_CONF}] $ST $ERR" $EMAIL  <  trusting.mail 
    201182  fi 
    202183 
     
    204185} 
    205186 
    206 print_step() { 
    207     [ ! -z "$1" ] && STEP=$1 
    208     export STEP && printf "Step.....\n$STEP\n" 
    209 } 
     187print_step() { printf "Step.....\n$1\n"; } 
    210188 
    211189get_out() { 
     190    printf "Status\n$ST\n"  > mesg_02_status_${CFG_USER}_${CFG_ARCH}.txt 
     191 
    212192    # Save tested configuration if trusting failed 
    213     if [ "$STATUS" == 'FAILED' ]; then 
     193    if [ "$ST" == 'FAILED' ]; then 
    214194   cd ${TEST_DIR} 
    215195   printf "Input files\n\n"   > mesg_09_inputfiles_${CFG_USER}_${CFG_ARCH}.txt 
     
    220200   fi 
    221201 
     202        # Error identification 
     203   case ERR in 
     204       # Compilation 
     205       1) ERR='XIOS compilation failed'            ;;  2) ERR='NEMO compilation failed'            ;; 
     206       # Submission 
     207       3) ERR='Missing input files'                 ;;  4) ERR='Job submission error'               ;; 
     208       # Running 
     209       5) ERR='nemo.exe crashed at '${LAST_TIME_ERR};;  6) ERR='Exceeded time limit '${TIME_LIMI}'h';; 
     210       # Results 
     211       7) ERR='Missing previous outputs '           ;;  8) ERR='New outputs  differ/missing'        ;; 
     212       9) ERR='Missing previous restarts'           ;; 10) ERR='New restarts differ/missing'        ;; 
     213       # Undefined 
     214       *) ERR='Unknown error'                       ;; 
     215   esac 
     216 
    222217   #tar -czf ${REFE_DIR}/trusting_${DATESTR}_${CFG_USER}_${CFG_ARCH}.tar.gz * 
    223218    fi 
    224219 
    225     printf "Status\n$STATUS\n"  > mesg_02_status_${CFG_USER}_${CFG_ARCH}.txt 
    226     printf "Step.....\n$STEP\n" > mesg_03_step_${CFG_USER}_${CFG_ARCH}.txt 
    227  
    228     mesg_make; mesg_publish 
     220    printf "Result.....\n$ERR\n" > mesg_03_result_${CFG_USER}_${CFG_ARCH}.txt 
     221 
     222    mesg_make 
     223    mesg_publish 
    229224 
    230225    exit 1 
     
    233228job_pending() { 
    234229    time_elapsed=0; time_increment=30 
     230 
    235231    sleep ${time_increment} 
    236  
    237232    while [[ $( ${JOB_LIST} | grep ${JOB_ID} ) && ${time_elapsed} -lt ${TIME_LIMI} ]]; do 
    238233   printf "\n####################################################" >> computation.log 
    239234   ${JOB_INFO} ${JOB_ID}                                           >> computation.log 
    240235   sleep ${time_increment} 
    241    let time_elapsed+=${time_increment} 
    242     done 
    243  
     236#  let time_elapsed+=${time_increment} 
     237   time_elapsed=$(( ${time_elapsed} + ${time_increment} )) 
     238    done 
    244239    sleep ${time_increment} 
    245240 
    246241    if [ ${time_elapsed} -eq ${TIME_LIMI} ]; then 
    247    STEP='Exceeded time limit' 
    248    [ $( ${JOB_LIST} | grep ${JOB_ID} ) ] && STEP='Job not finished on time: '$(( ${TIME_LIMI}/3600 ))'h' 
    249    print_step 
    250242   ${JOB_DELE} ${JOB_ID} &> /dev/null 
    251    get_out $STATUS $STEP 
    252     fi 
    253 } 
     243   TIME_LIMIT=$(( ${TIME_LIMI} / 3600 )) 
     244   export ${TIME_LIMIT} 
     245   get_out $ST ERR=6 
     246} 
Note: See TracChangeset for help on using the changeset viewer.