1 | #!/bin/bash |
---|
2 | |
---|
3 | |
---|
4 | ## Messenger filenames |
---|
5 | file_date=mesg_01_date.txt ; file_rslt=mesg_02_result.txt |
---|
6 | file_stat=mesg_03_status.txt; file_nemo=mesg_04_nemo.txt |
---|
7 | file_xios=mesg_05_xios.txt ; file_cmpf=mesg_06_compiler.txt |
---|
8 | file_lmpi=mesg_07_mpi.txt ; file_ncdf=mesg_08_netcdf.txt |
---|
9 | file_inpt=mesg_09_inputs.txt; file_time=mesg_10_time.txt |
---|
10 | file_memy=mesg_11_memory.txt; file_note=mesg_12_comments.txt |
---|
11 | |
---|
12 | |
---|
13 | ## Functions in order of use |
---|
14 | print_step() { |
---|
15 | local char_nb=$( echo "$1" | wc -c ) |
---|
16 | local outline=$( printf "%${char_nb}s" ) |
---|
17 | |
---|
18 | printf "\nStep.....\n%s\n%s\n" "$1" ${outline// /-} |
---|
19 | } |
---|
20 | |
---|
21 | init_files() { |
---|
22 | echo 'Date' > ${file_date} |
---|
23 | echo 'Result' > ${file_rslt} |
---|
24 | echo 'Status' > ${file_stat} |
---|
25 | echo 'NEMOGCM rev.' > ${file_nemo} |
---|
26 | echo 'XIOS rev.' > ${file_xios} |
---|
27 | echo 'Fortran compiler' > ${file_cmpf} |
---|
28 | echo 'MPI libs' > ${file_lmpi} |
---|
29 | echo 'NetCDF libs' > ${file_ncdf} |
---|
30 | echo 'Input files' > ${file_inpt} |
---|
31 | echo 'Elapsed time' > ${file_time} |
---|
32 | echo 'Memory (Phy./Virt.)' > ${file_memy} |
---|
33 | echo 'Comments' > ${file_note} |
---|
34 | |
---|
35 | ## 'Failed' status with 'Unknown error' by default |
---|
36 | echo ${TRUST_FLAG_RESULT} \ |
---|
37 | >> ${file_rslt} |
---|
38 | echo 'Unknown error' \ |
---|
39 | >> ${file_stat} |
---|
40 | } |
---|
41 | |
---|
42 | get_date() { |
---|
43 | ## UTC time zone for timestamping |
---|
44 | local dat=$( date -ud "${TRUST_TEST_DATE}" +"%F %R %Z" ) |
---|
45 | |
---|
46 | echo $dat \ |
---|
47 | >> ${file_date} |
---|
48 | } |
---|
49 | |
---|
50 | get_nemo_rev() { |
---|
51 | local dir rev_loc |
---|
52 | local rev=0 |
---|
53 | |
---|
54 | ## Loop on essential NEMO directories |
---|
55 | for dir in ${TRUST_SVN_CO} ${TRUST_IO_XIOS}; do |
---|
56 | |
---|
57 | ## For time being, just get revision from XIOS with no action on directory |
---|
58 | if [ $dir == ${TRUST_IO_XIOS} ]; then |
---|
59 | rev_loc=$( svn info $dir | awk '/Last Changed Rev/ {print $NF}' ) |
---|
60 | echo 'XIOS '${rev_loc} \ |
---|
61 | >> model.log |
---|
62 | echo "<a href=\"https://forge.ipsl.jussieu.fr/ioserver/changeset/${rev_loc}\" target=\"_blank\">${rev_loc}</a>" |
---|
63 | >> ${file_xios} |
---|
64 | continue |
---|
65 | fi |
---|
66 | |
---|
67 | echo $dir && ${TRUST_SVN_ACTION} ${TRUST_DIR_NEMOGCM}/$dir |
---|
68 | rev_loc=$( svn info ${TRUST_DIR_NEMOGCM}/$dir \ |
---|
69 | | awk '/Last Changed Rev/ {print $NF}' ) |
---|
70 | |
---|
71 | ## Keep last rev. nb |
---|
72 | [ ${rev_loc} -gt $rev ] && rev=${rev_loc} |
---|
73 | done |
---|
74 | |
---|
75 | echo 'NEMOGCM '$rev \ |
---|
76 | >> model.log |
---|
77 | echo "<a href=\"https://forge.ipsl.jussieu.fr/nemo/changeset/$rev\" target=\"_blank\">$rev</a>" |
---|
78 | >> ${file_nemo} |
---|
79 | } |
---|
80 | |
---|
81 | get_soft_rel() { |
---|
82 | local ver str |
---|
83 | |
---|
84 | ## Sourcing environment |
---|
85 | . ${TRUST_JOB_ENV} |
---|
86 | |
---|
87 | for str in ${TRUST_COMPILE_FORTRAN} \ |
---|
88 | ${TRUST_COMPILE_MPI} ${TRUST_COMPILE_NETCDF} \ |
---|
89 | ${TRUST_IO_CDO} ; do |
---|
90 | [ -z "$str" ] && continue |
---|
91 | ver='' |
---|
92 | |
---|
93 | ## Extract version number after searching pattern in PATH env. variable |
---|
94 | ver=$( echo $PATH | sed "s|.*\($str[0-9.]*\).*|\1|" ) |
---|
95 | |
---|
96 | ## option --version would work for main Fortran compilers and CDO |
---|
97 | if [[ $str =~ ${TRUST_COMPILE_FORTRAN}|${TRUST_IO_CDO} ]]; then |
---|
98 | ver=$( $str --version 2>&1 | grep -m1 -oe '\<[0-9. ]*\>' \ |
---|
99 | | xargs echo $str ) |
---|
100 | fi |
---|
101 | |
---|
102 | ## Cleaning characters string to display proper soft name |
---|
103 | #str=$( echo $str | sed 's|[/-]||g' ) |
---|
104 | ver=$( echo $ver | sed 's|[/-]| |g' ) |
---|
105 | |
---|
106 | echo $ver \ |
---|
107 | >> model.log |
---|
108 | done |
---|
109 | |
---|
110 | sed -n 3p model.log \ |
---|
111 | >> ${file_cmpf} |
---|
112 | sed -n 4p model.log \ |
---|
113 | >> ${file_lmpi} |
---|
114 | sed -n 5p model.log \ |
---|
115 | >> ${file_ncdf} |
---|
116 | } |
---|
117 | |
---|
118 | get_inputs() { |
---|
119 | # List archive content & extract it by default |
---|
120 | local inputs_list=$( eval " |
---|
121 | for archive in ${TRUST_IO_FORC_TAR}; do |
---|
122 | tar -tvf ${TRUST_IO_FORC_PATH}/\$archive >> inputs_list.txt; |
---|
123 | done |
---|
124 | " ) |
---|
125 | local inputs_get=$( eval " |
---|
126 | for archive in ${TRUST_IO_FORC_TAR}; do |
---|
127 | tar -vxf ${TRUST_IO_FORC_PATH}/\$archive > /dev/null; |
---|
128 | done |
---|
129 | " ) |
---|
130 | |
---|
131 | ## List & copy files without archive |
---|
132 | if [ -z "${TRUST_IO_FORC_TAR}" ]; then |
---|
133 | inputs_list=" ls -lh ${TRUST_IO_FORC_PATH}/* >> inputs_list.txt" |
---|
134 | inputs_get=" \cp ${TRUST_IO_FORC_PATH}/* . " |
---|
135 | fi |
---|
136 | |
---|
137 | ${inputs_list}; ${inputs_get} |
---|
138 | |
---|
139 | # for entry in ${TRUST_IO_FORC_PATH}; do |
---|
140 | # |
---|
141 | # If path to file (assuming it is an archive) |
---|
142 | # if [ -e $entry ]; then |
---|
143 | # tar -tvf $entry >> inputs_list.txt; |
---|
144 | # tar -vxf $entry > /dev/null; |
---|
145 | # If path to directory |
---|
146 | # elif [ -d $entry ]; then |
---|
147 | # inputs_list=" ls -lh ${TRUST_IO_FORC_PATH}/* >> inputs_list.txt" |
---|
148 | # inputs_get=" \cp ${TRUST_IO_FORC_PATH}/* . " |
---|
149 | # fi |
---|
150 | |
---|
151 | # done |
---|
152 | |
---|
153 | if [ $( find -name '*.gz' -print -quit ) ]; then |
---|
154 | find . -name '*.gz' -exec gzip -d {} \; |
---|
155 | fi |
---|
156 | } |
---|
157 | |
---|
158 | diff_inputs() { |
---|
159 | local dif file |
---|
160 | local files_list='' mesg='Same' |
---|
161 | |
---|
162 | ################################### |
---|
163 | ## Think of copying initial test ## |
---|
164 | ################################### |
---|
165 | |
---|
166 | ## Simple diff |
---|
167 | for file in cpp_* 'inputs_list.txt' *namelist_* *.xml; do |
---|
168 | dif='' |
---|
169 | |
---|
170 | ## Continue even if input file is not in here (see after) |
---|
171 | if [ -e ${TRUST_TEST_BENCHMARK}/$file ]; then |
---|
172 | dif=$( diff -q $file ${TRUST_TEST_BENCHMARK}/$file ) |
---|
173 | else |
---|
174 | dif=0 |
---|
175 | fi |
---|
176 | |
---|
177 | ## Pass over useless file omission in benckmark directory |
---|
178 | if [[ -n "$dif" && "$dif" != '0' ]]; then |
---|
179 | mesg='Different' |
---|
180 | echo $dif |
---|
181 | files_list+=$file' ' |
---|
182 | fi |
---|
183 | |
---|
184 | done |
---|
185 | |
---|
186 | [ $mesg == 'Same' ] && echo $mesg |
---|
187 | echo $mesg \ |
---|
188 | >> ${file_inpt} |
---|
189 | |
---|
190 | ## List different files for web comment |
---|
191 | [ -n "${files_list}" ] && echo 'Inputs : '${files_list}'differ<br>' \ |
---|
192 | >> temp_${file_note} |
---|
193 | } |
---|
194 | |
---|
195 | job_pending() { |
---|
196 | local outline=$( printf "%100s" ) time_elapsed=0 time_increment=30 |
---|
197 | |
---|
198 | sleep ${time_increment} |
---|
199 | |
---|
200 | ## Append a log file while pending |
---|
201 | while [[ $( eval ${TRUST_JOB_STATE} ) \ |
---|
202 | && ${time_elapsed} -lt ${TRUST_JOB_TIMEOUT} ]]; do |
---|
203 | printf "\n%s\n" ${outline// /#} \ |
---|
204 | >> computation.log |
---|
205 | [ -n "${TRUST_JOB_INFO}" ] && eval ${TRUST_JOB_INFO} \ |
---|
206 | >> computation.log |
---|
207 | sleep ${time_increment} |
---|
208 | time_elapsed=$(( ${time_elapsed} + ${time_increment} )) |
---|
209 | done |
---|
210 | |
---|
211 | sleep ${time_increment} |
---|
212 | |
---|
213 | ## Kill remaining job & stop the test if it's too long |
---|
214 | if [ ${time_elapsed} -eq ${TRUST_JOB_TIMEOUT} ]; then |
---|
215 | eval ${TRUST_JOB_KILL} &> /dev/null |
---|
216 | get_out 6 |
---|
217 | fi |
---|
218 | |
---|
219 | } |
---|
220 | |
---|
221 | diff_results() { |
---|
222 | local file |
---|
223 | local files_list='' mesg='Same' |
---|
224 | |
---|
225 | ################################### |
---|
226 | ## Think of copying initial test ## |
---|
227 | ################################### |
---|
228 | |
---|
229 | ## Simple diff |
---|
230 | for file in 'ocean.output' *.stat; do |
---|
231 | ## Stop if no minimal benchmark files (ocean.output, eventual stat files) |
---|
232 | [ ! -e ${TRUST_TEST_BENCHMARK}/$file ] && get_out 7 |
---|
233 | |
---|
234 | diff -q $file ${TRUST_TEST_BENCHMARK}/$file |
---|
235 | |
---|
236 | ## Continue even if it differs |
---|
237 | if [ $? -ne 0 ]; then mesg='Different'; files_list+=$file' '; fi |
---|
238 | |
---|
239 | done |
---|
240 | |
---|
241 | [ $mesg == 'Same' ] && echo $mesg |
---|
242 | |
---|
243 | ## List different files for web comment |
---|
244 | [ -n "${files_list}" ] && echo 'Results : '${files_list}'differ<br>' \ |
---|
245 | >> temp_${file_note} |
---|
246 | } |
---|
247 | |
---|
248 | diff_restarts() { |
---|
249 | local dif filebase filebases ndomain out |
---|
250 | local files_list='' dif_sum='0' #bcmk='false' |
---|
251 | |
---|
252 | ## Find all restart files to rebuild |
---|
253 | if [ $( find -regex ".*_restart.*[0-9]\.nc" -print -quit ) ]; then |
---|
254 | ############################################################### |
---|
255 | ## Think to set the configuration name in the 'namelist_cfg' ## |
---|
256 | ############################################################### |
---|
257 | filebases=$( find -regextype sed -regex ".*${TRUST_CFG_NEW}.*_[0-9]\{4\}\.nc" \ |
---|
258 | | sed 's/\(.*\)_.*/\1/' | sort -u ) |
---|
259 | |
---|
260 | for filebase in $filebases; do |
---|
261 | |
---|
262 | ndomain=$( find -regex ".*${filebase}_[0-9]*.nc" \ |
---|
263 | | wc -l | awk '{print $1}' ) |
---|
264 | |
---|
265 | [ $ndomain -eq 0 ] && get_out X |
---|
266 | |
---|
267 | ##################################################### |
---|
268 | ## Handle 2 possibilities of 'rebuild_nemo' origin ## |
---|
269 | ##################################################### |
---|
270 | |
---|
271 | ${TRUST_DIR_NEMOGCM}/TOOLS/REBUILD_NEMO/rebuild_nemo \ |
---|
272 | -t ${TRUST_COMPILE_NPROC} $filebase $ndomain \ |
---|
273 | > /dev/null |
---|
274 | |
---|
275 | ## Possibility of remaining decomposed restarts (even after rebuild) |
---|
276 | if [ $? -eq 0 ]; then |
---|
277 | rm -f ${filebase}_[0-9]*.nc \ |
---|
278 | > /dev/null |
---|
279 | else |
---|
280 | get_out X |
---|
281 | fi |
---|
282 | |
---|
283 | ## Stop if no benchmark files (restart file) |
---|
284 | if [ -e ${TRUST_TEST_BENCHMARK}/$filebase.nc ]; then |
---|
285 | |
---|
286 | #bcmk='true' |
---|
287 | cdo diffn $filebase.nc ${TRUST_TEST_BENCHMARK}/$filebase.nc \ |
---|
288 | > cdo_diff.out 2> /dev/null |
---|
289 | |
---|
290 | ## Identical if cdo_diff.out is zero size |
---|
291 | [ ! -s cdo_diff.out ] && continue |
---|
292 | |
---|
293 | dif=$( grep -om1 '[0-9]* of [0-9]* records' cdo_diff.out ) |
---|
294 | |
---|
295 | if [ -n "$dif" ]; then |
---|
296 | files_list+=$filebase' ' && echo $filebase'.nc: '$dif |
---|
297 | let dif_sum+=$( echo $dif | sed '|^\([0-9]*\).*|\1|' ) |
---|
298 | fi |
---|
299 | |
---|
300 | fi |
---|
301 | |
---|
302 | done |
---|
303 | |
---|
304 | ## List modified restart(s) for web comment with sum of differences |
---|
305 | if [ ${dif_sum} -ne 0 ]; then |
---|
306 | echo 'Restarts: '${files_list}${dif_sum}' record(s) differ<br>' \ |
---|
307 | >> temp_${file_note} |
---|
308 | else |
---|
309 | echo 'Same' |
---|
310 | fi |
---|
311 | |
---|
312 | fi |
---|
313 | |
---|
314 | } |
---|
315 | |
---|
316 | get_time() { |
---|
317 | [ -z "${TRUST_JOB_TIME}" ] && return |
---|
318 | |
---|
319 | ## Interest for checking unusual time computation |
---|
320 | local time_cpu=$( eval ${TRUST_JOB_TIME} ) |
---|
321 | |
---|
322 | printf "Elapsed time: " |
---|
323 | echo ${time_cpu} | tee -a ${file_time} |
---|
324 | } |
---|
325 | |
---|
326 | get_memy() { |
---|
327 | [[ -z "${TRUST_JOB_RAM_P}" && -z "${TRUST_JOB_RAM_V}" ]] && return |
---|
328 | |
---|
329 | ## Interest for checking unusual memory usage |
---|
330 | local memory_pmax=$( eval ${TRUST_JOB_RAM_P} ) |
---|
331 | local memory_vmax=$( eval ${TRUST_JOB_RAM_V} ) |
---|
332 | |
---|
333 | printf "Memory max usage (physical/virtual): " |
---|
334 | echo ${memory_pmax}' / '${memory_vmax} | tee -a ${file_memy} |
---|
335 | } |
---|
336 | |
---|
337 | comments() { |
---|
338 | local opat |
---|
339 | local line='' state=$1 |
---|
340 | |
---|
341 | if [ -e ocean.output ]; then |
---|
342 | ## 'W A R N I N G' pattern by default |
---|
343 | opat="-A2 \"^ $state\"" |
---|
344 | [ "$state" == 'E R R O R' ] && opat="-A4 \"$state\"" |
---|
345 | |
---|
346 | ## Select first occurence for web comment |
---|
347 | line=$( eval grep -m1 $opat ocean.output | tr -d '\n' ) |
---|
348 | fi |
---|
349 | |
---|
350 | [ -n "$line" ] && ( echo $line; printf "$line<br>" \ |
---|
351 | >> temp_${file_note} ) |
---|
352 | } |
---|
353 | |
---|
354 | log_make() { |
---|
355 | ## Format comments for web |
---|
356 | if [ -e temp_${file_note} ]; then |
---|
357 | cat temp_${file_note} | tr -d '\n' | sed 's/<br>$//' \ |
---|
358 | >> ${file_note} |
---|
359 | fi |
---|
360 | |
---|
361 | ## Construct txt file with all messenger files |
---|
362 | paste -d ';' mesg_*.txt | tee ${TRUST_TEST_SUMMARY} |
---|
363 | } |
---|
364 | |
---|
365 | prod_publish() { |
---|
366 | local cmd |
---|
367 | local rev=$( awk '/NEMOGCM/ {print $NF}' model.log ) |
---|
368 | |
---|
369 | ## Production mode (-p|--prod) |
---|
370 | if [ ${TRUST_FLAG_PROD} -eq 1 ]; then |
---|
371 | |
---|
372 | ## Create or append trusting logfile |
---|
373 | if [ -f ${TRUST_TEST_LOG} ]; then cmd='tail -1'; else cmd='cat'; fi |
---|
374 | |
---|
375 | $cmd ${TRUST_TEST_SUMMARY} \ |
---|
376 | >> ${TRUST_TEST_LOG} |
---|
377 | |
---|
378 | ## Send mail only when FAILED |
---|
379 | if [[ ! -z "${TRUST_TEST_MAILING}" \ |
---|
380 | && ${TRUST_FLAG_RESULT} == 'FAILED' ]]; then |
---|
381 | |
---|
382 | ## Content |
---|
383 | cat <<END_MAIL \ |
---|
384 | > trusting.mail |
---|
385 | Dear all, |
---|
386 | |
---|
387 | |
---|
388 | The following trusting sequence has not completed successfully: |
---|
389 | |
---|
390 | Testing configuration ${TRUST_CFG_NEW} based on ${TRUST_CFG_REF}. |
---|
391 | User installation ${TRUST_MAIN_USER} |
---|
392 | HPC environment ${TRUST_MAIN_HPCC} |
---|
393 | |
---|
394 | Here is the running environment summary: |
---|
395 | `cat model.log` |
---|
396 | |
---|
397 | For more details, look into the testing folder at: |
---|
398 | ${TRUST_TEST_DIR} |
---|
399 | An archive is also available to share the questionable configuration: |
---|
400 | ${TRUST_TEST_BENCHMARK}/${TRUST_TEST_BACKUP} |
---|
401 | |
---|
402 | END_MAIL |
---|
403 | |
---|
404 | ## Send with detailed subject |
---|
405 | mail -s "[NEMO Trusting][${TRUST_CFG_REF}][${TRUST_SVN_BRANCH}] \ |
---|
406 | ${TRUST_FLAG_RESULT} ${TRUST_FLAG_ERROR}" \ |
---|
407 | ${TRUST_TEST_MAILING} \ |
---|
408 | < trusting.mail |
---|
409 | fi |
---|
410 | |
---|
411 | fi |
---|
412 | } |
---|
413 | |
---|
414 | get_out() { |
---|
415 | local time_step=0 |
---|
416 | |
---|
417 | TRUST_FLAG_ERROR=$1 |
---|
418 | |
---|
419 | printf "\n\nEnd of test\n" |
---|
420 | |
---|
421 | ## In case of compilation error |
---|
422 | cd ${TRUST_TEST_DIR} |
---|
423 | |
---|
424 | if [ ${TRUST_FLAG_RESULT} == 'FAILED' ]; then |
---|
425 | echo 'Failure' |
---|
426 | |
---|
427 | ## Error identification |
---|
428 | case ${TRUST_FLAG_ERROR} in |
---|
429 | ## Compilation |
---|
430 | '1') TRUST_FLAG_ERROR='XIOS compilation failed' ;; |
---|
431 | '2') TRUST_FLAG_ERROR='NEMO compilation failed' ;; |
---|
432 | ## Submission |
---|
433 | '3') TRUST_FLAG_ERROR='Missing input files' ;; |
---|
434 | '4') TRUST_FLAG_ERROR='Job submission error' ;; |
---|
435 | ## Computing |
---|
436 | '5') TRUST_FLAG_ERROR='Crashed at time step' ;; |
---|
437 | '6') TRUST_FLAG_ERROR='Exceeded time limit' ;; |
---|
438 | ## Results |
---|
439 | '7') TRUST_FLAG_ERROR='Missing previous outputs';; |
---|
440 | '8') TRUST_FLAG_ERROR='New outputs differ' ;; |
---|
441 | ## Other |
---|
442 | '*') TRUST_FLAG_ERROR='Unknown error' ;; |
---|
443 | esac |
---|
444 | |
---|
445 | else |
---|
446 | echo 'Success' && TRUST_FLAG_ERROR='Code is reliable' |
---|
447 | fi |
---|
448 | |
---|
449 | ## Eventual comments from ocean.output |
---|
450 | if [ "${TRUST_FLAG_ERROR}" == 'Crashed at time step' ]; then |
---|
451 | comments 'E R R O R' |
---|
452 | [ -e time.step ] && time_step=$( cat time.step ) |
---|
453 | TRUST_FLAG_ERROR+=' '$time_step |
---|
454 | else |
---|
455 | comments 'W A R N I N G' |
---|
456 | |
---|
457 | if [ "${TRUST_FLAG_ERROR}" == 'Exceeded time limit' ]; then |
---|
458 | TRUST_FLAG_ERROR+=' '$(( ${TRUST_JOB_TIMEOUT}/3600 ))'h' |
---|
459 | fi |
---|
460 | |
---|
461 | fi |
---|
462 | |
---|
463 | ## Last messenger files |
---|
464 | sed -i "2 s/.*/$TRUST_RESULT/" ${file_rslt} |
---|
465 | sed -i "2 s/.*/$TRUST_FLAG_ERROR/" ${file_stat} |
---|
466 | |
---|
467 | ## Save tested configuration if trusting failed in production mode (-p|--prod) |
---|
468 | if [[ ${TRUST_FLAG_RESULT} == 'FAILED' && ${TRUST_FLAG_PROD} -eq 1 ]]; then |
---|
469 | echo 'Creating archive '${TRUST_TEST_BACKUP}' under '${TRUST_TEST_BENCHMARK} |
---|
470 | tar -czf ${TRUST_TEST_BENCHMARK}/${TRUST_TEST_BACKUP} * \ |
---|
471 | -C ${TRUST_DIR_NEMOGCM}/CONFIG/${TRUST_CFG_NEW}/MY_SRC . \ |
---|
472 | -C ${TRUST_DIR_NEMOGCM}/CONFIG/${TRUST_CFG_NEW} \ |
---|
473 | cpp_${TRUST_CFG_NEW}.fcm |
---|
474 | fi |
---|
475 | |
---|
476 | ## Logfile construct & eventual sending of notification email |
---|
477 | printf "\nTrusting digest:\n----------------\n" |
---|
478 | log_make |
---|
479 | prod_publish |
---|
480 | |
---|
481 | exit 0 |
---|
482 | } |
---|