[9846] | 1 | #!/bin/bash |
---|
| 2 | # |
---|
| 3 | # BENCH launching scripts for beaufix, Meteo-France |
---|
[9848] | 4 | # and curie, TGCC |
---|
[9846] | 5 | # To be modified for other machines |
---|
| 6 | # |
---|
| 7 | set -u |
---|
[11536] | 8 | #set -vx |
---|
[9846] | 9 | # |
---|
| 10 | cores=$1 |
---|
| 11 | ncore_node=$2 |
---|
| 12 | resolution=$3 |
---|
[9869] | 13 | dateref=$4 |
---|
[9846] | 14 | machine=$( hostname | sed -e "s/[0-9]*//g" ) |
---|
| 15 | # |
---|
| 16 | # number of processes for each executable |
---|
[11536] | 17 | nproc_exe1=$( echo $cores | bc ) |
---|
| 18 | nproc=$nproc_exe1 |
---|
| 19 | nnode=$(( ( $nproc + $ncore_node - 1 ) / $ncore_node )) |
---|
[9846] | 20 | |
---|
| 21 | nproc5=$( printf "%05d\n" ${nproc_exe1} ) |
---|
| 22 | |
---|
[11536] | 23 | case ${resolution} in |
---|
| 24 | "1") |
---|
| 25 | if [ $nproc_exe1 -lt 50 ] |
---|
| 26 | then |
---|
| 27 | timejob=3600 |
---|
| 28 | elif [ $nproc_exe1 -lt 100 ] |
---|
| 29 | then |
---|
| 30 | timejob=1800 |
---|
| 31 | elif [ $nproc_exe1 -lt 200 ] |
---|
| 32 | then |
---|
| 33 | timejob=900 |
---|
| 34 | else |
---|
| 35 | timejob=600 |
---|
| 36 | fi |
---|
| 37 | ;; |
---|
| 38 | "025") |
---|
| 39 | if [ $nproc_exe1 -lt 50 ] |
---|
| 40 | then |
---|
| 41 | timejob=15000 |
---|
| 42 | elif [ $nproc_exe1 -lt 100 ] |
---|
| 43 | then |
---|
| 44 | timejob=7000 |
---|
| 45 | elif [ $nproc_exe1 -lt 200 ] |
---|
| 46 | then |
---|
| 47 | timejob=3600 |
---|
| 48 | elif [ $nproc_exe1 -lt 400 ] |
---|
| 49 | then |
---|
| 50 | timejob=2000 |
---|
| 51 | elif [ $nproc_exe1 -lt 800 ] |
---|
| 52 | then |
---|
| 53 | timejob=1000 |
---|
| 54 | else |
---|
| 55 | timejob=600 |
---|
| 56 | fi |
---|
| 57 | ;; |
---|
| 58 | "12") |
---|
| 59 | if [ $nproc_exe1 -lt 200 ] |
---|
| 60 | then |
---|
| 61 | timejob=30000 |
---|
| 62 | elif [ $nproc_exe1 -lt 400 ] |
---|
| 63 | then |
---|
| 64 | timejob=15000 |
---|
| 65 | elif [ $nproc_exe1 -lt 800 ] |
---|
| 66 | then |
---|
| 67 | timejob=20000 |
---|
| 68 | elif [ $nproc_exe1 -lt 1600 ] |
---|
| 69 | then |
---|
| 70 | timejob=15000 |
---|
| 71 | elif [ $nproc_exe1 -lt 3200 ] |
---|
| 72 | then |
---|
| 73 | timejob=7500 |
---|
| 74 | elif [ $nproc_exe1 -lt 10000 ] |
---|
| 75 | then |
---|
| 76 | timejob=5000 |
---|
| 77 | elif [ $nproc_exe1 -lt 20000 ] |
---|
| 78 | then |
---|
| 79 | timejob=2500 |
---|
| 80 | else |
---|
| 81 | timejob=1200 |
---|
| 82 | fi |
---|
| 83 | ;; |
---|
| 84 | esac |
---|
| 85 | |
---|
| 86 | |
---|
[9851] | 87 | ###################################################################### |
---|
| 88 | ### beaufixlogin |
---|
| 89 | ###################################################################### |
---|
| 90 | |
---|
[9848] | 91 | if [ "$machine" == "beaufixlogin" ] |
---|
[9846] | 92 | then |
---|
| 93 | |
---|
| 94 | cat > Log/run_bench << EOF |
---|
| 95 | #!/bin/bash |
---|
| 96 | #SBATCH --time=00:1:00 |
---|
| 97 | #SBATCH -p normal64 # partition/queue |
---|
| 98 | #SBATCH --job-name=bench # job name |
---|
| 99 | #SBATCH -N $nnode # number of nodes |
---|
| 100 | #SBATCH -n $nproc # number of procs |
---|
| 101 | #SBATCH -o /scratch/work/cglo315/ESIWACE/dev_r9759_HPC09_ESIWACE/tests/LBENCH_RN/EXP00/Log/job.out%j |
---|
| 102 | #SBATCH -e /scratch/work/cglo315/ESIWACE/dev_r9759_HPC09_ESIWACE/tests/LBENCH_RN/EXP00/Log/job.out%j |
---|
| 103 | #SBATCH --exclusive |
---|
| 104 | |
---|
| 105 | module unload intelmpi intel grib_api |
---|
| 106 | module load intel/16.1.150 intelmpi/5.1.2.150 |
---|
| 107 | |
---|
| 108 | itac=0 |
---|
| 109 | xpmpi=0 |
---|
| 110 | |
---|
| 111 | if [ \$xpmpi == 1 ]; then |
---|
| 112 | module load bullxde |
---|
| 113 | module load xPMPI/1.1_intelmpi |
---|
| 114 | fi |
---|
| 115 | |
---|
| 116 | [ \$itac == 1 ] && module load itac/2017.2.028 |
---|
| 117 | |
---|
| 118 | set -vx |
---|
| 119 | |
---|
| 120 | cd \${TMPDIR} |
---|
| 121 | cp /scratch/work/cglo315/ESIWACE/dev_r9759_HPC09_ESIWACE/tests/LBENCH_RN/EXP00/* . |
---|
| 122 | |
---|
| 123 | # Best decompositions BENCH-1 |
---|
| 124 | jpni=${cores/\**/} |
---|
[9848] | 125 | jpnj=${cores/?*\*/} |
---|
[9846] | 126 | |
---|
| 127 | sed -e "s/jpni *=.*/jpni = \${jpni}/" -e "s/jpnj *=.*/jpnj = \${jpnj}/" namelist_cfg_orca${resolution}_like > namelist_cfg |
---|
| 128 | |
---|
| 129 | export OMP_NUM_THREADS=1 |
---|
| 130 | ulimit -s unlimited |
---|
| 131 | # |
---|
| 132 | if [ \$itac == 1 ]; then |
---|
| 133 | source /opt/softs/intel/2017/update_1/itac_latest/bin/itacvars.sh |
---|
| 134 | time mpirun -ordered-output -prepend-rank -trace -np $nproc_exe1 ./nemo > jobout 2>joberr |
---|
| 135 | else |
---|
| 136 | time mpirun -ordered-output -prepend-rank -np $nproc_exe1 ./nemo > jobout_${resolution}_${nproc5} |
---|
| 137 | fi |
---|
| 138 | /opt/softs/bin/ja |
---|
| 139 | |
---|
| 140 | if [ \$xpmpi == 1 ]; then |
---|
| 141 | module unload xPMPI/1.1_intelmpi |
---|
| 142 | module unload bullxde |
---|
| 143 | fi |
---|
| 144 | # |
---|
| 145 | EOF |
---|
| 146 | |
---|
| 147 | |
---|
| 148 | ### 4. Execute the model |
---|
| 149 | |
---|
| 150 | echo 'Submitting the job to queue using sbatch' |
---|
| 151 | sbatch Log/run_bench |
---|
| 152 | squeue -u cglo315 |
---|
| 153 | |
---|
| 154 | echo 'is executed or submitted to queue.' |
---|
| 155 | |
---|
| 156 | |
---|
| 157 | fi |
---|
| 158 | |
---|
[9851] | 159 | ###################################################################### |
---|
| 160 | ### curie or irene |
---|
| 161 | ###################################################################### |
---|
| 162 | |
---|
| 163 | if [[ ( "$machine" == "curie" ) || ( "$machine" == "irene" ) ]] |
---|
[9846] | 164 | then |
---|
[9848] | 165 | |
---|
[9851] | 166 | [ "$machine" == "curie" ] && queuename=standard || queuename=skylake |
---|
| 167 | |
---|
[11536] | 168 | EXPjob=../EXP_${resolution}_${nproc5}_${dateref} |
---|
| 169 | mkdir -p ${EXPjob} |
---|
| 170 | cd ${EXPjob} |
---|
| 171 | jobname=jobbench |
---|
[9846] | 172 | cat > $jobname << EOF |
---|
| 173 | #!/bin/bash |
---|
[9851] | 174 | #MSUB -r bench${nproc5} |
---|
[9846] | 175 | #MSUB -n ${nproc_exe1} |
---|
| 176 | #MSUB -T $timejob |
---|
| 177 | #MSUB -e bench_${resolution}_${nproc5}_%I.eo |
---|
| 178 | #MSUB -o bench_${resolution}_${nproc5}_%I.eo |
---|
| 179 | #MSUB -j oe |
---|
| 180 | #MSUB -x |
---|
[9851] | 181 | #MSUB -q ${queuename} |
---|
[9846] | 182 | #MSUB -A gen6895 |
---|
| 183 | #========================================== |
---|
| 184 | set -u |
---|
| 185 | # |
---|
| 186 | |
---|
[11536] | 187 | cd \${BRIDGE_MSUB_PWD} |
---|
[9846] | 188 | |
---|
[11536] | 189 | for ff in \${BRIDGE_MSUB_PWD}/../EXPREF/namelist_*cfg \${BRIDGE_MSUB_PWD}/../EXPREF/namelist_*ref \${BRIDGE_MSUB_PWD}/../BLD/bin/nemo.exe |
---|
| 190 | do |
---|
| 191 | cp \$ff . |
---|
| 192 | done |
---|
[9846] | 193 | |
---|
| 194 | jpni=${cores/\**/} |
---|
| 195 | jpnj=${cores/?*\*/} |
---|
| 196 | |
---|
[11536] | 197 | sed -e "s/jpni *=.*/jpni = \${jpni}/" \ |
---|
| 198 | -e "s/jpnj *=.*/jpnj = \${jpnj}/"\ |
---|
| 199 | -e "s/ln_timing *= *.false./ln_timing = .true./" \ |
---|
| 200 | \${BRIDGE_MSUB_PWD}/../EXPREF/namelist_cfg_orca${resolution}_like > namelist_cfg |
---|
[9846] | 201 | |
---|
[11536] | 202 | time ccc_mprun -n \${BRIDGE_MSUB_NPROC} ./nemo.exe > jobout_${resolution}_${nproc5}_${dateref} 2>&1 |
---|
[9846] | 203 | |
---|
| 204 | EOF |
---|
| 205 | |
---|
| 206 | ccc_msub $jobname |
---|
| 207 | |
---|
| 208 | fi |
---|
[11536] | 209 | |
---|
| 210 | ###################################################################### |
---|
| 211 | ### Jean-Zay |
---|
| 212 | ###################################################################### |
---|
| 213 | |
---|
| 214 | if [ "$machine" == "jean-zay" ] |
---|
| 215 | then |
---|
| 216 | hh=$( printf "%02d\n" $(( ${timejob} / 3600 )) ) |
---|
| 217 | mm=$( printf "%02d\n" $(( ( ${timejob} % 3600 ) / 60 )) ) |
---|
| 218 | ss=$( printf "%02d\n" $(( ( ${timejob} % 3600 ) % 60 )) ) |
---|
| 219 | |
---|
| 220 | EXPjob=../EXP_${resolution}_${nproc5}_${dateref} |
---|
| 221 | mkdir -p ${EXPjob} |
---|
| 222 | cd ${EXPjob} |
---|
| 223 | jobname=jobbench |
---|
| 224 | cat > $jobname << EOF |
---|
| 225 | #!/bin/bash |
---|
| 226 | #SBATCH --job-name=Seq # nom du job |
---|
| 227 | #SBATCH --partition=cpu_gct3 # demande d'allocation sur la partition CPU |
---|
| 228 | #SBATCH --nodes=${nnode} # nombre de noeuds |
---|
| 229 | #SBATCH --ntasks-per-node=${ncore_node} # nombre de taches MPI par noeud |
---|
| 230 | #SBATCH --ntasks-per-core=1 # 1 processus MPI par coeur physique (pas d'hyperthreading) |
---|
| 231 | #SBATCH --time=${hh}:${mm}:${ss} # temps d execution maximum demande (HH:MM:SS) |
---|
| 232 | #SBATCH --output=bench_${resolution}_${nproc5}_%j.eo # nom du fichier de sortie |
---|
| 233 | #SBATCH --error=bench_${resolution}_${nproc5}_%j.eo # nom du fichier d'erreur (ici en commun avec la sortie) |
---|
| 234 | #========================================== |
---|
| 235 | set -u |
---|
| 236 | #set -xv |
---|
| 237 | # |
---|
| 238 | #cd \${SLURM_SUBMIT_DIR} |
---|
| 239 | cd \${JOBSCRATCH} |
---|
| 240 | pwd |
---|
| 241 | |
---|
| 242 | for ff in \${SLURM_SUBMIT_DIR}/../EXPREF/namelist_*cfg \${SLURM_SUBMIT_DIR}/../EXPREF/namelist_*ref \${SLURM_SUBMIT_DIR}/../BLD/bin/nemo.exe |
---|
| 243 | do |
---|
| 244 | cp \$ff . |
---|
| 245 | done |
---|
| 246 | |
---|
| 247 | jpni=${cores/\**/} |
---|
| 248 | jpnj=${cores/?*\*/} |
---|
| 249 | |
---|
| 250 | sed -e "s/jpni *=.*/jpni = \${jpni}/" \ |
---|
| 251 | -e "s/jpnj *=.*/jpnj = \${jpnj}/" \ |
---|
| 252 | -e "s/ln_timing *= *.false./ln_timing = .true./" \ |
---|
| 253 | \${SLURM_SUBMIT_DIR}/../EXPREF/namelist_cfg_orca${resolution}_like > namelist_cfg |
---|
| 254 | |
---|
| 255 | ls -l |
---|
| 256 | |
---|
| 257 | echo |
---|
| 258 | echo |
---|
| 259 | echo " =========== start the model ===========" |
---|
| 260 | echo |
---|
| 261 | echo |
---|
| 262 | |
---|
| 263 | time srun --mpi=pmi2 --cpu-bind=cores -K1 -n ${nproc} ./nemo.exe > jobout_${resolution}_${nproc5}_${dateref} 2>&1 |
---|
| 264 | |
---|
| 265 | ls -l |
---|
| 266 | |
---|
| 267 | if [ "\$( pwd )" != "\${SLURM_SUBMIT_DIR}" ] |
---|
| 268 | then |
---|
| 269 | rsync -av namelist_cfg time.step ocean.output jobout_${resolution}_${nproc5}_${dateref} communication_report.txt layout.dat timing.output output.namelist* \${SLURM_SUBMIT_DIR} |
---|
| 270 | fi |
---|
| 271 | |
---|
| 272 | EOF |
---|
| 273 | |
---|
| 274 | sbatch $jobname |
---|
| 275 | |
---|
| 276 | fi |
---|