#!/bin/bash # The script that is called to submit jobs. It needs to accept # a job number as the first paramter, so # ./cascade_job_100k.sh 42 # for instance. JOB_SCRIPT="./cascade_job_100k.sh" # Where is the data being stored? DATA_FOLDER="../data/ana" # The data should have this format: # ${DATA_PREFIX}_N_reco_complete.root # where N is the run ID and until the .root is ignored, so also # ${DATA_PREFIX}_N_very_interesting_data.root # is valid. DATA_PREFIX="cascade_" # restart all currently running jobs which are hanging for some reason if [ "$1" == "qstat" ]; then qstat=$(qstat -u $USER) jobIDs=$(echo "$qstat" | awk -v col=1 'NR > 2 {print $col}' | sort | uniq) taskIDs=$(echo "$qstat" | awk -v col=10 'NR > 2 {print $col}') # kill currently running jobs for id in $jobIDs; do echo "kill job $id" qdel $id done # restart the jobs for id in $taskIDs; do if [ "$2" == "-n" ]; then echo "$JOB_SCRIPT $id" else $JOB_SCRIPT $id fi done echo "$taskIDs" > cascade_job_resubmit_qstat.txt # find files below 2 MB and restart those jobs elif [ "$1" == "find" ]; then find=$(find ${DATA_FOLDER} -name "*.root" -type f -size -2M) taskIDs="" for f in $find; do f2="${f#*${DATA_PREFIX}}" id="${f2%%_*}" taskIDs=$(printf "$taskIDs\n$id") done taskIDs=$(echo "$taskIDs" | tr ' ' '\n' | sort -u) for id in $taskIDs; do if [ "$2" == "-n" ]; then echo "$JOB_SCRIPT $id" else $JOB_SCRIPT $id fi done echo "$taskIDs" > cascade_job_resubmit_find.txt # find gaps in the processed job IDs and restart the missing ones elif [ "$1" == "missing" ]; then # retrieve further information for the range maxID=100 if test "$2" != ""; then if [[ $2 =~ ^[0-9]+$ ]]; then maxID=$2 fi fi # get the list of IDs that ran through find=$(find ${DATA_FOLDER} -name "*.root" -type f) completedIDs="" for f in $find; do f2="${f#*${DATA_PREFIX}}" id="${f2%%_*}" completedIDs=$(printf "$completedIDs\n$id") done completedIDs=$(echo "$completedIDs" | tr " " "\n" | sort -un) # echo $completedIDs # find the gaps lastID=0 gapIDs="" for id in $completedIDs; do if (( $id > $maxID )); then break fi if (( $id != $lastID + 1 )); then if (( $lastID + 2 == $id )); then gapIDs=$(printf "$gapIDs\n$(( $lastID + 1 ))") else gapIDs=$(printf "$gapIDs\n$(( $lastID + 1 ))-$(( $id - 1 ))") fi fi lastID=$id done # catch IDs in the gap between last and maximum if (( $lastID < $maxID )); then if (( $lastID + 1 == $maxID )); then gapIDs=$(printf "$gapIDs\n$(( $lastID + 1 ))") else gapIDs=$(printf "$gapIDs\n$(( $lastID + 1 ))-$(( $maxID ))") fi fi # call the job control for id in $gapIDs; do if [ "$3" == "-n" ]; then echo "$JOB_SCRIPT $id" else $JOB_SCRIPT $id fi done echo "$gapIDs" > cascade_job_resubmit_missing.txt # print hints on usage else echo "cascade_job_resubmit.sh – Resubmit failed jobs on prometheus." echo echo "Call: ./cascade_job_resubmit.sh method [maxID] [-n]" echo " methods:" echo " qstat - Kill and restart all jobs still running." echo " find - Find failed jobs by checking for files < 2MB and restart." echo " missing - Check for missing files and restart those jobs." echo " The parameter 'maxID' sets the ID until jobs should have run." fi