#!/bin/bash # # Author: J.G. Messchendorp, messchendorp@kvi.nl # # USAGE: runmpi [arg1] [arg2] # # with # # [arg1] ($1): machinelist # [arg2] ($2): job description file # RETVAL=0 BOSSWORKERMPI="boss_worker_mpi" MPIRUN="mpirun" JDL=$2 MACHINES=$1 # # Check the command line arguments # if [ ! "$#" = "2" ]; then echo echo " Not enough or too many arguments supplied." echo echo " Usage: runmpi [procgrp] [jdl]" echo echo " [procgrp] - This file should contain a list of names of the computer nodes" echo " which you would like to participate as boss/workers. Each line" echo " starts with the name of a computer node (internet address)," echo " followed by a number which is \"0\" or \"1\". In case of \"0\"," echo " no other processes are to be started on that node than the one" echo " started by the user, e.g. useful for the boss process with rank 0." echo " The entries with \"1\" reflect the worker processes The third entry" echo " on each line gives the path and filename of the \"boss_worker_mpi\"" echo " program and the last, optional, entry the corresponding username" echo " for that node. Note that with this, one can setup a session on" echo " a heterogeneous network. For clarification, the contents of an" echo " example \"procgrp\" file could look as follows:" echo echo " node01.kvi.nl 0 /home/johan/bin/boss_worker_mpi johan" echo " node02.kvi.nl 1 /home/johan/bin/boss_worker_mpi johan" echo " node02.kvi.nl 1 /home/johan/bin/boss_worker_mpi johan" echo " lxi002.gsi.de 1 /u/jmessch/bin/boss_worker_mpi jmessch" echo " lxi003.gsi.de 1 /u/jmessch/bin/boss_worker_mpi jmessch" echo echo " This will start one boss and four workers processes. The boss (rank 0)" echo " will run on \"node01.kvi.nl\". The first two workers will run on" echo " \"node02.kvi.nl\", and the other two on \"lxi002.gsi.de\" and on" echo " \"lxi003.gsi.de\", respectively. Note that the location of the MPI" echo " program and the username are indicated as well, which could differ." echo " The names of the nodes should be valid internet addresses and" echo " accessible via \"ssh\" or \"rsh\", depending on the MPI configuration." echo " More detailed information about the \"procgrp\" configuration can be" echo " found in any MPI manual (look for option -p4pg)." echo echo " [jdl] - This is the job description file (jdl), which contains the" echo " information about the jobs and other parameter settings." echo " Checkout the README file in the \"mpiTools\" directory for a" echo " detailed description." echo RETVAL=1 exit $RETVAL fi # # Try to find the MPIRUN program # MPIRUNPROG=`which $MPIRUN` if [ -z "$MPIRUNPROG" ]; then echo " Could not find $MPIRUN, please make sure your PATH is set properly!" RETVAL=1 exit $RETVAL fi # # Try to find the MPI program # BOSSWORKERPROG=`which $BOSSWORKERMPI` if [ "$BOSSWORKERPROG" = "" ]; then echo " Could not find $BOSSWORKERMPI, please make sure your PATH is set properly!" RETVAL=1 exit $RETVAL fi # # Check whether list of machines exists # if [ ! -f $MACHINES ]; then echo " The machinelist file $MACHINES does not exist!" RETVAL=1 exit $RETVAL fi # # Check whether JDL file exists # if [ ! -f $JDL ]; then echo " The JDL file $JDL does not exist!" RETVAL=1 exit $RETVAL fi # # Define the logfile # LOGFILE=/tmp/runmpi_$$.log echo " Detailed LOG information can be found in \"$LOGFILE\"" # # Print stuff # echo " The job description file is \"$JDL\"" echo " The list of computer nodes are given in \"$MACHINES\"" # # Remove the old logfile and start a new one # rm $JDL.log > /dev/null 2>&1 touch $JDL.log RETVAL=$? if [ $RETVAL != 0 ] ; then echo " Error creating $JDL.log" exit $RETVAL fi # # Start the MPI session in NOHUP-background mode (CTRL-C safe!) # echo " Starting MPI session in NOHUP and background mode." nohup $MPIRUNPROG -p4pg $MACHINES $BOSSWORKERPROG -j $JDL -v > $LOGFILE 2>&1 & # # Get the PID of the MPI session (not complete bulletproof, yet) # echo " You can monitor the output by \"tail -f $JDL.log\"" echo " ...or for more details by \"tail -f $LOGFILE\"" # # The end. # exit $RETVAL