Here
are a set of scripts I wrote to manage jobs on a Sun Grid Engine
cluster. They allow one-line job submission, including looping of
repetitive jobs. They also allow a user to set up her or his own queue
on top of the SGE queue to limit the number of jobs submitted at one
time (requires running a perl script on a node that can qalter jobs).
All scripts are released under the GNU Public License V. 2.
ezsub: submit one job
at a time: "ezsub program datafile"
ezloop: submit
multiple repetitive jobs: "ezsub number_of_reps program datafile". Note
that any instance of "REP" [all caps] in the datafile will be replaced
by the replicate number: "log file=runREP.log" is changed to "log
file=run3.log" on the third replicate.
ezdel: delete all
queued and held jobs
nicesub, niceloop: same as
ezsub and ezloop, but starts jobs with user holds on them. Nicestart
then removes the user holds
nicestart: converts
held jobs to running jobs: "nicestart minnum maxnum freenodes". Minnum
(maxnum) specifies the minimum (maximum) number of jobs to be actively
queued or run regardless of cluster activity. Freenodes specifies the
number of nodes to remain free: jobs will be submitted until there are
no more than freenodes still free (currently written for an 88 node
cluster) or until maxnum is hit.
nicewatch: a script
launched by nicestart
niceqstat: greps
qstat output to strip user-held jobs
nicecount: uses
qstat to count the number of running jobs of various types
ezsub
#!
/bin/bash
# Shell script to
automatically create a simple shell script
# and submit it to sge using
qsub. To use this, just type
# ezsub followed by the name
and options for your job. I.e.,
#
# ezsub paup -n batchfile.nex
# ezsub ./configure
--disable-shared
# ezsub mb adh.nex
# ezsub tar -xvf gsl-1.8.tar
#
# The one constraint is that
you can't have pipes or redirects ( >, <, |), as
# they would use the output of
the ezsub command (which is just your job number)
# instead of the output of
whatever program you're calling
#
# Brian O'Meara 17 Nov 2006
# http://www.brianomeara.info
# Released under GPL v2
echo "#!/bin/bash" >tempqrun.sh
echo "#$ -cwd" >>tempqrun.sh #use current directory as working
directory
########### Change email
settings #################
echo "#$ -M me@mycollege.edu" >>tempqrun.sh #use your own email address. Please.
echo "#$ -m as" >>tempqrun.sh #send email about the job.
# "b"=when job begins
# "e"=when job ends
# "a"=when job aborts
# "s"=when job suspended
(someone kicks you off)
# "n" alone means don't send
mail
##################################################
##################### Job name
############################
echo "#$ -N EZsub" >>tempqrun.sh #job name, currently EZsub
###########################################################
########### Don't modify this
bit #################
echo "#$ -r y">>tempqrun.sh #makes job rerunable #
echo "#$ -S /bin/bash">>tempqrun.sh #
echo "">>tempqrun.sh #
until [ -z "$1" ] #
do #
echo -n "$1 " >> temporaryqrunstrings #
shift #
done #
tr '\n' ' ' < temporaryqrunstrings >>
tempqrun.sh #
###################################################
############ Cleanup
#########################
#delete the "#" at the beginning
of the following line if you want to remove the output files
#echo "rm EZsub.*" >>
tempqrun.sh
#############################################
###### Don't modify this
bit ###########
sleep 1 #
chmod 777 tempqrun.sh #
cp tempqrun.sh tempqrun2.sh #
qsub tempqrun2.sh #
sleep 1 #
rm temporaryqrunstrings #
rm tempqrun.sh #
rm tempqrun2.sh #
########################################
ezloop
#!/usr/bin/perl -w
# Shell script to
automatically start multiple repetitive jobs.
# Examples would be doing 100
bootstrap replicates by sending
# five jobs of 20 reps each or
doing 4 mrbayes runs at once.
# Basically, the script
expects there to be a batch file that
# you're using; it will
replace any instance of REP (all caps)
# in that batch file with the
replicate number. To use this
# script, you type ezloop
followed by the number of loops,
# then the program you want to
run, any options, and the
# name of the batch file. For
example,
#
# ezloop 5 paup -n
bootbatch.nex
# ezloop 4 mb adh.nex
#
# A sample batch file for
something like paup might be:
#
# #nexus
# begin paup;
# log start file=bootREP.log;
# execute primates.nex;
# bootstrap nreps=20
treefile=bootREP.tre brlen=yes / start=nj;
# quit;
# end;
#
# Then, if submitted using the
command "ezloop 5 paup -n bootbatch.nex",
# the output would be
boot1.log, boot1.tre, boot2.log, boot2.tre,...
# boot5.log, boot5.tre. Each
tree file would have trees from twenty
# bootstrap replicates; you
could then load them all into paup,
# making sure to store tree
weights, and get a majority rule
# consensus tree using tree
weights to compute the bootstrap tree
# from 100 bootstrap
replicates.
# The one constraint is that
you can't have pipes or redirects ( >, <, |), as
# they would use the output of
the ezloop command (which is just your job number)
# instead of the output of
whatever program you're calling
#
# Brian O'Meara 17 Nov 2006
# http://www.brianomeara.info
# Released under GPL v2
use diagnostics;
use strict;
if ($#ARGV < 2 || $ARGV[0]!~m/\d+/) {
print "usage: ezloop #reps command [options]
filename\n";
exit;
}
my
$outputstring="";
my
$maxcmdnum=$#ARGV;
for (my $arg=1;$arg<$maxcmdnum;$arg++) {
$outputstring="$outputstring "."$ARGV[$arg]";
}
for (my $rep=1;$rep<=$ARGV[0];$rep++) {
my
$filein="$ARGV[$#ARGV]";
open(IN,"$filein") or die("where is $filein ?");
open(OUT,">$rep.$filein");
while(<IN>) {
my
$inline=$_;
chomp $inline;
$inline=~s/REP/$rep/g; #convert REP to $rep. Case sensitive
print
OUT "$inline\n";
}
close IN;
close OUT;
my
$totaloutput="ezsub $outputstring "."$rep.$filein";
sleep(5); #just to give the cluster a break,
give us time to abort if something's wrong, etc.
system("$totaloutput");
}
ezdel
#!/usr/bin/perl -w
#A quick script to delete all
your queued and held jobs.
#Not often useful, but handy
when you do need it (something has gone terribly wrong)
use diagnostics;
use strict;
my
$username="bcomeara";
while (`qstat
| grep "$username" | grep -c "qw"`>0) { #while you still have queued and/or
held jobs
my
$jobtokillline=`qstat
| grep "$username" | grep -m1 "qw"`;
$jobtokillline=~m/^[\s]*([\d]+)/;
my
$jobtokill=$1;
system("qdel $jobtokill");
sleep(1);
}
nicesub
#!
/bin/bash
# Shell script to
automatically create a simple shell script
# and submit it to sge using
qsub. To use this, just type
# ezsub followed by the name
and options for your job. I.e.,
#
# nicesub paup -n batchfile.nex
# nicesub ./configure
--disable-shared
# nicesub mb adh.nex
# nicesub tar -xvf gsl-1.8.tar
#
# The one constraint is that
you can't have pipes or redirects ( >, <, |), as
# they would use the output of
the ezsub command (which is just your job number)
# instead of the output of
whatever program you're calling
#
# Brian O'Meara 17 Nov 2006
# http://www.brianomeara.info
# Released under GPL v2
echo "#!/bin/bash" >tempqrun.sh
echo "#$ -cwd" >>tempqrun.sh #use current directory as working
directory
########### Change email
settings #################
echo "#$ -M me@mycollege.edu" >>tempqrun.sh #use your own email address. Please.
echo "#$ -m as" >>tempqrun.sh #send email about the job.
# "b"=when job begins
# "e"=when job ends
# "a"=when job aborts
# "s"=when job suspended
(someone kicks you off)
# "n" alone means don't send
mail
##################################################
##################### Job name
############################
echo "#$ -N niceWAIT" >>tempqrun.sh #job name
###########################################################
########### Don't modify this
bit #################
echo "#$ -r y">>tempqrun.sh #makes job rerunable #
echo "#$ -S /bin/bash">>tempqrun.sh #
echo "#$ -h">>tempqrun.sh #
echo "">>tempqrun.sh #
until [ -z "$1" ] #
do #
echo -n "$1 " >> temporaryqrunstrings #
shift #
done #
tr '\n' ' ' < temporaryqrunstrings >>
tempqrun.sh #
###################################################
############ Cleanup
#########################
#delete the "#" at the beginning
of the following line if you want to remove the output files
#echo "rm EZsub.*" >>
tempqrun.sh
#############################################
###### Don't modify this
bit ###########
sleep 1 #
chmod 777 tempqrun.sh #
cp tempqrun.sh tempqrun2.sh #
qsub tempqrun2.sh #
sleep 1 #
rm temporaryqrunstrings #
rm tempqrun.sh #
rm tempqrun2.sh #
########################################
niceloop
#!/usr/bin/perl -w
# Shell script to
automatically start multiple repetitive jobs.
# Examples would be doing 100
bootstrap replicates by sending
# five jobs of 20 reps each or
doing 4 mrbayes runs at once.
# Basically, the script
expects there to be a batch file that
# you're using; it will
replace any instance of REP (all caps)
# in that batch file with the
replicate number. To use this
# script, you type niceloop
followed by the number of loops,
# then the program you want to
run, any options, and the
# name of the batch file. For
example,
#
# niceloop 5 paup -n
bootbatch.nex
# niceloop 4 mb adh.nex
#
# A sample batch file for
something like paup might be:
#
# #nexus
# begin paup;
# log start file=bootREP.log;
# execute primates.nex;
# bootstrap nreps=20
treefile=bootREP.tre brlen=yes / start=nj;
# quit;
# end;
#
# Then, if submitted using the
command "niceloop 5 paup -n bootbatch.nex",
# the output would be
boot1.log, boot1.tre, boot2.log, boot2.tre,...
# boot5.log, boot5.tre. Each
tree file would have trees from twenty
# bootstrap replicates; you
could then load them all into paup,
# making sure to store tree
weights, and get a majority rule
# consensus tree using tree
weights to compute the bootstrap tree
# from 100 bootstrap
replicates.
# The one constraint is that
you can't have pipes or redirects ( >, <, |), as
# they would use the output of
the niceloop command (which is just your job number)
# instead of the output of
whatever program you're calling
#
# Brian O'Meara 17 Nov 2006
# http://www.brianomeara.info
# Released under GPL v2
use diagnostics;
use strict;
if ($#ARGV < 2 || $ARGV[0]!~m/\d+/) {
print "usage: ezloop #reps command [options]
filename\n";
exit;
}
my
$outputstring="";
my
$maxcmdnum=$#ARGV;
for (my $arg=1;$arg<$maxcmdnum;$arg++) {
$outputstring="$outputstring "."$ARGV[$arg]";
}
for (my $rep=1;$rep<=$ARGV[0];$rep++) {
my
$filein="$ARGV[$#ARGV]";
open(IN,"$filein") or die("where is $filein ?");
open(OUT,">$rep.$filein");
while(<IN>) {
my
$inline=$_;
chomp $inline;
$inline=~s/REP/$rep/g; #convert REP to $rep. Case sensitive
print
OUT "$inline\n";
}
close IN;
close OUT;
my
$totaloutput="nicesub $outputstring "."$rep.$filein";
sleep(rand(4)); #just to give the cluster a break,
give us time to abort if something's wrong, etc.
system("$totaloutput");
}
nicestart
#!/usr/bin/perl -w
# nicestart: start submitting
your nicely-submitted jobs to SGE. It takes three numbers as arguments.
# MinNum: You will have at
least MinNum jobs running or in the official queue at a time,
# even if this leaves no nodes
free for other users
# MaxNum: You will have no
more than MaxNum jobs running or in the official queue at a time,
# even there are oodles of
other nodes free.
# FreeNum: The number of nodes
you will leave free for other users.
#
# usage: nicestart
<MinNum> <MaxNum> <FreeNum>
# example: nicestart 2 30 40
# will keep between 2 and 30
of your jobs running or actively queued, while keeping at least
# 40 nodes unused, regardless
of how many other people are using the cluster
#
# Niceloop and nicesub
effectively put jobs in your own queue. Nicestart starts a script that
moves jobs
# from your own queue into the
general queue. You submit jobs until there are only FreeNum nodes
available
# (but you keep at least
MinNum jobs in the general queue). The general idea is that if the
cluster isn't
# being heavily used, you
submit many jobs, but if the cluster is getting fuller, you submit jobs
at a slower
# rate to make sure to leave
nodes available for other users. This is most useful for cases where
you have
# many jobs (>50) to submit
but don't want to block cluster use. It can be especially appropriate
if the
# jobs complete very quickly,
so your number of active jobs can change very quickly as cluster use
changes.
# Setting MinNum>0 means
that you will always have MinNum jobs submitted or running, even if
this means that
# fewer than FreeNum nodes are
left available.
#
# If you want to change your
limits, just run nicestart again with the new limits.
#
# Jobs that are held in your
own separate queue will have state "hqw" and job name "niceWAIT" -- when
# they are submitted to the
general queue, their state will be "qw" or "r" and their name changed to
# "niceRUN".
#
# Brian O'Meara
# http://www.brianomeara.info
# Nov. 16, 2006
# Released under GNU Public
License V. 2
use diagnostics;
use strict;
my
$username=`whoami`;
chomp $username;
my
$nicewatchcount=`top
-b -n1 | grep $username | grep -c nicewatch`;
if ($nicewatchcount>0) {
my
@nicewatchlist=`top
-b -n1 | grep $username | grep nicewatch`;
foreach my $job (@nicewatchlist) {
$job=~m/^\s*(\d+)\s+/;
my $pid=$1;
system("kill $pid");
}
}
system("nohup nicewatch $ARGV[0] $ARGV[1] $ARGV[2] > /dev/null &");
nicewatch
#!/usr/bin/perl -w
#takes settings
minnum_active_queued_or_running, maxnum_active_queued_or_running (both
for individual user), number of slots to leave free
# Brian O'Meara
# http://www.brianomeara.info
# Nov. 27, 2006
# Released under GNU Public
License V. 2
use diagnostics;
use strict;
sleep(3);
my
$username=`whoami`;
chomp $username;
my
$minnumqueuedtorun=$ARGV[0];
my
$maxnumqueuedtorun=$ARGV[1];
my
$freeslots=$ARGV[2];
while (`qstat
| grep "$username" | grep -c "hqw"`>0) { #while there are still queued, held
jobs
my
@ezcountarray=split(/ /,`nicecount`);
my
$countall=$ezcountarray[0];
my
$countRme=$ezcountarray[6];
my
$countRall=$ezcountarray[1];
my
$countQWme=$ezcountarray[7];
my
$countQWall=$ezcountarray[2];
my
$countHme=$ezcountarray[8];
my
$countHall=$ezcountarray[3];
my
$countSall=$ezcountarray[4];
my
$countSme=$ezcountarray[9];
my
$countEall=$ezcountarray[5];
my
$countEme=$ezcountarray[10];
my
$reactivate=0;
if (($countRme+$countQWme)<$minnumqueuedtorun) {
$reactivate=1;
}
if ((88-($countQWall+$countRall))>$freeslots) {
if (($countRme+$countQWme)<$maxnumqueuedtorun ) {
$reactivate=1;
}
}
if ($reactivate==1) {
my
$jobtostartline=`qstat
| grep "$username" | grep -m1 "hqw"`;
$jobtostartline=~m/^[\s]*([\d]+)/;
my
$jobtostart=$1;
system("qalter -h U -N niceRUN $jobtostart");
}
sleep(rand(10)); #Don't change this. This tells the
script how long to wait before
# reactivating. If you get rid
of this line, the script will
# keep rerunning, taking up
far too much time on the head node.
}
niceqstat
#!/bin/bash
#This just returns qstat's
normal output, omitting jobs held in individual user queues
# Brian O'Meara
# http://www.brianomeara.info
# Nov. 16, 2006
# Released under GNU Public
Licence v 2
qstat | grep -v "hqw"
nicecount
#!/usr/bin/perl -w
use diagnostics;
use strict;
my
@qstatin=`qstat`;
my
$username=`whoami`;
chomp $username;
my
$countall=0;
my
$countRme=0;
my
$countRall=0;
my
$countQWme=0;
my
$countQWall=0;
my
$countHme=0;
my
$countHall=0;
my
$countSall=0;
my
$countSme=0;
my
$countEall=0;
my
$countEme=0;
foreach my $qstatline
(@qstatin) {
chomp $qstatline;
if ($qstatline=~m/\s+(\d+)\s+([\d\.]+)\s+([\S\d\.]+)\s+(\S+)\s+(\S+)\s+([\d\/]+)\s+([\d\:]+)\s+(\S*)\s+(\d+)/i) {
my $jobid=$1;
my $prior=$2;
my
$jobname=$3;
my $user=$4;
my $state=$5;
my
$subdate=$6;
my
$subtime=$7;
my $queue=$8;
my $slots=$9;
$countall++;
if ($state=~m/r/i) {
$countRall++;
if ($user eq $username) {
$countRme++;
}
}
if ($state=~m/^qw$/i) {
$countQWall++;
if ($user eq $username) {
$countQWme++;
}
}
if ($state=~m/h/i) {
$countHall++;
if ($user eq $username) {
$countHme++;
}
}
if ($state=~m/s/i) {
$countSall++;
if ($user eq $username) {
$countSme++;
}
}
if ($state=~m/e/i) {
$countEall++;
if ($user eq $username) {
$countEme++;
}
}
}
}
print "$countall $countRall $countQWall
$countHall $countSall $countEall $countRme $countQWme $countHme
$countSme $countEme";