Distributed GATK analysis scripts
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5049 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a0731eaa81
commit
8ece2b9230
|
|
@ -0,0 +1,22 @@
|
||||||
|
# todo -- add replicate number to system
|
||||||
|
# tood -- add scatter gather comparison
|
||||||
|
d <- read.table("results.dat", header=T)
|
||||||
|
require("lattice")
|
||||||
|
|
||||||
|
subd = data.frame(parallel.type=d$parallel.type, nWaysParallel=d$nWaysParallel, end.to.end.time=d$end.to.end.time,per.1M.sites = d$per.1M.sites, job.run.time = d$job.run.time)
|
||||||
|
|
||||||
|
nways = unique(subd$nWaysParallel)
|
||||||
|
m = max(subd$end.to.end.time)
|
||||||
|
nNW = subset(subd, end.to.end.time == m)$nWaysParallel[1]
|
||||||
|
timeAt1 = m * nNW
|
||||||
|
my.runtime = subset(subd, end.to.end.time == m)$job.run.time[1] * nNW
|
||||||
|
my.pms = subset(subd, end.to.end.time == m)$per.1M.sites[1]
|
||||||
|
|
||||||
|
theo = data.frame(parallel.type="theoretic", end.to.end.time=timeAt1/nways, nWaysParallel=nways, per.1M.sites = my.pms, job.run.time = my.runtime / nways)
|
||||||
|
|
||||||
|
subd = rbind(subd, theo)
|
||||||
|
|
||||||
|
print(summary(subd))
|
||||||
|
|
||||||
|
print(xyplot(end.to.end.time + per.1M.sites + job.run.time ~ nWaysParallel, data=subd[order(subd$nWaysParallel),], group=parallel.type, type="b", outer=T, scale=list(relation="free"), auto.key=T, lwd=c(2,2,1)))
|
||||||
|
|
||||||
|
|
@ -0,0 +1,119 @@
|
||||||
|
import sys
|
||||||
|
from optparse import OptionParser
|
||||||
|
from itertools import *
|
||||||
|
import random
|
||||||
|
import re
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
# a simple script that does:
|
||||||
|
# 1 -- generates a master set of variants following the neutral expectation from a single big population
|
||||||
|
# 2 -- randomly generates M individuals with variants and genotypes sampled as expected from the big population of variants
|
||||||
|
# 3 -- writes out the genotypes of these individuals, and their allele frequency
|
||||||
|
def main():
|
||||||
|
global OPTIONS
|
||||||
|
usage = "usage: %prog [options] outputFile"
|
||||||
|
parser = OptionParser(usage=usage)
|
||||||
|
|
||||||
|
(OPTIONS, args) = parser.parse_args()
|
||||||
|
if len(args) == 0:
|
||||||
|
parser.error("Requires at least one argument")
|
||||||
|
|
||||||
|
print 'file dataset parallel.type nWaysParallel start.time end.time end.to.end.time per.1M.sites job.run.time'
|
||||||
|
typere = '.*/(.*).ptype_(\w+).nways_(\d+).*'
|
||||||
|
for file in args:
|
||||||
|
startTime, endTime, perMSites, runtime = None, None, None, None
|
||||||
|
for line in open(file):
|
||||||
|
match = re.match(typere, line)
|
||||||
|
if match != None: dataset, parallelType, nWays = match.groups()
|
||||||
|
startTime = captureStartTime(line, startTime)
|
||||||
|
perMSites = capturePerMSites(line, perMSites)
|
||||||
|
endTime = captureEndTime(line, endTime)
|
||||||
|
runtime = captureRuntime(line, runtime)
|
||||||
|
print file, dataset, parallelType, nWays, formatTime(startTime), formatTime(endTime), endToEnd(endTime, startTime), perMSites, runtime
|
||||||
|
|
||||||
|
def endToEnd(endTime, startTime):
|
||||||
|
#print 'endToEnd', endTime, startTime
|
||||||
|
return total_minutes(endTime - startTime)
|
||||||
|
|
||||||
|
def formatTime(t):
|
||||||
|
return datetime.datetime.strftime(t, formatString)
|
||||||
|
|
||||||
|
def total_minutes(td):
|
||||||
|
return td.days * 24 * 60 + td.seconds / 60.0
|
||||||
|
|
||||||
|
def captureLine(line, regex, func, prevValue):
|
||||||
|
match = regex.match(line)
|
||||||
|
if match != None:
|
||||||
|
if func != None:
|
||||||
|
val = func(line)
|
||||||
|
else:
|
||||||
|
val = match.group(1)
|
||||||
|
else:
|
||||||
|
val = None
|
||||||
|
#print 'Matching', line, regex, match, prevValue, val
|
||||||
|
|
||||||
|
return val
|
||||||
|
|
||||||
|
formatString = "%H:%M:%S"
|
||||||
|
|
||||||
|
def captureStartTime(line, prev):
|
||||||
|
# todo - needs to find the earliest time
|
||||||
|
#INFO 11:03:50,202 HelpFormatter - The Genome Analysis Toolkit (GATK) v<unknown>, Compiled <unknown>
|
||||||
|
regex = re.compile("INFO\W*(\d+:\d+:\d+).*The Genome Analysis Toolkit.*")
|
||||||
|
return selectTime(captureLine(line, regex, None, prev), prev, earlier = True)
|
||||||
|
|
||||||
|
def selectTime(newTimeString, oldTime, earlier = False):
|
||||||
|
def select():
|
||||||
|
if newTimeString == None:
|
||||||
|
return oldTime
|
||||||
|
else:
|
||||||
|
newTime = datetime.datetime.strptime(newTimeString, formatString)
|
||||||
|
if oldTime == None:
|
||||||
|
return newTime
|
||||||
|
elif earlier:
|
||||||
|
if newTime < oldTime:
|
||||||
|
return newTime
|
||||||
|
else:
|
||||||
|
return oldTime
|
||||||
|
else:
|
||||||
|
if newTime > oldTime:
|
||||||
|
return newTime
|
||||||
|
else:
|
||||||
|
return oldTime
|
||||||
|
r = select()
|
||||||
|
#if not earlier: print 'selectTime', oldTime, newTimeString, r
|
||||||
|
return r
|
||||||
|
|
||||||
|
|
||||||
|
def captureEndTime(line, prev):
|
||||||
|
# todo - needs to find the latest time
|
||||||
|
regex = re.compile("INFO\W*(\d+:\d+:\d+).*GATKRunReport - Aggregating data for run report.*")
|
||||||
|
return selectTime(captureLine(line, regex, None, prev), prev, earlier=False)
|
||||||
|
|
||||||
|
unitsToMinutes = {
|
||||||
|
'm' : 1.0,
|
||||||
|
'h' : 60,
|
||||||
|
's' : 1.0/60,
|
||||||
|
'd' : 60 * 60
|
||||||
|
}
|
||||||
|
|
||||||
|
def capturePerMSites(line, prev):
|
||||||
|
return captureDoneLine(line, prev, 8, 10)
|
||||||
|
|
||||||
|
def captureRuntime(line, prev):
|
||||||
|
return captureDoneLine(line, prev, 6, 8)
|
||||||
|
|
||||||
|
def captureDoneLine(line, prev, s, e):
|
||||||
|
# INFO 11:04:11,541 TraversalEngine - chr1:3769010 1.32e+05 20.0 s 2.5 m 1.5% 21.9 m 21.5 m
|
||||||
|
regex = re.compile("INFO .*TraversalEngine -.*done*")
|
||||||
|
val = captureLine(line, regex, lambda x: x.split()[s:e], None)
|
||||||
|
if val == None:
|
||||||
|
return prev
|
||||||
|
else:
|
||||||
|
x, u = val
|
||||||
|
return float(x) * unitsToMinutes[u]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
grep -l -e "ptype_sg" -e "part1\." short/Q-*.out long/Q-*.out > toTime.txt
|
||||||
|
|
@ -0,0 +1,10 @@
|
||||||
|
#!/bin/tcsh
|
||||||
|
|
||||||
|
setenv CMD "java -Djava.io.tmpdir=/broad/shptmp/depristo/tmp -jar /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/dist/Queue.jar -statusTo depristo -S /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/scala/qscript/oneoffs/depristo/distributedGATKPerformance.scala -bsub --gatkjarfile /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/dist/GenomeAnalysisTK.jar -dataset HiSeq $argv[2-$#argv]"
|
||||||
|
|
||||||
|
if ( $1 == 1 ) then
|
||||||
|
pushd short; $CMD -jobQueue hour -run &
|
||||||
|
pushd long; $CMD -jobQueue gsa -long -run &
|
||||||
|
else
|
||||||
|
$CMD
|
||||||
|
endif
|
||||||
Loading…
Reference in New Issue