Checking in scripts used for testing the linear index MAX_FEATURES_PER_BIN.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4887 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kshakir 2010-12-20 21:25:36 +00:00
parent fc33901810
commit 758d14a261
6 changed files with 176 additions and 0 deletions

View File

@ -0,0 +1,68 @@
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.utils.text._
import org.broadinstitute.sting.queue.extensions.gatk._
import collection.JavaConversions._
class LinearIndexBinTests extends QScript {
qscript =>
@Input(doc="The path to the GenomeAnalysisTK.jar file.", shortName="gatk")
var gatkJar: File = null
@Argument(doc="Rod list to test. The first line in the list is the reference.", shortName="BL")
var rodLists: List[File] = Nil
@Argument(doc="Number of times to run the test.", shortName="numRuns", required=false)
var numRuns = 1
@Input(doc="memory limits", shortName="mem", required=true)
var memoryLimits: List[Int] = Nil
@Input(doc="max bin size", shortName="maxBin", required=false)
var maxBinSize = 512
def script = {
var maxFeaturesPerBin = List.empty[String]
var currMaxFeatures = maxBinSize
while (currMaxFeatures > 1) {
maxFeaturesPerBin +:= currMaxFeatures.toString
currMaxFeatures /= 2
}
maxFeaturesPerBin :::= List("0.001", "0.01", "0.1", "1")
for (run <- 1 to numRuns) {
for (rodList <- rodLists) {
val rodListName = rodList.getName
val lines = new XReadLines(rodList).iterator
val reference = new File(lines.next)
val rodFiles = lines.map(path => new File(path)).toList
for (memoryLimit <- memoryLimits) {
for (maxFeatures <- maxFeaturesPerBin) {
val dir = "%s_%smfpb_%02dg_run%02d".format(rodListName, "00000".take(5-maxFeatures.length) + maxFeatures, memoryLimit, run)
val countRod = new CountRod {
override def javaOpts = super.javaOpts + " -DMAX_FEATURES_PER_BIN=" + maxFeatures
}
countRod.jobOutputFile = new File(dir, "CountRod.out")
countRod.out = new File(dir, "CountRod.txt")
countRod.jarFile = qscript.gatkJar
countRod.reference_sequence = reference
countRod.memoryLimit = Some(memoryLimit)
// Some of the BED files don't have a chrM, which makes the GATK angry. Run unsafe.
countRod.U = Some(org.broadinstitute.sting.gatk.arguments.ValidationExclusion.TYPE.ALL)
for ((rodFile, index) <- rodFiles.zipWithIndex) {
val rodType = rodFile.getName.split("\\.").last
countRod.rodBind :+= new RodBind(rodType + (index+1), rodType.toUpperCase, rodFile.getAbsolutePath)
}
add(countRod)
}
}
}
}
}
}

View File

@ -0,0 +1,15 @@
#!/bin/sh
STING_HOME=/humgen/gsa-hpprojects/dev/kshakir/src/Sting_CountRod
TMP_DIR=/broad/shptmp/kshakir/CountRod
JOB_QUEUE=
STATUS_TO=
#JOB_QUEUE="-jobQueue week"
#STATUS_TO="-statusTo kshakir"
if [ "$1" == "debug" ]; then
JAVA_DEBUG="-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8555"
shift
fi
java $JAVA_DEBUG -Djava.io.tmpdir=${TMP_DIR} -jar ${STING_HOME}/dist/Queue.jar -jobPrefix QCountRodTest -S LinearIndexBinTests.scala -gatk ${STING_HOME}/dist/GenomeAnalysisTK.jar -jobQueue ${JOB_QUEUE} ${STATUS_TO} -bsub $@

View File

@ -0,0 +1,54 @@
DESCRIPTION
-----------
This folder contains a set of test scripts for evaluating the MAX_FEATURES_PER_BIN setting in tribble/src/org/broad/tribble/index/linear/LinearIndex.java
For the tests to work you must patch the tribble code to enable the MAX_FEATURES_PER_BIN to be set via a system property, for example:
java -jar GenomeAnalysisTK.jar -DMAX_FEATURES_PER_BIN=1 ...
SCRIPTS
-------
*** LinearIndexBinTests.sh ***
Runs the scala script LinearIndexBinTests.scala. Requires that you pass rods via "rod lists" (see below) and specify how much memory to run each set of tests with.
Example dry run:
./LinearIndexBinTests.sh -mem 2 -mem 4 -mem 6 -BL test_vcfs
Example run:
./LinearIndexBinTests.sh -mem 2 -mem 4 -mem 6 -BL test_vcfs -run
Example run on the hour queue:
./LinearIndexBinTests.sh -mem 2 -mem 4 -mem 6 -BL test_vcfs -jobQueue hour -run
Example run on the hour queue with each job run three times:
./LinearIndexBinTests.sh -mem 2 -mem 4 -mem 6 -BL test_vcfs -jobQueue hour -numRuns 3 -run
*** grep_results.sh ***
Greps the CPU and Max Memory statistics from the LSF output files into a file mfpb.txt.
Example:
./grep_results.sh
[outputs: ./mfpb.txt]
*** plot_results.R ***
Creates a plot from a subset of the data in mfpb.txt. Can be run multiple times to produces plots for the different memory limits passed to LinearIndexBinTests.sh
Example:
./plot_results.R mfpb.txt 2
./plot_results.R mfpb.txt 4
./plot_results.R mfpb.txt 6
[outputs: ./max_features_per_bin_Xmx2g.pdf ./max_features_per_bin_Xmx4g.pdf ./max_features_per_bin_Xmx6g.pdf]
ROD LISTS
---------
A rod list is a file that contains the FASTA reference on the first line, and then 1..N ROD files in the rest of the file. The RODs must all end with an extension that corresponds to the rod type, for example: .vcf, .bed, etc.
Example:
[See test_vcfs]

View File

@ -0,0 +1,13 @@
#!/bin/sh
DIRECTORY=.
find $DIRECTORY -name CountRod.out -exec grep -H "Max Memory" {} \; | sort | sed -e 's!'$DIRECTORY'/\(.*\)_\([.0-9]*\)mfpb_\(.*\)g_run\(.*\)/CountRod.out: * Max Memory *: *\([.0-9]*\) MB!\1\t\2\t\3\t\4\t\5!' > $DIRECTORY/memory.txt
find $DIRECTORY -name CountRod.out -exec grep -H "CPU" {} \; | sort | sed -e 's!'$DIRECTORY'/\(.*\)_\([.0-9]*\)mfpb_\(.*\)g_run\(.*\)/CountRod.out: * CPU time *: *\([.0-9]*\) sec.!\1\t\2\t\3\t\4\t\5!' > $DIRECTORY/cpu.txt
find $DIRECTORY -name \*.done -o -name \*.fail | sort | sed -e 's!'$DIRECTORY'/\(.*\)_\([.0-9]*\)mfpb_\(.*\)g_run\(.*\)/\.CountRod.txt\.*\(.*\)!\1\t\2\t\3\t\4\t\5!' > $DIRECTORY/success.txt
TAB=" "
echo "set${TAB}max_features_per_bin${TAB}memory_limit_gb${TAB}run_number${TAB}max_memory_mb${TAB}cpu_s${TAB}job_success" > $DIRECTORY/mfpb.txt
paste $DIRECTORY/memory.txt $DIRECTORY/cpu.txt $DIRECTORY/success.txt | cut -f 1-5,10,15 >> $DIRECTORY/mfpb.txt

View File

@ -0,0 +1,23 @@
#!/bin/env Rscript
require(lattice)
require(sqldf)
args <- commandArgs(TRUE)
verbose = TRUE
input = args[1]
memory = args[2]
mfpb_data <- read.table(input, head=T)
mfpb_data <- mfpb_data[order(mfpb_data$set, mfpb_data$max_features_per_bin, mfpb_data$memory_limit_gb, mfpb_data$run_number) , ]
mfpb_data <- sqldf("select \"set\", max_features_per_bin, memory_limit_gb, avg(cpu_s) as cpu_s, avg(max_memory_mb) as max_memory_mb from mfpb_data where job_success = 'done' group by \"set\", max_features_per_bin, memory_limit_gb")
outfile = paste("max_features_per_bin_Xmx", memory, "g.pdf", sep="")
pdf(outfile, height=7, width=14)
par(cex=1.3)
xyplot(max_memory_mb + cpu_s ~ log10(max_features_per_bin), groups = set, data = subset(mfpb_data, memory_limit_gb == memory), type="b", scales=list(relation="free"), auto.key=T)
dev.off()

View File

@ -0,0 +1,3 @@
/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta
/humgen/gsa-firehose2/pipeline/projects/Barcoded_1000G_WEx_Plate_1/v5/IndelCalls/IntermediateFiles/Barcoded_1000G_WEx_HG01359/HG01359.cleaned.indels.vcf
/humgen/gsa-firehose2/pipeline/projects/Barcoded_1000G_WEx_Plate_1/v5/IndelCalls/IntermediateFiles/Barcoded_1000G_WEx_NA19670/NA19670.cleaned.indels.vcf