Checking in scripts used for testing the linear index MAX_FEATURES_PER_BIN.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4887 348d0f76-0448-11de-a6fe-93d51630548a
2010-12-20 21:25:36 +00:00 · 2010-12-20 21:25:36 +00:00 · 758d14a261
parent fc33901810
commit 758d14a261
6 changed files with 176 additions and 0 deletions
--- a/scala/qscript/kshakir/linearindexbintests/LinearIndexBinTests.scala
+++ b/scala/qscript/kshakir/linearindexbintests/LinearIndexBinTests.scala
@ -0,0 +1,68 @@
+import org.broadinstitute.sting.queue.QScript
+import org.broadinstitute.sting.utils.text._
+import org.broadinstitute.sting.queue.extensions.gatk._
+import collection.JavaConversions._
+
+class LinearIndexBinTests extends QScript {
+  qscript =>
+
+  @Input(doc="The path to the GenomeAnalysisTK.jar file.", shortName="gatk")
+  var gatkJar: File = null
+
+  @Argument(doc="Rod list to test.  The first line in the list is the reference.", shortName="BL")
+  var rodLists: List[File] = Nil
+
+  @Argument(doc="Number of times to run the test.", shortName="numRuns", required=false)
+  var numRuns = 1
+
+  @Input(doc="memory limits", shortName="mem", required=true)
+  var memoryLimits: List[Int] = Nil
+
+  @Input(doc="max bin size", shortName="maxBin", required=false)
+  var maxBinSize = 512
+
+  def script = {
+    var maxFeaturesPerBin = List.empty[String]
+    var currMaxFeatures = maxBinSize
+    while (currMaxFeatures > 1) {
+      maxFeaturesPerBin +:= currMaxFeatures.toString
+      currMaxFeatures /= 2
+    }
+    maxFeaturesPerBin :::= List("0.001", "0.01", "0.1", "1")
+
+    for (run <- 1 to numRuns) {
+      for (rodList <- rodLists) {
+        val rodListName = rodList.getName
+        val lines = new XReadLines(rodList).iterator
+        val reference = new File(lines.next)
+        val rodFiles = lines.map(path => new File(path)).toList
+
+        for (memoryLimit <- memoryLimits) {
+          for (maxFeatures <- maxFeaturesPerBin) {
+            val dir = "%s_%smfpb_%02dg_run%02d".format(rodListName, "00000".take(5-maxFeatures.length) + maxFeatures, memoryLimit, run)
+
+            val countRod = new CountRod {
+              override def javaOpts = super.javaOpts + " -DMAX_FEATURES_PER_BIN=" + maxFeatures
+            }
+            countRod.jobOutputFile = new File(dir, "CountRod.out")
+            countRod.out = new File(dir, "CountRod.txt")
+
+            countRod.jarFile = qscript.gatkJar
+            countRod.reference_sequence = reference
+            countRod.memoryLimit = Some(memoryLimit)
+
+            // Some of the BED files don't have a chrM, which makes the GATK angry.  Run unsafe.
+            countRod.U = Some(org.broadinstitute.sting.gatk.arguments.ValidationExclusion.TYPE.ALL)
+
+            for ((rodFile, index) <- rodFiles.zipWithIndex) {
+               val rodType = rodFile.getName.split("\\.").last
+               countRod.rodBind :+= new RodBind(rodType + (index+1), rodType.toUpperCase, rodFile.getAbsolutePath)
+            }
+
+            add(countRod)
+          }
+        }
+      }
+    }
+  }
+}
--- a/scala/qscript/kshakir/linearindexbintests/LinearIndexBinTests.sh
+++ b/scala/qscript/kshakir/linearindexbintests/LinearIndexBinTests.sh
@ -0,0 +1,15 @@
+#!/bin/sh
+
+STING_HOME=/humgen/gsa-hpprojects/dev/kshakir/src/Sting_CountRod
+TMP_DIR=/broad/shptmp/kshakir/CountRod
+JOB_QUEUE=
+STATUS_TO=
+#JOB_QUEUE="-jobQueue week"
+#STATUS_TO="-statusTo kshakir"
+
+if [ "$1" == "debug" ]; then
+    JAVA_DEBUG="-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8555"
+    shift
+fi
+
+java $JAVA_DEBUG -Djava.io.tmpdir=${TMP_DIR} -jar ${STING_HOME}/dist/Queue.jar -jobPrefix QCountRodTest -S LinearIndexBinTests.scala -gatk ${STING_HOME}/dist/GenomeAnalysisTK.jar -jobQueue ${JOB_QUEUE} ${STATUS_TO} -bsub $@
--- a/scala/qscript/kshakir/linearindexbintests/README.txt
+++ b/scala/qscript/kshakir/linearindexbintests/README.txt
@ -0,0 +1,54 @@
+DESCRIPTION
+-----------
+
+This folder contains a set of test scripts for evaluating the MAX_FEATURES_PER_BIN setting in tribble/src/org/broad/tribble/index/linear/LinearIndex.java
+
+For the tests to work you must patch the tribble code to enable the MAX_FEATURES_PER_BIN to be set via a system property, for example:
+  java -jar GenomeAnalysisTK.jar -DMAX_FEATURES_PER_BIN=1 ...
+
+
+SCRIPTS
+-------
+
+*** LinearIndexBinTests.sh ***
+
+Runs the scala script LinearIndexBinTests.scala.  Requires that you pass rods via "rod lists" (see below) and specify how much memory to run each set of tests with.
+
+Example dry run:
+  ./LinearIndexBinTests.sh -mem 2 -mem 4 -mem 6 -BL test_vcfs
+
+Example run:
+  ./LinearIndexBinTests.sh -mem 2 -mem 4 -mem 6 -BL test_vcfs -run
+
+Example run on the hour queue:
+  ./LinearIndexBinTests.sh -mem 2 -mem 4 -mem 6 -BL test_vcfs -jobQueue hour -run
+
+Example run on the hour queue with each job run three times:
+  ./LinearIndexBinTests.sh -mem 2 -mem 4 -mem 6 -BL test_vcfs -jobQueue hour -numRuns 3 -run
+
+*** grep_results.sh ***
+
+Greps the CPU and Max Memory statistics from the LSF output files into a file mfpb.txt.
+
+Example:
+  ./grep_results.sh
+  [outputs: ./mfpb.txt]
+
+*** plot_results.R ***
+
+Creates a plot from a subset of the data in mfpb.txt.  Can be run multiple times to produces plots for the different memory limits passed to LinearIndexBinTests.sh
+
+Example:
+  ./plot_results.R mfpb.txt 2
+  ./plot_results.R mfpb.txt 4
+  ./plot_results.R mfpb.txt 6
+  [outputs: ./max_features_per_bin_Xmx2g.pdf ./max_features_per_bin_Xmx4g.pdf ./max_features_per_bin_Xmx6g.pdf]
+
+
+ROD LISTS
+---------
+
+A rod list is a file that contains the FASTA reference on the first line, and then 1..N ROD files in the rest of the file.  The RODs must all end with an extension that corresponds to the rod type, for example: .vcf, .bed, etc.
+
+Example:
+  [See test_vcfs]
--- a/scala/qscript/kshakir/linearindexbintests/grep_results.sh
+++ b/scala/qscript/kshakir/linearindexbintests/grep_results.sh
@ -0,0 +1,13 @@
+#!/bin/sh
+
+DIRECTORY=.
+
+find $DIRECTORY -name CountRod.out -exec grep -H "Max Memory" {} \; | sort | sed -e 's!'$DIRECTORY'/\(.*\)_\([.0-9]*\)mfpb_\(.*\)g_run\(.*\)/CountRod.out: * Max Memory *: *\([.0-9]*\) MB!\1\t\2\t\3\t\4\t\5!' > $DIRECTORY/memory.txt
+
+find $DIRECTORY -name CountRod.out -exec grep -H "CPU" {} \; | sort | sed -e 's!'$DIRECTORY'/\(.*\)_\([.0-9]*\)mfpb_\(.*\)g_run\(.*\)/CountRod.out: * CPU time *: *\([.0-9]*\) sec.!\1\t\2\t\3\t\4\t\5!' > $DIRECTORY/cpu.txt
+
+find $DIRECTORY -name \*.done -o -name \*.fail | sort | sed -e 's!'$DIRECTORY'/\(.*\)_\([.0-9]*\)mfpb_\(.*\)g_run\(.*\)/\.CountRod.txt\.*\(.*\)!\1\t\2\t\3\t\4\t\5!' > $DIRECTORY/success.txt
+
+TAB="	"
+echo "set${TAB}max_features_per_bin${TAB}memory_limit_gb${TAB}run_number${TAB}max_memory_mb${TAB}cpu_s${TAB}job_success" > $DIRECTORY/mfpb.txt
+paste $DIRECTORY/memory.txt $DIRECTORY/cpu.txt $DIRECTORY/success.txt | cut -f 1-5,10,15 >> $DIRECTORY/mfpb.txt
--- a/scala/qscript/kshakir/linearindexbintests/plot_results.R
+++ b/scala/qscript/kshakir/linearindexbintests/plot_results.R
@ -0,0 +1,23 @@
+#!/bin/env Rscript
+
+require(lattice)
+require(sqldf)
+
+args <- commandArgs(TRUE)
+verbose = TRUE
+
+input = args[1]
+memory = args[2]
+
+mfpb_data <- read.table(input, head=T)
+mfpb_data <- mfpb_data[order(mfpb_data$set, mfpb_data$max_features_per_bin, mfpb_data$memory_limit_gb, mfpb_data$run_number) , ]
+mfpb_data <- sqldf("select \"set\", max_features_per_bin, memory_limit_gb, avg(cpu_s) as cpu_s, avg(max_memory_mb) as max_memory_mb from mfpb_data where job_success = 'done' group by \"set\", max_features_per_bin, memory_limit_gb")
+
+outfile = paste("max_features_per_bin_Xmx", memory, "g.pdf", sep="")
+pdf(outfile, height=7, width=14)
+
+par(cex=1.3)
+
+xyplot(max_memory_mb + cpu_s ~ log10(max_features_per_bin), groups = set, data = subset(mfpb_data, memory_limit_gb == memory), type="b", scales=list(relation="free"), auto.key=T)
+
+dev.off()
--- a/scala/qscript/kshakir/linearindexbintests/test_vcfs
+++ b/scala/qscript/kshakir/linearindexbintests/test_vcfs
@ -0,0 +1,3 @@
+/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta
+/humgen/gsa-firehose2/pipeline/projects/Barcoded_1000G_WEx_Plate_1/v5/IndelCalls/IntermediateFiles/Barcoded_1000G_WEx_HG01359/HG01359.cleaned.indels.vcf
+/humgen/gsa-firehose2/pipeline/projects/Barcoded_1000G_WEx_Plate_1/v5/IndelCalls/IntermediateFiles/Barcoded_1000G_WEx_NA19670/NA19670.cleaned.indels.vcf