From 798fb6a7a23cd110d3a82fe0d227fbde074c16fe Mon Sep 17 00:00:00 2001 From: hanna Date: Mon, 4 Apr 2011 15:35:14 +0000 Subject: [PATCH] First draft of a script to measure performance of read walkers when merging dynamically. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5570 348d0f76-0448-11de-a6fe-93d51630548a --- .../hanna/PrintReadsAcrossManySamples.q | 81 +++++++++++++++++++ scala/qscript/oneoffs/hanna/runprintreads.sh | 6 ++ 2 files changed, 87 insertions(+) create mode 100644 scala/qscript/oneoffs/hanna/PrintReadsAcrossManySamples.q create mode 100644 scala/qscript/oneoffs/hanna/runprintreads.sh diff --git a/scala/qscript/oneoffs/hanna/PrintReadsAcrossManySamples.q b/scala/qscript/oneoffs/hanna/PrintReadsAcrossManySamples.q new file mode 100644 index 000000000..98769e5df --- /dev/null +++ b/scala/qscript/oneoffs/hanna/PrintReadsAcrossManySamples.q @@ -0,0 +1,81 @@ +import java.io.PrintWriter +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException + +/** + * A pipeline for Queue that runs a custom walker outside of the GATK jar. + * NOTE: This code is an unsupported example for soliciting feedback on how to improve Queue. + * Future syntax will simplify running the GATK so please expect the syntax below to change significantly. + */ +class PrintReadsAcrossManySamples extends QScript { + // The full packaged jar should be used. + // You can build this jar via 'ant package' and then find it under + // 'Sting/dist/packages/GenomeAnalysisTK-*/GenomeAnalysisTK.jar' + @Input(doc="The path to the packaged GenomeAnalysisTK.jar file.", shortName="gatk") + var gatkJar: File = null + + @Input(doc="The reference file for the bam files.", shortName="R") + var referenceFile: File = null + + // NOTE: Do not initialize List, Set, or Option to null + // as you won't be able to update the collection. + // By default set: + // List[T] = Nil + // Set[T] = Set.empty[T] + // Option[T] = None + @Input(doc="One or more bam files.", shortName="I") + var bamFiles: List[File] = Nil + + @Input(doc="Name of the test case", fullName="test_case",required=false) + var testCaseName: String = "." + + @Input(doc="Max number of bam files to process", fullName="max_bams",required=false) + var maxBams = 1 + + @Input(doc="Step size",fullName="step_size",required=false) + var stepSize = 1 + + // This trait allows us set the variables below in one place, + // and then reuse this trait on each CommandLineGATK function below. + trait PrintReadsAcrossManySamplesArguments extends CommandLineGATK { + this.jarFile = PrintReadsAcrossManySamples.this.gatkJar + this.reference_sequence = PrintReadsAcrossManySamples.this.referenceFile + this.memoryLimit = 8 + } + + + def script = { + if(bamFiles.size != 1) + throw new ReviewedStingException("-I argument must consist of exactly one file containing a list of BAM files."); + + var lines: List[String] = List[String]() + for(line <- scala.io.Source.fromFile(bamFiles(0)).getLines) { + lines = lines ::: List(line) + } + + for(numBams <- 1 to math.min(maxBams,lines.size) by stepSize) { + val dir = new File(testCaseName + "/%03d_bams".format(numBams)) + dir.mkdir() + + val file = new File(dir,"bams.list") + val writer = new PrintWriter(file) + + for(bamIndex <- 0 to numBams-1) + writer.println(lines(bamIndex)) + + writer.close() + + // Create the function that we can run. + val printreads = new PrintReads with PrintReadsAcrossManySamplesArguments + + printreads.jobOutputFile = new File(dir, "PrintReads.out") + printreads.input_file = List(file) + printreads.reference_sequence = referenceFile + printreads.out = new File("/dev/null") + + add(printreads) + } + + } +} diff --git a/scala/qscript/oneoffs/hanna/runprintreads.sh b/scala/qscript/oneoffs/hanna/runprintreads.sh new file mode 100644 index 000000000..daf395ed6 --- /dev/null +++ b/scala/qscript/oneoffs/hanna/runprintreads.sh @@ -0,0 +1,6 @@ +#!/bin/sh +java -Djava.io.tmpdir=/broad/shptmp/hanna -jar ~/src/Sting/dist/Queue.jar \ + --script PrintReadsAcrossManySamples.q \ + -gatk ~/src/Sting/dist/GenomeAnalysisTK.jar \ + -R /humgen/1kg/reference/human_g1k_v37.fasta \ + -I ~/tests/1600samples/1kg_t2d.list --max_bams 2000 --step_size 10 -bsub -jobQueue week $1