From 3c5b8730d5b13e2d307c2e872b3f01c1fe23bbf6 Mon Sep 17 00:00:00 2001 From: depristo Date: Sun, 12 Sep 2010 14:04:10 +0000 Subject: [PATCH] More Queue scripts for analysis git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4260 348d0f76-0448-11de-a6fe-93d51630548a --- scala/qscript/depristo/1kg_table1.scala | 24 ++++++---- .../depristo/manySampleUGPerformance.scala | 19 ++++++++ .../depristo/resequencingSamples1KG.scala | 44 +++++++++++++++++++ 3 files changed, 78 insertions(+), 9 deletions(-) create mode 100644 scala/qscript/depristo/resequencingSamples1KG.scala diff --git a/scala/qscript/depristo/1kg_table1.scala b/scala/qscript/depristo/1kg_table1.scala index e307390ad..2c6968539 100755 --- a/scala/qscript/depristo/1kg_table1.scala +++ b/scala/qscript/depristo/1kg_table1.scala @@ -38,6 +38,8 @@ class Target(project: String, snpVCF: String, indelVCF: String, calledGenome: Do def getIndelEval = getEval("indels") } +val RELEASE = "/humgen/1kg/DCC/ftp/release/2010_07/" + var targets: List[Target] = List() val p1Targets = List(("CEU", 2.43e9), ("YRI", 2.39e9), ("CHBJPT", 2.41e9)) @@ -48,12 +50,12 @@ for ( (pop: String,called) <- p1Targets ) // pilot 2 val p2Targets = List(("CEU", 2.264e9), ("YRI", 2.214e9)) for ( (pop: String, called) <- p2Targets ) - targets ::= new Target("SRP000032", "/humgen/gsa-hpprojects/1kg/releases/pilot_paper_calls/trio/snps/" + pop + ".trio.2010_03.genotypes.vcf.gz", "v1/dindel-v2/"+pop+".trio.2010_06.indel.genotypes.vcf", called, 2.85e9, pop, "pilot2") + targets ::= new Target("SRP000032", RELEASE + "trio/snps/" + pop + ".trio.2010_03.genotypes.vcf.gz", "v1/dindel-v2/"+pop+".trio.2010_06.indel.genotypes.vcf", called, 2.85e9, pop, "pilot2") // pilot 3 for (pop <- List("CEU", "CHB", "CHD", "JPT", "LWK", "TSI", "YRI")) { - val indels = if ( pop != "LWK" ) "/humgen/gsa-hpprojects/1kg/releases/pilot_paper_calls/exon/indel/"+pop+".exon.2010_06.genotypes.vcf.gz" else null - targets ::= new Target("SRP000033", "/humgen/gsa-hpprojects/1kg/releases/pilot_paper_calls/exon/snps/" + pop + ".exon.2010_03.genotypes.vcf.gz", indels, 1.43e6, 1.43e6, pop, "pilot3", "/humgen/gsa-hpprojects/1kg/1kg_pilot3/useTheseBamsForAnalysis/pilot3.%s.cleaned.bam".format(pop)) + val indels = if ( pop != "LWK" ) "exon/indel/"+pop+".exon.2010_06.genotypes.vcf.gz" else null + targets ::= new Target("SRP000033", "exon/snps/" + pop + ".exon.2010_03.genotypes.vcf.gz", indels, 1.43e6, 1.43e6, pop, "pilot3", "/humgen/gsa-hpprojects/1kg/1kg_pilot3/useTheseBamsForAnalysis/pilot3.%s.cleaned.bam".format(pop)) } // merged files @@ -72,9 +74,9 @@ def script = stage match { case "ALL" => // initial pilot1 merge -- autosomes + x for ( (pop: String,called) <- p1Targets ) { - val auto = "/humgen/gsa-hpprojects/1kg/releases/pilot_paper_calls/low_coverage/snps/"+ pop +".low_coverage.2010_07.genotypes.vcf.gz" + val auto = RELEASE + "low_coverage/snps/"+ pop +".low_coverage.2010_07.genotypes.vcf.gz" // todo -- remove fixed when Laura gives us the official calls - val x = "/humgen/gsa-hpprojects/1kg/releases/pilot_paper_calls/low_coverage/snps/"+ pop +".low_coverage.2010_07.xchr.fixed.genotypes.vcf" + val x = RELEASE + "low_coverage/snps/"+ pop +".low_coverage.2010_07.xchr.fixed.genotypes.vcf" val combineSNPs = new Combine(List(auto, x), pop + ".pilot1.vcf") add(combineSNPs) } @@ -98,12 +100,14 @@ def script = stage match { //add(new Combine(pilots.map(_ + ".snps.merged.vcf"), snps)) add(new Combine(pilots.map(_ + ".indels.merged.vcf"), indels)) + + case "EVAL" => // VariantEval of the SNPs - //for (target <- targets) { - // add(new VariantEval(target.getSNPVCF, target.getSNPEval)) - // add(new StatPop(target)) - //} + for (target <- targets) { + add(new VariantEval(target.getSNPVCF, target.getSNPEval)) + //add(new StatPop(target)) + } case "DOC" => for (target <- targets) { @@ -124,7 +128,9 @@ class VariantEval(vcfIn: String, evalOut: String, vcfType: String = "VCF") exten this.out = new File(evalOut) this.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b36.rod") this.reportType = Some(VE2TemplateType.Grep) + this.noStandard = true; this.evalModule :+= "CompOverlap" + this.memoryLimit = Some(3) override def dotString = "VariantEval: " + vcfFile.getName } diff --git a/scala/qscript/depristo/manySampleUGPerformance.scala b/scala/qscript/depristo/manySampleUGPerformance.scala index a985024b2..e265b6961 100755 --- a/scala/qscript/depristo/manySampleUGPerformance.scala +++ b/scala/qscript/depristo/manySampleUGPerformance.scala @@ -32,8 +32,19 @@ def script = { add(sublist) add(mergeSublist) add(new Index(mergeSublist.o) ) + + // SNP calling add(new Call(sublist.list, nSamples, "dynamic_merge")) add(new Call(mergeSublist.o, nSamples, "pre_merge")) + + // SNP calling -- no annotations + add(new Call(sublist.list, nSamples, "dynamic_merge_no_annotations") { this.G :+= "None"; }) + add(new Call(mergeSublist.o, nSamples, "pre_merge_no_annotations") { this.G :+= "None"; }) + + // CountLoci + add(new MyCountLoci(sublist.list, nSamples, "dynamic_merge")) + add(new MyCountLoci(mergeSublist.o, nSamples, "pre_merge")) + } } @@ -57,6 +68,14 @@ class Call(@Input(doc="foo") bamList: File, n: Int, name: String) extends Unifie this.o = outVCF } + class MyCountLoci(@Input(doc="foo") bamList: File, n: Int, name: String) extends CountLoci with UNIVERSAL_GATK_ARGS { + @Output(doc="foo") var outFile: File = new File("%s.%d.%s.txt".format(bamList.getName, n, name)) + this.memoryLimit = Some(4) + this.input_file :+= bamList + this.jobQueue = "gsa" + this.o = outFile + } + class SliceList(n: Int) extends CommandLineFunction { @Output(doc="foo") var list: File = new File("bams.%d.list".format(n)) this.jobQueue = "gsa" diff --git a/scala/qscript/depristo/resequencingSamples1KG.scala b/scala/qscript/depristo/resequencingSamples1KG.scala new file mode 100644 index 000000000..61a3838a7 --- /dev/null +++ b/scala/qscript/depristo/resequencingSamples1KG.scala @@ -0,0 +1,44 @@ +import java.io.File +import org.broadinstitute.sting.commandline.Argument +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.extensions.samtools._ + +class ManySampleUGPerformanceTesting extends QScript { + @Argument(doc="gatkJarFile", required=false) + var gatkJarFile: File = new File("/home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar") + + @Argument(shortName = "R", doc="ref", required=false) + var referenceFile: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") + + val TARGET_INTERVAL = "my.intervals" + val TEST_BAM_LIST = new File("ten.bam.list") + val FULL_BAM_LIST = new File("/humgen/1kg/processing/allPopulations_chr20_june_release/allPopulations.june.bam.list") + val BAM_LIST = FULL_BAM_LIST + val HM3 = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.2/genotypes_r27_nr.hg19_fwd.vcf") + + trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { + this.logging_level = "INFO"; + this.jarFile = gatkJarFile; + this.intervals = new File(TARGET_INTERVAL); + this.reference_sequence = referenceFile; + this.jobQueue = "gsa"; + this.et = Option(org.broadinstitute.sting.gatk.phonehome.GATKRunReport.PhoneHomeOption.STANDARD); + this.dcov = Option(50); + } + + def script = { + // SNP calling + add(new MyQSample(BAM_LIST)); + } + + class MyQSample(@Input(doc="foo") bamList: File) extends QSample with UNIVERSAL_GATK_ARGS { + this.memoryLimit = Some(4) + this.input_file :+= bamList + //this.BTI = "genotypes" + this.nt = Option(10) + this.rodBind :+= RodBind("genotypes", "VCF", HM3) + this.o = new File("%s.qsample".format(bamList.getName)) + } +} +