java changes:

VariantEvalWalker's logger is made public, so that variant eval modules can access it through the parent object. DesignFileGenerator comment lists how best to bind things to it, and the feature accessor is better refined to grab the genome loc. (old change) scala changes: convenience addAll( List[CommandLineFunction] ) added to QScript class (and thus removed from the fCPV2) useful command line functions added to a new library package for command line functions (these are fast simple VCF command lines) bug fixed in ProjectManagement for the class where there's only one batch to be batch-merged (not really part of the use-case, but an edge-condition that came up during pipeline testing) first draft of a private mutations pipeline which will be elaborated in future git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4823 348d0f76-0448-11de-a6fe-93d51630548a
2010-12-12 05:10:45 +00:00 · 2010-12-12 05:10:45 +00:00 · cf75caf653
parent abd6ce1c77
commit cf75caf653
8 changed files with 122 additions and 7 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
@ -914,5 +914,5 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
        return eval.hasGenotypes() ? eval.getNSamples() : nSamples;
    }

-    protected Logger getLogger() { return logger; }
+    public Logger getLogger() { return logger; }
 }
--- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DesignFileGeneratorWalker.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DesignFileGeneratorWalker.java
@ -19,7 +19,7 @@ import java.io.PrintStream;
 * Takes an interval list and annotates intervals with genes and exons falling within that interval
 * Was written in order to annotate the Whole Exome Agilent designs at the Broad institute
 * Bind the refseq rod as -B refseq,refseq,/path/to/refGene.txt
- * Bind the interval list as -B interval_list,intervals,/path/to/intervals.interval_list
+ * Bind the interval list as -B interval_list,bed,/path/to/intervals.interval_list
 * Bind the additional files file as -B gene*,bed,/path/to/other/file.bed
 * @Author chartl                                                                                                                                                                                       
 * @Date Apr 26, 2010                                                                                                                                                                                   
@ -39,7 +39,7 @@ public class DesignFileGeneratorWalker extends RodWalker<Long,Long> {
            return null;
        }

-        List<Object> intervalsList= tracker.getReferenceMetaData("interval_list");
+        List<GATKFeature> intervalsList= tracker.getGATKFeatureMetaData("interval_list",false);
        List<Object> refseqList = tracker.getReferenceMetaData("refseq");
        List<GATKFeature> bedList = tracker.getGATKFeatureMetaData("gene",false);

--- a/scala/qscript/chartl/fullCallingPipelineV2.q
+++ b/scala/qscript/chartl/fullCallingPipelineV2.q
@ -61,6 +61,4 @@ class fullCallingPipelineV2 extends QScript {

    addAll(lib.StandardCallingPipeline(bamFiles,indel_vcf,recal_vcf,handfilt_vcf,fcp.pipelineArgs.target_titv,fcp.pipelineArgs.refseqTable))
  }
-
-  def addAll(clfs: List[CommandLineFunction]) = { clfs.foreach(c => add(c)) }
 }
--- a/scala/qscript/chartl/private_mutations.q
+++ b/scala/qscript/chartl/private_mutations.q
@ -0,0 +1,57 @@
+import collection.JavaConversions._
+import java.io.FileNotFoundException
+import org.broadinstitute.sting.datasources.pipeline._
+import org.broadinstitute.sting.queue.extensions.gatk.{VariantFiltration, UnifiedGenotyper}
+import org.broadinstitute.sting.queue.library.clf.vcf._
+import org.broadinstitute.sting.queue.pipeline._
+import org.broadinstitute.sting.queue.QScript
+import org.broadinstitute.sting.utils.yaml.YamlUtils
+
+class private_mutations extends QScript {
+  @Argument(shortName="yaml",fullName="eomiYaml",doc="Project YAML file",required=true) var eomiYaml: File = _
+  @Argument(shortName="sting",fullName="stingDir",doc="path to the Sting directory",required=true) var sting: String = _
+  @Argument(shortName="out",fullName="finalVCF",doc="the merged vcf to write to", required=true) var finalMergedVCF : File = _
+
+  var gatkjar : File = _
+  def script = {
+    gatkjar = new File(sting+"/dist/GenomeAnalysisTK.jar")
+    var input_pipeline : Pipeline = YamlUtils.load(classOf[Pipeline],eomiYaml)
+    var eomi_pipeline : Pipeline = new Pipeline
+    // use only QC-positive samples
+    eomi_pipeline.setProject( input_pipeline.getProject )
+    eomi_pipeline.setSamples( input_pipeline.getSamples.filter( p => p.getTags.get("QCStatus").equals("PASS")) )
+    var vcLib : VariantCalling = new VariantCalling(eomi_pipeline,gatkjar)
+    var pmLib : ProjectManagement = new ProjectManagement(sting)
+
+    /*eomi_pipeline.getSamples.foreach( p =>
+      if ( ! p.getBamFiles.get("recalibrated").exists) throw new FileNotFoundException(
+        p.getBamFiles.get("recalibrated").getAbsolutePath+" does not exist" ))*/
+
+    var batches : List[List[PipelineSample]] = eomi_pipeline.getSamples.toList.grouped(100).toList
+    var genotypers : List[UnifiedGenotyper] = batches.map( pl => pl.map( p => p.getBamFiles.get("recalibrated") ) ).zipWithIndex.map(
+    b => vcLib.StandardUnifiedGenotyper(b._1,new File(eomi_pipeline.getProject.getName+"_batch%d.raw.vcf".format(1+b._2))))
+    addAll(genotypers)
+
+    var handFilters : List[VariantFiltration] = genotypers.map( g => vcLib.StandardHandfilter(g.out,swapExt(g.out,".raw.vcf",".handfiltered.vcf")))
+
+    addAll(handFilters)
+
+    addAll(pmLib.MergeBatches(handFilters.map( _.out), batches.flatten.map( p => p.getBamFiles.get("recalibrated")),
+      finalMergedVCF,eomi_pipeline.getProject.getReferenceFile,20))
+
+    var afr_sams : List[PipelineSample] = eomi_pipeline.getSamples.toList.filter( p => p.getTags.get("Population").equals("AFRAMR"))
+    var eur_sams : List[PipelineSample] = eomi_pipeline.getSamples.toList.filter( p => p.getTags.get("Population").equals("EURAMR") ||
+      p.getTags.get("Population").equals("UNK"))
+
+    var variant_loci : VCFExtractIntervals = new VCFExtractIntervals(finalMergedVCF,swapExt(finalMergedVCF,".vcf",".intervals.list"),false)
+
+    add(variant_loci)
+
+    var extract_afr : VCFExtractSamples = new VCFExtractSamples(finalMergedVCF,swapExt(finalMergedVCF,".vcf",".afr.vcf"),afr_sams.map(_.getId))
+    var extract_eur : VCFExtractSamples = new VCFExtractSamples(finalMergedVCF,swapExt(finalMergedVCF,".vcf",".eur+unk.vcf"),eur_sams.map(_.getId))
+
+    add(extract_afr)
+    add(extract_eur)
+  }
+  
+}
--- a/scala/src/org/broadinstitute/sting/queue/QScript.scala
+++ b/scala/src/org/broadinstitute/sting/queue/QScript.scala
@ -69,6 +69,10 @@ trait QScript extends Logging {
    functions.foreach(function => function.addOrder = QScript.nextAddOrder)
    this.functions ++= functions
  }
+
+  def addAll(functions: List[QFunction] ) = {
+    functions.foreach( f => add(f) )
+  }
 }

 object QScript {
--- a/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractIntervals.scala
+++ b/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractIntervals.scala
@ -0,0 +1,21 @@
+package org.broadinstitute.sting.queue.library.clf.vcf
+
+import java.io.File
+import org.broadinstitute.sting.commandline.{Argument, Output, Input}
+import org.broadinstitute.sting.queue.function.CommandLineFunction
+
+class VCFExtractIntervals(inVCF: File, outList: File, passOnly: Boolean ) extends CommandLineFunction {
+  def this(vcf: File, list: File ) = this(vcf,list,false)
+  def this(vcf: File) = this(vcf, new File(vcf.getAbsolutePath.replace("vcf","intervals.list")),false)
+  def this(vcf: File, removeFilters: Boolean) = this(vcf, new File(vcf.getAbsolutePath.replace("vcf","intervals.list")), removeFilters)
+
+  @Input(doc="The VCF from which to extract an interval list") var inputVCF : File = inVCF
+  @Output(doc="The file to write the interval list to") var outputList : File = outList
+  @Argument(doc="Whether to use all sites, or only the unfiltered sites") var usePFOnly: Boolean = passOnly
+
+  def commandLine = {
+    if ( usePFOnly ) "grep PASS %s | awk '{print $1\":\"$2}' > %s".format(inputVCF.getAbsolutePath,outputList.getAbsolutePath)
+    else "grep -v \\\\# %s | awk '{print $1\":\"$2}' > %s".format(inputVCF.getAbsolutePath,outputList.getAbsolutePath)
+  }
+
+}
--- a/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractSamples.scala
+++ b/scala/src/org/broadinstitute/sting/queue/library/clf/vcf/VCFExtractSamples.scala
@ -0,0 +1,33 @@
+package org.broadinstitute.sting.queue.library.clf.vcf
+
+import java.io.File
+import collection.JavaConversions._
+import org.broadinstitute.sting.commandline._
+import org.broadinstitute.sting.queue.function.CommandLineFunction
+import org.broadinstitute.sting.utils.text.XReadLines
+
+class VCFExtractSamples(inVCF: File, outVCF: File, samples: List[String]) extends CommandLineFunction {
+  @Input(doc="input VCF from which to extract samples") var inputVCF : File = inVCF
+  @Output(doc="output VCF to write extracted samples to") var outputVCF : File = outVCF
+  @Argument(doc="List of samples to extract from the VCF") var sampleList : List[String] = samples
+
+  var sampleGrep : String = _
+
+  def this(in: File, out: File, samples: File) = this(in,out, (new XReadLines(samples)).readLines.toList)
+
+  override def freezeFieldValues = {
+    this.logger.warn("Note: Using VCFExtractSamples invalidates AC/AF/AN annotations. This is an explicit warning.")
+    sampleGrep = "\"" + sampleList.foldLeft[String](sampleList.get(0))( (a,b) => a + "|" + b) + "\""
+    super.freezeFieldValues
+  }
+
+  def commandLine = {
+
+    var first : String = "head -n 500 %s | grep \\\\#\\\\# > %s".format(inputVCF.getAbsolutePath,outputVCF.getAbsolutePath)
+    var second : String = "head -n 500 %s | grep \\\\#CHR | tr '\\t' '\\n' | awk '{print NF\"\\t\"$1}' ".format(inputVCF)
+    second += "| egrep %s | awk '{print $1}' | tr '\\n' ',' | xargs -i cut -f1-9,\\{\\} %s | grep -v \\\\#\\\\# >> %s".format(sampleGrep,inputVCF.getAbsolutePath,outputVCF.getAbsolutePath)
+
+    first+" ; "+second
+  }
+
+}
--- a/scala/src/org/broadinstitute/sting/queue/pipeline/ProjectManagement.scala
+++ b/scala/src/org/broadinstitute/sting/queue/pipeline/ProjectManagement.scala
@ -16,6 +16,8 @@ class ProjectManagement(stingPath: String) {

  var stingDirPath : String = stingPath

+  def this(f : File) = this(f.getAbsolutePath)
+
  class PassFilterAlleles(vcf_files: List[File], out_list: File) extends CommandLineFunction {
    @Input(doc="List of VCFs to extract PF sites from") var vcfs = vcf_files
    @Output(doc="The file to write the site list to") var out_vcf = out_list
@ -24,7 +26,7 @@ class ProjectManagement(stingPath: String) {

    def commandLine = {
      "egrep \"FORMAT|format\" %s | cut -f1-8 > %s ; grep PASS %s | tr ':' '\\t' | awk '{print $2\"\\t\"$3\"\\t\"$4\"\\t\"$5\"\\t\"$6\"\\t.\\t.\\t.\"}' | sort -n -k2,2 | uniq | perl %s - %s.fai >> %s".format(
-        vcf_files(1).getAbsolutePath, out_vcf.getAbsolutePath, vcf_files.foldLeft[String]("")( (b,a) => b + " " + a.getAbsolutePath), sortByRef, ref.getAbsolutePath, out_vcf.getAbsolutePath
+        vcf_files(0).getAbsolutePath, out_vcf.getAbsolutePath, vcf_files.foldLeft[String]("")( (b,a) => b + " " + a.getAbsolutePath), sortByRef, ref.getAbsolutePath, out_vcf.getAbsolutePath
        )
    }
  }
@ -50,7 +52,7 @@ class ProjectManagement(stingPath: String) {

    cmds :+= ints
    
-    var calcs: List[UGCalcLikelihoods] = allBams.grouped(size).toList.zipWithIndex.map(u => LikelihoodCalc(u._1,ref,ints.intervals,pfSites.out_vcf, new File("batch%d.likelihoods.vcf".format(u._2))))
+    var calcs: List[UGCalcLikelihoods] = allBams.grouped(size).toList.zipWithIndex.map(u => LikelihoodCalc(u._1,ref,ints.intervals,pfSites.out_vcf, new File("MBatch%d.likelihoods.vcf".format(u._2))))

    cmds ++= calcs