Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/ebanks/Sting_rodrefactor into rodrewrite

2011-09-07 10:50:00 -04:00 · 2011-09-07 10:50:00 -04:00 · a02636a1ac
parent d471617c65 9559115ad5
commit a02636a1ac
8 changed files with 62 additions and 21 deletions
--- a/public/R/queueJobReport.R
+++ b/public/R/queueJobReport.R
@ -12,7 +12,9 @@ if ( onCMDLine ) {
  inputFileName = args[1]
  outputPDF = args[2]
 } else {
-  inputFileName = "~/Desktop/broadLocal/GATK/unstable/report.txt"
+  #inputFileName = "~/Desktop/broadLocal/GATK/unstable/report.txt"
+  inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
+  #inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt"
  outputPDF = NA
 }

@ -113,11 +115,22 @@ plotGroup <- function(groupTable) {
  textplot(as.data.frame(sum), show.rownames=F)
  title(paste("Job summary for", name, "itemizing each iteration"), cex=3)

+  # histogram of job times by groupAnnotations
+  if ( length(groupAnnotations) == 1 && dim(sub)[1] > 1 ) {
+    # todo -- how do we group by annotations?
+    p <- ggplot(data=sub, aes(x=runtime)) + geom_histogram()
+    p <- p + xlab("runtime in seconds") + ylab("No. of jobs")
+    p <- p + opts(title=paste("Job runtime histogram for", name))
+    print(p)
+  }
+  
  # as above, but averaging over all iterations
  groupAnnotationsNoIteration = setdiff(groupAnnotations, "iteration")
-  sum = cast(melt(sub, id.vars=groupAnnotationsNoIteration, measure.vars=c("runtime")), ... ~ ., fun.aggregate=c(mean, sd))
-  textplot(as.data.frame(sum), show.rownames=F)
-  title(paste("Job summary for", name, "averaging over all iterations"), cex=3)
+  if ( dim(sub)[1] > 1 ) {
+    sum = cast(melt(sub, id.vars=groupAnnotationsNoIteration, measure.vars=c("runtime")), ... ~ ., fun.aggregate=c(mean, sd))
+    textplot(as.data.frame(sum), show.rownames=F)
+    title(paste("Job summary for", name, "averaging over all iterations"), cex=3)
+  }
 }
    
 # print out some useful basic information
@ -146,7 +159,7 @@ plotJobsGantt(gatkReportData, T)
 plotJobsGantt(gatkReportData, F)
 plotProgressByTime(gatkReportData)
 for ( group in gatkReportData ) {
-  plotGroup(group)
+ plotGroup(group)
 }
  
 if ( ! is.na(outputPDF) ) {
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
@ -52,6 +52,11 @@ public class UnifiedArgumentCollection {
    @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
    public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY;

+    /**
+     * The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily
+     * distinguish between PCR errors vs. sequencing errors.  The practical implication for this value is that it
+     * effectively acts as a cap on the base qualities.
+     */
    @Argument(fullName = "pcr_error_rate", shortName = "pcr_error", doc = "The PCR error rate to be used for computing fragment-based likelihoods", required = false)
    public Double PCR_error = DiploidSNPGenotypeLikelihoods.DEFAULT_PCR_ERROR_RATE;

--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
@ -214,7 +214,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
    @Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times", required=false)
    public Set<String> sampleExpressions ;

-    @Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line) to include. Can be specified multiple times", required=false)
+    @Input(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line) to include. Can be specified multiple times", required=false)
    public Set<File> sampleFiles;

    /**
@ -226,7 +226,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
    /**
     * Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be excluded.
     */
-    @Argument(fullName="exclude_sample_file", shortName="xl_sf", doc="File containing a list of samples (one per line) to exclude. Can be specified multiple times", required=false)
+    @Input(fullName="exclude_sample_file", shortName="xl_sf", doc="File containing a list of samples (one per line) to exclude. Can be specified multiple times", required=false)
    public Set<File> XLsampleFiles = new HashSet<File>(0);

    /**
--- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java
+++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java
@ -252,7 +252,8 @@ public class ClippingOp {
        if (start == 0 && stop == read.getReadLength() -1)
            return new SAMRecord(read.getHeader());

-        CigarShift cigarShift = hardClipCigar(read.getCigar(), start, stop);
+        // If the read is unmapped there is no Cigar string and neither should we create a new cigar string
+        CigarShift cigarShift = (read.getReadUnmappedFlag()) ? new CigarShift(new Cigar(), 0, 0) : hardClipCigar(read.getCigar(), start, stop);

        // the cigar may force a shift left or right (or both) in case we are left with insertions
        // starting or ending the read after applying the hard clip on start/stop.
--- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java
+++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java
@ -82,7 +82,7 @@ public class PileupElement {
    // --------------------------------------------------------------------------

    private Integer getReducedReadQualityTagValue() {
-        return (Integer)getRead().getAttribute(ReadUtils.REDUCED_READ_QUALITY_TAG);
+        return getRead().getIntegerAttribute(ReadUtils.REDUCED_READ_QUALITY_TAG);
    }

    public boolean isReducedRead() {
--- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java
+++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java
@ -41,6 +41,10 @@ public class GATKSAMRecord extends SAMRecord {
    // because some values can be null, we don't want to duplicate effort
    private boolean retrievedReadGroup = false;

+    /** A private cache for the reduced read quality.  Null indicates the value hasn't be fetched yet or isn't available */
+    private boolean lookedUpReducedReadQuality = false;
+    private Integer reducedReadQuality;
+
    // These temporary attributes were added here to make life easier for
    // certain algorithms by providing a way to label or attach arbitrary data to
    // individual GATKSAMRecords.
@ -338,7 +342,17 @@ public class GATKSAMRecord extends SAMRecord {

    public Object getAttribute(final String tag) { return mRecord.getAttribute(tag); }

-    public Integer getIntegerAttribute(final String tag) { return mRecord.getIntegerAttribute(tag); }
+    public Integer getIntegerAttribute(final String tag) {
+        if ( tag == ReadUtils.REDUCED_READ_QUALITY_TAG ) {
+            if ( ! lookedUpReducedReadQuality ) {
+                lookedUpReducedReadQuality = true;
+                reducedReadQuality = mRecord.getIntegerAttribute(tag);
+            }
+            return reducedReadQuality;
+        } else {
+            return mRecord.getIntegerAttribute(tag);
+        }
+    }

    public Short getShortAttribute(final String tag) { return mRecord.getShortAttribute(tag); }

--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala
@ -11,7 +11,7 @@ import net.sf.samtools.SAMFileReader
 import net.sf.samtools.SAMFileHeader.SortOrder

 import org.broadinstitute.sting.queue.util.QScriptUtils
-import org.broadinstitute.sting.queue.function.{CommandLineFunction, ListWriterFunction}
+import org.broadinstitute.sting.queue.function.ListWriterFunction

 class DataProcessingPipeline extends QScript {
  qscript =>
@ -31,7 +31,7 @@ class DataProcessingPipeline extends QScript {
  var reference: File = _

  @Input(doc="dbsnp ROD to use (must be in VCF format)", fullName="dbsnp", shortName="D", required=true)
-  var dbSNP: File = _
+  var dbSNP: List[File] = List()

  /****************************************************************************
  * Optional Parameters
@ -43,7 +43,7 @@ class DataProcessingPipeline extends QScript {
 //

  @Input(doc="extra VCF files to use as reference indels for Indel Realignment", fullName="extra_indels", shortName="indels", required=false)
-  var indels: File = _
+  var indels: List[File] = List()

  @Input(doc="The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option)", fullName="path_to_bwa", shortName="bwa", required=false)
  var bwaPath: File = _
@ -159,7 +159,7 @@ class DataProcessingPipeline extends QScript {
    for (rg <- readGroups) {
      val intermediateInBam: File = if (index == readGroups.length) { inBam } else { swapExt(outBam, ".bam", index+1 + "-rg.bam") }
      val intermediateOutBam: File = if (index > 1) {swapExt(outBam, ".bam", index + "-rg.bam") } else { outBam}
-      val readGroup = new ReadGroup(rg.getReadGroupId, rg.getPlatform, rg.getLibrary, rg.getPlatformUnit, rg.getSample, rg.getSequencingCenter, rg.getDescription)
+      val readGroup = new ReadGroup(rg.getReadGroupId, rg.getLibrary, rg.getPlatform, rg.getPlatformUnit, rg.getSample, rg.getSequencingCenter, rg.getDescription)
      add(addReadGroup(intermediateInBam, intermediateOutBam, readGroup))
      index = index - 1
    }
@ -321,9 +321,9 @@ class DataProcessingPipeline extends QScript {
      this.input_file = inBams
    this.out = outIntervals
    this.mismatchFraction = 0.0
-    this.known :+= qscript.dbSNP
+    this.known ++= qscript.dbSNP
    if (indels != null)
-      this.known :+= qscript.indels
+      this.known ++= qscript.indels
    this.scatterCount = nContigs
    this.analysisName = queueLogDir + outIntervals + ".target"
    this.jobName = queueLogDir + outIntervals + ".target"
@ -333,9 +333,9 @@ class DataProcessingPipeline extends QScript {
    this.input_file = inBams
    this.targetIntervals = tIntervals
    this.out = outBam
-    this.known :+= qscript.dbSNP
+    this.known ++= qscript.dbSNP
    if (qscript.indels != null)
-      this.known :+= qscript.indels
+      this.known ++= qscript.indels
    this.consensusDeterminationModel = cleanModelEnum
    this.compress = 0
    this.scatterCount = nContigs
@ -344,7 +344,7 @@ class DataProcessingPipeline extends QScript {
  }

  case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs {
-    this.knownSites :+= qscript.dbSNP
+    this.knownSites ++= qscript.dbSNP
    this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
    this.input_file :+= inBam
    this.recal_file = outRecalFile
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala
@ -1,6 +1,5 @@
 package org.broadinstitute.sting.queue.qscripts

-import org.broadinstitute.sting.commandline.Hidden
 import org.broadinstitute.sting.queue.extensions.gatk._
 import org.broadinstitute.sting.queue.QScript
 import org.broadinstitute.sting.gatk.phonehome.GATKRunReport
@ -70,7 +69,8 @@ class MethodsDevelopmentCallingPipeline extends QScript {
    val goldStandardClusterFile = new File(goldStandardName + ".clusters")
  }

-  val hg19 = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta")  
+  val b37_decoy = new File("/humgen/1kg/reference/human_g1k_v37_decoy.fasta")
+  val hg19 = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta")
  val hg18 = new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
  val b36 = new File("/humgen/1kg/reference/human_b36_both.fasta")
  val b37 = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
@ -124,6 +124,14 @@ class MethodsDevelopmentCallingPipeline extends QScript {
              new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.bwa.cleaned.recal.bam"),
              new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"),                  // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
              "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 99.0, !lowPass, !exome, 3),
+    "WExTrioDecoy" -> new Target("CEUTrio.HiSeq.WEx.b37_decoy", b37_decoy, dbSNP_b37, hapmap_b37, indelMask_b37,
+              new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WEx.b37_decoy.list"),
+              new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"),                  // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
+              "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 3.3, 98.0, !lowPass, exome, 3),
+    "WGSTrioDecoy" -> new Target("CEUTrio.HiSeq.WGS.b37_decoy", b37_decoy, dbSNP_b37, hapmap_b37, indelMask_b37,
+              new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37_decoy.list"),
+              new File("/humgen/gsa-hpprojects/dev/carneiro/trio/analysis/snps/CEUTrio.WEx.filtered.vcf"),                  // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
+              "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 99.0, !lowPass, !exome, 3),
    "FIN" -> new Target("FIN", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
              new File("/humgen/1kg/processing/pipeline_test_bams/FIN.79sample.Nov2010.chr20.bam"),
              new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"),         // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **