From 596c1723aeec43ebf2feb7ede387b1954910736c Mon Sep 17 00:00:00 2001
From: Guillermo del Angel <delangel@broadinstitute.org>
Date: Thu, 25 Oct 2012 10:35:43 -0400
Subject: [PATCH 02/49] Hidden, unsupported ability of VariantEval to run
 AlleleCount stratification on sites-only VCFs. I'll expose it/add tests on it
 if people think this is generaly useful. User needs to specify total # of
 samples as command line argument since genotypes are not available. Also,
 fixes to large-scale validation script: lower -minIndelFrac threshold or else
 we'll kill most indels since default 0.25 is too high for pools, fix also VE
 stratifications and add one VE run where eval=1KG, comp=pool data and AC
 stratification based on 1KG annotation

---
 .../sting/gatk/walkers/varianteval/VariantEval.java  | 12 ++++++++++++
 .../varianteval/stratifications/AlleleCount.java     |  2 +-
 2 files changed, 13 insertions(+), 1 deletion(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java
index a73e125ad..201028d99 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java
@@ -183,6 +183,10 @@ public class VariantEval extends RodWalker<Integer, Integer> implements TreeRedu
     @Argument(fullName="keepAC0", shortName="keepAC0", doc="If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes", required=false)
     private boolean keepSitesWithAC0 = false;
 
+    @Hidden
+    @Argument(fullName="numSamples", shortName="numSamples", doc="If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes", required=false)
+    private int numSamplesFromArgument = 0;
+
     /**
      * If true, VariantEval will treat -eval 1 -eval 2 as separate tracks from the same underlying
      * variant set, and evaluate the union of the results.  Useful when you want to do -eval chr1.vcf -eval chr2.vcf etc.
@@ -589,6 +593,14 @@ public class VariantEval extends RodWalker<Integer, Integer> implements TreeRedu
     public boolean isSubsettingToSpecificSamples() { return isSubsettingSamples; }
     public Set<String> getSampleNamesForEvaluation() { return sampleNamesForEvaluation; }
 
+    public int getNumberOfSamplesForEvaluation() {
+        if (sampleNamesForEvaluation!= null &&  !sampleNamesForEvaluation.isEmpty())
+            return sampleNamesForEvaluation.size();
+        else {
+            return numSamplesFromArgument;
+        }
+
+    }
     public Set<String> getSampleNamesForStratification() { return sampleNamesForStratification; }
 
     public List<RodBinding<VariantContext>> getComps() { return comps; }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java
index e6efd4482..7197fc14c 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java
@@ -29,7 +29,7 @@ public class AlleleCount extends VariantStratifier {
 
         // There are ploidy x n sample chromosomes
         // TODO -- generalize to handle multiple ploidy
-        nchrom = getVariantEvalWalker().getSampleNamesForEvaluation().size() * getVariantEvalWalker().getSamplePloidy();
+        nchrom = getVariantEvalWalker().getNumberOfSamplesForEvaluation() * getVariantEvalWalker().getSamplePloidy();
         if ( nchrom < 2 )
             throw new UserException.BadArgumentValue("AlleleCount", "AlleleCount stratification requires an eval vcf with at least one sample");
 

From cde4f037d3a79802569d06134358d9c7ed14a8ce Mon Sep 17 00:00:00 2001
From: Menachem Fromer <fromer@broadinstitute.org>
Date: Thu, 25 Oct 2012 16:18:25 -0400
Subject: [PATCH 03/49] Begin moving XHMM scripts to public

---
 .../queue/qscripts/CNV/xhmmCNVpipeline.scala  | 499 ++++++++++++++++++
 .../sting/queue/util/DoC/package.scala        | 123 +++++
 2 files changed, 622 insertions(+)
 create mode 100644 public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala
 create mode 100644 public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala

diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala
new file mode 100644
index 000000000..362337c84
--- /dev/null
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala
@@ -0,0 +1,499 @@
+package org.broadinstitute.sting.queue.qscripts.CNV
+
+import org.broadinstitute.sting.queue.extensions.gatk._
+import org.broadinstitute.sting.queue.QScript
+import org.broadinstitute.sting.queue.util.VCF_BAM_utilities
+import org.broadinstitute.sting.queue.util.DoC._
+import org.broadinstitute.sting.commandline.Hidden
+import java.io.{PrintStream, PrintWriter}
+import org.broadinstitute.sting.utils.text.XReadLines
+import collection.JavaConversions._
+
+class xhmmCNVpipeline extends QScript {
+  qscript =>
+
+  @Input(doc = "bam input, as .bam or as a list of files", shortName = "I", required = true)
+  var bams: File = _
+
+  @Argument(doc = "gatk jar file", shortName = "J", required = true)
+  var gatkJarFile: File = _
+
+  @Argument(doc = "xhmm executable file", shortName = "xhmmExec", required = true)
+  var xhmmExec: File = _
+
+  @Argument(doc = "Plink/Seq executable file", shortName = "pseqExec", required = true)
+  var pseqExec: File = _
+
+  @Argument(doc = "Plink/Seq SEQDB file (Reference genome sequence)", shortName = "SEQDB", required = true)
+  var pseqSeqDB: String = _
+
+  @Argument(shortName = "R", doc = "ref", required = true)
+  var referenceFile: File = _
+
+  @Argument(shortName = "L", doc = "Intervals", required = false)
+  var intervals: File = _
+
+  @Input(doc = "level of parallelism for BAM DoC.   By default is set to 0 [no scattering].", shortName = "scatter", required = false)
+  var scatterCountInput = 0
+
+  @Input(doc = "Samples to run together for DoC.   By default is set to 1 [one job per sample].", shortName = "samplesPerJob", required = false)
+  var samplesPerJob = 1
+
+  @Output(doc = "Base name for files to output", shortName = "o", required = true)
+  var outputBase: File = _
+
+  @Input(doc = "Maximum depth (before GATK down-sampling kicks in...)", shortName = "MAX_DEPTH", required = false)
+  var MAX_DEPTH = 20000
+
+  @Hidden
+  @Input(doc = "Number of read-depth bins", shortName = "NUM_BINS", required = false)
+  var NUM_BINS = 200
+
+  @Hidden
+  @Input(doc = "Starting value of read-depth bins", shortName = "START_BIN", required = false)
+  var START_BIN = 1
+
+  @Input(doc = "Minimum read mapping quality", shortName = "MMQ", required = false)
+  var minMappingQuality = 0
+
+  @Input(doc = "Memory (in GB) required for storing the whole matrix in memory", shortName = "wholeMatrixMemory", required = false)
+  var wholeMatrixMemory = -1
+
+  @Argument(shortName = "minTargGC", doc = "Exclude all targets with GC content less than this value", required = false)
+  var minTargGC : Double = 0.1
+
+  @Argument(shortName = "maxTargGC", doc = "Exclude all targets with GC content greater than this value", required = false)
+  var maxTargGC : Double = 0.9
+
+  @Argument(shortName = "minTargRepeats", doc = "Exclude all targets with % of repeat-masked bases less than this value", required = false)
+  var minTargRepeats : Double = 0.0
+
+  @Argument(shortName = "maxTargRepeats", doc = "Exclude all targets with % of repeat-masked bases greater than this value", required = false)
+  var maxTargRepeats : Double = 0.1
+
+  @Argument(shortName = "sampleIDsMap", doc = "File mapping BAM sample IDs to desired sample IDs", required = false)
+  var sampleIDsMap: String = ""
+
+  @Argument(shortName = "sampleIDsMapFromColumn", doc = "Column number of OLD sample IDs to map", required = false)
+  var sampleIDsMapFromColumn = 1
+
+  @Argument(shortName = "sampleIDsMapToColumn", doc = "Column number of NEW sample IDs to map", required = false)
+  var sampleIDsMapToColumn = 2
+
+  @Argument(shortName = "rawFilters", doc = "xhmm command-line parameters to filter targets and samples from raw data", required = false)
+  var targetSampleFiltersString: String = ""
+
+  @Argument(shortName = "PCAnormalize", doc = "xhmm command-line parameters to Normalize data using PCA information", required = false)
+  var PCAnormalizeMethodString: String = ""
+
+  @Argument(shortName = "normalizedFilters", doc = "xhmm command-line parameters to filter targets and samples from PCA-normalized data", required = false)
+  var targetSampleNormalizedFiltersString: String = ""
+
+  @Argument(shortName = "xhmmParams", doc = "xhmm model parameters file", required = true)
+  var xhmmParamsArg: File = _
+
+  @Argument(shortName = "discoverParams", doc = "xhmm command-line parameters for discovery step", required = false)
+  var discoverCommandLineParams: String = ""
+
+  @Argument(shortName = "genotypeParams", doc = "xhmm command-line parameters for genotyping step", required = false)
+  var genotypeCommandLineParams: String = ""
+
+  @Argument(shortName = "genotypeSubsegments", doc = "Should we also genotype all subsegments of the discovered CNV?", required = false)
+  var genotypeSubsegments: Boolean = false
+
+  @Argument(shortName = "maxTargetsInSubsegment", doc = "If genotypeSubsegments, then only consider sub-segments consisting of this number of targets or fewer", required = false)
+  var maxTargetsInSubsegment = 30
+
+  @Argument(shortName = "subsegmentGenotypeThreshold", doc = "If genotypeSubsegments, this is the default genotype quality threshold for the sub-segments", required = false)
+  var subsegmentGenotypeThreshold = 20.0
+
+  @Argument(shortName = "longJobQueue", doc = "Job queue to run the 'long-running' commands", required = false)
+  var longJobQueue: String = ""
+
+
+  val PREPARED_TARGS_SUFFIX: String = ".merged.interval_list"
+
+  val RD_OUTPUT_SUFFIX: String = ".RD.txt"
+
+  val TARGS_GC_SUFFIX = ".locus_GC.txt"
+  val EXTREME_GC_TARGS_SUFFIX = ".extreme_gc_targets.txt"
+
+  val TARGS_REPEAT_COMPLEXITY_SUFFIX = ".locus_complexity.txt"
+  val EXTREME_REPEAT_COMPLEXITY_SUFFIX = ".extreme_complexity_targets.txt"
+
+  val FILTERED_TARGS_SUFFIX: String = ".filtered_targets.txt"
+  val FILTERED_SAMPS_SUFFIX: String =  ".filtered_samples.txt"
+
+
+  trait WholeMatrixMemoryLimit extends CommandLineFunction {
+    // Since loading ALL of the data can take significant memory:
+    if (wholeMatrixMemory < 0) {
+      this.memoryLimit = 24
+    }
+    else {
+      this.memoryLimit = wholeMatrixMemory
+    }
+  }
+
+  trait LongRunTime extends CommandLineFunction {
+    if (longJobQueue != "")
+      this.jobQueue = longJobQueue
+  }
+
+  def script = {
+    val prepTargets = new PrepareTargets(List(qscript.intervals), outputBase.getPath + PREPARED_TARGS_SUFFIX, xhmmExec, referenceFile)
+    add(prepTargets)
+
+    trait CommandLineGATKArgs extends CommandLineGATK {
+      this.intervals :+= prepTargets.out
+      this.jarFile = qscript.gatkJarFile
+      this.reference_sequence = qscript.referenceFile
+      this.logging_level = "INFO"
+    }
+
+    val sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = VCF_BAM_utilities.getMapOfBAMsForSample(VCF_BAM_utilities.parseBAMsInput(bams))
+    val samples: List[String] = sampleToBams.keys.toList
+    Console.out.printf("Samples are %s%n", samples)
+
+    val groups: List[Group] = buildDoCgroups(samples, sampleToBams, samplesPerJob, outputBase)
+    var docs: List[DoC] = List[DoC]()
+    for (group <- groups) {
+      Console.out.printf("Group is %s%n", group)
+      docs ::= new DoC(group.bams, group.DoC_output, MAX_DEPTH, minMappingQuality, scatterCountInput, START_BIN, NUM_BINS, Nil) with CommandLineGATKArgs
+    }
+    addAll(docs)
+
+    val mergeDepths = new MergeGATKdepths(docs.map(u => u.intervalSampleOut), outputBase.getPath + RD_OUTPUT_SUFFIX, "_mean_cvg", xhmmExec, sampleIDsMap, sampleIDsMapFromColumn, sampleIDsMapToColumn, None, false) with WholeMatrixMemoryLimit
+    add(mergeDepths)
+
+    var excludeTargets : List[File] = List[File]()
+    if (minTargGC > 0 || maxTargGC < 1) {
+      val calcGCcontents = new GCContentByInterval with CommandLineGATKArgs
+      calcGCcontents.out = outputBase.getPath + TARGS_GC_SUFFIX
+      add(calcGCcontents)
+
+      val excludeTargetsBasedOnGC = new ExcludeTargetsBasedOnValue(calcGCcontents.out, EXTREME_GC_TARGS_SUFFIX, minTargGC, maxTargGC)
+      add(excludeTargetsBasedOnGC)
+      excludeTargets ::= excludeTargetsBasedOnGC.out
+    }
+
+    class CalculateRepeatComplexity(outFile : String) extends CommandLineFunction {
+      @Input(doc="")
+      var intervals: File = prepTargets.out
+
+      @Output(doc="")
+      var out : File = new File(outFile)
+
+      val regFile : String = outputBase.getPath + ".targets.reg"
+      val locDB : String = outputBase.getPath + ".targets.LOCDB"
+
+      val removeFiles = "rm -f " + regFile + " " + locDB
+      val createRegFile = "cat " + intervals + " | awk 'BEGIN{OFS=\"\\t\"; print \"#CHR\\tBP1\\tBP2\\tID\"} {split($1,a,\":\"); chr=a[1]; if (match(chr,\"chr\")==0) {chr=\"chr\"chr} split(a[2],b,\"-\"); bp1=b[1]; bp2=bp1; if (length(b) > 1) {bp2=b[2]} print chr,bp1,bp2,NR}' > " + regFile
+      val createLOCDB = pseqExec + " . loc-load --locdb " + locDB + " --file " + regFile + " --group targets --out " + locDB + ".loc-load"
+      val calcRepeatMaskedPercent = pseqExec + " . loc-stats --locdb " + locDB + " --group targets --seqdb " + pseqSeqDB + " --out " + locDB + ".loc-stats"
+      val extractRepeatMaskedPercent = "cat " + locDB + ".loc-stats.locstats | awk '{if (NR > 1) print $_}' | sort -k1 -g | awk '{print $10}' | paste " + intervals + " - | awk '{print $1\"\\t\"$2}' > " + out
+
+      var command: String =
+        removeFiles +
+          " && " + createRegFile +
+          " && " + createLOCDB +
+          " && " + calcRepeatMaskedPercent +
+          " && " + extractRepeatMaskedPercent
+
+      def commandLine = command
+
+      override def description = "Calculate the percentage of each target that is repeat-masked in the reference sequence: " + command
+    }
+
+    if (minTargRepeats > 0 || maxTargRepeats < 1) {
+      val calcRepeatComplexity = new CalculateRepeatComplexity(outputBase.getPath + TARGS_REPEAT_COMPLEXITY_SUFFIX)
+      add(calcRepeatComplexity)
+
+      val excludeTargetsBasedOnRepeats = new ExcludeTargetsBasedOnValue(calcRepeatComplexity.out, EXTREME_REPEAT_COMPLEXITY_SUFFIX, minTargRepeats, maxTargRepeats)
+      add(excludeTargetsBasedOnRepeats)
+      excludeTargets ::= excludeTargetsBasedOnRepeats.out
+    }
+
+    val filterCenterDepths = new FilterCenterRawMatrix(mergeDepths.mergedDoC, excludeTargets)
+    add(filterCenterDepths)
+
+    val pca = new PCA(filterCenterDepths.filteredCentered)
+    add(pca)
+
+    val normalize = new Normalize(pca)
+    add(normalize)
+
+    val filterZscore = new FilterAndZscoreNormalized(normalize.normalized)
+    add(filterZscore)
+
+    val filterOriginal = new FilterOriginalData(mergeDepths.mergedDoC, filterCenterDepths, filterZscore)
+    add(filterOriginal)
+
+    val discover = new DiscoverCNVs(filterZscore.filteredZscored, filterOriginal.sameFiltered)
+    add(discover)
+
+    val genotype = new GenotypeCNVs(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered)
+    add(genotype)
+
+    if (genotypeSubsegments) {
+      val genotypeSegs = new GenotypeCNVandSubsegments(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered)
+      add(genotypeSegs)
+    }
+  }
+
+  class ExcludeTargetsBasedOnValue(locus_valueIn : File, outSuffix : String, minVal : Double, maxVal : Double) extends InProcessFunction {
+    @Input(doc="")
+    var locus_value : File = locus_valueIn
+
+    @Output(doc="")
+    var out : File = new File(outputBase.getPath + outSuffix)
+
+    def run = {
+      var outWriter = new PrintWriter(new PrintStream(out))
+      var elems = asScalaIterator(new XReadLines(locus_value))
+
+      while (elems.hasNext) {
+        val line = elems.next
+        val splitLine = line.split("\\s+")
+        val locus = splitLine(0)
+        val locValStr = splitLine(1)
+        try {
+          val locVal = locValStr.toDouble
+          if (locVal < minVal || locVal > maxVal)
+            outWriter.printf("%s%n", locus)
+        }
+        catch {
+          case nfe: NumberFormatException => println("Ignoring non-numeric value " + locValStr + " for locus " + locus)
+          case e: Exception => throw e
+        }
+      }
+
+      outWriter.close
+    }
+  }
+
+  class FilterCenterRawMatrix(inputParam: File, excludeTargetsIn : List[File]) extends CommandLineFunction with WholeMatrixMemoryLimit {
+    @Input(doc = "")
+    val input = inputParam
+
+    @Input(doc = "")
+    val excludeTargets = excludeTargetsIn
+
+    @Output
+    val filteredCentered: File = new File(outputBase.getPath + ".filtered_centered" + RD_OUTPUT_SUFFIX)
+    @Output
+    val filteredTargets: File = new File(filteredCentered.getPath + FILTERED_TARGS_SUFFIX)
+    @Output
+    val filteredSamples: File = new File(filteredCentered.getPath + FILTERED_SAMPS_SUFFIX)
+
+    var command: String =
+      xhmmExec + " --matrix" +
+      " -r " + input +
+      " --centerData --centerType target" +
+      " -o " + filteredCentered +
+      " --outputExcludedTargets " + filteredTargets +
+      " --outputExcludedSamples " + filteredSamples
+    command += excludeTargets.map(u => " --excludeTargets " + u).reduceLeft(_ + "" + _)
+    if (targetSampleFiltersString != "")
+      command += " " + targetSampleFiltersString
+
+    def commandLine = command
+
+    override def description = "Filters samples and targets and then mean-centers the targets: " + command
+  }
+
+  class PCA(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit {
+    @Input(doc = "")
+    val input = inputParam
+
+    val PCAbase: String = outputBase.getPath + ".RD_PCA"
+
+    @Output
+    val outPC: File = new File(PCAbase + ".PC.txt")
+    @Output
+    val outPC_SD: File = new File(PCAbase + ".PC_SD.txt")
+    @Output
+    val outPC_LOADINGS: File = new File(PCAbase + ".PC_LOADINGS.txt")
+
+    var command: String =
+      xhmmExec + " --PCA" +
+      " -r " + input +
+      " --PCAfiles " + PCAbase
+
+    def commandLine = command
+
+    override def description = "Runs PCA on mean-centered data: " + command
+  }
+
+  class Normalize(pca: PCA) extends CommandLineFunction {
+    @Input(doc = "")
+    val input = pca.input
+
+    @Input(doc = "")
+    val inPC = pca.outPC
+
+    @Input(doc = "")
+    val inPC_SD = pca.outPC_SD
+
+    @Input(doc = "")
+    val inPC_LOADINGS = pca.outPC_LOADINGS
+
+    @Output
+    val normalized: File = new File(outputBase.getPath + ".PCA_normalized.txt")
+
+    var command: String =
+      xhmmExec + " --normalize" +
+      " -r " + input +
+      " --PCAfiles " + pca.PCAbase +
+      " --normalizeOutput " + normalized
+    if (PCAnormalizeMethodString != "")
+      command += " " + PCAnormalizeMethodString
+
+    def commandLine = command
+
+    override def description = "Normalizes mean-centered data using PCA information: " + command
+  }
+
+  class FilterAndZscoreNormalized(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit {
+    @Input(doc = "")
+    val input = inputParam
+
+    @Output
+    val filteredZscored: File = new File(outputBase.getPath + ".PCA_normalized.filtered.sample_zscores" + RD_OUTPUT_SUFFIX)
+    @Output
+    val filteredTargets: File = new File(filteredZscored.getPath + FILTERED_TARGS_SUFFIX)
+    @Output
+    val filteredSamples: File = new File(filteredZscored.getPath + FILTERED_SAMPS_SUFFIX)
+
+    var command: String =
+      xhmmExec + " --matrix" +
+      " -r " + input +
+      " --centerData --centerType sample --zScoreData" +
+      " -o " + filteredZscored +
+      " --outputExcludedTargets " + filteredTargets +
+      " --outputExcludedSamples " + filteredSamples
+    if (targetSampleNormalizedFiltersString != "")
+      command += " " + targetSampleNormalizedFiltersString
+
+    def commandLine = command
+
+    override def description = "Filters and z-score centers (by sample) the PCA-normalized data: " + command
+  }
+
+  class FilterOriginalData(inputParam: File, filt1: FilterCenterRawMatrix, filt2: FilterAndZscoreNormalized) extends CommandLineFunction with WholeMatrixMemoryLimit {
+    @Input(doc = "")
+    val input = inputParam
+
+    @Input(doc = "")
+    val targFilters: List[File] = List(filt1.filteredTargets, filt2.filteredTargets).map(u => new File(u))
+
+    @Input(doc = "")
+    val sampFilters: List[File] = List(filt1.filteredSamples, filt2.filteredSamples).map(u => new File(u))
+
+    @Output
+    val sameFiltered: File = new File(outputBase.getPath + ".same_filtered" + RD_OUTPUT_SUFFIX)
+
+    var command: String =
+      xhmmExec + " --matrix" +
+      " -r " + input +
+      targFilters.map(u => " --excludeTargets " + u).reduceLeft(_ + "" + _) +
+      sampFilters.map(u => " --excludeSamples " + u).reduceLeft(_ + "" + _) +
+      " -o " + sameFiltered
+
+    def commandLine = command
+
+    override def description = "Filters original read-depth data to be the same as filtered, normalized data: " + command
+  }
+
+  class DiscoverCNVs(inputParam: File, origRDParam: File) extends CommandLineFunction with LongRunTime {
+    @Input(doc = "")
+    val input = inputParam
+
+    @Input(doc = "")
+    val xhmmParams = xhmmParamsArg
+
+    @Input(doc = "")
+    val origRD = origRDParam
+
+    @Output
+    val xcnv: File = new File(outputBase.getPath + ".xcnv")
+
+    @Output
+    val aux_xcnv: File = new File(outputBase.getPath + ".aux_xcnv")
+
+    val posteriorsBase = outputBase.getPath
+
+    @Output
+    val dipPosteriors: File = new File(posteriorsBase + ".posteriors.DIP.txt")
+
+    @Output
+    val delPosteriors: File = new File(posteriorsBase + ".posteriors.DEL.txt")
+
+    @Output
+    val dupPosteriors: File = new File(posteriorsBase + ".posteriors.DUP.txt")
+
+    var command: String =
+      xhmmExec + " --discover" +
+      " -p " + xhmmParams +
+      " -r " + input +
+      " -R " + origRD +
+      " -c " + xcnv +
+      " -a " + aux_xcnv +
+      " -s " + posteriorsBase +
+      " " + discoverCommandLineParams
+
+    def commandLine = command
+
+    override def description = "Discovers CNVs in normalized data: " + command
+  }
+
+  abstract class BaseGenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends CommandLineFunction with LongRunTime {
+    @Input(doc = "")
+    val input = inputParam
+
+    @Input(doc = "")
+    val xhmmParams = xhmmParamsArg
+
+    @Input(doc = "")
+    val origRD = origRDParam
+
+    @Input(doc = "")
+    val inXcnv = xcnv
+
+    var command: String =
+      xhmmExec + " --genotype" +
+      " -p " + xhmmParams +
+      " -r " + input +
+      " -g " + inXcnv +
+      " -F " + referenceFile +
+      " -R " + origRD +
+      " " + genotypeCommandLineParams
+  }
+
+  class GenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam) {
+    @Output
+    val vcf: File = new File(outputBase.getPath + ".vcf")
+
+    command +=
+      " -v " +  vcf
+
+    def commandLine = command
+
+    override def description = "Genotypes discovered CNVs in all samples: " + command
+  }
+
+  class GenotypeCNVandSubsegments(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam) {
+    @Output
+    val vcf: File = new File(outputBase.getPath + ".subsegments.vcf")
+
+    command +=
+      " -v " +  vcf +
+      " --subsegments" +
+      " --maxTargetsInSubsegment " + maxTargetsInSubsegment +
+      " --genotypeQualThresholdWhenNoExact " + subsegmentGenotypeThreshold
+
+    def commandLine = command
+
+    override def description = "Genotypes discovered CNVs (and their sub-segments, of up to " + maxTargetsInSubsegment + " targets) in all samples: " + command
+  }
+}
\ No newline at end of file
diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala b/public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala
new file mode 100644
index 000000000..f35db4aa3
--- /dev/null
+++ b/public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala
@@ -0,0 +1,123 @@
+package org.broadinstitute.sting.queue.util
+
+import java.io.File
+import org.broadinstitute.sting.queue.extensions.gatk.{IntervalScatterFunction, CommandLineGATK}
+import org.broadinstitute.sting.queue.function.scattergather.ScatterGatherableFunction
+import org.broadinstitute.sting.gatk.downsampling.DownsampleType
+import org.broadinstitute.sting.commandline.{Input, Gather, Output}
+import org.broadinstitute.sting.queue.function.CommandLineFunction
+
+package object DoC {
+  class DoC(val bams: List[File], val DoC_output: File, val MAX_DEPTH: Int, val minMappingQuality: Int, val scatterCountInput: Int, val START_BIN: Int, val NUM_BINS: Int, val minCoverageCalcs: Seq[Int]) extends CommandLineGATK with ScatterGatherableFunction {
+    val DOC_OUTPUT_SUFFIX: String = ".sample_interval_summary"
+
+    // So that the output files of this DoC run get deleted once they're used further downstream:
+    this.isIntermediate = true
+
+    this.analysis_type = "DepthOfCoverage"
+
+    this.input_file = bams
+
+    this.downsample_to_coverage = Some(MAX_DEPTH)
+    this.downsampling_type = DownsampleType.BY_SAMPLE
+
+    this.scatterCount = scatterCountInput
+    this.scatterClass = classOf[IntervalScatterFunction]
+
+    // HACK for DoC to work properly within Queue:
+    @Output
+    @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction])
+    var intervalSampleOut: File = new File(DoC_output.getPath + DOC_OUTPUT_SUFFIX)
+
+    override def commandLine = super.commandLine +
+      " --omitDepthOutputAtEachBase" +
+      " --omitLocusTable" +
+      " --minBaseQuality 0" +
+      " --minMappingQuality " + minMappingQuality +
+      " --start " + START_BIN + " --stop " + MAX_DEPTH + " --nBins " + NUM_BINS +
+      (if (!minCoverageCalcs.isEmpty) minCoverageCalcs.map(cov => " --summaryCoverageThreshold " + cov).reduceLeft(_ + "" + _) else "") +
+      " --includeRefNSites" +
+      " -o " + DoC_output
+
+    override def shortDescription = "DoC: " + DoC_output
+  }
+
+  class DoCwithDepthOutputAtEachBase(bams: List[File], DoC_output: File, MAX_DEPTH: Int, minMappingQuality: Int, scatterCountInput: Int, START_BIN: Int, NUM_BINS: Int, minCoverageCalcs: Seq[Int]) extends DoC(bams, DoC_output, MAX_DEPTH: Int, minMappingQuality, scatterCountInput, START_BIN, NUM_BINS, minCoverageCalcs) {
+    // HACK for DoC to work properly within Queue:
+    @Output
+    @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction])
+    var outPrefix = DoC_output
+
+    override def commandLine = super.commandLine.replaceAll(" --omitDepthOutputAtEachBase", "")
+  }
+
+  def buildDoCgroups(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]], samplesPerJob: Int, outputBase: File): List[Group] = {
+
+    def buildDoCgroupsHelper(samples: List[String], count: Int): List[Group] = (samples splitAt samplesPerJob) match {
+      case (Nil, y) =>
+        return Nil
+      case (subsamples, remaining) =>
+        return new Group("group" + count, outputBase, subsamples, VCF_BAM_utilities.findBAMsForSamples(subsamples, sampleToBams)) :: buildDoCgroupsHelper(remaining, count + 1)
+    }
+
+    return buildDoCgroupsHelper(samples, 0)
+  }
+
+  // A group has a list of samples and bam files to use for DoC
+  class Group(val name: String, val outputBase: File, val samples: List[String], val bams: List[File]) {
+    // getName() just includes the file name WITHOUT the path:
+    val groupOutputName = name + "." + outputBase.getName
+
+    // Comment this out, so that when jobs are scattered in DoC class below, they do not scatter into outputs at directories that don't exist!!! :
+    //def DoC_output = new File(outputBase.getParentFile(), groupOutputName)
+
+    def DoC_output = new File(groupOutputName)
+
+    override def toString(): String = String.format("[Group %s [%s] with samples %s against bams %s]", name, DoC_output, samples, bams)
+  }
+
+  class MergeGATKdepths(DoCsToCombine: List[File], outFile: String, columnSuffix: String, xhmmExec: File, sampleIDsMap: String, sampleIDsMapFromColumn: Int, sampleIDsMapToColumn: Int, rdPrecisionArg: Option[Int], outputTargetsBySamples: Boolean) extends CommandLineFunction {
+    @Input(doc = "")
+    var inputDoCfiles: List[File] = DoCsToCombine
+
+    @Output
+    val mergedDoC: File = new File(outFile)
+    var command: String =
+      xhmmExec + " --mergeGATKdepths" +
+        inputDoCfiles.map(input => " --GATKdepths " + input).reduceLeft(_ + "" + _) +
+        " --columnSuffix " + columnSuffix +
+        " -o " + mergedDoC
+    if (sampleIDsMap != "")
+      command += " --sampleIDmap " + sampleIDsMap + " --fromID " + sampleIDsMapFromColumn + " --toID " + sampleIDsMapToColumn
+    rdPrecisionArg match {
+      case Some(rdPrecision) => {
+        command += " --rdPrecision " + rdPrecision
+      }
+      case None => {}
+    }
+    if (outputTargetsBySamples)
+      command += " --outputTargetsBySamples"
+
+    def commandLine = command
+
+    override def description = "Combines DoC outputs for multiple samples (at same loci): " + command
+  }
+
+  class PrepareTargets(intervalsIn: List[File], outIntervals: String, val xhmmExec: File, val referenceFile: File) extends CommandLineFunction {
+    @Input(doc = "List of files containing targeted intervals to be prepared and merged")
+    var inIntervals: List[File] = intervalsIn
+
+    @Output(doc = "The merged intervals file to write to")
+    var out: File = new File(outIntervals)
+
+    var command: String =
+      xhmmExec + " --prepareTargets" +
+        " -F " + referenceFile +
+        inIntervals.map(intervFile => " --targets " + intervFile).reduceLeft(_ + "" + _) +
+        " --mergedTargets " + out
+
+    def commandLine = command
+
+    override def description = "Sort all target intervals, merge overlapping ones, and print the resulting interval list: " + command
+  }
+}

From 9af4b34fd8a45d4bd561f29dc337f5676a57e21b Mon Sep 17 00:00:00 2001
From: Menachem Fromer <fromer@broadinstitute.org>
Date: Fri, 26 Oct 2012 01:21:05 -0400
Subject: [PATCH 04/49] Changed @Input to @Argument for non-File types

---
 .../queue/qscripts/CNV/xhmmCNVpipeline.scala     | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala
index 362337c84..8db089484 100644
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala
@@ -33,30 +33,30 @@ class xhmmCNVpipeline extends QScript {
   @Argument(shortName = "L", doc = "Intervals", required = false)
   var intervals: File = _
 
-  @Input(doc = "level of parallelism for BAM DoC.   By default is set to 0 [no scattering].", shortName = "scatter", required = false)
+  @Argument(doc = "level of parallelism for BAM DoC.   By default is set to 0 [no scattering].", shortName = "scatter", required = false)
   var scatterCountInput = 0
 
-  @Input(doc = "Samples to run together for DoC.   By default is set to 1 [one job per sample].", shortName = "samplesPerJob", required = false)
+  @Argument(doc = "Samples to run together for DoC.   By default is set to 1 [one job per sample].", shortName = "samplesPerJob", required = false)
   var samplesPerJob = 1
 
   @Output(doc = "Base name for files to output", shortName = "o", required = true)
   var outputBase: File = _
 
-  @Input(doc = "Maximum depth (before GATK down-sampling kicks in...)", shortName = "MAX_DEPTH", required = false)
+  @Argument(doc = "Maximum depth (before GATK down-sampling kicks in...)", shortName = "MAX_DEPTH", required = false)
   var MAX_DEPTH = 20000
 
   @Hidden
-  @Input(doc = "Number of read-depth bins", shortName = "NUM_BINS", required = false)
+  @Argument(doc = "Number of read-depth bins", shortName = "NUM_BINS", required = false)
   var NUM_BINS = 200
 
   @Hidden
-  @Input(doc = "Starting value of read-depth bins", shortName = "START_BIN", required = false)
+  @Argument(doc = "Starting value of read-depth bins", shortName = "START_BIN", required = false)
   var START_BIN = 1
 
-  @Input(doc = "Minimum read mapping quality", shortName = "MMQ", required = false)
+  @Argument(doc = "Minimum read mapping quality", shortName = "MMQ", required = false)
   var minMappingQuality = 0
 
-  @Input(doc = "Memory (in GB) required for storing the whole matrix in memory", shortName = "wholeMatrixMemory", required = false)
+  @Argument(doc = "Memory (in GB) required for storing the whole matrix in memory", shortName = "wholeMatrixMemory", required = false)
   var wholeMatrixMemory = -1
 
   @Argument(shortName = "minTargGC", doc = "Exclude all targets with GC content less than this value", required = false)
@@ -496,4 +496,4 @@ class xhmmCNVpipeline extends QScript {
 
     override def description = "Genotypes discovered CNVs (and their sub-segments, of up to " + maxTargetsInSubsegment + " targets) in all samples: " + command
   }
-}
\ No newline at end of file
+}

From c8e17a7adf410aa540f6e51091160f56bd4231e2 Mon Sep 17 00:00:00 2001
From: Guillermo del Angel <delangel@broadinstitute.org>
Date: Tue, 30 Oct 2012 13:57:54 -0400
Subject: [PATCH 05/49] totally experimental UG feature, to be removed

---
 .../sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java   | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
index 3c4a97ec1..d9b46ad36 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
@@ -190,6 +190,9 @@ public class UnifiedGenotyperEngine {
                     final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap);
                     if ( vc != null )
                         results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap));
+                    else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES)
+                        results.add(generateEmptyContext(tracker, refContext, null, rawContext));
+
                 }
             }        
         }

From 843384e43539eb9d41f5449cb2408fe3dd52cbb9 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Wed, 14 Nov 2012 11:47:09 -0500
Subject: [PATCH 06/49] Rename hg19 files in bundle to b37 since that's what
 they are

---
 .../sting/queue/qscripts/GATKResourcesBundle.scala            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala
index 24ab50451..dc6cae197 100755
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala
@@ -147,13 +147,13 @@ class GATKResourcesBundle extends QScript {
     //
     // example call set for wiki tutorial
     //
-    addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/exampleCalls/NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf",
+    addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/exampleCalls/NA12878.HiSeq.WGS.bwa.cleaned.raw.b37.subset.vcf",
       "NA12878.HiSeq.WGS.bwa.cleaned.raw.subset", b37, true, true))
 
     //
     // Test BAM file, specific to each reference
     //
-    addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam",
+    addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.b37.20.bam",
       "IGNORE", b37, false, false))
 
     //

From 89bbe73a43c6e6ebb30ee564f7b04a432139f716 Mon Sep 17 00:00:00 2001
From: Guillermo del Angel <delangel@broadinstitute.org>
Date: Wed, 14 Nov 2012 14:39:04 -0500
Subject: [PATCH 07/49] Commenting out CMI pipeline test that wasn't meant to
 be in GATK repository (why was this merged??)

---
 .../gatk/walkers/annotator/VariantAnnotatorEngine.java      | 6 +++++-
 .../gatk/walkers/genotyper/UnifiedGenotyperEngine.java      | 4 ++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
index ee4f77752..725097ddc 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
@@ -277,8 +277,12 @@ public class VariantAnnotatorEngine {
             if ( expression.fieldName.equals("ID") ) {
                 if ( vc.hasID() )
                     infoAnnotations.put(expression.fullName, vc.getID());
+            } else if (expression.fieldName.equals("ALT")) {
+                infoAnnotations.put(expression.fullName, vc.getAlternateAllele(0).getDisplayString());
+
             } else if ( vc.hasAttribute(expression.fieldName) ) {
-                infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName));
+                    infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName));
+
             }
         }
     }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
index 879a46ab0..22ed95365 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
@@ -190,8 +190,8 @@ public class UnifiedGenotyperEngine {
                     final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap);
                     if ( vc != null )
                         results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap));
- //                   else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES)
-   //                     results.add(generateEmptyContext(tracker, refContext, null, rawContext));
+                    else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES)
+                        results.add(generateEmptyContext(tracker, refContext, null, rawContext));
 
                 }
             }        

From a68e6810c90711d88d2a829d27818034f721eb12 Mon Sep 17 00:00:00 2001
From: Guillermo del Angel <delangel@broadinstitute.org>
Date: Wed, 14 Nov 2012 14:45:15 -0500
Subject: [PATCH 08/49] Back off experimental code that escaped last commit,
 not for general use yet

---
 .../sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
index 22ed95365..80bc04845 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
@@ -190,8 +190,9 @@ public class UnifiedGenotyperEngine {
                     final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap);
                     if ( vc != null )
                         results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap));
-                    else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES)
-                        results.add(generateEmptyContext(tracker, refContext, null, rawContext));
+// todo - uncomment if we want to also emit a null ref call (with no QUAL) if there's no evidence for REF and if EMIT_ALL_SITES is set
+//                    else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES)
+//                        results.add(generateEmptyContext(tracker, refContext, null, rawContext));
 
                 }
             }        

From b70fd4a2426a21f375776c1de54540a562539423 Mon Sep 17 00:00:00 2001
From: Joel Thibault <thibault@broadinstitute.org>
Date: Wed, 14 Nov 2012 11:08:48 -0500
Subject: [PATCH 09/49] Initial testing of the Active Region Traversal contract

- TODO: many more tests and test cases
---
 .../traversals/TraverseActiveRegions.java     |   5 +-
 .../utils/activeregion/ActivityProfile.java   |   5 +
 .../traversals/TraverseActiveRegionsTest.java | 126 ++++++++++++++++++
 3 files changed, 135 insertions(+), 1 deletion(-)
 create mode 100644 public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java

diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
index a2c37944a..4fe83f331 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
@@ -34,6 +34,9 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
     private final LinkedList<ActiveRegion> workQueue = new LinkedList<ActiveRegion>();
     private final LinkedHashSet<GATKSAMRecord> myReads = new LinkedHashSet<GATKSAMRecord>();
 
+    // package access for unit testing
+    ActivityProfile profile;
+
     @Override
     public String getTraversalUnits() {
         return "active regions";
@@ -53,7 +56,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
 
         int minStart = Integer.MAX_VALUE;
         final List<ActiveRegion> activeRegions = new LinkedList<ActiveRegion>();
-        ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
+        profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
 
         ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
 
diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java
index e96eb843d..38cfbb38d 100644
--- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java
+++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java
@@ -103,6 +103,11 @@ public class ActivityProfile {
         isActiveList.add(result);
     }
 
+    // for unit testing
+    public List<ActivityProfileResult> getActiveList() {
+        return isActiveList;
+    }
+
     public int size() {
         return isActiveList.size();
     }
diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java
new file mode 100644
index 000000000..8740a8b68
--- /dev/null
+++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java
@@ -0,0 +1,126 @@
+package org.broadinstitute.sting.gatk.traversals;
+
+import org.testng.Assert;
+import net.sf.picard.reference.IndexedFastaSequenceFile;
+import net.sf.samtools.SAMRecord;
+import net.sf.samtools.SAMSequenceDictionary;
+import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider;
+import org.broadinstitute.sting.gatk.datasources.reads.MockLocusShard;
+import org.broadinstitute.sting.gatk.datasources.reads.Shard;
+import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
+import org.broadinstitute.sting.gatk.executive.WindowMaker;
+import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
+import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
+import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
+import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
+import org.testng.annotations.BeforeClass;
+import org.testng.annotations.Test;
+
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: thibault
+ * Date: 11/13/12
+ * Time: 2:47 PM
+ */
+public class TraverseActiveRegionsTest extends BaseTest {
+
+    private class DummyActiveRegionWalker extends ActiveRegionWalker<Integer, Integer> {
+        private final double prob;
+
+        public DummyActiveRegionWalker() {
+            this.prob = 1.0;
+        }
+
+        @Override
+        public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+            return new ActivityProfileResult(ref.getLocus(), prob);
+        }
+
+        @Override
+        public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) {
+            return 0;
+        }
+
+        @Override
+        public Integer reduceInit() {
+            return 0;
+        }
+
+        @Override
+        public Integer reduce(Integer value, Integer sum) {
+            return 0;
+        }
+    }
+
+    private final TraverseActiveRegions<Integer, Integer> t = new TraverseActiveRegions<Integer, Integer>();
+
+    private IndexedFastaSequenceFile reference;
+    private GenomeLocParser genomeLocParser;
+    private ActiveRegionWalker<Integer, Integer> walker;
+
+    @BeforeClass
+    private void init() throws FileNotFoundException {
+        reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference));
+        SAMSequenceDictionary dictionary = reference.getSequenceDictionary();
+        genomeLocParser = new GenomeLocParser(dictionary);
+    }
+
+    @Test
+    public void testAllIntervalsSeen() throws Exception {
+        List<GenomeLoc> intervals = new ArrayList<GenomeLoc>();
+        GenomeLoc interval = genomeLocParser.createGenomeLoc("1", 1, 1);
+        intervals.add(interval);
+
+        LocusShardDataProvider dataProvider = createDataProvider(intervals);
+
+        t.traverse(walker, dataProvider, 0);
+
+        boolean allGenomeLocsSeen = true;
+        for (GenomeLoc loc : intervals) {
+            boolean thisGenomeLocSeen = false;
+            for (ActivityProfileResult active : t.profile.getActiveList()) {
+                if (loc.equals(active.getLoc())) {
+                    thisGenomeLocSeen = true;
+                    break;
+                }
+            }
+            if (!thisGenomeLocSeen) {
+                allGenomeLocsSeen = false;
+                break;
+            }
+        }
+
+        Assert.assertTrue(allGenomeLocsSeen, "Some intervals missing from activity profile");
+    }
+
+    private LocusShardDataProvider createDataProvider(List<GenomeLoc> intervals) {
+        walker = new DummyActiveRegionWalker();
+
+        StingSAMIterator iterator = ArtificialSAMUtils.createReadIterator(new ArrayList<SAMRecord>());
+        Shard shard = new MockLocusShard(genomeLocParser, intervals);
+        WindowMaker windowMaker = new WindowMaker(shard, genomeLocParser,iterator,shard.getGenomeLocs());
+        WindowMaker.WindowMakerIterator window = windowMaker.next();
+
+        GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
+        //engine.setReferenceDataSource(reference);
+        engine.setGenomeLocParser(genomeLocParser);
+        t.initialize(engine);
+
+        return new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, reference, new ArrayList<ReferenceOrderedDataSource>());
+    }
+}

From 855a68ae39c1f31623f9561af4f9e2b4e85a14ab Mon Sep 17 00:00:00 2001
From: David Roazen <droazen@broadinstitute.org>
Date: Mon, 19 Nov 2012 08:06:58 -0500
Subject: [PATCH 10/49] Testing out the new github-hosted repos. Please ignore.

---
 testfile | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 testfile

diff --git a/testfile b/testfile
new file mode 100644
index 000000000..6de7b8c69
--- /dev/null
+++ b/testfile
@@ -0,0 +1 @@
+This is a test file.

From 2a16d6fa55140ff6835c57a737586fbb18d0452b Mon Sep 17 00:00:00 2001
From: David Roazen <droazen@broadinstitute.org>
Date: Mon, 19 Nov 2012 08:07:19 -0500
Subject: [PATCH 11/49] Revert "Testing out the new github-hosted repos. Please
 ignore."

This reverts commit b6bf66cd088754e7fd3d5f105ca8b2551237f183.
---
 testfile | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 testfile

diff --git a/testfile b/testfile
deleted file mode 100644
index 6de7b8c69..000000000
--- a/testfile
+++ /dev/null
@@ -1 +0,0 @@
-This is a test file.

From 78ce822b6f56fc5c6cc43be77f0faa47fbeabba6 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Mon, 19 Nov 2012 09:07:04 -0500
Subject: [PATCH 12/49] Protect against NPE when using non-GATK reports for
 inputs expecting valid GATK reports

---
 .../broadinstitute/sting/gatk/report/GATKReportVersion.java    | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java
index b51fb17f0..1079d9b91 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java
@@ -80,6 +80,9 @@ public enum GATKReportVersion {
      * @return The version as an enum.
      */
     public static GATKReportVersion fromHeader(String header) {
+        if ( header == null )
+            throw new UserException.BadInput("The GATK report has no version specified in the header");
+
         if (header.startsWith("##:GATKReport.v0.1 "))
             return GATKReportVersion.V0_1;
 

From ff180a8e02eaeffbc42226c789c6c6946affae68 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Mon, 19 Nov 2012 09:09:57 -0500
Subject: [PATCH 13/49] Significant refactoring of the Haplotype Caller to
 handle problems with GGA.  The main fix is that we now maintain a mapping
 from 'original' allele to 'Smith-Waterman-based' allele so that we no longer
 need to do a (buggy) matching throughout the calling process.

---
 .../haplotypecaller/GenotypingEngine.java     | 253 ++++++------------
 .../haplotypecaller/HaplotypeCaller.java      |   6 +-
 .../LikelihoodCalculationEngine.java          |  78 +++---
 .../LikelihoodCalculationEngineUnitTest.java  |   5 +-
 .../broadinstitute/sting/utils/Haplotype.java |  16 +-
 5 files changed, 144 insertions(+), 214 deletions(-)

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
index d91df82e2..9fc636efe 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
@@ -31,7 +31,6 @@ import net.sf.samtools.Cigar;
 import net.sf.samtools.CigarElement;
 import org.apache.commons.lang.ArrayUtils;
 import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
-import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
 import org.broadinstitute.sting.utils.*;
 import org.broadinstitute.sting.utils.collections.Pair;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@@ -52,153 +51,17 @@ public class GenotypingEngine {
         noCall.add(Allele.NO_CALL);
     }
 
-    // WARN
-    // This function is the streamlined approach, currently not being used by default
-    // WARN
-    // WARN: This function is currently only being used by Menachem. Slated for removal/merging with the rest of the code.
-    // WARN
-    @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
-    public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine,
-                                                                                                                             final ArrayList<Haplotype> haplotypes,
-                                                                                                                             final byte[] ref,
-                                                                                                                             final GenomeLoc refLoc,
-                                                                                                                             final GenomeLoc activeRegionWindow,
-                                                                                                                             final GenomeLocParser genomeLocParser ) {
-        // Prepare the list of haplotype indices to genotype
-        final ArrayList<Allele> allelesToGenotype = new ArrayList<Allele>();
-
-        for( final Haplotype h : haplotypes ) {
-            allelesToGenotype.add( Allele.create(h.getBases(), h.isReference()) );
-        }
-        final int numHaplotypes = haplotypes.size();
-
-        // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
-        final GenotypesContext genotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size());
-        for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
-            final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2];
-            final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(haplotypes, sample);
-            int glIndex = 0;
-            for( int iii = 0; iii < numHaplotypes; iii++ ) {
-                for( int jjj = 0; jjj <= iii; jjj++ ) {
-                    genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC
-                }
-            }
-            genotypes.add(new GenotypeBuilder(sample, noCall).PL(genotypeLikelihoods).make());
-        }
-        final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder().loc(activeRegionWindow).alleles(allelesToGenotype).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
-        if( call == null ) { return Collections.emptyList(); } // exact model says that the call confidence is below the specified confidence threshold so nothing to do here
-
-        // Prepare the list of haplotypes that need to be run through Smith-Waterman for output to VCF
-        final ArrayList<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
-        for( final Haplotype h : haplotypes ) {
-            if( call.getAllele(h.getBases()) == null ) { // exact model removed this allele from the list so no need to run SW and output to VCF
-                haplotypesToRemove.add(h);
-            }
-        }
-        haplotypes.removeAll(haplotypesToRemove);
-
-        if( OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) {
-            final List<Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>>> returnVCs = new ArrayList<Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>>>();
-            // set up the default 1-to-1 haplotype mapping object
-            final HashMap<Allele,ArrayList<Haplotype>> haplotypeMapping = new HashMap<Allele,ArrayList<Haplotype>>();
-            for( final Haplotype h : haplotypes ) {
-                final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
-                list.add(h);
-                haplotypeMapping.put(call.getAllele(h.getBases()), list);
-            }
-            returnVCs.add( new Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>>(call,haplotypeMapping) );
-            return returnVCs;
-        }
-
-        final ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> returnCalls = new ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>>();
-
-        // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
-        final TreeSet<Integer> startPosKeySet = new TreeSet<Integer>();
-        int count = 0;
-        if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); }
-        for( final Haplotype h : haplotypes ) {
-            if( DEBUG ) {
-                System.out.println( h.toString() );
-                System.out.println( "> Cigar = " + h.getCigar() );
-            }
-            // Walk along the alignment and turn any difference from the reference into an event
-            h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) );
-            startPosKeySet.addAll(h.getEventMap().keySet());
-        }
-        
-        // Create the VC merge priority list
-        final ArrayList<String> priorityList = new ArrayList<String>();
-        for( int iii = 0; iii < haplotypes.size(); iii++ ) {
-            priorityList.add("HC" + iii);
-        }
-        
-        // Walk along each position in the key set and create each event to be outputted
-        for( final int loc : startPosKeySet ) {
-            if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) {
-                final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>();
-                for( final Haplotype h : haplotypes ) {
-                    final HashMap<Integer,VariantContext> eventMap = h.getEventMap();
-                    final VariantContext vc = eventMap.get(loc);
-                    if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) {
-                        eventsAtThisLoc.add(vc);
-                    }
-                }
-                
-                // Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event
-                final ArrayList<ArrayList<Haplotype>> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
-
-                // Merge the event to find a common reference representation
-                final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
-
-                final HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
-                int aCount = 0;
-                for( final Allele a : mergedVC.getAlleles() ) {
-                    alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper
-                }
-
-                if( DEBUG ) {
-                    System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
-                    //System.out.println("Event/haplotype allele mapping = " + alleleMapper);
-                }
-
-                // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
-                final GenotypesContext myGenotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size());
-                for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
-                    final int myNumHaplotypes = alleleMapper.size();
-                    final double[] genotypeLikelihoods = new double[myNumHaplotypes * (myNumHaplotypes+1) / 2];
-                    final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper);
-                    int glIndex = 0;
-                    for( int iii = 0; iii < myNumHaplotypes; iii++ ) {
-                        for( int jjj = 0; jjj <= iii; jjj++ ) {
-                            genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC
-                        }
-                    }
-
-                    // using the allele mapping object translate the haplotype allele into the event allele
-                    final Genotype g = new GenotypeBuilder(sample)
-                            .alleles(findEventAllelesInSample(mergedVC.getAlleles(), call.getAlleles(), call.getGenotype(sample).getAlleles(), alleleMapper, haplotypes))
-                            .phased(loc != startPosKeySet.first())
-                            .PL(genotypeLikelihoods).make();
-                    myGenotypes.add(g);
-                }
-                returnCalls.add( new Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>(
-                                 new VariantContextBuilder(mergedVC).log10PError(call.getLog10PError()).genotypes(myGenotypes).make(), alleleHashMap) );
-            }
-        }
-        return returnCalls;
-    }
-
     // BUGBUG: Create a class to hold this complicated return type
     @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
-    public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine,
-                                                                                                                               final ArrayList<Haplotype> haplotypes,
-                                                                                                                               final byte[] ref,
-                                                                                                                               final GenomeLoc refLoc,
-                                                                                                                               final GenomeLoc activeRegionWindow,
-                                                                                                                               final GenomeLocParser genomeLocParser,
-                                                                                                                               final ArrayList<VariantContext> activeAllelesToGenotype ) {
+    public List<Pair<VariantContext, Map<Allele, List<Haplotype>>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine,
+                                                                                                                       final List<Haplotype> haplotypes,
+                                                                                                                       final byte[] ref,
+                                                                                                                       final GenomeLoc refLoc,
+                                                                                                                       final GenomeLoc activeRegionWindow,
+                                                                                                                       final GenomeLocParser genomeLocParser,
+                                                                                                                       final List<VariantContext> activeAllelesToGenotype ) {
 
-        final ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> returnCalls = new ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>>();
+        final ArrayList<Pair<VariantContext, Map<Allele, List<Haplotype>>>> returnCalls = new ArrayList<Pair<VariantContext, Map<Allele, List<Haplotype>>>>();
 
         // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
         final TreeSet<Integer> startPosKeySet = new TreeSet<Integer>();
@@ -261,7 +124,15 @@ public class GenotypingEngine {
                 if( eventsAtThisLoc.isEmpty() ) { continue; }
 
                 // Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event
-                final ArrayList<ArrayList<Haplotype>> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
+                Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
+
+                final Allele refAllele = eventsAtThisLoc.get(0).getReference();
+                final ArrayList<Allele> alleleOrdering = new ArrayList<Allele>(alleleMapper.size());
+                alleleOrdering.add(refAllele);
+                for ( final Allele allele : alleleMapper.keySet() ) {
+                    if ( !refAllele.equals(allele) )
+                        alleleOrdering.add(allele);
+                }
 
                 // Sanity check the priority list
                 for( final VariantContext vc : eventsAtThisLoc ) {
@@ -283,12 +154,6 @@ public class GenotypingEngine {
                 final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
                 if( mergedVC == null ) { continue; }
 
-                HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
-                int aCount = 0;
-                for( final Allele a : mergedVC.getAlleles() ) {
-                    alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper
-                }
-
                 if( DEBUG ) {
                     System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
                     //System.out.println("Event/haplotype allele mapping = " + alleleMapper);
@@ -299,7 +164,7 @@ public class GenotypingEngine {
                 for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
                     final int numHaplotypes = alleleMapper.size();
                     final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2];
-                    final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper);
+                    final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper, alleleOrdering);
                     int glIndex = 0;
                     for( int iii = 0; iii < numHaplotypes; iii++ ) {
                         for( int jjj = 0; jjj <= iii; jjj++ ) {
@@ -313,23 +178,23 @@ public class GenotypingEngine {
                     if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
                         final VariantContext vcCallTrim = VariantContextUtils.reverseTrimAlleles(call);
                         // also, need to update the allele -> haplotype mapping
-                        final HashMap<Allele, ArrayList<Haplotype>> alleleHashMapTrim = new HashMap<Allele, ArrayList<Haplotype>>();
+                        final HashMap<Allele, List<Haplotype>> alleleHashMapTrim = new HashMap<Allele, List<Haplotype>>();
                         for( int iii = 0; iii < vcCallTrim.getAlleles().size(); iii++ ) { // BUGBUG: this is assuming that the original and trimmed alleles maintain the same ordering in the VC
-                            alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleHashMap.get(call.getAlleles().get(iii)));
+                            alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleMapper.get(call.getAlleles().get(iii)));
                         }
 
                         call = vcCallTrim;
-                        alleleHashMap = alleleHashMapTrim;
+                        alleleMapper = alleleHashMapTrim;
                     }
 
-                    returnCalls.add( new Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>(call, alleleHashMap) );
+                    returnCalls.add( new Pair<VariantContext, Map<Allele,List<Haplotype>>>(call, alleleMapper) );
                 }
             }
         }
         return returnCalls;
     }
 
-    protected static void cleanUpSymbolicUnassembledEvents( final ArrayList<Haplotype> haplotypes ) {
+    protected static void cleanUpSymbolicUnassembledEvents( final List<Haplotype> haplotypes ) {
         final ArrayList<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
         for( final Haplotype h : haplotypes ) {
             for( final VariantContext vc : h.getEventMap().values() ) {
@@ -348,7 +213,7 @@ public class GenotypingEngine {
         haplotypes.removeAll(haplotypesToRemove);
     }
 
-    protected void mergeConsecutiveEventsBasedOnLD( final ArrayList<Haplotype> haplotypes, final TreeSet<Integer> startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) {
+    protected void mergeConsecutiveEventsBasedOnLD( final List<Haplotype> haplotypes, final TreeSet<Integer> startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) {
         final int MAX_SIZE_TO_COMBINE = 15;
         final double MERGE_EVENTS_R2_THRESHOLD = 0.95;
         if( startPosKeySet.size() <= 1 ) { return; }
@@ -395,7 +260,9 @@ public class GenotypingEngine {
                         final ArrayList<Haplotype> haplotypeList = new ArrayList<Haplotype>();
                         haplotypeList.add(h);
                         for( final String sample : haplotypes.get(0).getSampleKeySet() ) {
-                            final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( haplotypeList, sample )[0][0];
+                            final HashSet<String> sampleSet = new HashSet<String>(1);
+                            sampleSet.add(sample);
+                            final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( sampleSet,  haplotypeList )[0][0];
                             if( thisHapVC == null ) {
                                 if( nextHapVC == null ) { x11 = MathUtils.approximateLog10SumLog10(x11, haplotypeLikelihood); }
                                 else { x12 = MathUtils.approximateLog10SumLog10(x12, haplotypeLikelihood); }
@@ -489,37 +356,71 @@ public class GenotypingEngine {
 
     @Requires({"haplotypes.size() >= eventsAtThisLoc.size() + 1"})
     @Ensures({"result.size() == eventsAtThisLoc.size() + 1"})
-    protected static ArrayList<ArrayList<Haplotype>> createAlleleMapper( final int loc, final ArrayList<VariantContext> eventsAtThisLoc, final ArrayList<Haplotype> haplotypes ) {
-        final ArrayList<ArrayList<Haplotype>> alleleMapper = new ArrayList<ArrayList<Haplotype>>();
-        final ArrayList<Haplotype> refList = new ArrayList<Haplotype>();
+    protected static Map<Allele, List<Haplotype>> createAlleleMapper( final int loc, final List<VariantContext> eventsAtThisLoc, final List<Haplotype> haplotypes ) {
+
+        final Allele refAllele = eventsAtThisLoc.get(0).getReference();
+
+        final Map<Allele, List<Haplotype>> alleleMapper = new HashMap<Allele, List<Haplotype>>(eventsAtThisLoc.size()+1);
         for( final Haplotype h : haplotypes ) {
             if( h.getEventMap().get(loc) == null ) { // no event at this location so this is a reference-supporting haplotype
-                refList.add(h);
+                if ( !alleleMapper.containsKey(refAllele) )
+                    alleleMapper.put(refAllele, new ArrayList<Haplotype>());
+                alleleMapper.get(refAllele).add(h);
+            } else if ( h.isArtificialHaplotype() ) {
+                if ( !alleleMapper.containsKey(h.getArtificialAllele()) )
+                    alleleMapper.put(h.getArtificialAllele(), new ArrayList<Haplotype>());
+                alleleMapper.get(h.getArtificialAllele()).add(h);
             } else {
-                boolean foundInEventList = false;
                 for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) {
                     if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) {
-                        foundInEventList = true;
+                        final Allele altAllele = vcAtThisLoc.getAlternateAllele(0);
+                        if ( !alleleMapper.containsKey(altAllele) )
+                            alleleMapper.put(altAllele, new ArrayList<Haplotype>());
+                        alleleMapper.get(altAllele).add(h);
+                        break;
                     }
                 }
-                if( !foundInEventList ) { // event at this location isn't one of the genotype-able options (during GGA) so this is a reference-supporting haplotype
-                    refList.add(h);
-                }
             }
         }
-        alleleMapper.add(refList);
-        for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) {
-            final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
-            for( final Haplotype h : haplotypes ) {
-                if( h.getEventMap().get(loc) != null && h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) {
-                    list.add(h);
+
+        for( final Haplotype h : haplotypes ) {
+            if ( h.getEventMap().get(loc) == null || h.isArtificialHaplotype() )
+                continue;
+
+            Allele matchingAllele = null;
+            for ( final Map.Entry<Allele, List<Haplotype>> alleleToTest : alleleMapper.entrySet() ) {
+                if ( alleleToTest.getKey().equals(refAllele) )
+                    continue;
+
+                final Haplotype artificialHaplotype = alleleToTest.getValue().get(0);
+                if ( isSubSetOf(artificialHaplotype.getEventMap(), h.getEventMap()) ) {
+                    matchingAllele = alleleToTest.getKey();
+                    break;
                 }
             }
-            alleleMapper.add(list);
+
+            if ( matchingAllele == null )
+                matchingAllele = refAllele;
+            alleleMapper.get(matchingAllele).add(h);
         }
+
         return alleleMapper;
     }
 
+    protected static boolean isSubSetOf(final Map<Integer, VariantContext> subset, final Map<Integer, VariantContext> superset) {
+
+        for ( final Map.Entry<Integer, VariantContext> fromSubset : subset.entrySet() ) {
+            final VariantContext fromSuperset = superset.get(fromSubset.getKey());
+            if ( fromSuperset == null )
+                return false;
+
+            if ( !fromSuperset.hasAlternateAllele(fromSubset.getValue().getAlternateAllele(0)) )
+                return false;
+        }
+
+        return true;
+    }
+
     @Ensures({"result.size() == haplotypeAllelesForSample.size()"})
     protected static List<Allele> findEventAllelesInSample( final List<Allele> eventAlleles, final List<Allele> haplotypeAlleles, final List<Allele> haplotypeAllelesForSample, final ArrayList<ArrayList<Haplotype>> alleleMapper, final ArrayList<Haplotype> haplotypes ) {
         if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; }
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index a185ba6af..2b739a321 100755
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -421,10 +421,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
         // subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes )
         final ArrayList<Haplotype> bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes ) : haplotypes );
 
-        for( final Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>> callResult :
-                ( GENOTYPE_FULL_ACTIVE_REGION && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
-                  ? genotypingEngine.assignGenotypeLikelihoodsAndCallHaplotypeEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getExtendedLoc(), getToolkit().getGenomeLocParser() )
-                  : genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) {
+        for( final Pair<VariantContext, Map<Allele, List<Haplotype>>> callResult :
+                genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) {
             if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); }
 
             final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog );
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
index a0924623b..543987e74 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
@@ -148,34 +148,21 @@ public class LikelihoodCalculationEngine {
         return Math.min(b1.length, b2.length);
     }
 
-    @Requires({"haplotypes.size() > 0"})
-    @Ensures({"result.length == result[0].length", "result.length == haplotypes.size()"})
-    public static double[][] computeDiploidHaplotypeLikelihoods( final ArrayList<Haplotype> haplotypes, final String sample ) {
-        // set up the default 1-to-1 haplotype mapping object, BUGBUG: target for future optimization?
-        final ArrayList<ArrayList<Haplotype>> haplotypeMapping = new ArrayList<ArrayList<Haplotype>>();
-        for( final Haplotype h : haplotypes ) {
-            final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
-            list.add(h);
-            haplotypeMapping.add(list);
-        }
-        return computeDiploidHaplotypeLikelihoods( sample, haplotypeMapping );
-    }
-
     // This function takes just a single sample and a haplotypeMapping
     @Requires({"haplotypeMapping.size() > 0"})
     @Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"})
-    public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final ArrayList<ArrayList<Haplotype>> haplotypeMapping ) {
+    public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final Map<Allele, List<Haplotype>> haplotypeMapping, final List<Allele> alleleOrdering ) {
         final TreeSet<String> sampleSet = new TreeSet<String>();
         sampleSet.add(sample);
-        return computeDiploidHaplotypeLikelihoods(sampleSet, haplotypeMapping);
+        return computeDiploidHaplotypeLikelihoods(sampleSet, haplotypeMapping, alleleOrdering);
     }
 
     // This function takes a set of samples to pool over and a haplotypeMapping
     @Requires({"haplotypeMapping.size() > 0"})
     @Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"})
-    public static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples, final ArrayList<ArrayList<Haplotype>> haplotypeMapping ) {
+    public static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples, final Map<Allele, List<Haplotype>> haplotypeMapping, final List<Allele> alleleOrdering ) {
 
-        final int numHaplotypes = haplotypeMapping.size();
+        final int numHaplotypes = alleleOrdering.size();
         final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
         for( int iii = 0; iii < numHaplotypes; iii++ ) {
             Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY);
@@ -184,9 +171,9 @@ public class LikelihoodCalculationEngine {
         // compute the diploid haplotype likelihoods
         // todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code
         for( int iii = 0; iii < numHaplotypes; iii++ ) {
-            for( int jjj = 0; jjj <= iii; jjj++ ) {                
-                for( final Haplotype iii_mapped : haplotypeMapping.get(iii) ) {
-                    for( final Haplotype jjj_mapped : haplotypeMapping.get(jjj) ) {
+            for( int jjj = 0; jjj <= iii; jjj++ ) {
+                for( final Haplotype iii_mapped : haplotypeMapping.get(alleleOrdering.get(iii)) ) {
+                    for( final Haplotype jjj_mapped : haplotypeMapping.get(alleleOrdering.get(jjj)) ) {
                         double haplotypeLikelihood = 0.0;
                         for( final String sample : samples ) {
                             final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample);
@@ -200,12 +187,48 @@ public class LikelihoodCalculationEngine {
                         }
                         haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood);
                     }
-                }       
+                }
             }
         }
 
         // normalize the diploid likelihoods matrix
-        return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix );        
+        return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix );
+    }
+
+    // This function takes a set of samples to pool over and a haplotypeMapping
+    @Requires({"haplotypeMapping.size() > 0"})
+    @Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"})
+    public static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples, final List<Haplotype> haplotypeList ) {
+
+        final int numHaplotypes = haplotypeList.size();
+        final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
+        for( int iii = 0; iii < numHaplotypes; iii++ ) {
+            Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY);
+        }
+
+        // compute the diploid haplotype likelihoods
+        // todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code
+        for( int iii = 0; iii < numHaplotypes; iii++ ) {
+            final Haplotype iii_haplotype = haplotypeList.get(iii);
+            for( int jjj = 0; jjj <= iii; jjj++ ) {
+                final Haplotype jjj_haplotype = haplotypeList.get(jjj);
+                double haplotypeLikelihood = 0.0;
+                for( final String sample : samples ) {
+                    final double[] readLikelihoods_iii = iii_haplotype.getReadLikelihoods(sample);
+                    final int[] readCounts_iii = iii_haplotype.getReadCounts(sample);
+                    final double[] readLikelihoods_jjj = jjj_haplotype.getReadLikelihoods(sample);
+                    for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) {
+                        // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
+                        // First term is approximated by Jacobian log with table lookup.
+                        haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF );
+                    }
+                }
+                haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood);
+            }
+        }
+
+        // normalize the diploid likelihoods matrix
+        return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix );
     }
 
     @Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"})
@@ -296,14 +319,7 @@ public class LikelihoodCalculationEngine {
         final Set<String> sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples
         final ArrayList<Integer> bestHaplotypesIndexList = new ArrayList<Integer>();
         bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype
-        // set up the default 1-to-1 haplotype mapping object
-        final ArrayList<ArrayList<Haplotype>> haplotypeMapping = new ArrayList<ArrayList<Haplotype>>();
-        for( final Haplotype h : haplotypes ) {
-            final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
-            list.add(h);
-            haplotypeMapping.add(list);
-        }
-        final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, haplotypeMapping ); // all samples pooled together
+        final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, haplotypes ); // all samples pooled together
 
         int hap1 = 0;
         int hap2 = 0;
@@ -347,7 +363,7 @@ public class LikelihoodCalculationEngine {
     public static Map<String, PerReadAlleleLikelihoodMap> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser,
                                                                                             final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList,
                                                                                             final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,
-                                                                                            final Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>> call,
+                                                                                            final Pair<VariantContext, Map<Allele,List<Haplotype>>> call,
                                                                                             final double downsamplingFraction,
                                                                                             final PrintStream downsamplingLog ) {
         final Map<String, PerReadAlleleLikelihoodMap> returnMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java
index e82946690..19ced9f42 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java
@@ -10,7 +10,6 @@ import org.broadinstitute.sting.BaseTest;
 import org.broadinstitute.sting.utils.Haplotype;
 import org.broadinstitute.sting.utils.MathUtils;
 import org.testng.Assert;
-import org.testng.annotations.BeforeClass;
 import org.testng.annotations.DataProvider;
 import org.testng.annotations.Test;
 
@@ -102,7 +101,9 @@ public class LikelihoodCalculationEngineUnitTest extends BaseTest {
                     haplotypes.add(haplotype);
                 }
             }
-            return LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(haplotypes, "myTestSample");
+            final HashSet<String> sampleSet = new HashSet<String>(1);
+            sampleSet.add("myTestSample");
+            return LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sampleSet, haplotypes);
         }
     }
 
diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
index b30d47074..6de15e18b 100755
--- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
+++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
@@ -49,6 +49,7 @@ public class Haplotype {
     private int alignmentStartHapwrtRef;
     public int leftBreakPoint = 0;
     public int rightBreakPoint = 0;
+    private Allele artificialAllele = null;
  
     /**
      * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual
@@ -71,6 +72,11 @@ public class Haplotype {
         this(bases, 0);
     }
 
+    public Haplotype( final byte[] bases, final Allele artificialAllele ) {
+        this(bases, 0);
+        this.artificialAllele = artificialAllele;
+    }
+
     public Haplotype( final byte[] bases, final GenomeLoc loc ) {
         this(bases);
         this.genomeLocation = loc;
@@ -171,6 +177,14 @@ public class Haplotype {
         this.cigar = cigar;
     }
 
+    public boolean isArtificialHaplotype() {
+        return artificialAllele != null;
+    }
+
+    public Allele getArtificialAllele() {
+        return artificialAllele;
+    }
+
     @Requires({"refInsertLocation >= 0"})
     public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation ) {
         // refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates
@@ -182,7 +196,7 @@ public class Haplotype {
         newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, 0, haplotypeInsertLocation)); // bases before the variant
         newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant
         newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, haplotypeInsertLocation + refAllele.length(), bases.length)); // bases after the variant
-        return new Haplotype(newHaplotypeBases);
+        return new Haplotype(newHaplotypeBases, altAllele);
     }
 
     public static class HaplotypeBaseComparator implements Comparator<Haplotype>, Serializable {

From 9fc63efc307475f6b36ffad7e425b90bcab817cf Mon Sep 17 00:00:00 2001
From: David Roazen <droazen@broadinstitute.org>
Date: Mon, 19 Nov 2012 09:34:15 -0500
Subject: [PATCH 14/49] Second test of new repos. Please ignore.

---
 testfile | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 testfile

diff --git a/testfile b/testfile
new file mode 100644
index 000000000..524acfffa
--- /dev/null
+++ b/testfile
@@ -0,0 +1 @@
+Test file

From e1a5c3ce7a97e8eb92c64cc144340c81d01c9903 Mon Sep 17 00:00:00 2001
From: David Roazen <droazen@broadinstitute.org>
Date: Mon, 19 Nov 2012 09:34:32 -0500
Subject: [PATCH 15/49] Revert "Second test of new repos. Please ignore."

This reverts commit 077532d870ddf53ec514b98f14534ca7dbf55331.
---
 testfile | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 testfile

diff --git a/testfile b/testfile
deleted file mode 100644
index 524acfffa..000000000
--- a/testfile
+++ /dev/null
@@ -1 +0,0 @@
-Test file

From f0b8a0228fef45f23478c1a12be1cd58633c0873 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Mon, 19 Nov 2012 09:57:55 -0500
Subject: [PATCH 17/49] Quick fix for HC refactoring: when copying over
 Haplotype objects, make sure to copy over the artificial allele used to
 create it too.

---
 .../gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java | 2 ++
 public/java/src/org/broadinstitute/sting/utils/Haplotype.java | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java
index fd46e4e69..33fa49543 100755
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java
@@ -369,6 +369,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
 
         h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() );
         h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0) );
+        if ( haplotype.isArtificialHaplotype() )
+            h.setArtificialAllele(haplotype.getArtificialAllele());
         h.leftBreakPoint = leftBreakPoint;
         h.rightBreakPoint = rightBreakPoint;
         if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart ) { // protect against SW failures
diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
index 6de15e18b..af4e31698 100755
--- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
+++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
@@ -185,6 +185,10 @@ public class Haplotype {
         return artificialAllele;
     }
 
+    public void setArtificialAllele(final Allele artificialAllele) {
+        this.artificialAllele = artificialAllele;
+    }
+
     @Requires({"refInsertLocation >= 0"})
     public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation ) {
         // refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates

From 937ac7290f7caa7f0cae996608f5a68358f10c09 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Tue, 20 Nov 2012 16:13:29 -0500
Subject: [PATCH 21/49] Lots more GGA fixes for the HC now that I understand
 what's going on internally.  Integration tests pass except for the GGA test
 which I believe now produces better results.

---
 .../haplotypecaller/GenotypingEngine.java     | 82 ++++++++++++-------
 .../LikelihoodCalculationEngine.java          |  4 +-
 .../SimpleDeBruijnAssembler.java              | 22 +++--
 .../HaplotypeCallerIntegrationTest.java       |  3 +-
 .../broadinstitute/sting/utils/Haplotype.java | 17 ++--
 .../sting/utils/HaplotypeUnitTest.java        |  2 +-
 6 files changed, 87 insertions(+), 43 deletions(-)

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
index 9fc636efe..beec8a92e 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
@@ -62,6 +62,7 @@ public class GenotypingEngine {
                                                                                                                        final List<VariantContext> activeAllelesToGenotype ) {
 
         final ArrayList<Pair<VariantContext, Map<Allele, List<Haplotype>>>> returnCalls = new ArrayList<Pair<VariantContext, Map<Allele, List<Haplotype>>>>();
+        final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty();
 
         // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
         final TreeSet<Integer> startPosKeySet = new TreeSet<Integer>();
@@ -70,7 +71,7 @@ public class GenotypingEngine {
         for( final Haplotype h : haplotypes ) {
             // Walk along the alignment and turn any difference from the reference into an event
             h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) );
-            if( activeAllelesToGenotype.isEmpty() ) { startPosKeySet.addAll(h.getEventMap().keySet()); }
+            if( !in_GGA_mode ) { startPosKeySet.addAll(h.getEventMap().keySet()); }
             if( DEBUG ) {
                 System.out.println( h.toString() );
                 System.out.println( "> Cigar = " + h.getCigar() );
@@ -80,10 +81,10 @@ public class GenotypingEngine {
         }
 
         cleanUpSymbolicUnassembledEvents( haplotypes );
-        if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure
+        if( !in_GGA_mode && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure
             mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc );
         }
-        if( !activeAllelesToGenotype.isEmpty() ) { // we are in GGA mode!
+        if( in_GGA_mode ) {
             for( final VariantContext compVC : activeAllelesToGenotype ) {
                 startPosKeySet.add( compVC.getStart() );
             }
@@ -95,7 +96,7 @@ public class GenotypingEngine {
                 final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>(); // the overlapping events to merge into a common reference view
                 final ArrayList<String> priorityList = new ArrayList<String>(); // used to merge overlapping events into common reference view
 
-                if( activeAllelesToGenotype.isEmpty() ) {
+                if( !in_GGA_mode ) {
                     for( final Haplotype h : haplotypes ) {
                         final HashMap<Integer,VariantContext> eventMap = h.getEventMap();
                         final VariantContext vc = eventMap.get(loc);
@@ -129,9 +130,8 @@ public class GenotypingEngine {
                 final Allele refAllele = eventsAtThisLoc.get(0).getReference();
                 final ArrayList<Allele> alleleOrdering = new ArrayList<Allele>(alleleMapper.size());
                 alleleOrdering.add(refAllele);
-                for ( final Allele allele : alleleMapper.keySet() ) {
-                    if ( !refAllele.equals(allele) )
-                        alleleOrdering.add(allele);
+                for( final VariantContext vc : eventsAtThisLoc ) {
+                    alleleOrdering.add(vc.getAlternateAllele(0));
                 }
 
                 // Sanity check the priority list
@@ -154,6 +154,16 @@ public class GenotypingEngine {
                 final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
                 if( mergedVC == null ) { continue; }
 
+                // let's update the Allele keys in the mapper because they can change after merging when there are complex events
+                Map<Allele, List<Haplotype>> updatedAlleleMapper = new HashMap<Allele, List<Haplotype>>(alleleMapper.size());
+                for ( int i = 0; i < mergedVC.getNAlleles(); i++ ) {
+                    final Allele oldAllele = alleleOrdering.get(i);
+                    final Allele newAllele = mergedVC.getAlleles().get(i);
+                    updatedAlleleMapper.put(newAllele, alleleMapper.get(oldAllele));
+                    alleleOrdering.set(i, newAllele);
+                }
+                alleleMapper = updatedAlleleMapper;
+
                 if( DEBUG ) {
                     System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
                     //System.out.println("Event/haplotype allele mapping = " + alleleMapper);
@@ -358,48 +368,48 @@ public class GenotypingEngine {
     @Ensures({"result.size() == eventsAtThisLoc.size() + 1"})
     protected static Map<Allele, List<Haplotype>> createAlleleMapper( final int loc, final List<VariantContext> eventsAtThisLoc, final List<Haplotype> haplotypes ) {
 
-        final Allele refAllele = eventsAtThisLoc.get(0).getReference();
-
         final Map<Allele, List<Haplotype>> alleleMapper = new HashMap<Allele, List<Haplotype>>(eventsAtThisLoc.size()+1);
+        final Allele refAllele = eventsAtThisLoc.get(0).getReference();
+        alleleMapper.put(refAllele, new ArrayList<Haplotype>());
+        for( final VariantContext vc : eventsAtThisLoc )
+            alleleMapper.put(vc.getAlternateAllele(0), new ArrayList<Haplotype>());
+
+        final ArrayList<Haplotype> undeterminedHaplotypes = new ArrayList<Haplotype>(haplotypes.size());
         for( final Haplotype h : haplotypes ) {
             if( h.getEventMap().get(loc) == null ) { // no event at this location so this is a reference-supporting haplotype
-                if ( !alleleMapper.containsKey(refAllele) )
-                    alleleMapper.put(refAllele, new ArrayList<Haplotype>());
                 alleleMapper.get(refAllele).add(h);
-            } else if ( h.isArtificialHaplotype() ) {
-                if ( !alleleMapper.containsKey(h.getArtificialAllele()) )
-                    alleleMapper.put(h.getArtificialAllele(), new ArrayList<Haplotype>());
+            } else if( h.isArtificialHaplotype() && loc == h.getArtificialAllelePosition() && alleleMapper.containsKey(h.getArtificialAllele()) ) {
                 alleleMapper.get(h.getArtificialAllele()).add(h);
             } else {
+                boolean haplotypeIsDetermined = false;
                 for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) {
                     if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) {
-                        final Allele altAllele = vcAtThisLoc.getAlternateAllele(0);
-                        if ( !alleleMapper.containsKey(altAllele) )
-                            alleleMapper.put(altAllele, new ArrayList<Haplotype>());
-                        alleleMapper.get(altAllele).add(h);
+                        alleleMapper.get(vcAtThisLoc.getAlternateAllele(0)).add(h);
+                        haplotypeIsDetermined = true;
                         break;
                     }
                 }
+
+                if( !haplotypeIsDetermined )
+                    undeterminedHaplotypes.add(h);
             }
         }
 
-        for( final Haplotype h : haplotypes ) {
-            if ( h.getEventMap().get(loc) == null || h.isArtificialHaplotype() )
-                continue;
-
+        for( final Haplotype h : undeterminedHaplotypes ) {
             Allele matchingAllele = null;
-            for ( final Map.Entry<Allele, List<Haplotype>> alleleToTest : alleleMapper.entrySet() ) {
-                if ( alleleToTest.getKey().equals(refAllele) )
+            for( final Map.Entry<Allele, List<Haplotype>> alleleToTest : alleleMapper.entrySet() ) {
+                // don't test against the reference allele
+                if( alleleToTest.getKey().equals(refAllele) )
                     continue;
 
                 final Haplotype artificialHaplotype = alleleToTest.getValue().get(0);
-                if ( isSubSetOf(artificialHaplotype.getEventMap(), h.getEventMap()) ) {
+                if( isSubSetOf(artificialHaplotype.getEventMap(), h.getEventMap(), true) ) {
                     matchingAllele = alleleToTest.getKey();
                     break;
                 }
             }
 
-            if ( matchingAllele == null )
+            if( matchingAllele == null )
                 matchingAllele = refAllele;
             alleleMapper.get(matchingAllele).add(h);
         }
@@ -407,20 +417,36 @@ public class GenotypingEngine {
         return alleleMapper;
     }
 
-    protected static boolean isSubSetOf(final Map<Integer, VariantContext> subset, final Map<Integer, VariantContext> superset) {
+    protected static boolean isSubSetOf(final Map<Integer, VariantContext> subset, final Map<Integer, VariantContext> superset, final boolean resolveSupersetToSubset) {
 
         for ( final Map.Entry<Integer, VariantContext> fromSubset : subset.entrySet() ) {
             final VariantContext fromSuperset = superset.get(fromSubset.getKey());
             if ( fromSuperset == null )
                 return false;
 
-            if ( !fromSuperset.hasAlternateAllele(fromSubset.getValue().getAlternateAllele(0)) )
+            List<Allele> supersetAlleles = fromSuperset.getAlternateAlleles();
+            if ( resolveSupersetToSubset )
+                supersetAlleles = resolveAlternateAlleles(fromSubset.getValue().getReference(), fromSuperset.getReference(), supersetAlleles);
+
+            if ( !supersetAlleles.contains(fromSubset.getValue().getAlternateAllele(0)) )
                 return false;
         }
 
         return true;
     }
 
+    private static List<Allele> resolveAlternateAlleles(final Allele targetReference, final Allele actualReference, final List<Allele> currentAlleles) {
+        if ( targetReference.length() <= actualReference.length() )
+            return currentAlleles;
+
+        final List<Allele> newAlleles = new ArrayList<Allele>(currentAlleles.size());
+        final byte[] extraBases = Arrays.copyOfRange(targetReference.getBases(), actualReference.length(), targetReference.length());
+        for ( final Allele a : currentAlleles ) {
+            newAlleles.add(Allele.extend(a, extraBases));
+        }
+        return newAlleles;
+    }
+
     @Ensures({"result.size() == haplotypeAllelesForSample.size()"})
     protected static List<Allele> findEventAllelesInSample( final List<Allele> eventAlleles, final List<Allele> haplotypeAlleles, final List<Allele> haplotypeAllelesForSample, final ArrayList<ArrayList<Haplotype>> alleleMapper, final ArrayList<Haplotype> haplotypes ) {
         if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; }
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
index 543987e74..304f8d5cb 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
@@ -196,8 +196,8 @@ public class LikelihoodCalculationEngine {
     }
 
     // This function takes a set of samples to pool over and a haplotypeMapping
-    @Requires({"haplotypeMapping.size() > 0"})
-    @Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"})
+    @Requires({"haplotypeList.size() > 0"})
+    @Ensures({"result.length == result[0].length", "result.length == haplotypeList.size()"})
     public static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples, final List<Haplotype> haplotypeList ) {
 
         final int numHaplotypes = haplotypeList.size();
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java
index 33fa49543..4f072d720 100755
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java
@@ -278,9 +278,10 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
         final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
         final int activeRegionStop = refHaplotype.getAlignmentStartHapwrtRef() + refHaplotype.getCigar().getReferenceLength();
 
-        for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype
+        // for GGA mode, add the desired allele into the haplotype
+        for( final VariantContext compVC : activeAllelesToGenotype ) {
             for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
-                final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart());
+                final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart());
                 if( !addHaplotype( insertedRefHaplotype, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) {
                     return returnHaplotypes;
                     //throw new ReviewedStingException("Unable to add reference+allele haplotype during GGA-enabled assembly: " + insertedRefHaplotype);
@@ -290,15 +291,24 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
 
         for( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph : graphs ) {
             for ( final KBestPaths.Path path : KBestPaths.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) {
+
                 final Haplotype h = new Haplotype( path.getBases( graph ), path.getScore() );
                 if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) {
-                    if( !activeAllelesToGenotype.isEmpty() ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
+
+                    // for GGA mode, add the desired allele into the haplotype if it isn't already present
+                    if( !activeAllelesToGenotype.isEmpty() ) {
                         final HashMap<Integer,VariantContext> eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), fullReferenceWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place
                         for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
                             final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart());
-                            if( vcOnHaplotype == null || !vcOnHaplotype.hasSameAllelesAs(compVC) ) {
+
+                            // This if statement used to additionally have:
+                            //      "|| !vcOnHaplotype.hasSameAllelesAs(compVC)"
+                            //  but that can lead to problems downstream when e.g. you are injecting a 1bp deletion onto
+                            //  a haplotype that already contains a 1bp insertion (so practically it is reference but
+                            //  falls into the bin for the 1bp deletion because we keep track of the artificial alleles).
+                            if( vcOnHaplotype == null ) {
                                 for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
-                                    addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop );
+                                    addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop );
                                 }
                             }
                         }
@@ -370,7 +380,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
         h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() );
         h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0) );
         if ( haplotype.isArtificialHaplotype() )
-            h.setArtificialAllele(haplotype.getArtificialAllele());
+            h.setArtificialAllele(haplotype.getArtificialAllele(), haplotype.getArtificialAllelePosition());
         h.leftBreakPoint = leftBreakPoint;
         h.rightBreakPoint = rightBreakPoint;
         if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart ) { // protect against SW failures
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
index 6828dbcb5..a57462d1d 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
@@ -29,9 +29,10 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
         HCTest(NA12878_BAM, "", "baabae06c85d416920be434939124d7f");
     }
 
+    // TODO -- add more tests for GGA mode, especially with input alleles that are complex variants and/or not trimmed
     @Test
     public void testHaplotypeCallerMultiSampleGGA() {
-        HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "39da622b309597d7a0b082c8aa1748c9");
+        HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "f2d0309fdf50d5827e9c60ed0dd07e3f");
     }
 
     private void HCTestComplexVariants(String bam, String args, String md5) {
diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
index af4e31698..30fdce75d 100755
--- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
+++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
@@ -50,7 +50,8 @@ public class Haplotype {
     public int leftBreakPoint = 0;
     public int rightBreakPoint = 0;
     private Allele artificialAllele = null;
- 
+    private int artificialAllelePosition = -1;
+
     /**
      * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual
      *
@@ -72,9 +73,10 @@ public class Haplotype {
         this(bases, 0);
     }
 
-    public Haplotype( final byte[] bases, final Allele artificialAllele ) {
+    protected Haplotype( final byte[] bases, final Allele artificialAllele, final int artificialAllelePosition ) {
         this(bases, 0);
         this.artificialAllele = artificialAllele;
+        this.artificialAllelePosition = artificialAllelePosition;
     }
 
     public Haplotype( final byte[] bases, final GenomeLoc loc ) {
@@ -185,12 +187,17 @@ public class Haplotype {
         return artificialAllele;
     }
 
-    public void setArtificialAllele(final Allele artificialAllele) {
+    public int getArtificialAllelePosition() {
+        return artificialAllelePosition;
+    }
+
+    public void setArtificialAllele(final Allele artificialAllele, final int artificialAllelePosition) {
         this.artificialAllele = artificialAllele;
+        this.artificialAllelePosition = artificialAllelePosition;
     }
 
     @Requires({"refInsertLocation >= 0"})
-    public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation ) {
+    public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation, final int genomicInsertLocation ) {
         // refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates
         final int haplotypeInsertLocation = ReadUtils.getReadCoordinateForReferenceCoordinate(alignmentStartHapwrtRef, cigar, refInsertLocation, ReadUtils.ClippingTail.RIGHT_TAIL, true);
         if( haplotypeInsertLocation == -1 || haplotypeInsertLocation + refAllele.length() >= bases.length ) { // desired change falls inside deletion so don't bother creating a new haplotype
@@ -200,7 +207,7 @@ public class Haplotype {
         newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, 0, haplotypeInsertLocation)); // bases before the variant
         newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant
         newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, haplotypeInsertLocation + refAllele.length(), bases.length)); // bases after the variant
-        return new Haplotype(newHaplotypeBases, altAllele);
+        return new Haplotype(newHaplotypeBases, altAllele, genomicInsertLocation);
     }
 
     public static class HaplotypeBaseComparator implements Comparator<Haplotype>, Serializable {
diff --git a/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java
index ddffb6e4c..13db1d39e 100644
--- a/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java
@@ -159,7 +159,7 @@ public class HaplotypeUnitTest extends BaseTest {
         final VariantContext vc = new VariantContextBuilder().alleles(alleles).loc("1", loc, loc + h1refAllele.getBases().length - 1).make();
         h.setAlignmentStartHapwrtRef(0);
         h.setCigar(cigar);
-        final Haplotype h1 = h.insertAllele(vc.getReference(), vc.getAlternateAllele(0), loc);
+        final Haplotype h1 = h.insertAllele(vc.getReference(), vc.getAlternateAllele(0), loc, vc.getStart());
         final Haplotype h1expected = new Haplotype(newHap.getBytes());
         Assert.assertEquals(h1, h1expected);
     }

From cc7680e6010ab184752c9f4c20b96e055178478c Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Sun, 4 Nov 2012 14:40:17 -0800
Subject: [PATCH 22/49] NA12878 knowledge base backed by MongoDB

-- Idea is simply to create a persistent database of all TP/FP sites on chr20 in NA12878.  Individual callsets can be imported, and a consensus algorithm is run over all callsets in the database to create a consensus collection, which can be used to assess NA12878 callsets for GATK and methods development
-- Framework for representing simple VariantContexts and Genotypes in MongoDB, querying for records, and iterating over them in the GATK
-- Not hooked up to Tribble, but could be done reasonably easily now (future TODO)
-- Tools to import callsets, create consensus callsets, import and export reviews
-- Scripts to reset the knowledge base and repopulate it with the standard data files (Eric will expand)
-- Actually scales to all of chr20, includes AssessNA12878 that reads a VCF and itemizes it against the truth data set
-- ImportCallset can load OMNI, HM3, CEU best practices, mills/devine sites and genotypes, properly marking sites as poly/mono/unk as well as TP/FP/UNK based on command line parameters
-- Added shell scripts that start up a local mongo db, that connect to a local or BI hosted mongo for NA12878.db for debugging, and a setupNA12878db script that can load OMNI, HM3, CEU best practices, Mills/Devine into the db and then update the consensus.
-- Reviewed sites can be exported to a VCF, and imported again, as a mechanism to safely store the only non-recoverable data from the Mongo DB.
-- Created a NA12878DBWalker that manages the outer DB interaction, and that all MongoDB interacting walkers inherit from.  Added a NA12878DBArgumentCollection.java consolating all of the common command line arguments (though strictly not necessary as all of this occurs in the root walker)

UnitTests
-- Can connect to a test knowledge base for development and unit testing
-- PolymorphicStatus, TruthStatus, SiteIterator
-- NA12878KBUnitTestBase provides simple utilities for connecting to the test mongo db, getting calls, etc
-- MongoVariantContext tests creation, matching, and encoding -> writing -> read -> decoding from the mongodb

AssessNA12878
-- Generic tool for comparing a NA12878 callset against the knowledge base.  See http://gatkforums.broadinstitute.org/discussion/1848/using-the-na12878-knowledge-base for detailed documentation
-- Performs trivial filtering on FS, MQ, QD for SNPs and non-SNPs to separate out variants likely to be filtered from those that are honest-to-goodness FPs

Misc
-- Ability to provide Description for Simplified GATK report
---
 .../sting/gatk/report/GATKReport.java         | 13 ++++++-
 .../broadinstitute/sting/utils/GenomeLoc.java | 14 ++++++++
 .../org/broadinstitute/sting/utils/Utils.java |  4 +++
 .../sting/utils/codecs/vcf/VCFUtils.java      | 34 +++++++++++++++++++
 4 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java
index 47bc48f81..6685ee12a 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java
@@ -271,7 +271,18 @@ public class GATKReport {
      * @return a simplified GATK report
      */
     public static GATKReport newSimpleReport(final String tableName, final String... columns) {
-        GATKReportTable table = new GATKReportTable(tableName, "A simplified GATK table report", columns.length);
+        return newSimpleReportWithDescription(tableName, "A simplified GATK table report", columns);
+    }
+
+    /**
+     * @see #newSimpleReport(String, String...) but with a customized description
+     * @param tableName
+     * @param desc
+     * @param columns
+     * @return
+     */
+    public static GATKReport newSimpleReportWithDescription(final String tableName, final String desc, final String... columns) {
+        GATKReportTable table = new GATKReportTable(tableName, desc, columns.length);
 
         for (String column : columns) {
             table.addColumn(column, "");
diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java
index 6df9c9f1d..4d2c26a79 100644
--- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java
+++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java
@@ -315,6 +315,20 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
         return ( comparison == -1 || ( comparison == 0 && this.getStop() < that.getStart() ));        
     }
 
+    /**
+     * Tests whether this genome loc starts at the same position as that.
+     *
+     * i.e., do this and that have the same contig and the same start position
+     *
+     * @param that genome loc to compare to
+     * @return true if this and that have the same contig and the same start position
+     */
+    @Requires("that != null")
+    public final boolean startsAt( GenomeLoc that ) {
+        int comparison = this.compareContigs(that);
+        return comparison == 0 && this.getStart() == that.getStart();
+    }
+
     /**
      * Tests whether any portion of this contig is before that contig.
      * @param that Other contig to test.
diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java
index b780d0966..1d12d6f8b 100755
--- a/public/java/src/org/broadinstitute/sting/utils/Utils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java
@@ -293,6 +293,10 @@ public class Utils {
         }
     }
 
+    public static <T> String join(final String separator, final T ... objects) {
+        return join(separator, Arrays.asList(objects));
+    }
+
     public static String dupString(char c, int nCopies) {
         char[] chars = new char[nCopies];
         Arrays.fill(chars, c);
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java
index be87e7306..a8aefb703 100755
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java
@@ -30,12 +30,17 @@ import net.sf.samtools.SAMSequenceRecord;
 import org.apache.commons.io.FilenameUtils;
 import org.apache.log4j.Logger;
 import org.broad.tribble.Feature;
+import org.broad.tribble.FeatureCodecHeader;
+import org.broad.tribble.readers.PositionalBufferedStream;
 import org.broadinstitute.sting.commandline.RodBinding;
 import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
+import org.broadinstitute.sting.utils.collections.Pair;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;
 
 import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
 import java.util.*;
 
 /**
@@ -317,4 +322,33 @@ public class VCFUtils {
             assembly = "hg19";
         return assembly;
     }
+
+    /**
+     * Read all of the VCF records from source into memory, returning the header and the VariantContexts
+     *
+     * @param source the file to read, must be in VCF4 format
+     * @return
+     * @throws IOException
+     */
+    public static Pair<VCFHeader, List<VariantContext>> readVCF(final File source) throws IOException {
+        // read in the features
+        final List<VariantContext> vcs = new ArrayList<VariantContext>();
+        final VCFCodec codec = new VCFCodec();
+        PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source));
+        FeatureCodecHeader header = codec.readHeader(pbs);
+        pbs.close();
+
+        pbs = new PositionalBufferedStream(new FileInputStream(source));
+        pbs.skip(header.getHeaderEnd());
+
+        final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue();
+
+        while ( ! pbs.isDone() ) {
+            final VariantContext vc = codec.decode(pbs);
+            if ( vc != null )
+                vcs.add(vc);
+        }
+
+        return new Pair<VCFHeader, List<VariantContext>>(vcfHeader, vcs);
+    }
 }
\ No newline at end of file

From ff87642a91c569982fed062d34cf1bf63caab63b Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Tue, 20 Nov 2012 22:29:56 -0500
Subject: [PATCH 23/49] Enable cycle covariate unit tests

---
 .../sting/utils/recalibration/CycleCovariateUnitTest.java       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java
index c3d93b2cb..deb0931d6 100644
--- a/public/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java
@@ -24,7 +24,7 @@ public class CycleCovariateUnitTest {
         covariate.initialize(RAC);
     }
 
-    @Test(enabled = false)
+    @Test(enabled = true)
     public void testSimpleCycles() {
         short readLength = 10;
         GATKSAMRecord read = ReadUtils.createRandomRead(readLength);

From 72e2d569c540e758284d9304caf44bf2b2e6ca35 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Tue, 20 Nov 2012 22:41:57 -0500
Subject: [PATCH 24/49] The user can now set the maximum allowable cycle on the
 command-line with --maximum_cycle_value.  This value is (now) enforced in the
 Cycle covariate and a User Error is thrown if the maximum value is passed
 (with a helpful error message).  Added unit tests to cover this new
 functionality.

---
 .../bqsr/RecalibrationArgumentCollection.java | 18 ++++++++++---
 .../covariates/CycleCovariate.java            |  7 +++++-
 .../recalibration/CycleCovariateUnitTest.java | 25 ++++++++++++++++++-
 3 files changed, 44 insertions(+), 6 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
index e5704a1e2..c64482151 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
@@ -102,13 +102,10 @@ public class RecalibrationArgumentCollection {
     @Argument(fullName = "no_standard_covs", shortName = "noStandard", doc = "Do not use the standard set of covariates, but rather just the ones listed using the -cov argument", required = false)
     public boolean DO_NOT_USE_STANDARD_COVARIATES = false;
 
-    /////////////////////////////
-    // Debugging-only Arguments
-    /////////////////////////////
     /**
      * This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option.
      */
-    @Hidden
+    @Advanced
     @Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
     public boolean RUN_WITHOUT_DBSNP = false;
 
@@ -139,6 +136,13 @@ public class RecalibrationArgumentCollection {
     @Argument(fullName = "indels_context_size", shortName = "ics", doc = "size of the k-mer context to be used for base insertions and deletions", required = false)
     public int INDELS_CONTEXT_SIZE = 3;
 
+    /**
+     * The cycle covariate will generate an error if it encounters a cycle greater than this value.
+     * This argument is ignored if the Cycle covariate is not used.
+     */
+    @Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "the maximum cycle value permitted for the Cycle covariate", required = false)
+    public int MAXIMUM_CYCLE_VALUE = 500;
+
     /**
      * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
      */
@@ -176,9 +180,15 @@ public class RecalibrationArgumentCollection {
     @Argument(fullName = "binary_tag_name", shortName = "bintag", required = false, doc = "the binary tag covariate name if using it")
     public String BINARY_TAG_NAME = null;
 
+
+    /////////////////////////////
+    // Debugging-only Arguments
+    /////////////////////////////
+
     @Hidden
     @Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
     public String DEFAULT_PLATFORM = null;
+
     @Hidden
     @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
     public String FORCE_PLATFORM = null;
diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java
index 5d0d94b69..a9b6c7152 100755
--- a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java
+++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java
@@ -49,7 +49,7 @@ import java.util.EnumSet;
 
 public class CycleCovariate implements StandardCovariate {
 
-    private static final int MAXIMUM_CYCLE_VALUE = 1000;
+    private int MAXIMUM_CYCLE_VALUE;
     private static final int CUSHION_FOR_INDELS = 4;
     private String default_platform = null;
 
@@ -59,6 +59,8 @@ public class CycleCovariate implements StandardCovariate {
     // Initialize any member variables using the command-line arguments passed to the walkers
     @Override
     public void initialize(final RecalibrationArgumentCollection RAC) {
+        this.MAXIMUM_CYCLE_VALUE = RAC.MAXIMUM_CYCLE_VALUE;
+
         if (RAC.DEFAULT_PLATFORM != null && !NGSPlatform.isKnown(RAC.DEFAULT_PLATFORM))
             throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM + ") is not a recognized platform.");
 
@@ -88,6 +90,9 @@ public class CycleCovariate implements StandardCovariate {
 
             final int MAX_CYCLE_FOR_INDELS = readLength - CUSHION_FOR_INDELS - 1;
             for (int i = 0; i < readLength; i++) {
+                if ( cycle > MAXIMUM_CYCLE_VALUE )
+                    throw new UserException("The maximum allowed value for the cycle is " + MAXIMUM_CYCLE_VALUE + ", but a larger cycle was detected in read " + read.getReadName() + ".  Please use the --maximum_cycle_value argument to increase this value (at the expense of requiring more memory to run)");
+
                 final int substitutionKey = keyFromCycle(cycle);
                 final int indelKey = (i < CUSHION_FOR_INDELS || i > MAX_CYCLE_FOR_INDELS) ? -1 : substitutionKey;
                 values.addCovariate(substitutionKey, indelKey, indelKey, i);
diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java
index deb0931d6..b73b1a311 100644
--- a/public/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java
@@ -1,6 +1,7 @@
 package org.broadinstitute.sting.utils.recalibration;
 
 import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection;
+import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.broadinstitute.sting.utils.recalibration.covariates.CycleCovariate;
 import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@@ -53,9 +54,31 @@ public class CycleCovariateUnitTest {
         for (short i = 0; i < values.length; i++) {
             short actual = Short.decode(covariate.formatKey(values[i][0]));
             int expected = init + (increment * i);
-            //            System.out.println(String.format("%d: %d, %d", i, actual, expected));
             Assert.assertEquals(actual, expected);
         }
     }
 
+    @Test(enabled = true, expectedExceptions={UserException.class})
+    public void testMoreThanMaxCycleFails() {
+        int readLength = RAC.MAXIMUM_CYCLE_VALUE + 1;
+        GATKSAMRecord read = ReadUtils.createRandomRead(readLength);
+        read.setReadPairedFlag(true);
+        read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID"));
+        read.getReadGroup().setPlatform("illumina");
+
+        ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1);
+        covariate.recordValues(read, readCovariates);
+    }
+
+    @Test(enabled = true)
+    public void testMaxCyclePasses() {
+        int readLength = RAC.MAXIMUM_CYCLE_VALUE;
+        GATKSAMRecord read = ReadUtils.createRandomRead(readLength);
+        read.setReadPairedFlag(true);
+        read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID"));
+        read.getReadGroup().setPlatform("illumina");
+
+        ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1);
+        covariate.recordValues(read, readCovariates);
+    }
 }

From c54fc94505e6d1b913081b8854d5063d5ed11593 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Tue, 20 Nov 2012 23:19:59 -0500
Subject: [PATCH 25/49] Protect against features that start off the end of the
 read (otherwise, Arrays.fill fails)

---
 .../gatk/walkers/bqsr/BaseRecalibrator.java     | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java
index 9506510a9..b415bb1f5 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java
@@ -268,16 +268,25 @@ public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSche
     }
 
     protected boolean[] calculateKnownSites( final GATKSAMRecord read, final List<Feature> features ) {
-        final int BUFFER_SIZE = 0;
         final int readLength = read.getReadBases().length;
         final boolean[] knownSites = new boolean[readLength];
         Arrays.fill(knownSites, false);
         for( final Feature f : features ) {
             int featureStartOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getStart(), ReadUtils.ClippingTail.LEFT_TAIL, true); // BUGBUG: should I use LEFT_TAIL here?
-            if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { featureStartOnRead = 0; }
+            if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
+                featureStartOnRead = 0;
+            }
+
             int featureEndOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getEnd(), ReadUtils.ClippingTail.LEFT_TAIL, true);
-            if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { featureEndOnRead = readLength; }
-            Arrays.fill(knownSites, Math.max(0, featureStartOnRead - BUFFER_SIZE), Math.min(readLength, featureEndOnRead + 1 + BUFFER_SIZE), true);
+            if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
+                featureEndOnRead = readLength;
+            }
+
+            if( featureStartOnRead > readLength ) {
+                featureStartOnRead = featureEndOnRead = readLength;
+            }
+
+            Arrays.fill(knownSites, Math.max(0, featureStartOnRead), Math.min(readLength, featureEndOnRead + 1), true);
         }
         return knownSites;
     }

From c8be7c3102d7764b0141665bc64955062da8c00a Mon Sep 17 00:00:00 2001
From: Menachem Fromer <fromer@broadinstitute.org>
Date: Wed, 21 Nov 2012 15:56:53 -0500
Subject: [PATCH 27/49] Keep SNPs and indels separately for batch merging; Add
 options to DepthOfCoverage to count fragments (to not double-count
 overlapping reads of same fragment); DepthOfCoverage should now support
 ReducedReads; Replace recusrion with loop in DoC/package.scala (for lists
 longer than 5000 elements)

---
 .../gatk/walkers/coverage/CoverageUtils.java  | 114 +++++++++++++++---
 .../walkers/coverage/DepthOfCoverage.java     |   6 +-
 .../pileup/AbstractReadBackedPileup.java      |  56 ++++++++-
 .../sting/utils/pileup/ReadBackedPileup.java  |  17 +++
 .../queue/qscripts/CNV/xhmmCNVpipeline.scala  |  25 +++-
 .../sting/queue/util/DoC/package.scala        |  26 ++--
 .../sting/queue/util/VCF_BAM_utilities.scala  |  27 ++---
 7 files changed, 215 insertions(+), 56 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java
index a41e55166..21532823b 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java
@@ -6,11 +6,10 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.utils.BaseUtils;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.fragments.FragmentCollection;
 import org.broadinstitute.sting.utils.pileup.PileupElement;
 
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.Map;
+import java.util.*;
 
 /**
  * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
@@ -20,6 +19,21 @@ import java.util.Map;
  */
 public class CoverageUtils {
 
+    public enum CountPileupType {
+        /**
+         * Count all reads independently (even if from the same fragment).
+         */
+        COUNT_READS,
+        /**
+         * Count all fragments (even if the reads that compose the fragment are not consistent at that base).
+         */
+        COUNT_FRAGMENTS,
+        /**
+         * Count all fragments (but only if the reads that compose the fragment are consistent at that base).
+         */
+        COUNT_FRAGMENTS_REQUIRE_SAME_BASE
+    }
+
     /**
      * Returns the counts of bases from reads with MAPQ > minMapQ and base quality > minBaseQ in the context
      * as an array of ints, indexed by the index fields of BaseUtils
@@ -64,10 +78,10 @@ public class CoverageUtils {
     }
 
     public static Map<DoCOutputType.Partition,Map<String,int[]>>
-                    getBaseCountsByPartition(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, Collection<DoCOutputType.Partition> types) {
+                    getBaseCountsByPartition(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, CountPileupType countType, Collection<DoCOutputType.Partition> types) {
 
         Map<DoCOutputType.Partition,Map<String,int[]>> countsByIDByType = new HashMap<DoCOutputType.Partition,Map<String,int[]>>();
-        Map<SAMReadGroupRecord,int[]> countsByRG = getBaseCountsByReadGroup(context,minMapQ,maxMapQ,minBaseQ,maxBaseQ);
+        Map<SAMReadGroupRecord,int[]> countsByRG = getBaseCountsByReadGroup(context,minMapQ,maxMapQ,minBaseQ,maxBaseQ,countType);
         for (DoCOutputType.Partition t : types ) {
             // iterate through the read group counts and build the type associations
             for ( Map.Entry<SAMReadGroupRecord,int[]> readGroupCountEntry : countsByRG.entrySet() ) {
@@ -95,31 +109,95 @@ public class CoverageUtils {
         }
     }
 
-    public static Map<SAMReadGroupRecord,int[]> getBaseCountsByReadGroup(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ) {
+    public static Map<SAMReadGroupRecord,int[]> getBaseCountsByReadGroup(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, CountPileupType countType) {
         Map<SAMReadGroupRecord, int[]> countsByRG = new HashMap<SAMReadGroupRecord,int[]>();
-        for ( PileupElement e : context.getBasePileup() ) {
-            if ( e.getMappingQual() >= minMapQ && e.getMappingQual() <= maxMapQ && ( e.getQual() >= minBaseQ && e.getQual() <= maxBaseQ || e.isDeletion() ) ) {
-                SAMReadGroupRecord readGroup = getReadGroup(e.getRead());
-                if ( ! countsByRG.keySet().contains(readGroup) ) {
-                    countsByRG.put(readGroup,new int[6]);
-                    updateCounts(countsByRG.get(readGroup),e);
-                } else {
-                    updateCounts(countsByRG.get(readGroup),e);
+
+        List<PileupElement> countPileup = new LinkedList<PileupElement>();
+        FragmentCollection<PileupElement> fpile;
+
+        switch (countType) {
+
+            case COUNT_READS:
+                for (PileupElement e : context.getBasePileup())
+                    if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ))
+                        countPileup.add(e);
+                break;
+
+            case COUNT_FRAGMENTS: // ignore base identities and put in FIRST base that passes filters:
+                fpile = context.getBasePileup().getStartSortedPileup().toFragments();
+
+                for (PileupElement e : fpile.getSingletonReads())
+                    if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ))
+                        countPileup.add(e);
+
+                for (List<PileupElement> overlappingPair : fpile.getOverlappingPairs()) {
+                    // iterate over all elements in fragment:
+                    for (PileupElement e : overlappingPair) {
+                        if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) {
+                            countPileup.add(e); // add the first passing element per fragment
+                            break;
+                        }
+                    }
                 }
-            }
+                break;
+
+            case COUNT_FRAGMENTS_REQUIRE_SAME_BASE:
+                fpile = context.getBasePileup().getStartSortedPileup().toFragments();
+
+                for (PileupElement e : fpile.getSingletonReads())
+                    if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ))
+                        countPileup.add(e);
+
+                for (List<PileupElement> overlappingPair : fpile.getOverlappingPairs()) {
+                    PileupElement firstElem = null;
+                    PileupElement addElem = null;
+
+                    // iterate over all elements in fragment:
+                    for (PileupElement e : overlappingPair) {
+                        if (firstElem == null)
+                            firstElem = e;
+                        else if (e.getBase() != firstElem.getBase()) {
+                            addElem = null;
+                            break;
+                        }
+
+                        // will add the first passing element per base-consistent fragment:
+                        if (addElem == null && countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ))
+                            addElem = e;
+                    }
+
+                    if (addElem != null)
+                        countPileup.add(addElem);
+                }
+                break;
+
+            default:
+                throw new UserException("Must use valid CountPileupType");
+        }
+
+        for (PileupElement e : countPileup) {
+            SAMReadGroupRecord readGroup = getReadGroup(e.getRead());
+            if (!countsByRG.keySet().contains(readGroup))
+                countsByRG.put(readGroup, new int[6]);
+
+            updateCounts(countsByRG.get(readGroup), e);
         }
 
         return countsByRG;
     }
 
+    private static boolean countElement(PileupElement e, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ) {
+        return (e.getMappingQual() >= minMapQ && e.getMappingQual() <= maxMapQ && ( e.getQual() >= minBaseQ && e.getQual() <= maxBaseQ || e.isDeletion() ));
+    }
+
     private static void updateCounts(int[] counts, PileupElement e) {
         if ( e.isDeletion() ) {
-            counts[BaseUtils.DELETION_INDEX]++;
+            counts[BaseUtils.DELETION_INDEX] += e.getRepresentativeCount();
         } else if ( BaseUtils.basesAreEqual((byte) 'N', e.getBase()) ) {
-            counts[BaseUtils.NO_CALL_INDEX]++;
+            counts[BaseUtils.NO_CALL_INDEX] += e.getRepresentativeCount();
         } else {
             try {
-                counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())]++;
+                counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())] += e.getRepresentativeCount();
             } catch (ArrayIndexOutOfBoundsException exc) {
                 throw new ReviewedStingException("Expected a simple base, but actually received"+(char)e.getBase());
             }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java
index 44b0d74ca..fe9942662 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java
@@ -129,11 +129,15 @@ public class DepthOfCoverage extends LocusWalker<Map<DoCOutputType.Partition,Map
     int minMappingQuality = -1;
     @Argument(fullName = "maxMappingQuality", doc = "Maximum mapping quality of reads to count towards depth. Defaults to 2^31-1 (Integer.MAX_VALUE).", required = false)
     int maxMappingQuality = Integer.MAX_VALUE;
+
     @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to -1.", required = false)
     byte minBaseQuality = -1;
     @Argument(fullName = "maxBaseQuality", doc = "Maximum quality of bases to count towards depth. Defaults to 127 (Byte.MAX_VALUE).", required = false)
     byte maxBaseQuality = Byte.MAX_VALUE;
 
+    @Argument(fullName = "countType", doc = "How should overlapping reads from the same fragment be handled?", required = false)
+    CoverageUtils.CountPileupType countType = CoverageUtils.CountPileupType.COUNT_READS;
+
     /**
      * Instead of reporting depth, report the base pileup at each locus
      */
@@ -373,7 +377,7 @@ public class DepthOfCoverage extends LocusWalker<Map<DoCOutputType.Partition,Map
                 //System.out.printf("\t[log]\t%s",ref.getLocus());
             }
 
-            return CoverageUtils.getBaseCountsByPartition(context,minMappingQuality,maxMappingQuality,minBaseQuality,maxBaseQuality,partitionTypes);
+            return CoverageUtils.getBaseCountsByPartition(context,minMappingQuality,maxMappingQuality,minBaseQuality,maxBaseQuality,countType,partitionTypes);
         } else {
             return null;
         }
diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java
index ed6fc46bb..d0ae68912 100644
--- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java
+++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java
@@ -254,19 +254,32 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
      * Returns a new ReadBackedPileup where only one read from an overlapping read
      * pair is retained.  If the two reads in question disagree to their basecall,
      * neither read is retained.  If they agree on the base, the read with the higher
-     * quality observation is retained
+     * base quality observation is retained
      *
      * @return the newly filtered pileup
      */
     @Override
-    public RBP getOverlappingFragmentFilteredPileup() {
+    public ReadBackedPileup getOverlappingFragmentFilteredPileup() {
+        return getOverlappingFragmentFilteredPileup(true, true);
+    }
+
+    /**
+     * Returns a new ReadBackedPileup where only one read from an overlapping read
+     * pair is retained.  If discardDiscordant and the two reads in question disagree to their basecall,
+     * neither read is retained.  Otherwise, the read with the higher
+     * quality (base or mapping, depending on baseQualNotMapQual) observation is retained
+     *
+     * @return the newly filtered pileup
+     */
+    @Override
+    public RBP getOverlappingFragmentFilteredPileup(boolean discardDiscordant, boolean baseQualNotMapQual) {
         if (pileupElementTracker instanceof PerSamplePileupElementTracker) {
             PerSamplePileupElementTracker<PE> tracker = (PerSamplePileupElementTracker<PE>) pileupElementTracker;
             PerSamplePileupElementTracker<PE> filteredTracker = new PerSamplePileupElementTracker<PE>();
 
             for (final String sample : tracker.getSamples()) {
                 PileupElementTracker<PE> perSampleElements = tracker.getElements(sample);
-                AbstractReadBackedPileup<RBP, PE> pileup = createNewPileup(loc, perSampleElements).getOverlappingFragmentFilteredPileup();
+                AbstractReadBackedPileup<RBP, PE> pileup = createNewPileup(loc, perSampleElements).getOverlappingFragmentFilteredPileup(discardDiscordant, baseQualNotMapQual);
                 filteredTracker.addElements(sample, pileup.pileupElementTracker);
             }
             return (RBP) createNewPileup(loc, filteredTracker);
@@ -284,11 +297,16 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
 
                     // if the reads disagree at this position, throw them both out.  Otherwise
                     // keep the element with the higher quality score
-                    if (existing.getBase() != p.getBase()) {
+                    if (discardDiscordant && existing.getBase() != p.getBase()) {
                         filteredPileup.remove(readName);
                     } else {
-                        if (existing.getQual() < p.getQual()) {
-                            filteredPileup.put(readName, p);
+                        if (baseQualNotMapQual) {
+                            if (existing.getQual() < p.getQual())
+                                filteredPileup.put(readName, p);
+                        }
+                        else {
+                            if (existing.getMappingQual() < p.getMappingQual())
+                                filteredPileup.put(readName, p);
                         }
                     }
                 }
@@ -998,6 +1016,32 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
         return quals2String(getQuals());
     }
 
+    /**
+     * Returns a new ReadBackedPileup that is sorted by start coordinate of the reads.
+     *
+     * @return
+     */
+    @Override
+    public ReadBackedPileup getStartSortedPileup() {
+
+        final TreeSet<PE> sortedElements = new TreeSet<PE>(new Comparator<PE>() {
+            @Override
+            public int compare(PE element1, PE element2) {
+                final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart();
+                return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName());
+            }
+        });
+        UnifiedPileupElementTracker<PE> tracker = (UnifiedPileupElementTracker<PE>) pileupElementTracker;
+        for (PE pile : tracker)
+            sortedElements.add(pile);
+
+        UnifiedPileupElementTracker<PE> sortedTracker = new UnifiedPileupElementTracker<PE>();
+        for (PE pile : sortedElements)
+            sortedTracker.add(pile);
+
+        return (RBP) createNewPileup(this.getLocation(), sortedTracker);
+    }
+
     @Override
     public FragmentCollection<PileupElement> toFragments() {
         return FragmentUtils.create(this);
diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java
index f15468840..be61bad99 100644
--- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java
+++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java
@@ -60,6 +60,16 @@ public interface ReadBackedPileup extends Iterable<PileupElement>, HasGenomeLoca
      */
     public ReadBackedPileup getOverlappingFragmentFilteredPileup();
 
+    /**
+     * Returns a new ReadBackedPileup where only one read from an overlapping read
+     * pair is retained.  If discardDiscordant and the two reads in question disagree to their basecall,
+     * neither read is retained.  Otherwise, the read with the higher
+     * quality (base or mapping, depending on baseQualNotMapQual) observation is retained
+     *
+     * @return the newly filtered pileup
+     */
+    public ReadBackedPileup getOverlappingFragmentFilteredPileup(boolean discardDiscordant, boolean baseQualNotMapQual);
+
     /**
      * Returns a new ReadBackedPileup that is free of mapping quality zero reads in this pileup.  Note that this
      * does not copy the data, so both ReadBackedPileups should not be changed.  Doesn't make an unnecessary copy
@@ -261,6 +271,13 @@ public interface ReadBackedPileup extends Iterable<PileupElement>, HasGenomeLoca
      */
     public byte[] getMappingQuals();
 
+    /**
+     * Returns a new ReadBackedPileup that is sorted by start coordinate of the reads.
+     *
+     * @return
+     */
+    public ReadBackedPileup getStartSortedPileup();
+
     /**
      * Converts this pileup into a FragmentCollection (see FragmentUtils for documentation)
      * @return
diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala
index 8db089484..c556913ab 100644
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala
@@ -8,6 +8,7 @@ import org.broadinstitute.sting.commandline.Hidden
 import java.io.{PrintStream, PrintWriter}
 import org.broadinstitute.sting.utils.text.XReadLines
 import collection.JavaConversions._
+import org.broadinstitute.sting.gatk.walkers.coverage.CoverageUtils
 
 class xhmmCNVpipeline extends QScript {
   qscript =>
@@ -15,22 +16,22 @@ class xhmmCNVpipeline extends QScript {
   @Input(doc = "bam input, as .bam or as a list of files", shortName = "I", required = true)
   var bams: File = _
 
-  @Argument(doc = "gatk jar file", shortName = "J", required = true)
+  @Input(doc = "gatk jar file", shortName = "J", required = true)
   var gatkJarFile: File = _
 
-  @Argument(doc = "xhmm executable file", shortName = "xhmmExec", required = true)
+  @Input(doc = "xhmm executable file", shortName = "xhmmExec", required = true)
   var xhmmExec: File = _
 
-  @Argument(doc = "Plink/Seq executable file", shortName = "pseqExec", required = true)
+  @Input(doc = "Plink/Seq executable file", shortName = "pseqExec", required = true)
   var pseqExec: File = _
 
   @Argument(doc = "Plink/Seq SEQDB file (Reference genome sequence)", shortName = "SEQDB", required = true)
   var pseqSeqDB: String = _
 
-  @Argument(shortName = "R", doc = "ref", required = true)
+  @Input(shortName = "R", doc = "ref", required = true)
   var referenceFile: File = _
 
-  @Argument(shortName = "L", doc = "Intervals", required = false)
+  @Input(shortName = "L", doc = "Intervals", required = false)
   var intervals: File = _
 
   @Argument(doc = "level of parallelism for BAM DoC.   By default is set to 0 [no scattering].", shortName = "scatter", required = false)
@@ -42,6 +43,15 @@ class xhmmCNVpipeline extends QScript {
   @Output(doc = "Base name for files to output", shortName = "o", required = true)
   var outputBase: File = _
 
+  @Hidden
+  @Argument(doc = "How should overlapping reads from the same fragment be handled?", shortName = "countType", required = false)
+  // TODO: change this to be the default once reads can be ordered properly for FragmentUtils.create():
+  //
+  // Don't want to double-count (but also don't mind counting base-inconsistencies in overlap):
+  //var countType = CoverageUtils.CountPileupType.COUNT_FRAGMENTS
+  //
+  var countType = CoverageUtils.CountPileupType.COUNT_READS
+
   @Argument(doc = "Maximum depth (before GATK down-sampling kicks in...)", shortName = "MAX_DEPTH", required = false)
   var MAX_DEPTH = 20000
 
@@ -56,6 +66,9 @@ class xhmmCNVpipeline extends QScript {
   @Argument(doc = "Minimum read mapping quality", shortName = "MMQ", required = false)
   var minMappingQuality = 0
 
+  @Argument(doc = "Minimum base quality to be counted in depth", shortName = "MBQ", required = false)
+  var minBaseQuality = 0
+
   @Argument(doc = "Memory (in GB) required for storing the whole matrix in memory", shortName = "wholeMatrixMemory", required = false)
   var wholeMatrixMemory = -1
 
@@ -159,7 +172,7 @@ class xhmmCNVpipeline extends QScript {
     var docs: List[DoC] = List[DoC]()
     for (group <- groups) {
       Console.out.printf("Group is %s%n", group)
-      docs ::= new DoC(group.bams, group.DoC_output, MAX_DEPTH, minMappingQuality, scatterCountInput, START_BIN, NUM_BINS, Nil) with CommandLineGATKArgs
+      docs ::= new DoC(group.bams, group.DoC_output, countType, MAX_DEPTH, minMappingQuality, minBaseQuality, scatterCountInput, START_BIN, NUM_BINS, Nil) with CommandLineGATKArgs
     }
     addAll(docs)
 
diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala b/public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala
index f35db4aa3..2b19b0f8e 100644
--- a/public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala
+++ b/public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala
@@ -6,9 +6,10 @@ import org.broadinstitute.sting.queue.function.scattergather.ScatterGatherableFu
 import org.broadinstitute.sting.gatk.downsampling.DownsampleType
 import org.broadinstitute.sting.commandline.{Input, Gather, Output}
 import org.broadinstitute.sting.queue.function.CommandLineFunction
+import org.broadinstitute.sting.gatk.walkers.coverage.CoverageUtils
 
 package object DoC {
-  class DoC(val bams: List[File], val DoC_output: File, val MAX_DEPTH: Int, val minMappingQuality: Int, val scatterCountInput: Int, val START_BIN: Int, val NUM_BINS: Int, val minCoverageCalcs: Seq[Int]) extends CommandLineGATK with ScatterGatherableFunction {
+  class DoC(val bams: List[File], val DoC_output: File, val countType: CoverageUtils.CountPileupType, val MAX_DEPTH: Int, val minMappingQuality: Int, val minBaseQuality: Int, val scatterCountInput: Int, val START_BIN: Int, val NUM_BINS: Int, val minCoverageCalcs: Seq[Int]) extends CommandLineGATK with ScatterGatherableFunction {
     val DOC_OUTPUT_SUFFIX: String = ".sample_interval_summary"
 
     // So that the output files of this DoC run get deleted once they're used further downstream:
@@ -32,8 +33,9 @@ package object DoC {
     override def commandLine = super.commandLine +
       " --omitDepthOutputAtEachBase" +
       " --omitLocusTable" +
-      " --minBaseQuality 0" +
       " --minMappingQuality " + minMappingQuality +
+      " --minBaseQuality " + minBaseQuality +
+      optional("--countType", countType, spaceSeparated=true, escape=true, format="%s") +
       " --start " + START_BIN + " --stop " + MAX_DEPTH + " --nBins " + NUM_BINS +
       (if (!minCoverageCalcs.isEmpty) minCoverageCalcs.map(cov => " --summaryCoverageThreshold " + cov).reduceLeft(_ + "" + _) else "") +
       " --includeRefNSites" +
@@ -42,7 +44,7 @@ package object DoC {
     override def shortDescription = "DoC: " + DoC_output
   }
 
-  class DoCwithDepthOutputAtEachBase(bams: List[File], DoC_output: File, MAX_DEPTH: Int, minMappingQuality: Int, scatterCountInput: Int, START_BIN: Int, NUM_BINS: Int, minCoverageCalcs: Seq[Int]) extends DoC(bams, DoC_output, MAX_DEPTH: Int, minMappingQuality, scatterCountInput, START_BIN, NUM_BINS, minCoverageCalcs) {
+  class DoCwithDepthOutputAtEachBase(bams: List[File], DoC_output: File, countType: CoverageUtils.CountPileupType, MAX_DEPTH: Int, minMappingQuality: Int, minBaseQuality: Int, scatterCountInput: Int, START_BIN: Int, NUM_BINS: Int, minCoverageCalcs: Seq[Int]) extends DoC(bams, DoC_output, countType: CoverageUtils.CountPileupType, MAX_DEPTH: Int, minMappingQuality, minBaseQuality, scatterCountInput, START_BIN, NUM_BINS, minCoverageCalcs) {
     // HACK for DoC to work properly within Queue:
     @Output
     @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction])
@@ -52,15 +54,21 @@ package object DoC {
   }
 
   def buildDoCgroups(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]], samplesPerJob: Int, outputBase: File): List[Group] = {
+    var l: List[Group] = Nil
 
-    def buildDoCgroupsHelper(samples: List[String], count: Int): List[Group] = (samples splitAt samplesPerJob) match {
-      case (Nil, y) =>
-        return Nil
-      case (subsamples, remaining) =>
-        return new Group("group" + count, outputBase, subsamples, VCF_BAM_utilities.findBAMsForSamples(subsamples, sampleToBams)) :: buildDoCgroupsHelper(remaining, count + 1)
+    var remaining = samples
+    var subsamples: List[String] = Nil
+    var count = 1
+
+    while (!remaining.isEmpty) {
+      val splitRes = (remaining splitAt samplesPerJob)
+      subsamples = splitRes._1
+      remaining = splitRes._2
+      l ::= new Group("group" + count, outputBase, subsamples, VCF_BAM_utilities.findBAMsForSamples(subsamples, sampleToBams))
+      count = count + 1
     }
 
-    return buildDoCgroupsHelper(samples, 0)
+    return l
   }
 
   // A group has a list of samples and bam files to use for DoC
diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala b/public/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala
index 1f18858e1..3fe867981 100644
--- a/public/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala
+++ b/public/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala
@@ -26,36 +26,31 @@ object VCF_BAM_utilities {
     case _ => throw new RuntimeException("Unexpected BAM input type: " + bamsIn + "; only permitted extensions are .bam and .list")
   }
 
-  def getMapOfBAMsForSample(bams: List[File]): scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = bams match {
-    case Nil => return scala.collection.mutable.Map.empty[String, scala.collection.mutable.Set[File]]
-
-    case x :: y =>
-      val m: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = getMapOfBAMsForSample(y)
-      val bamSamples: List[String] = getSamplesInBAM(x)
+  def getMapOfBAMsForSample(bams: List[File]): scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = {
+    var m = scala.collection.mutable.Map.empty[String, scala.collection.mutable.Set[File]]
 
+    for (bam <- bams) {
+      val bamSamples: List[String] = getSamplesInBAM(bam)
       for (s <- bamSamples) {
         if (!m.contains(s))
           m += s -> scala.collection.mutable.Set.empty[File]
 
-        m(s) = m(s) + x
+        m(s) += bam
       }
+    }
 
       return m
   }
 
   def findBAMsForSamples(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]]): List[File] = {
+    var s = scala.collection.mutable.Set.empty[File]
 
-    def findBAMsForSamplesHelper(samples: List[String]): scala.collection.mutable.Set[File] = samples match {
-      case Nil => scala.collection.mutable.Set.empty[File]
-
-      case x :: y =>
-        var bamsForSampleX: scala.collection.mutable.Set[File] = scala.collection.mutable.Set.empty[File]
-        if (sampleToBams.contains(x))
-          bamsForSampleX = sampleToBams(x)
-        return bamsForSampleX ++ findBAMsForSamplesHelper(y)
+    for (sample <- samples) {
+      if (sampleToBams.contains(sample))
+        s ++= sampleToBams(sample)
     }
 
     val l: List[File] = Nil
-    return l ++ findBAMsForSamplesHelper(samples)
+    return l ++ s
   }
 }

From ed50814ccba7fe72f296c466e0abf64d7923c51d Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Wed, 21 Nov 2012 15:57:05 -0500
Subject: [PATCH 28/49] Finally found a case where user errors were being
 masked behind other errors and could debug.  It turns out that the
 checkForMaskedUserErrors() method needs to run recursively over all levels
 (calling exception.getCause()) to check for the original cause.

---
 .../sting/gatk/CommandLineGATK.java           | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
index 0daad2c2b..4f9031329 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
@@ -118,17 +118,24 @@ public class CommandLineGATK extends CommandLineExecutable {
     public static final String DISK_QUOTA_EXCEEDED_ERROR = "Disk quota exceeded";
 
     private static void checkForMaskedUserErrors(final Throwable t) {
+        // masked out of memory error
+        if ( t instanceof OutOfMemoryError )
+            exitSystemWithUserError(new UserException.NotEnoughMemory());
+        // masked user error
+        if ( t instanceof UserException || t instanceof TribbleException )
+            exitSystemWithUserError(new UserException(t.getMessage()));
+
+        // no message means no masked error
         final String message = t.getMessage();
         if ( message == null )
             return;
 
-        // we know what to do about the common "Too many open files" error
+        // too many open files error
         if ( message.contains("Too many open files") )
             exitSystemWithUserError(new UserException.TooManyOpenFiles());
 
         // malformed BAM looks like a SAM file
-        if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) ||
-                message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) )
+        if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) || message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) )
             exitSystemWithSamError(t);
 
         // can't close tribble index when writing
@@ -138,12 +145,10 @@ public class CommandLineGATK extends CommandLineExecutable {
         // disk is full
         if ( message.contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || message.contains(DISK_QUOTA_EXCEEDED_ERROR) )
             exitSystemWithUserError(new UserException.NoSpaceOnDevice());
-        if ( t.getCause() != null && (t.getCause().getMessage().contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || t.getCause().getMessage().contains(DISK_QUOTA_EXCEEDED_ERROR)) )
-            exitSystemWithUserError(new UserException.NoSpaceOnDevice());
 
-        // masked out of memory error
-        if ( t.getCause() != null && t.getCause() instanceof OutOfMemoryError )
-            exitSystemWithUserError(new UserException.NotEnoughMemory());
+        // masked error wrapped in another one
+        if ( t.getCause() != null )
+            checkForMaskedUserErrors(t.getCause());
     }
 
     /**

From a8c7edca053a5e0fbe360bd9bd3bc09472e9b532 Mon Sep 17 00:00:00 2001
From: Menachem Fromer <fromer@broadinstitute.org>
Date: Wed, 21 Nov 2012 16:01:10 -0500
Subject: [PATCH 29/49] Fixed fragment handling in DepthOfCoverage

---
 .../sting/queue/qscripts/CNV/xhmmCNVpipeline.scala         | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala
index c556913ab..28a2534c0 100644
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala
@@ -45,12 +45,7 @@ class xhmmCNVpipeline extends QScript {
 
   @Hidden
   @Argument(doc = "How should overlapping reads from the same fragment be handled?", shortName = "countType", required = false)
-  // TODO: change this to be the default once reads can be ordered properly for FragmentUtils.create():
-  //
-  // Don't want to double-count (but also don't mind counting base-inconsistencies in overlap):
-  //var countType = CoverageUtils.CountPileupType.COUNT_FRAGMENTS
-  //
-  var countType = CoverageUtils.CountPileupType.COUNT_READS
+  var countType = CoverageUtils.CountPileupType.COUNT_FRAGMENTS
 
   @Argument(doc = "Maximum depth (before GATK down-sampling kicks in...)", shortName = "MAX_DEPTH", required = false)
   var MAX_DEPTH = 20000

From 4f2229d399948200576695574ef46111f3ed542d Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Wed, 21 Nov 2012 16:01:26 -0500
Subject: [PATCH 30/49] As per the TODO message, I removed a check that was no
 longer necessary.  Now ID is an allowable INFO field key.

---
 .../sting/utils/variantcontext/VariantContext.java         | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
index 27a5b0c24..12f9cb20c 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
@@ -184,9 +184,6 @@ public class VariantContext implements Feature { // to enable tribble integratio
     protected CommonInfo commonInfo = null;
     public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR;
 
-    @Deprecated // ID is no longer stored in the attributes map
-    private final static String ID_KEY = "ID";
-
     public final static Set<String> PASSES_FILTERS = Collections.unmodifiableSet(new LinkedHashSet<String>());
 
     /** The location of this VariantContext */
@@ -287,10 +284,6 @@ public class VariantContext implements Feature { // to enable tribble integratio
 
         this.commonInfo = new CommonInfo(source, log10PError, filters, attributes);
 
-        // todo -- remove me when this check is no longer necessary
-        if ( this.commonInfo.hasAttribute(ID_KEY) )
-            throw new IllegalArgumentException("Trying to create a VariantContext with a ID key.  Please use provided constructor argument ID");
-
         if ( alleles == null ) { throw new IllegalArgumentException("Alleles cannot be null"); }
 
         // we need to make this a LinkedHashSet in case the user prefers a given ordering of alleles

From c08b78274395c916db872417d4f17ebdacd39906 Mon Sep 17 00:00:00 2001
From: Joel Thibault <thibault@broadinstitute.org>
Date: Mon, 19 Nov 2012 14:39:06 -0500
Subject: [PATCH 31/49] Count isActive calls directly

---
 .../sting/gatk/traversals/TraverseActiveRegions.java  |  5 +----
 .../sting/utils/activeregion/ActivityProfile.java     |  5 -----
 .../gatk/traversals/TraverseActiveRegionsTest.java    | 11 ++++++++---
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
index 4fe83f331..a2c37944a 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
@@ -34,9 +34,6 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
     private final LinkedList<ActiveRegion> workQueue = new LinkedList<ActiveRegion>();
     private final LinkedHashSet<GATKSAMRecord> myReads = new LinkedHashSet<GATKSAMRecord>();
 
-    // package access for unit testing
-    ActivityProfile profile;
-
     @Override
     public String getTraversalUnits() {
         return "active regions";
@@ -56,7 +53,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
 
         int minStart = Integer.MAX_VALUE;
         final List<ActiveRegion> activeRegions = new LinkedList<ActiveRegion>();
-        profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
+        ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
 
         ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
 
diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java
index 38cfbb38d..e96eb843d 100644
--- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java
+++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java
@@ -103,11 +103,6 @@ public class ActivityProfile {
         isActiveList.add(result);
     }
 
-    // for unit testing
-    public List<ActivityProfileResult> getActiveList() {
-        return isActiveList;
-    }
-
     public int size() {
         return isActiveList.size();
     }
diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java
index 8740a8b68..edc818aca 100644
--- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java
@@ -41,6 +41,7 @@ public class TraverseActiveRegionsTest extends BaseTest {
 
     private class DummyActiveRegionWalker extends ActiveRegionWalker<Integer, Integer> {
         private final double prob;
+        public List<GenomeLoc> isActiveCalls = new ArrayList<GenomeLoc>();
 
         public DummyActiveRegionWalker() {
             this.prob = 1.0;
@@ -48,6 +49,7 @@ public class TraverseActiveRegionsTest extends BaseTest {
 
         @Override
         public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+            isActiveCalls.add(ref.getLocus());
             return new ActivityProfileResult(ref.getLocus(), prob);
         }
 
@@ -71,7 +73,7 @@ public class TraverseActiveRegionsTest extends BaseTest {
 
     private IndexedFastaSequenceFile reference;
     private GenomeLocParser genomeLocParser;
-    private ActiveRegionWalker<Integer, Integer> walker;
+    private DummyActiveRegionWalker walker;
 
     @BeforeClass
     private void init() throws FileNotFoundException {
@@ -83,18 +85,21 @@ public class TraverseActiveRegionsTest extends BaseTest {
     @Test
     public void testAllIntervalsSeen() throws Exception {
         List<GenomeLoc> intervals = new ArrayList<GenomeLoc>();
+        List<GenomeLoc> activeIntervals = new ArrayList<GenomeLoc>();
+
         GenomeLoc interval = genomeLocParser.createGenomeLoc("1", 1, 1);
         intervals.add(interval);
 
         LocusShardDataProvider dataProvider = createDataProvider(intervals);
 
         t.traverse(walker, dataProvider, 0);
+        activeIntervals.addAll(walker.isActiveCalls);
 
         boolean allGenomeLocsSeen = true;
         for (GenomeLoc loc : intervals) {
             boolean thisGenomeLocSeen = false;
-            for (ActivityProfileResult active : t.profile.getActiveList()) {
-                if (loc.equals(active.getLoc())) {
+            for (GenomeLoc activeLoc : activeIntervals) {
+                if (loc.equals(activeLoc)) {
                     thisGenomeLocSeen = true;
                     break;
                 }

From e8defcb20dfcc50cec565fbb7fd2e93479162ae5 Mon Sep 17 00:00:00 2001
From: Joel Thibault <thibault@broadinstitute.org>
Date: Mon, 19 Nov 2012 14:44:00 -0500
Subject: [PATCH 32/49] Test multiple bases and intervals

---
 .../traversals/TraverseActiveRegionsTest.java | 65 +++++++++++++------
 1 file changed, 45 insertions(+), 20 deletions(-)

diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java
index edc818aca..d61da5a83 100644
--- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java
@@ -83,49 +83,74 @@ public class TraverseActiveRegionsTest extends BaseTest {
     }
 
     @Test
-    public void testAllIntervalsSeen() throws Exception {
+    public void testAllBasesSeenSuite() {
         List<GenomeLoc> intervals = new ArrayList<GenomeLoc>();
         List<GenomeLoc> activeIntervals = new ArrayList<GenomeLoc>();
 
         GenomeLoc interval = genomeLocParser.createGenomeLoc("1", 1, 1);
         intervals.add(interval);
+        testAllBasesSeen(intervals);
 
-        LocusShardDataProvider dataProvider = createDataProvider(intervals);
+        interval = genomeLocParser.createGenomeLoc("1", 10, 20);
+        intervals.add(interval);
+        testAllBasesSeen(intervals);
+    }
 
-        t.traverse(walker, dataProvider, 0);
-        activeIntervals.addAll(walker.isActiveCalls);
+    public void testAllBasesSeen(List<GenomeLoc> intervals) {
+        List<GenomeLoc> activeIntervals = new ArrayList<GenomeLoc>();
+        for (LocusShardDataProvider dataProvider : createDataProviders(intervals)) {
+            t.traverse(walker, dataProvider, 0);
+            activeIntervals.addAll(walker.isActiveCalls);
+        }
 
-        boolean allGenomeLocsSeen = true;
-        for (GenomeLoc loc : intervals) {
-            boolean thisGenomeLocSeen = false;
+        boolean allBasesSeen = true;
+        for (GenomeLoc base : toBases(intervals)) {
+            boolean thisBaseSeen = false;
             for (GenomeLoc activeLoc : activeIntervals) {
-                if (loc.equals(activeLoc)) {
-                    thisGenomeLocSeen = true;
+                if (base.equals(activeLoc)) {
+                    thisBaseSeen = true;
                     break;
                 }
             }
-            if (!thisGenomeLocSeen) {
-                allGenomeLocsSeen = false;
+            if (!thisBaseSeen) {
+                allBasesSeen = false;
                 break;
             }
         }
 
-        Assert.assertTrue(allGenomeLocsSeen, "Some intervals missing from activity profile");
+        Assert.assertTrue(allBasesSeen, "Some intervals missing from activity profile");
     }
 
-    private LocusShardDataProvider createDataProvider(List<GenomeLoc> intervals) {
+    private List<GenomeLoc> toBases(List<GenomeLoc> intervals) {
+        List<GenomeLoc> bases = new ArrayList<GenomeLoc>();
+        for (GenomeLoc interval : intervals) {
+            if (interval.size() == 1)
+                bases.add(interval);
+            else {
+                for (int location = interval.getStart(); location <= interval.getStop(); location++) {
+                    bases.add(genomeLocParser.createGenomeLoc(interval.getContig(), location, location));
+                }
+            }
+        }
+        return bases;
+    }
+
+    private List<LocusShardDataProvider> createDataProviders(List<GenomeLoc> intervals) {
         walker = new DummyActiveRegionWalker();
 
-        StingSAMIterator iterator = ArtificialSAMUtils.createReadIterator(new ArrayList<SAMRecord>());
-        Shard shard = new MockLocusShard(genomeLocParser, intervals);
-        WindowMaker windowMaker = new WindowMaker(shard, genomeLocParser,iterator,shard.getGenomeLocs());
-        WindowMaker.WindowMakerIterator window = windowMaker.next();
-
         GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
-        //engine.setReferenceDataSource(reference);
         engine.setGenomeLocParser(genomeLocParser);
         t.initialize(engine);
 
-        return new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, reference, new ArrayList<ReferenceOrderedDataSource>());
+        StingSAMIterator iterator = ArtificialSAMUtils.createReadIterator(new ArrayList<SAMRecord>());
+        Shard shard = new MockLocusShard(genomeLocParser, intervals);
+
+        List<LocusShardDataProvider> providers = new ArrayList<LocusShardDataProvider>();
+        WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs());
+        for (WindowMaker.WindowMakerIterator window : windowMaker) {
+            providers.add(new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, reference, new ArrayList<ReferenceOrderedDataSource>()));
+        }
+
+        return providers;
     }
 }

From 3fa3b00f4abd3159b67a8013505606bf96db1a38 Mon Sep 17 00:00:00 2001
From: Joel Thibault <thibault@broadinstitute.org>
Date: Mon, 19 Nov 2012 14:45:19 -0500
Subject: [PATCH 33/49] Add ActiveRegion tests and refactor

---
 .../traversals/TraverseActiveRegionsTest.java | 156 +++++++++++++-----
 1 file changed, 115 insertions(+), 41 deletions(-)

diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java
index d61da5a83..ce4d400b4 100644
--- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java
@@ -28,20 +28,23 @@ import org.testng.annotations.Test;
 
 import java.io.File;
 import java.io.FileNotFoundException;
-import java.util.ArrayList;
-import java.util.List;
+import java.util.*;
 
 /**
  * Created with IntelliJ IDEA.
  * User: thibault
  * Date: 11/13/12
  * Time: 2:47 PM
+ *
+ * Test the Active Region Traversal Contract
+ * http://iwww.broadinstitute.org/gsa/wiki/index.php/Active_Region_Traversal_Contract
  */
 public class TraverseActiveRegionsTest extends BaseTest {
 
     private class DummyActiveRegionWalker extends ActiveRegionWalker<Integer, Integer> {
         private final double prob;
         public List<GenomeLoc> isActiveCalls = new ArrayList<GenomeLoc>();
+        public List<ActiveRegion> mappedActiveRegions = new ArrayList<ActiveRegion>();
 
         public DummyActiveRegionWalker() {
             this.prob = 1.0;
@@ -55,6 +58,7 @@ public class TraverseActiveRegionsTest extends BaseTest {
 
         @Override
         public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) {
+            mappedActiveRegions.add(activeRegion);
             return 0;
         }
 
@@ -73,7 +77,6 @@ public class TraverseActiveRegionsTest extends BaseTest {
 
     private IndexedFastaSequenceFile reference;
     private GenomeLocParser genomeLocParser;
-    private DummyActiveRegionWalker walker;
 
     @BeforeClass
     private void init() throws FileNotFoundException {
@@ -83,61 +86,133 @@ public class TraverseActiveRegionsTest extends BaseTest {
     }
 
     @Test
-    public void testAllBasesSeenSuite() {
+    public void testAllBasesSeen() {
+        DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
         List<GenomeLoc> intervals = new ArrayList<GenomeLoc>();
-        List<GenomeLoc> activeIntervals = new ArrayList<GenomeLoc>();
 
-        GenomeLoc interval = genomeLocParser.createGenomeLoc("1", 1, 1);
-        intervals.add(interval);
-        testAllBasesSeen(intervals);
+        intervals.add(genomeLocParser.createGenomeLoc("1", 1, 1));
+        List<GenomeLoc> activeIntervals = getIsActiveIntervals(walker, intervals);
+        // Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call
+        verifyEqualIntervals(intervals, activeIntervals);
 
-        interval = genomeLocParser.createGenomeLoc("1", 10, 20);
-        intervals.add(interval);
-        testAllBasesSeen(intervals);
+        intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20));
+        activeIntervals = getIsActiveIntervals(walker, intervals);
+        // Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call
+        verifyEqualIntervals(intervals, activeIntervals);
+
+        // TODO: more tests and edge cases
     }
 
-    public void testAllBasesSeen(List<GenomeLoc> intervals) {
+    private List<GenomeLoc> getIsActiveIntervals(DummyActiveRegionWalker walker, List<GenomeLoc> intervals) {
         List<GenomeLoc> activeIntervals = new ArrayList<GenomeLoc>();
         for (LocusShardDataProvider dataProvider : createDataProviders(intervals)) {
             t.traverse(walker, dataProvider, 0);
             activeIntervals.addAll(walker.isActiveCalls);
         }
 
-        boolean allBasesSeen = true;
-        for (GenomeLoc base : toBases(intervals)) {
-            boolean thisBaseSeen = false;
-            for (GenomeLoc activeLoc : activeIntervals) {
-                if (base.equals(activeLoc)) {
-                    thisBaseSeen = true;
-                    break;
-                }
-            }
-            if (!thisBaseSeen) {
-                allBasesSeen = false;
-                break;
-            }
-        }
-
-        Assert.assertTrue(allBasesSeen, "Some intervals missing from activity profile");
+        return activeIntervals;
     }
 
-    private List<GenomeLoc> toBases(List<GenomeLoc> intervals) {
-        List<GenomeLoc> bases = new ArrayList<GenomeLoc>();
+    @Test
+    public void testActiveRegionCoverage() {
+        DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
+        List<GenomeLoc> intervals = new ArrayList<GenomeLoc>();
+
+        intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999));
+        intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999));
+        intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999));
+
+        List<ActiveRegion> activeRegions = getActiveRegions(walker, intervals);
+        verifyActiveRegionCoverage(intervals, activeRegions);
+
+        // TODO: more tests and edge cases
+    }
+
+    private void verifyActiveRegionCoverage(List<GenomeLoc> intervals, List<ActiveRegion> activeRegions) {
+        List<GenomeLoc> intervalStarts = new ArrayList<GenomeLoc>();
+        List<GenomeLoc> intervalStops = new ArrayList<GenomeLoc>();
+
         for (GenomeLoc interval : intervals) {
-            if (interval.size() == 1)
-                bases.add(interval);
-            else {
-                for (int location = interval.getStart(); location <= interval.getStop(); location++) {
-                    bases.add(genomeLocParser.createGenomeLoc(interval.getContig(), location, location));
-                }
-            }
+            intervalStarts.add(interval.getStartLocation());
+            intervalStops.add(interval.getStopLocation());
         }
+
+        Map<GenomeLoc, ActiveRegion> baseRegionMap = new HashMap<GenomeLoc, ActiveRegion>();
+
+        for (ActiveRegion activeRegion : activeRegions) {
+            for (GenomeLoc activeLoc : toSingleBaseLocs(activeRegion.getLocation())) {
+                // Contract: Regions do not overlap
+                Assert.assertFalse(baseRegionMap.containsKey(activeLoc), "Genome location " + activeLoc + " is assigned to more than one region");
+                baseRegionMap.put(activeLoc, activeRegion);
+            }
+
+            GenomeLoc start = activeRegion.getLocation().getStartLocation();
+            if (intervalStarts.contains(start))
+                intervalStarts.remove(start);
+
+            GenomeLoc stop = activeRegion.getLocation().getStopLocation();
+            if (intervalStops.contains(stop))
+                intervalStops.remove(stop);
+        }
+
+        for (GenomeLoc baseLoc : toSingleBaseLocs(intervals)) {
+            // Contract: Each location in the interval(s) is in exactly one region
+            // Contract: The total set of regions exactly matches the analysis interval(s)
+            Assert.assertTrue(baseRegionMap.containsKey(baseLoc), "Genome location " + baseLoc + " is not assigned to any region");
+            baseRegionMap.remove(baseLoc);
+        }
+
+        // Contract: The total set of regions exactly matches the analysis interval(s)
+        Assert.assertEquals(baseRegionMap.size(), 0, "Active regions contain base(s) outside of the given intervals");
+
+        // Contract: All explicit interval boundaries must also be region boundaries
+        Assert.assertEquals(intervalStarts.size(), 0, "Interval start location does not match an active region start location");
+        Assert.assertEquals(intervalStops.size(), 0, "Interval stop location does not match an active region stop location");
+    }
+
+    private List<ActiveRegion> getActiveRegions(DummyActiveRegionWalker walker, List<GenomeLoc> intervals) {
+        for (LocusShardDataProvider dataProvider : createDataProviders(intervals))
+            t.traverse(walker, dataProvider, 0);
+
+        return walker.mappedActiveRegions;
+    }
+
+    private Collection<GenomeLoc> toSingleBaseLocs(GenomeLoc interval) {
+        List<GenomeLoc> bases = new ArrayList<GenomeLoc>();
+        if (interval.size() == 1)
+            bases.add(interval);
+        else {
+            for (int location = interval.getStart(); location <= interval.getStop(); location++)
+                bases.add(genomeLocParser.createGenomeLoc(interval.getContig(), location, location));
+        }
+
         return bases;
     }
 
-    private List<LocusShardDataProvider> createDataProviders(List<GenomeLoc> intervals) {
-        walker = new DummyActiveRegionWalker();
+    private Collection<GenomeLoc> toSingleBaseLocs(List<GenomeLoc> intervals) {
+        Set<GenomeLoc> bases = new TreeSet<GenomeLoc>();    // for sorting and uniqueness
+        for (GenomeLoc interval : intervals)
+            bases.addAll(toSingleBaseLocs(interval));
 
+        return bases;
+    }
+
+    private void verifyEqualIntervals(List<GenomeLoc> aIntervals, List<GenomeLoc> bIntervals) {
+        Collection<GenomeLoc> aBases = toSingleBaseLocs(aIntervals);
+        Collection<GenomeLoc> bBases = toSingleBaseLocs(bIntervals);
+
+        Assert.assertTrue(aBases.size() == bBases.size(), "Interval lists have a differing number of bases: " + aBases.size() + " vs. " + bBases.size());
+
+        Iterator<GenomeLoc> aIter = aBases.iterator();
+        Iterator<GenomeLoc> bIter = bBases.iterator();
+        while (aIter.hasNext() && bIter.hasNext()) {
+            GenomeLoc aLoc = aIter.next();
+            GenomeLoc bLoc = bIter.next();
+            Assert.assertTrue(aLoc.equals(bLoc), "Interval locations do not match: " + aLoc + " vs. " + bLoc);
+        }
+    }
+
+    private List<LocusShardDataProvider> createDataProviders(List<GenomeLoc> intervals) {
         GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
         engine.setGenomeLocParser(genomeLocParser);
         t.initialize(engine);
@@ -146,8 +221,7 @@ public class TraverseActiveRegionsTest extends BaseTest {
         Shard shard = new MockLocusShard(genomeLocParser, intervals);
 
         List<LocusShardDataProvider> providers = new ArrayList<LocusShardDataProvider>();
-        WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs());
-        for (WindowMaker.WindowMakerIterator window : windowMaker) {
+        for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, iterator, shard.getGenomeLocs())) {
             providers.add(new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, reference, new ArrayList<ReferenceOrderedDataSource>()));
         }
 

From 3ad9128800922ab40aeb5504ff5b0e5a6402e591 Mon Sep 17 00:00:00 2001
From: Joel Thibault <thibault@broadinstitute.org>
Date: Tue, 20 Nov 2012 13:11:24 -0500
Subject: [PATCH 34/49] Add some reads - Move intervals and reads to init -
 Update intervals and reads

---
 .../traversals/TraverseActiveRegionsTest.java | 73 ++++++++++++++-----
 1 file changed, 53 insertions(+), 20 deletions(-)

diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java
index ce4d400b4..2780b7421 100644
--- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java
@@ -1,9 +1,10 @@
 package org.broadinstitute.sting.gatk.traversals;
 
-import org.testng.Assert;
+import net.sf.samtools.*;
+import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
+import org.broadinstitute.sting.utils.interval.IntervalUtils;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 import net.sf.picard.reference.IndexedFastaSequenceFile;
-import net.sf.samtools.SAMRecord;
-import net.sf.samtools.SAMSequenceDictionary;
 import org.broadinstitute.sting.BaseTest;
 import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
@@ -22,6 +23,7 @@ import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
 import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
 import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
 import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
+import org.testng.Assert;
 import org.testng.annotations.BeforeClass;
 import org.testng.annotations.Test;
 
@@ -43,8 +45,8 @@ public class TraverseActiveRegionsTest extends BaseTest {
 
     private class DummyActiveRegionWalker extends ActiveRegionWalker<Integer, Integer> {
         private final double prob;
-        public List<GenomeLoc> isActiveCalls = new ArrayList<GenomeLoc>();
-        public List<ActiveRegion> mappedActiveRegions = new ArrayList<ActiveRegion>();
+        protected List<GenomeLoc> isActiveCalls = new ArrayList<GenomeLoc>();
+        protected List<ActiveRegion> mappedActiveRegions = new ArrayList<ActiveRegion>();
 
         public DummyActiveRegionWalker() {
             this.prob = 1.0;
@@ -76,30 +78,46 @@ public class TraverseActiveRegionsTest extends BaseTest {
     private final TraverseActiveRegions<Integer, Integer> t = new TraverseActiveRegions<Integer, Integer>();
 
     private IndexedFastaSequenceFile reference;
+    private SAMSequenceDictionary dictionary;
     private GenomeLocParser genomeLocParser;
 
+    private List<GenomeLoc> intervals;
+    private List<SAMRecord> reads;
+
     @BeforeClass
     private void init() throws FileNotFoundException {
         reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference));
-        SAMSequenceDictionary dictionary = reference.getSequenceDictionary();
+        dictionary = reference.getSequenceDictionary();
         genomeLocParser = new GenomeLocParser(dictionary);
+
+        intervals = new ArrayList<GenomeLoc>();
+        intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20));
+        intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999));
+        intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999));
+        intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999));
+        intervals.add(genomeLocParser.createGenomeLoc("1", 10000, 20000));
+        // TODO: this fails!
+        //intervals.add(genomeLocParser.createGenomeLoc("20", 10000, 20000));
+        intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList();
+
+        reads = new ArrayList<SAMRecord>();
+        reads.add(buildSAMRecord("overlap_overlapped_equal", "1", 10, 20));
+        reads.add(buildSAMRecord("overlap_overlapped_unequal", "1", 10, 21));
+        reads.add(buildSAMRecord("overlap_boundary_equal", "1", 1990, 2009));
+        reads.add(buildSAMRecord("overlap_boundary_unequal", "1", 1995, 2050));
+        reads.add(buildSAMRecord("extended_only", "1", 3000, 3100));
+        reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990));
+        reads.add(buildSAMRecord("simple", "20", 1000100, 1000150));
     }
 
     @Test
     public void testAllBasesSeen() {
         DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
-        List<GenomeLoc> intervals = new ArrayList<GenomeLoc>();
 
-        intervals.add(genomeLocParser.createGenomeLoc("1", 1, 1));
         List<GenomeLoc> activeIntervals = getIsActiveIntervals(walker, intervals);
         // Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call
         verifyEqualIntervals(intervals, activeIntervals);
 
-        intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20));
-        activeIntervals = getIsActiveIntervals(walker, intervals);
-        // Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call
-        verifyEqualIntervals(intervals, activeIntervals);
-
         // TODO: more tests and edge cases
     }
 
@@ -116,11 +134,6 @@ public class TraverseActiveRegionsTest extends BaseTest {
     @Test
     public void testActiveRegionCoverage() {
         DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
-        List<GenomeLoc> intervals = new ArrayList<GenomeLoc>();
-
-        intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999));
-        intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999));
-        intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999));
 
         List<ActiveRegion> activeRegions = getActiveRegions(walker, intervals);
         verifyActiveRegionCoverage(intervals, activeRegions);
@@ -212,17 +225,37 @@ public class TraverseActiveRegionsTest extends BaseTest {
         }
     }
 
+    // copied from LocusViewTemplate
+    protected GATKSAMRecord buildSAMRecord(String readName, String contig, int alignmentStart, int alignmentEnd) {
+        SAMFileHeader header = new SAMFileHeader();
+        header.setSequenceDictionary(dictionary);
+        GATKSAMRecord record = new GATKSAMRecord(header);
+
+        record.setReadName(readName);
+        record.setReferenceIndex(dictionary.getSequenceIndex(contig));
+        record.setAlignmentStart(alignmentStart);
+
+        Cigar cigar = new Cigar();
+        int len = alignmentEnd - alignmentStart + 1;
+        cigar.add(new CigarElement(len, CigarOperator.M));
+        record.setCigar(cigar);
+        record.setReadBases(new byte[len]);
+        record.setBaseQualities(new byte[len]);
+
+        return record;
+    }
+
     private List<LocusShardDataProvider> createDataProviders(List<GenomeLoc> intervals) {
         GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
         engine.setGenomeLocParser(genomeLocParser);
         t.initialize(engine);
 
-        StingSAMIterator iterator = ArtificialSAMUtils.createReadIterator(new ArrayList<SAMRecord>());
+        StingSAMIterator iterator = ArtificialSAMUtils.createReadIterator(reads);
         Shard shard = new MockLocusShard(genomeLocParser, intervals);
 
         List<LocusShardDataProvider> providers = new ArrayList<LocusShardDataProvider>();
         for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, iterator, shard.getGenomeLocs())) {
-            providers.add(new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, reference, new ArrayList<ReferenceOrderedDataSource>()));
+            providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList<ReferenceOrderedDataSource>()));
         }
 
         return providers;

From c68bc95db6635ccbe90462d18aa5d3a4ae7ace38 Mon Sep 17 00:00:00 2001
From: Joel Thibault <thibault@broadinstitute.org>
Date: Wed, 21 Nov 2012 16:22:57 -0500
Subject: [PATCH 35/49] Initial read mapping tests

- Failing tests are commented out
---
 .../traversals/TraverseActiveRegionsTest.java | 115 ++++++++++++++++--
 1 file changed, 105 insertions(+), 10 deletions(-)

diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java
index 2780b7421..e4c7b2db0 100644
--- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java
@@ -46,7 +46,7 @@ public class TraverseActiveRegionsTest extends BaseTest {
     private class DummyActiveRegionWalker extends ActiveRegionWalker<Integer, Integer> {
         private final double prob;
         protected List<GenomeLoc> isActiveCalls = new ArrayList<GenomeLoc>();
-        protected List<ActiveRegion> mappedActiveRegions = new ArrayList<ActiveRegion>();
+        protected Map<GenomeLoc, ActiveRegion> mappedActiveRegions = new HashMap<GenomeLoc, ActiveRegion>();
 
         public DummyActiveRegionWalker() {
             this.prob = 1.0;
@@ -60,7 +60,7 @@ public class TraverseActiveRegionsTest extends BaseTest {
 
         @Override
         public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) {
-            mappedActiveRegions.add(activeRegion);
+            mappedActiveRegions.put(activeRegion.getLocation(), activeRegion);
             return 0;
         }
 
@@ -101,13 +101,16 @@ public class TraverseActiveRegionsTest extends BaseTest {
         intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList();
 
         reads = new ArrayList<SAMRecord>();
-        reads.add(buildSAMRecord("overlap_overlapped_equal", "1", 10, 20));
-        reads.add(buildSAMRecord("overlap_overlapped_unequal", "1", 10, 21));
-        reads.add(buildSAMRecord("overlap_boundary_equal", "1", 1990, 2009));
-        reads.add(buildSAMRecord("overlap_boundary_unequal", "1", 1995, 2050));
+        reads.add(buildSAMRecord("simple", "1", 100, 200));
+        reads.add(buildSAMRecord("overlap_equal", "1", 10, 20));
+        reads.add(buildSAMRecord("overlap_unequal", "1", 10, 21));
+        reads.add(buildSAMRecord("boundary_equal", "1", 1990, 2009));
+        reads.add(buildSAMRecord("boundary_unequal", "1", 1995, 2050));
         reads.add(buildSAMRecord("extended_only", "1", 3000, 3100));
         reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990));
-        reads.add(buildSAMRecord("simple", "20", 1000100, 1000150));
+        reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000));
+        // TODO
+        //reads.add(buildSAMRecord("simple20", "20", 10100, 10150));
     }
 
     @Test
@@ -135,13 +138,13 @@ public class TraverseActiveRegionsTest extends BaseTest {
     public void testActiveRegionCoverage() {
         DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
 
-        List<ActiveRegion> activeRegions = getActiveRegions(walker, intervals);
+        Collection<ActiveRegion> activeRegions = getActiveRegions(walker, intervals).values();
         verifyActiveRegionCoverage(intervals, activeRegions);
 
         // TODO: more tests and edge cases
     }
 
-    private void verifyActiveRegionCoverage(List<GenomeLoc> intervals, List<ActiveRegion> activeRegions) {
+    private void verifyActiveRegionCoverage(List<GenomeLoc> intervals, Collection<ActiveRegion> activeRegions) {
         List<GenomeLoc> intervalStarts = new ArrayList<GenomeLoc>();
         List<GenomeLoc> intervalStops = new ArrayList<GenomeLoc>();
 
@@ -183,7 +186,99 @@ public class TraverseActiveRegionsTest extends BaseTest {
         Assert.assertEquals(intervalStops.size(), 0, "Interval stop location does not match an active region stop location");
     }
 
-    private List<ActiveRegion> getActiveRegions(DummyActiveRegionWalker walker, List<GenomeLoc> intervals) {
+    @Test
+    public void testReadMapping() {
+        DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
+
+        // Contract: Each read has the Primary state in a single region (or none)
+        // This is the region of maximum overlap for the read (earlier if tied)
+
+        // Contract: Each read has the Non-Primary state in all other regions it overlaps
+        // Contract: Each read has the Extended state in regions where it only overlaps if the region is extended
+
+        // simple: Primary in 1:1-999
+        // overlap_equal: Primary in 1:1-999
+        // overlap_unequal: Primary in 1:1-999
+        // boundary_equal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
+        // boundary_unequal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
+        // extended_only: Extended in 1:2000-2999
+        // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999
+        // outside_intervals: none
+
+        // TODO
+        // simple20: Primary in 20:10000-20000
+
+        Map<GenomeLoc, ActiveRegion> activeRegions = getActiveRegions(walker, intervals);
+        ActiveRegion region;
+
+        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999));
+
+        verifyReadPrimary(region, "simple");
+        verifyReadPrimary(region, "overlap_equal");
+        verifyReadPrimary(region, "overlap_unequal");
+        verifyReadNotPlaced(region, "boundary_equal");
+        verifyReadNotPlaced(region, "boundary_unequal");
+        verifyReadNotPlaced(region, "extended_only");
+        // TODO: fail verifyReadNonPrimary(region, "extended_and_np");
+        verifyReadNotPlaced(region, "outside_intervals");
+
+        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999));
+
+        verifyReadNotPlaced(region, "simple");
+        verifyReadNotPlaced(region, "overlap_equal");
+        verifyReadNotPlaced(region, "overlap_unequal");
+        // TODO: fail verifyReadPrimary(region, "boundary_equal");
+        // TODO: fail verifyReadNonPrimary(region, "boundary_unequal");
+        verifyReadNotPlaced(region, "extended_only");
+        // TODO: fail verifyReadPrimary(region, "extended_and_np");
+        verifyReadNotPlaced(region, "outside_intervals");
+
+        region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999));
+
+        verifyReadNotPlaced(region, "simple");
+        verifyReadNotPlaced(region, "overlap_equal");
+        verifyReadNotPlaced(region, "overlap_unequal");
+        // TODO: fail verifyReadNonPrimary(region, "boundary_equal");
+        verifyReadPrimary(region, "boundary_unequal");
+        // TODO: fail verifyReadExtended(region, "extended_only");
+        // TODO: fail verifyReadExtended(region, "extended_and_np");
+        verifyReadNotPlaced(region, "outside_intervals");
+
+        // TODO: more tests and edge cases
+    }
+
+    private void verifyReadPrimary(ActiveRegion region, String readName) {
+        SAMRecord read = getRead(region, readName);
+        Assert.assertFalse(read.getNotPrimaryAlignmentFlag(), "Read " + read + " not primary in active region " + region);
+    }
+
+    private void verifyReadNonPrimary(ActiveRegion region, String readName) {
+        SAMRecord read = getRead(region, readName);
+        Assert.assertTrue(read.getNotPrimaryAlignmentFlag(), "Read " + read + " primary in active region " + region);
+    }
+
+    private void verifyReadExtended(ActiveRegion region, String readName) {
+        Assert.fail("The Extended read state has not been implemented");
+    }
+
+    private void verifyReadNotPlaced(ActiveRegion region, String readName) {
+        for (SAMRecord read : region.getReads()) {
+            if (read.getReadName().equals(readName))
+                Assert.fail("Read " + readName + " found in active region " + region);
+        }
+    }
+
+    private SAMRecord getRead(ActiveRegion region, String readName) {
+        for (SAMRecord read : region.getReads()) {
+            if (read.getReadName().equals(readName))
+                return read;
+        }
+
+        Assert.fail("Read " + readName + " not found in active region " + region);
+        return null;
+    }
+
+    private Map<GenomeLoc, ActiveRegion> getActiveRegions(DummyActiveRegionWalker walker, List<GenomeLoc> intervals) {
         for (LocusShardDataProvider dataProvider : createDataProviders(intervals))
             t.traverse(walker, dataProvider, 0);
 

From 48f271c5bd825dc0f57249dc6164ec4bc3c35541 Mon Sep 17 00:00:00 2001
From: Mark DePristo <depristo@broadinstitute.org>
Date: Wed, 21 Nov 2012 17:23:41 -0500
Subject: [PATCH 38/49] Adding 80% support for multi-allelic variants

-- Multi-allelic variants are split into their bi-allelic version, trimmed, and we attempt to provide a meaningful genotype for NA12878 here.  It's not perfect and needs some discussion on how to handle het/alt variants
-- Adding splitInBiallelic funtion to VariantContextUtils as well as extensive unit tests that also indirectly test reverseTrimAlleles (which worked perfectly FYI)
---
 .../variantcontext/VariantContextUtils.java   |  34 +++++
 .../VariantContextTestProvider.java           |   2 +-
 .../VariantContextUtilsUnitTest.java          | 119 +++++++++++++++++-
 3 files changed, 150 insertions(+), 5 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
index 81959c998..1f1867f75 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
@@ -979,6 +979,40 @@ public class VariantContextUtils {
     private static final List<Allele> NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
     public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
 
+    /**
+     * Split variant context into its biallelic components if there are more than 2 alleles
+     *
+     * For VC has A/B/C alleles, returns A/B and A/C contexts.
+     * Genotypes are all no-calls now (it's not possible to fix them easily)
+     * Alleles are right trimmed to satisfy VCF conventions
+     *
+     * If vc is biallelic or non-variant it is just returned
+     *
+     * Chromosome counts are updated (but they are by definition 0)
+     *
+     * @param vc a potentially multi-allelic variant context
+     * @return a list of bi-allelic (or monomorphic) variant context
+     */
+    public static List<VariantContext> splitVariantContextToBiallelics(final VariantContext vc) {
+        if ( ! vc.isVariant() || vc.isBiallelic() )
+            // non variant or biallelics already satisfy the contract
+            return Collections.singletonList(vc);
+        else {
+            final List<VariantContext> biallelics = new LinkedList<VariantContext>();
+
+            for ( final Allele alt : vc.getAlternateAlleles() ) {
+                VariantContextBuilder builder = new VariantContextBuilder(vc);
+                final List<Allele> alleles = Arrays.asList(vc.getReference(), alt);
+                builder.alleles(alleles);
+                builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false));
+                calculateChromosomeCounts(builder, true);
+                biallelics.add(reverseTrimAlleles(builder.make()));
+            }
+
+            return biallelics;
+        }
+    }
+
     /**
      * subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately)
      *
diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java
index 6785fa816..c57b2a44d 100644
--- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java
+++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java
@@ -782,7 +782,7 @@ public class VariantContextTestProvider {
         Assert.assertEquals(actual.getStart(), expected.getStart(), "start");
         Assert.assertEquals(actual.getEnd(), expected.getEnd(), "end");
         Assert.assertEquals(actual.getID(), expected.getID(), "id");
-        Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles");
+        Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles for " + expected + " vs " + actual);
 
         assertAttributesEquals(actual.getAttributes(), expected.getAttributes());
         Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied(), "filtersWereApplied");
diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java
index 114104d42..f3daa9e4c 100644
--- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java
@@ -26,7 +26,7 @@ package org.broadinstitute.sting.utils.variantcontext;
 import net.sf.picard.reference.IndexedFastaSequenceFile;
 import org.broadinstitute.sting.BaseTest;
 import org.broadinstitute.sting.utils.GenomeLocParser;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
+import org.broadinstitute.sting.utils.Utils;
 import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
 import org.testng.Assert;
@@ -39,7 +39,7 @@ import java.io.FileNotFoundException;
 import java.util.*;
 
 public class VariantContextUtilsUnitTest extends BaseTest {
-    Allele Aref, T, C, Cref, ATC, ATCATC;
+    Allele Aref, T, C, G, Cref, ATC, ATCATC;
     private GenomeLocParser genomeLocParser;
 
     @BeforeSuite
@@ -58,6 +58,7 @@ public class VariantContextUtilsUnitTest extends BaseTest {
         Cref = Allele.create("C", true);
         T = Allele.create("T");
         C = Allele.create("C");
+        G = Allele.create("G");
         ATC = Allele.create("ATC");
         ATCATC = Allele.create("ATCATC");
     }
@@ -697,10 +698,120 @@ public class VariantContextUtilsUnitTest extends BaseTest {
         return ReverseClippingPositionTestProvider.getTests(ReverseClippingPositionTestProvider.class);
     }
 
-
     @Test(dataProvider = "ReverseClippingPositionTestProvider")
     public void testReverseClippingPositionTestProvider(ReverseClippingPositionTestProvider cfg) {
         int result = VariantContextUtils.computeReverseClipping(cfg.alleles, cfg.ref.getBytes(), 0, false);
         Assert.assertEquals(result, cfg.expectedClip);
     }
-}
+
+    // --------------------------------------------------------------------------------
+    //
+    // test splitting into bi-allelics
+    //
+    // --------------------------------------------------------------------------------
+
+    @DataProvider(name = "SplitBiallelics")
+    public Object[][] makeSplitBiallelics() throws CloneNotSupportedException {
+        List<Object[]> tests = new ArrayList<Object[]>();
+
+        final VariantContextBuilder root = new VariantContextBuilder("x", "20", 10, 10, Arrays.asList(Aref, C));
+
+        // biallelic -> biallelic
+        tests.add(new Object[]{root.make(), Arrays.asList(root.make())});
+
+        // monos -> monos
+        root.alleles(Arrays.asList(Aref));
+        tests.add(new Object[]{root.make(), Arrays.asList(root.make())});
+
+        root.alleles(Arrays.asList(Aref, C, T));
+        tests.add(new Object[]{root.make(),
+                Arrays.asList(
+                        root.alleles(Arrays.asList(Aref, C)).make(),
+                        root.alleles(Arrays.asList(Aref, T)).make())});
+
+        root.alleles(Arrays.asList(Aref, C, T, G));
+        tests.add(new Object[]{root.make(),
+                Arrays.asList(
+                        root.alleles(Arrays.asList(Aref, C)).make(),
+                        root.alleles(Arrays.asList(Aref, T)).make(),
+                        root.alleles(Arrays.asList(Aref, G)).make())});
+
+        final Allele C      = Allele.create("C");
+        final Allele CA      = Allele.create("CA");
+        final Allele CAA     = Allele.create("CAA");
+        final Allele CAAAA   = Allele.create("CAAAA");
+        final Allele CAAAAA  = Allele.create("CAAAAA");
+        final Allele Cref      = Allele.create("C", true);
+        final Allele CAref     = Allele.create("CA", true);
+        final Allele CAAref    = Allele.create("CAA", true);
+        final Allele CAAAref   = Allele.create("CAAA", true);
+
+        root.alleles(Arrays.asList(Cref, CA, CAA));
+        tests.add(new Object[]{root.make(),
+                Arrays.asList(
+                        root.alleles(Arrays.asList(Cref, CA)).make(),
+                        root.alleles(Arrays.asList(Cref, CAA)).make())});
+
+        root.alleles(Arrays.asList(CAAref, C, CA)).stop(12);
+        tests.add(new Object[]{root.make(),
+                Arrays.asList(
+                        root.alleles(Arrays.asList(CAAref, C)).make(),
+                        root.alleles(Arrays.asList(CAref, C)).stop(11).make())});
+
+        root.alleles(Arrays.asList(CAAAref, C, CA, CAA)).stop(13);
+        tests.add(new Object[]{root.make(),
+                Arrays.asList(
+                        root.alleles(Arrays.asList(CAAAref, C)).make(),
+                        root.alleles(Arrays.asList(CAAref, C)).stop(12).make(),
+                        root.alleles(Arrays.asList(CAref, C)).stop(11).make())});
+
+        root.alleles(Arrays.asList(CAAAref, CAAAAA, CAAAA, CAA, C)).stop(13);
+        tests.add(new Object[]{root.make(),
+                Arrays.asList(
+                        root.alleles(Arrays.asList(Cref, CAA)).stop(10).make(),
+                        root.alleles(Arrays.asList(Cref, CA)).stop(10).make(),
+                        root.alleles(Arrays.asList(CAref, C)).stop(11).make(),
+                        root.alleles(Arrays.asList(CAAAref, C)).stop(13).make())});
+
+        return tests.toArray(new Object[][]{});
+    }
+
+    @Test(dataProvider = "SplitBiallelics")
+    public void testSplitBiallelicsNoGenotypes(final VariantContext vc, final List<VariantContext> expectedBiallelics) {
+        final List<VariantContext> biallelics = VariantContextUtils.splitVariantContextToBiallelics(vc);
+        Assert.assertEquals(biallelics.size(), expectedBiallelics.size());
+        for ( int i = 0; i < biallelics.size(); i++ ) {
+            final VariantContext actual = biallelics.get(i);
+            final VariantContext expected = expectedBiallelics.get(i);
+            VariantContextTestProvider.assertEquals(actual, expected);
+        }
+    }
+
+    @Test(dataProvider = "SplitBiallelics", dependsOnMethods = "testSplitBiallelicsNoGenotypes")
+    public void testSplitBiallelicsGenotypes(final VariantContext vc, final List<VariantContext> expectedBiallelics) {
+        final List<Genotype> genotypes = new ArrayList<Genotype>();
+
+        int sampleI = 0;
+        for ( final List<Allele> alleles : Utils.makePermutations(vc.getAlleles(), 2, true) ) {
+            genotypes.add(GenotypeBuilder.create("sample" + sampleI, alleles));
+        }
+        genotypes.add(GenotypeBuilder.createMissing("missing", 2));
+
+        final VariantContext vcWithGenotypes = new VariantContextBuilder(vc).genotypes(genotypes).make();
+
+        final List<VariantContext> biallelics = VariantContextUtils.splitVariantContextToBiallelics(vcWithGenotypes);
+        for ( int i = 0; i < biallelics.size(); i++ ) {
+            final VariantContext actual = biallelics.get(i);
+            Assert.assertEquals(actual.getNSamples(), vcWithGenotypes.getNSamples()); // not dropping any samples
+
+            for ( final Genotype inputGenotype : genotypes ) {
+                final Genotype actualGenotype = actual.getGenotype(inputGenotype.getSampleName());
+                Assert.assertNotNull(actualGenotype);
+                if ( ! vc.isVariant() || vc.isBiallelic() )
+                    Assert.assertEquals(actualGenotype, vcWithGenotypes.getGenotype(inputGenotype.getSampleName()));
+                else
+                    Assert.assertTrue(actualGenotype.isNoCall());
+            }
+        }
+    }
+}
\ No newline at end of file

From 2306518ab6be1c323b46d41b33d481cc1bb65197 Mon Sep 17 00:00:00 2001
From: Menachem Fromer <fromer@broadinstitute.org>
Date: Thu, 22 Nov 2012 01:45:18 -0500
Subject: [PATCH 39/49] Fix to deal with 'proper' options of casting

---
 .../pileup/AbstractReadBackedPileup.java      | 22 ++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java
index d0ae68912..42938d2a6 100644
--- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java
+++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java
@@ -1022,7 +1022,7 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
      * @return
      */
     @Override
-    public ReadBackedPileup getStartSortedPileup() {
+    public RBP getStartSortedPileup() {
 
         final TreeSet<PE> sortedElements = new TreeSet<PE>(new Comparator<PE>() {
             @Override
@@ -1031,15 +1031,27 @@ public abstract class AbstractReadBackedPileup<RBP extends AbstractReadBackedPil
                 return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName());
             }
         });
-        UnifiedPileupElementTracker<PE> tracker = (UnifiedPileupElementTracker<PE>) pileupElementTracker;
-        for (PE pile : tracker)
-            sortedElements.add(pile);
+
+        if (pileupElementTracker instanceof PerSamplePileupElementTracker) {
+            PerSamplePileupElementTracker<PE> tracker = (PerSamplePileupElementTracker<PE>) pileupElementTracker;
+
+            for (final String sample : tracker.getSamples()) {
+                PileupElementTracker<PE> perSampleElements = tracker.getElements(sample);
+                for (PE pile : perSampleElements)
+                    sortedElements.add(pile);
+            }
+        }
+        else {
+            UnifiedPileupElementTracker<PE> tracker = (UnifiedPileupElementTracker<PE>) pileupElementTracker;
+            for (PE pile : tracker)
+                sortedElements.add(pile);
+        }
 
         UnifiedPileupElementTracker<PE> sortedTracker = new UnifiedPileupElementTracker<PE>();
         for (PE pile : sortedElements)
             sortedTracker.add(pile);
 
-        return (RBP) createNewPileup(this.getLocation(), sortedTracker);
+        return (RBP) createNewPileup(loc, sortedTracker);
     }
 
     @Override

From 9719ba7adce0574a48281d6baac7db18ff3208fa Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Thu, 22 Nov 2012 21:53:42 -0500
Subject: [PATCH 40/49] Remove -number example from the docs since it's no
 longer supported.

---
 .../sting/gatk/walkers/variantutils/SelectVariants.java   | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
index d28fe34d6..9253446c8 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
@@ -151,14 +151,6 @@ import java.util.*;
  *   -mvq 50 \
  *   -o violations.vcf
  *
- * Creating a sample of exactly 1000 variants randomly chosen with equal probability from the variant VCF:
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T SelectVariants \
- *   --variant input.vcf \
- *   -o output.vcf \
- *   -number 1000
- *
  * Creating a set with 50% of the total number of variants in the variant VCF:
  * java -Xmx2g -jar GenomeAnalysisTK.jar \
  *   -R ref.fasta \

From d978cfe8350288bd5dd406f7de224ccb2847838a Mon Sep 17 00:00:00 2001
From: Ryan Poplin <rpoplin@broadinstitute.org>
Date: Sun, 25 Nov 2012 21:55:29 -0500
Subject: [PATCH 41/49] Soft clipped bases shouldn't be counted in the
 delocalized BQSR.

---
 .../walkers/bqsr/BQSRIntegrationTest.java     | 31 ++++++++++++++-----
 .../gatk/walkers/bqsr/BaseRecalibrator.java   |  2 +-
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java
index b839382dc..de328c825 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java
@@ -51,16 +51,16 @@ public class BQSRIntegrationTest extends WalkerTest {
         String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam";
         String HiSeqInterval = "chr1:10,000,000-10,100,000";
         return new Object[][]{
-                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "387b41dc2221a1a4a782958944662b25")},
-                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "b5e26902e76abbd59f94f65c70d18165")},
-                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "a8a9c3f83269911cb61c5fe8fb98dc4a")},
-                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "f43a0473101c63ae93444c300d843e81")},
-                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "9e05e63339d4716584bfc717cab6bd0f")},
-                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "1cf9b9c9c64617dc0f3d2f203f918dbe")},
-                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "aa1949a77bc3066fee551a217c970c0d")},
+                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "4fd3c9ad97e6ac58cba644a76564c9f7")},
+                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "2620f734cce20f70ce13afd880e46e5c")},
+                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "5eb3b94e767da19a4c037ee132e4b19a")},
+                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "ab261d291b107a3da7897759c0e4fa89")},
+                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "292303f649fbb19dc05d4a0197a49eeb")},
+                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "8ced9d1094493f17fb1876b818a64541")},
+                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "abb838131e403d39820dbd66932d1ed0")},
                 {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "f70d8b5358bc2f76696f14b7a807ede0")},
                 {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "4c0f63e06830681560a1e9f9aad9fe98")},
-                {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "be2812cd3dae3c326cf35ae3f1c8ad9e")},
+                {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "8f62aa0e75770204c98d8299793cc53c")},
                 {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "03c29a0c1d21f72b12daf51cec111599")},
                 {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "7080b2cad02ec6e67ebc766b2dccebf8")},
                 {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "30e76055c16843b6e33e5b9bd8ced57c")},
@@ -90,6 +90,21 @@ public class BQSRIntegrationTest extends WalkerTest {
         executeTest("testBQSRFailWithoutDBSNP", spec);
     }
 
+    @Test
+    public void testBQSRCSV() {
+        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
+                " -T BaseRecalibrator" +
+                        " -R " + b36KGReference +
+                        " -I " + validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam" +
+                        " -knownSites " + b36dbSNP129 +
+                        " -L 1:10,000,000-10,200,000" +
+                        " -o /dev/null" +
+                        " --plot_pdf_file /dev/null" +
+                        " --intermediate_csv_file %s",
+                Arrays.asList("d1c38a3418979400630e2bca1140689c"));
+        executeTest("testBQSR-CSVfile", spec);
+    }
+
     @Test
     public void testBQSRFailWithSolidNoCall() {
         WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java
index b415bb1f5..7ce98cf1d 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java
@@ -227,7 +227,7 @@ public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSche
      */
     public Long map( final ReferenceContext ref, final GATKSAMRecord originalRead, final RefMetaDataTracker metaDataTracker ) {
 
-        final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(originalRead);
+        final GATKSAMRecord read = ReadClipper.hardClipSoftClippedBases( ReadClipper.hardClipAdaptorSequence(originalRead) );
         if( read.isEmpty() ) { return 0L; } // the whole read was inside the adaptor so skip it
 
         RecalUtils.parsePlatformForRead(read, RAC);

From 4f7fa3009a4adb8617501f35d61730f62b6ca0b1 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Mon, 26 Nov 2012 11:34:59 -0500
Subject: [PATCH 44/49] I forget why I thought that the VariantAnnotator
 couldn't run multi-threaded because it works just fine.  Now you can specify
 -nt with VA.

---
 .../walkers/annotator/VariantAnnotator.java   | 28 +++++++------------
 .../VariantAnnotatorIntegrationTest.java      |  8 +++++-
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
index c4de9ed45..92060b4a3 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
@@ -82,7 +82,7 @@ import java.util.*;
 @Allows(value={DataSource.READS, DataSource.REFERENCE})
 @Reference(window=@Window(start=-50,stop=50))
 @By(DataSource.REFERENCE)
-public class VariantAnnotator extends RodWalker<Integer, Integer> implements AnnotatorCompatible {
+public class VariantAnnotator extends RodWalker<Integer, Integer> implements AnnotatorCompatible, TreeReducible<Integer> {
 
     @ArgumentCollection
     protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
@@ -275,14 +275,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
         return true;
     }
 
-    /**
-     * Initialize the number of loci processed to zero.
-     *
-     * @return 0
-     */
-    public Integer reduceInit() { return 0; }
-
-
     /**
      * We want reads that span deletions
      *
@@ -323,15 +315,15 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
         return 1;
     }
 
-    /**
-     * Increment the number of loci processed.
-     *
-     * @param value result of the map.
-     * @param sum   accumulator for the reduce.
-     * @return the new number of loci processed.
-     */
-    public Integer reduce(Integer value, Integer sum) {
-        return sum + value;
+    @Override
+    public Integer reduceInit() { return 0; }
+
+    @Override
+    public Integer reduce(Integer value, Integer sum) { return value + sum; }
+
+    @Override
+    public Integer treeReduce(Integer lhs, Integer rhs) {
+        return lhs + rhs;
     }
 
     /**
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
index 01dff0089..b097e3d34 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
@@ -151,7 +151,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
     }
 
     @Test
-    public void testTabixAnnotations() {
+    public void testTabixAnnotationsAndParallelism() {
         final String MD5 = "99938d1e197b8f10c408cac490a00a62";
         for ( String file : Arrays.asList("CEU.exon.2010_03.sites.vcf", "CEU.exon.2010_03.sites.vcf.gz")) {
             WalkerTestSpec spec = new WalkerTestSpec(
@@ -159,6 +159,12 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
                     Arrays.asList(MD5));
             executeTest("Testing lookup vcf tabix vs. vcf tribble", spec);
         }
+
+        WalkerTestSpec spec = new WalkerTestSpec(
+                baseTestString() + " -A HomopolymerRun -nt 2 --variant:vcf " + validationDataLocation + "CEU.exon.2010_03.sites.vcf -L " + validationDataLocation + "CEU.exon.2010_03.sites.vcf --no_cmdline_in_header", 1,
+                Arrays.asList(MD5));
+
+        executeTest("Testing lookup vcf tabix vs. vcf tribble plus parallelism", spec);
     }
 
     @Test

From c3b7dd1374ece9cb8bffedd6518f5d6d9582eec0 Mon Sep 17 00:00:00 2001
From: Ryan Poplin <rpoplin@broadinstitute.org>
Date: Mon, 26 Nov 2012 12:19:11 -0500
Subject: [PATCH 45/49] Misc cleanup in the HaplotypeCaller. Cleaning up unused
 arguments after recent changes to HC-GenotypingEngine

---
 .../haplotypecaller/GenotypingEngine.java     |  4 +-
 .../haplotypecaller/HaplotypeCaller.java      | 66 ++-----------------
 .../LikelihoodCalculationEngine.java          |  1 -
 .../HaplotypeCallerIntegrationTest.java       | 17 ++---
 .../annotator/MappingQualityRankSumTest.java  |  5 +-
 .../gatk/walkers/annotator/RankSumTest.java   |  2 +-
 .../walkers/annotator/ReadPosRankSumTest.java |  2 +-
 7 files changed, 21 insertions(+), 76 deletions(-)

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
index beec8a92e..4fc2dc8f7 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
@@ -41,13 +41,11 @@ import java.util.*;
 public class GenotypingEngine {
 
     private final boolean DEBUG;
-    private final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE;
     private final static List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
     private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("<UNASSEMBLED_EVENT>", false);
 
-    public GenotypingEngine( final boolean DEBUG, final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) {
+    public GenotypingEngine( final boolean DEBUG ) {
         this.DEBUG = DEBUG;
-        this.OUTPUT_FULL_HAPLOTYPE_SEQUENCE = OUTPUT_FULL_HAPLOTYPE_SEQUENCE;
         noCall.add(Allele.NO_CALL);
     }
 
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index 2b739a321..24b3309f1 100755
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -131,14 +131,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
     @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false)
     protected int MIN_PRUNE_FACTOR = 1;
 
-    @Advanced
-    @Argument(fullName="genotypeFullActiveRegion", shortName="genotypeFullActiveRegion", doc = "If specified, alternate alleles are considered to be the full active region for the purposes of genotyping", required = false)
-    protected boolean GENOTYPE_FULL_ACTIVE_REGION = false;
-
-    @Advanced
-    @Argument(fullName="fullHaplotype", shortName="fullHaplotype", doc = "If specified, output the full haplotype sequence instead of converting to individual variants w.r.t. the reference", required = false)
-    protected boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE = false;
-
     @Advanced
     @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false)
     protected int gcpHMM = 10;
@@ -248,10 +240,11 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
 
         // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested
         UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC);
-        simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
-        simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
-        simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING );
-        simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING );
+        simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
+        simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY;
+        simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling
+        simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling
+        simpleUAC.CONTAMINATION_FRACTION = 0.0;
         simpleUAC.exactCallsLog = null;
         UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
 
@@ -273,15 +266,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
                 VCFConstants.GENOTYPE_QUALITY_KEY,
                 VCFConstants.DEPTH_KEY,
                 VCFConstants.GENOTYPE_PL_KEY);
-        // header lines for the experimental HaplotypeCaller-specific annotations
-        headerInfo.add(new VCFInfoHeaderLine("NVH", 1, VCFHeaderLineType.Integer, "Number of variants found on the haplotype that contained this variant"));
-        headerInfo.add(new VCFInfoHeaderLine("NumHapEval", 1, VCFHeaderLineType.Integer, "Number of haplotypes that were chosen for evaluation in this active region"));
-        headerInfo.add(new VCFInfoHeaderLine("NumHapAssembly", 1, VCFHeaderLineType.Integer, "Number of haplotypes created during the assembly of this active region"));
-        headerInfo.add(new VCFInfoHeaderLine("ActiveRegionSize", 1, VCFHeaderLineType.Integer, "Number of base pairs that comprise this active region"));
-        headerInfo.add(new VCFInfoHeaderLine("EVENTLENGTH", 1, VCFHeaderLineType.Integer, "Max length of all the alternate alleles"));
-        headerInfo.add(new VCFInfoHeaderLine("TYPE", 1, VCFHeaderLineType.String, "Type of event: SNP or INDEL"));
-        headerInfo.add(new VCFInfoHeaderLine("extType", 1, VCFHeaderLineType.String, "Extended type of event: SNP, MNP, INDEL, or COMPLEX"));
-        headerInfo.add(new VCFInfoHeaderLine("QDE", 1, VCFHeaderLineType.Float, "QD value divided by the number of variants found on the haplotype that contained this variant"));
 
         // FILTER fields are added unconditionally as it's not always 100% certain the circumstances
         // where the filters are used.  For example, in emitting all sites the lowQual field is used
@@ -298,7 +282,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
 
         assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter );
         likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
-        genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE );
+        genotypingEngine = new GenotypingEngine( DEBUG );
     }
 
     //---------------------------------------------------------------------------------------------------------------
@@ -428,43 +412,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
             final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog );
             final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst());
             final Map<String, Object> myAttributes = new LinkedHashMap<String, Object>(annotatedCall.getAttributes());
-
-            if( !GENOTYPE_FULL_ACTIVE_REGION ) {
-                // add some custom annotations to the calls
-
-                // Calculate the number of variants on the haplotype
-                int maxNumVar = 0;
-                for( final Allele allele : callResult.getFirst().getAlleles() ) {
-                    if( !allele.isReference() ) {
-                        for( final Haplotype haplotype : callResult.getSecond().get(allele) ) {
-                            final int numVar = haplotype.getEventMap().size();
-                            if( numVar > maxNumVar ) { maxNumVar = numVar; }
-                        }
-                    }
-                }
-                // Calculate the event length
-                int maxLength = 0;
-                for ( final Allele a : annotatedCall.getAlternateAlleles() ) {
-                    final int length = a.length() - annotatedCall.getReference().length();
-                    if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; }
-                }
-
-                myAttributes.put("NVH", maxNumVar);
-                myAttributes.put("NumHapEval", bestHaplotypes.size());
-                myAttributes.put("NumHapAssembly", haplotypes.size());
-                myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size());
-                myAttributes.put("EVENTLENGTH", maxLength);
-                myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") );
-                myAttributes.put("extType", annotatedCall.getType().toString() );
-
-                //if( likelihoodCalculationEngine.haplotypeScore != null ) {
-                //    myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore));
-                //}
-                if( annotatedCall.hasAttribute("QD") ) {
-                    myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) );
-                }
-            }
-
             vcfWriter.add( new VariantContextBuilder(annotatedCall).attributes(myAttributes).make() );
         }
 
@@ -520,6 +467,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
             if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
                 final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
                 // protect against INTERVALS with abnormally high coverage
+                // BUGBUG: remove when positinal downsampler is hooked up to ART/HC
                 if( clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) {
                     activeRegion.add(clippedRead);
                 }
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
index 304f8d5cb..29622ca17 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
@@ -169,7 +169,6 @@ public class LikelihoodCalculationEngine {
         }
 
         // compute the diploid haplotype likelihoods
-        // todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code
         for( int iii = 0; iii < numHaplotypes; iii++ ) {
             for( int jjj = 0; jjj <= iii; jjj++ ) {
                 for( final Haplotype iii_mapped : haplotypeMapping.get(alleleOrdering.get(iii)) ) {
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
index a57462d1d..007df3602 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
@@ -21,18 +21,19 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
 
     @Test
     public void testHaplotypeCallerMultiSample() {
-        HCTest(CEUTRIO_BAM, "", "56aa4b84606b6b0b7dc78a383974d1b3");
+        HCTest(CEUTRIO_BAM, "", "2b39732ff8e0de5bc2ae949aaf7a6f21");
     }
 
     @Test
     public void testHaplotypeCallerSingleSample() {
-        HCTest(NA12878_BAM, "", "baabae06c85d416920be434939124d7f");
+        HCTest(NA12878_BAM, "", "8b217638ff585effb9cc70e9a9aa544f");
     }
 
     // TODO -- add more tests for GGA mode, especially with input alleles that are complex variants and/or not trimmed
     @Test
     public void testHaplotypeCallerMultiSampleGGA() {
-        HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "f2d0309fdf50d5827e9c60ed0dd07e3f");
+        HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
+                "541aa8291f03ba33bd1ad3d731fd5657");
     }
 
     private void HCTestComplexVariants(String bam, String args, String md5) {
@@ -43,7 +44,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
 
     @Test
     public void testHaplotypeCallerMultiSampleComplex() {
-        HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "966d338f423c86a390d685aa6336ec69");
+        HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "fd7170cbde7df04d4fbe1da7903c31c6");
     }
 
     private void HCTestSymbolicVariants(String bam, String args, String md5) {
@@ -54,7 +55,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
 
     @Test
     public void testHaplotypeCallerSingleSampleSymbolic() {
-        HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "7fbc6b9e27e374f2ffe4be952d88c7c6");
+        HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "99456fc7207c1fe9f367a0d0afae87cd");
     }
 
     private void HCTestIndelQualityScores(String bam, String args, String md5) {
@@ -65,7 +66,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
 
     @Test
     public void testHaplotypeCallerSingleSampleIndelQualityScores() {
-        HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "2581e760279291a3901a506d060bfac8");
+        HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "6c1631785b3f832aecab1a99f0454762");
     }
 
     @Test
@@ -78,7 +79,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
     @Test
     public void HCTestStructuralIndels() {
         final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
-        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("96ab8253d242b851ccfc218759f79784"));
+        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("237601bbc39694c7413a332cbb656c8e"));
         executeTest("HCTestStructuralIndels: ", spec);
     }
 
@@ -92,7 +93,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
     public void HCTestReducedBam() {
         WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
                 "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
-                Arrays.asList("425f1a0fb00d7145edf1c55e54346fae"));
+                Arrays.asList("40bf739fb2b1743642498efe79ea6342"));
         executeTest("HC calling on a ReducedRead BAM", spec);
     }
 }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java
index 82596a501..2679a169b 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java
@@ -29,7 +29,7 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn
                                        final List<Double> refQuals, final List<Double> altQuals) {
 
         if (pileup != null && likelihoodMap == null) {
-            // no per-read likelihoods available:
+            // old UG snp-only path through the annotations
             for ( final PileupElement p : pileup ) {
                 if ( isUsableBase(p) ) {
                     if ( allAlleles.get(0).equals(Allele.create(p.getBase(), true)) ) {
@@ -43,14 +43,13 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn
         }
         for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : likelihoodMap.getLikelihoodReadMap().entrySet()) {
             final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
+            // BUGBUG: There needs to be a comparable isUsableBase check here
             if (a.isNoCall())
                 continue; // read is non-informative
             if (a.isReference())
                 refQuals.add((double)el.getKey().getMappingQuality());
             else if (allAlleles.contains(a))
                 altQuals.add((double)el.getKey().getMappingQuality());
-
-
         }
     }
 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java
index 0df7aff71..e7c0e6b14 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java
@@ -49,7 +49,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
             ReadBackedPileup pileup = null;
 
 
-            if (stratifiedContexts != null) {
+            if (stratifiedContexts != null) { // the old UG SNP-only path through the annotations
                 final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
                 if ( context != null )
                     pileup = context.getBasePileup();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java
index d01233bb2..334b89f01 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java
@@ -39,7 +39,7 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
                                        final List<Double> refQuals, final List<Double> altQuals) {
 
         if (alleleLikelihoodMap == null) {
-            // use fast SNP-based version if we don't have per-read allele likelihoods
+            // use old UG SNP-based version if we don't have per-read allele likelihoods
             for ( final PileupElement p : pileup ) {
                 if ( isUsableBase(p) ) {
                     int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0);

From 59cef880d195937e2e2eee9344a7bcc0d2ff016b Mon Sep 17 00:00:00 2001
From: Ryan Poplin <rpoplin@broadinstitute.org>
Date: Mon, 26 Nov 2012 12:20:07 -0500
Subject: [PATCH 46/49] Updating HC integration tests because experimental,
 HC-specific annotations have been removed.

---
 .../walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
index 007df3602..f8ba1f4cc 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
@@ -72,7 +72,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
     @Test
     public void HCTestProblematicReadsModifiedInActiveRegions() {
         final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
-        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("788176e1717bd28fc7cbc8e3efbb6100"));
+        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ec437d2d9f3ae07d155983be0155c8ed"));
         executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
     }
 

From 405f3c675d9daa589942e830db0870931741f113 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Tue, 27 Nov 2012 01:07:00 -0500
Subject: [PATCH 47/49] Fix for GSA-649: GenomeLocSortedSet.overlaps is crazy
 slow.  Also improved GenomeLocSortedSet.sizeBeforeLoc.

---
 .../traversals/TraverseActiveRegions.java     |  1 -
 .../sting/utils/GenomeLocSortedSet.java       | 49 +++++++++++++++----
 .../utils/GenomeLocSortedSetUnitTest.java     | 36 ++++++++++++++
 3 files changed, 76 insertions(+), 10 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
index a2c37944a..3f20db0af 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
@@ -80,7 +80,6 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
             }
 
             // skip this location -- it's not part of our engine intervals
-            // TODO -- this is dangerously slow with current overlaps implementation : GSA-649 / GenomeLocSortedSet.overlaps is crazy slow
             if ( outsideEngineIntervals(location) )
                 continue;
 
diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java
index d11adf9e3..ca1d385a2 100755
--- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java
+++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java
@@ -43,6 +43,9 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
     // our private storage for the GenomeLoc's
     private List<GenomeLoc> mArray = new ArrayList<GenomeLoc>();
 
+    // cache this to make overlap checking much more efficient
+    private int previousOverlapSearchIndex = -1;
+
     /** default constructor */
     public GenomeLocSortedSet(GenomeLocParser parser) {
         this.genomeLocParser = parser;
@@ -101,7 +104,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
      * Return the number of bps before loc in the sorted set
      *
      * @param loc the location before which we are counting bases
-     * @return
+     * @return the number of base pairs over all previous intervals
      */
     public long sizeBeforeLoc(GenomeLoc loc) {
         long s = 0;
@@ -110,7 +113,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
             if ( e.isBefore(loc) )
                 s += e.size();
             else if ( e.isPast(loc) )
-                ; // don't do anything
+                break; // we are done
             else // loc is inside of s
                 s += loc.getStart() - e.getStart();
         }
@@ -131,15 +134,43 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
      * Determine if the given loc overlaps any loc in the sorted set
      *
      * @param loc the location to test
-     * @return
+     * @return trip if the location overlaps any loc
      */
     public boolean overlaps(final GenomeLoc loc) {
-        for(final GenomeLoc e : mArray) {
-            if(e.overlapsP(loc)) {
-                return true;
-            }
+        // edge condition
+        if ( mArray.isEmpty() )
+            return false;
+
+        // use the cached version first
+        if ( previousOverlapSearchIndex != -1 && overlapsAtOrImmediatelyAfterCachedIndex(loc, true) )
+            return true;
+
+        // update the cached index
+        previousOverlapSearchIndex = Collections.binarySearch(mArray, loc);
+
+        // if it matches an interval exactly, we are done
+        if ( previousOverlapSearchIndex > 0 )
+            return true;
+
+        // check whether it overlaps the interval before or after the insertion point
+        previousOverlapSearchIndex = Math.max(0, -1 * previousOverlapSearchIndex - 2);
+        return overlapsAtOrImmediatelyAfterCachedIndex(loc, false);
+    }
+
+    private boolean overlapsAtOrImmediatelyAfterCachedIndex(final GenomeLoc loc, final boolean updateCachedIndex) {
+        // check the cached entry
+        if ( mArray.get(previousOverlapSearchIndex).overlapsP(loc) )
+            return true;
+
+        // check the entry after the cached entry since we may have moved to it
+        boolean returnValue = false;
+        if ( previousOverlapSearchIndex < mArray.size() - 1 ) {
+            returnValue = mArray.get(previousOverlapSearchIndex + 1).overlapsP(loc);
+            if ( updateCachedIndex )
+                previousOverlapSearchIndex++;
         }
-        return false;
+
+        return returnValue;
     }
 
     /**
@@ -155,7 +186,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
             mArray.add(e);
             return true;
         } else {
-            int loc = Collections.binarySearch(mArray,e);
+            final int loc = Collections.binarySearch(mArray,e);
             if (loc >= 0) {
                 throw new ReviewedStingException("Genome Loc Sorted Set already contains the GenomicLoc " + e.toString());
             } else {
diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java
index 3d21e654f..6138e7396 100755
--- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java
@@ -6,6 +6,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
 
 import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertFalse;
 import static org.testng.Assert.assertTrue;
 
 import org.testng.annotations.BeforeClass;
@@ -117,6 +118,41 @@ public class GenomeLocSortedSetUnitTest extends BaseTest {
         assertTrue(loc.getContigIndex() == 1);
     }
 
+    @Test
+    public void overlap() {
+        for ( int i = 1; i < 6; i++ ) {
+            final int start = i * 10;
+            mSortedSet.add(genomeLocParser.createGenomeLoc(contigOneName, start, start + 1));
+        }
+
+        // test matches in and around interval
+        assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 9, 9)));
+        assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 10, 10)));
+        assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 11, 11)));
+        assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 12, 12)));
+
+        // test matches spanning intervals
+        assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 14, 20)));
+        assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 11, 15)));
+        assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 30, 40)));
+        assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 51, 53)));
+
+        // test miss
+        assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 12, 19)));
+
+        // test exact match after miss
+        assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 40, 41)));
+
+        // test matches at beginning of intervals
+        assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 5, 6)));
+        assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 0, 10)));
+
+        // test matches at end of intervals
+        assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 52, 53)));
+        assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 51, 53)));
+        assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 52, 53)));
+    }
+
     @Test
     public void mergingOverlappingAbove() {
         GenomeLoc e = genomeLocParser.createGenomeLoc(contigOneName, 0, 50);

From cc72aaefebfed89723c40403ae0dfd3a932ff78d Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Tue, 27 Nov 2012 01:11:23 -0500
Subject: [PATCH 48/49] Minor efficiency: use >= instead of > in test

---
 .../src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java
index ca1d385a2..394220106 100755
--- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java
+++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java
@@ -149,7 +149,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
         previousOverlapSearchIndex = Collections.binarySearch(mArray, loc);
 
         // if it matches an interval exactly, we are done
-        if ( previousOverlapSearchIndex > 0 )
+        if ( previousOverlapSearchIndex >= 0 )
             return true;
 
         // check whether it overlaps the interval before or after the insertion point

From b1969a66bdcf755757517e77582b9d4e55caeb54 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Tue, 27 Nov 2012 08:24:41 -0500
Subject: [PATCH 49/49] Update docs

---
 .../gatk/walkers/fasta/FastaAlternateReferenceMaker.java    | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java
index 2b9744b89..22c6097cf 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java
@@ -47,6 +47,12 @@ import java.util.List;
  * <p>
  * Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s).
  * Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'.
+ *
+ * The output format can be partially controlled using the provided command-line arguments.
+ * Specify intervals with the usual -L argument to output only the reference bases within your intervals.
+ * Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a
+ * separate fasta sequence (named numerically in order).
+ *
  * Several important notes:
  * 1) if there are multiple variants that start at a site, it chooses one of them randomly.
  * 2) when there are overlapping indels (but with different start positions) only the first will be chosen.