From 596c1723aeec43ebf2feb7ede387b1954910736c Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Thu, 25 Oct 2012 10:35:43 -0400 Subject: [PATCH 02/90] Hidden, unsupported ability of VariantEval to run AlleleCount stratification on sites-only VCFs. I'll expose it/add tests on it if people think this is generaly useful. User needs to specify total # of samples as command line argument since genotypes are not available. Also, fixes to large-scale validation script: lower -minIndelFrac threshold or else we'll kill most indels since default 0.25 is too high for pools, fix also VE stratifications and add one VE run where eval=1KG, comp=pool data and AC stratification based on 1KG annotation --- .../sting/gatk/walkers/varianteval/VariantEval.java | 12 ++++++++++++ .../varianteval/stratifications/AlleleCount.java | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java index a73e125ad..201028d99 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java @@ -183,6 +183,10 @@ public class VariantEval extends RodWalker implements TreeRedu @Argument(fullName="keepAC0", shortName="keepAC0", doc="If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes", required=false) private boolean keepSitesWithAC0 = false; + @Hidden + @Argument(fullName="numSamples", shortName="numSamples", doc="If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes", required=false) + private int numSamplesFromArgument = 0; + /** * If true, VariantEval will treat -eval 1 -eval 2 as separate tracks from the same underlying * variant set, and evaluate the union of the results. Useful when you want to do -eval chr1.vcf -eval chr2.vcf etc. @@ -589,6 +593,14 @@ public class VariantEval extends RodWalker implements TreeRedu public boolean isSubsettingToSpecificSamples() { return isSubsettingSamples; } public Set getSampleNamesForEvaluation() { return sampleNamesForEvaluation; } + public int getNumberOfSamplesForEvaluation() { + if (sampleNamesForEvaluation!= null && !sampleNamesForEvaluation.isEmpty()) + return sampleNamesForEvaluation.size(); + else { + return numSamplesFromArgument; + } + + } public Set getSampleNamesForStratification() { return sampleNamesForStratification; } public List> getComps() { return comps; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java index e6efd4482..7197fc14c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java @@ -29,7 +29,7 @@ public class AlleleCount extends VariantStratifier { // There are ploidy x n sample chromosomes // TODO -- generalize to handle multiple ploidy - nchrom = getVariantEvalWalker().getSampleNamesForEvaluation().size() * getVariantEvalWalker().getSamplePloidy(); + nchrom = getVariantEvalWalker().getNumberOfSamplesForEvaluation() * getVariantEvalWalker().getSamplePloidy(); if ( nchrom < 2 ) throw new UserException.BadArgumentValue("AlleleCount", "AlleleCount stratification requires an eval vcf with at least one sample"); From cde4f037d3a79802569d06134358d9c7ed14a8ce Mon Sep 17 00:00:00 2001 From: Menachem Fromer Date: Thu, 25 Oct 2012 16:18:25 -0400 Subject: [PATCH 03/90] Begin moving XHMM scripts to public --- .../queue/qscripts/CNV/xhmmCNVpipeline.scala | 499 ++++++++++++++++++ .../sting/queue/util/DoC/package.scala | 123 +++++ 2 files changed, 622 insertions(+) create mode 100644 public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala create mode 100644 public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala new file mode 100644 index 000000000..362337c84 --- /dev/null +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala @@ -0,0 +1,499 @@ +package org.broadinstitute.sting.queue.qscripts.CNV + +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.util.VCF_BAM_utilities +import org.broadinstitute.sting.queue.util.DoC._ +import org.broadinstitute.sting.commandline.Hidden +import java.io.{PrintStream, PrintWriter} +import org.broadinstitute.sting.utils.text.XReadLines +import collection.JavaConversions._ + +class xhmmCNVpipeline extends QScript { + qscript => + + @Input(doc = "bam input, as .bam or as a list of files", shortName = "I", required = true) + var bams: File = _ + + @Argument(doc = "gatk jar file", shortName = "J", required = true) + var gatkJarFile: File = _ + + @Argument(doc = "xhmm executable file", shortName = "xhmmExec", required = true) + var xhmmExec: File = _ + + @Argument(doc = "Plink/Seq executable file", shortName = "pseqExec", required = true) + var pseqExec: File = _ + + @Argument(doc = "Plink/Seq SEQDB file (Reference genome sequence)", shortName = "SEQDB", required = true) + var pseqSeqDB: String = _ + + @Argument(shortName = "R", doc = "ref", required = true) + var referenceFile: File = _ + + @Argument(shortName = "L", doc = "Intervals", required = false) + var intervals: File = _ + + @Input(doc = "level of parallelism for BAM DoC. By default is set to 0 [no scattering].", shortName = "scatter", required = false) + var scatterCountInput = 0 + + @Input(doc = "Samples to run together for DoC. By default is set to 1 [one job per sample].", shortName = "samplesPerJob", required = false) + var samplesPerJob = 1 + + @Output(doc = "Base name for files to output", shortName = "o", required = true) + var outputBase: File = _ + + @Input(doc = "Maximum depth (before GATK down-sampling kicks in...)", shortName = "MAX_DEPTH", required = false) + var MAX_DEPTH = 20000 + + @Hidden + @Input(doc = "Number of read-depth bins", shortName = "NUM_BINS", required = false) + var NUM_BINS = 200 + + @Hidden + @Input(doc = "Starting value of read-depth bins", shortName = "START_BIN", required = false) + var START_BIN = 1 + + @Input(doc = "Minimum read mapping quality", shortName = "MMQ", required = false) + var minMappingQuality = 0 + + @Input(doc = "Memory (in GB) required for storing the whole matrix in memory", shortName = "wholeMatrixMemory", required = false) + var wholeMatrixMemory = -1 + + @Argument(shortName = "minTargGC", doc = "Exclude all targets with GC content less than this value", required = false) + var minTargGC : Double = 0.1 + + @Argument(shortName = "maxTargGC", doc = "Exclude all targets with GC content greater than this value", required = false) + var maxTargGC : Double = 0.9 + + @Argument(shortName = "minTargRepeats", doc = "Exclude all targets with % of repeat-masked bases less than this value", required = false) + var minTargRepeats : Double = 0.0 + + @Argument(shortName = "maxTargRepeats", doc = "Exclude all targets with % of repeat-masked bases greater than this value", required = false) + var maxTargRepeats : Double = 0.1 + + @Argument(shortName = "sampleIDsMap", doc = "File mapping BAM sample IDs to desired sample IDs", required = false) + var sampleIDsMap: String = "" + + @Argument(shortName = "sampleIDsMapFromColumn", doc = "Column number of OLD sample IDs to map", required = false) + var sampleIDsMapFromColumn = 1 + + @Argument(shortName = "sampleIDsMapToColumn", doc = "Column number of NEW sample IDs to map", required = false) + var sampleIDsMapToColumn = 2 + + @Argument(shortName = "rawFilters", doc = "xhmm command-line parameters to filter targets and samples from raw data", required = false) + var targetSampleFiltersString: String = "" + + @Argument(shortName = "PCAnormalize", doc = "xhmm command-line parameters to Normalize data using PCA information", required = false) + var PCAnormalizeMethodString: String = "" + + @Argument(shortName = "normalizedFilters", doc = "xhmm command-line parameters to filter targets and samples from PCA-normalized data", required = false) + var targetSampleNormalizedFiltersString: String = "" + + @Argument(shortName = "xhmmParams", doc = "xhmm model parameters file", required = true) + var xhmmParamsArg: File = _ + + @Argument(shortName = "discoverParams", doc = "xhmm command-line parameters for discovery step", required = false) + var discoverCommandLineParams: String = "" + + @Argument(shortName = "genotypeParams", doc = "xhmm command-line parameters for genotyping step", required = false) + var genotypeCommandLineParams: String = "" + + @Argument(shortName = "genotypeSubsegments", doc = "Should we also genotype all subsegments of the discovered CNV?", required = false) + var genotypeSubsegments: Boolean = false + + @Argument(shortName = "maxTargetsInSubsegment", doc = "If genotypeSubsegments, then only consider sub-segments consisting of this number of targets or fewer", required = false) + var maxTargetsInSubsegment = 30 + + @Argument(shortName = "subsegmentGenotypeThreshold", doc = "If genotypeSubsegments, this is the default genotype quality threshold for the sub-segments", required = false) + var subsegmentGenotypeThreshold = 20.0 + + @Argument(shortName = "longJobQueue", doc = "Job queue to run the 'long-running' commands", required = false) + var longJobQueue: String = "" + + + val PREPARED_TARGS_SUFFIX: String = ".merged.interval_list" + + val RD_OUTPUT_SUFFIX: String = ".RD.txt" + + val TARGS_GC_SUFFIX = ".locus_GC.txt" + val EXTREME_GC_TARGS_SUFFIX = ".extreme_gc_targets.txt" + + val TARGS_REPEAT_COMPLEXITY_SUFFIX = ".locus_complexity.txt" + val EXTREME_REPEAT_COMPLEXITY_SUFFIX = ".extreme_complexity_targets.txt" + + val FILTERED_TARGS_SUFFIX: String = ".filtered_targets.txt" + val FILTERED_SAMPS_SUFFIX: String = ".filtered_samples.txt" + + + trait WholeMatrixMemoryLimit extends CommandLineFunction { + // Since loading ALL of the data can take significant memory: + if (wholeMatrixMemory < 0) { + this.memoryLimit = 24 + } + else { + this.memoryLimit = wholeMatrixMemory + } + } + + trait LongRunTime extends CommandLineFunction { + if (longJobQueue != "") + this.jobQueue = longJobQueue + } + + def script = { + val prepTargets = new PrepareTargets(List(qscript.intervals), outputBase.getPath + PREPARED_TARGS_SUFFIX, xhmmExec, referenceFile) + add(prepTargets) + + trait CommandLineGATKArgs extends CommandLineGATK { + this.intervals :+= prepTargets.out + this.jarFile = qscript.gatkJarFile + this.reference_sequence = qscript.referenceFile + this.logging_level = "INFO" + } + + val sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = VCF_BAM_utilities.getMapOfBAMsForSample(VCF_BAM_utilities.parseBAMsInput(bams)) + val samples: List[String] = sampleToBams.keys.toList + Console.out.printf("Samples are %s%n", samples) + + val groups: List[Group] = buildDoCgroups(samples, sampleToBams, samplesPerJob, outputBase) + var docs: List[DoC] = List[DoC]() + for (group <- groups) { + Console.out.printf("Group is %s%n", group) + docs ::= new DoC(group.bams, group.DoC_output, MAX_DEPTH, minMappingQuality, scatterCountInput, START_BIN, NUM_BINS, Nil) with CommandLineGATKArgs + } + addAll(docs) + + val mergeDepths = new MergeGATKdepths(docs.map(u => u.intervalSampleOut), outputBase.getPath + RD_OUTPUT_SUFFIX, "_mean_cvg", xhmmExec, sampleIDsMap, sampleIDsMapFromColumn, sampleIDsMapToColumn, None, false) with WholeMatrixMemoryLimit + add(mergeDepths) + + var excludeTargets : List[File] = List[File]() + if (minTargGC > 0 || maxTargGC < 1) { + val calcGCcontents = new GCContentByInterval with CommandLineGATKArgs + calcGCcontents.out = outputBase.getPath + TARGS_GC_SUFFIX + add(calcGCcontents) + + val excludeTargetsBasedOnGC = new ExcludeTargetsBasedOnValue(calcGCcontents.out, EXTREME_GC_TARGS_SUFFIX, minTargGC, maxTargGC) + add(excludeTargetsBasedOnGC) + excludeTargets ::= excludeTargetsBasedOnGC.out + } + + class CalculateRepeatComplexity(outFile : String) extends CommandLineFunction { + @Input(doc="") + var intervals: File = prepTargets.out + + @Output(doc="") + var out : File = new File(outFile) + + val regFile : String = outputBase.getPath + ".targets.reg" + val locDB : String = outputBase.getPath + ".targets.LOCDB" + + val removeFiles = "rm -f " + regFile + " " + locDB + val createRegFile = "cat " + intervals + " | awk 'BEGIN{OFS=\"\\t\"; print \"#CHR\\tBP1\\tBP2\\tID\"} {split($1,a,\":\"); chr=a[1]; if (match(chr,\"chr\")==0) {chr=\"chr\"chr} split(a[2],b,\"-\"); bp1=b[1]; bp2=bp1; if (length(b) > 1) {bp2=b[2]} print chr,bp1,bp2,NR}' > " + regFile + val createLOCDB = pseqExec + " . loc-load --locdb " + locDB + " --file " + regFile + " --group targets --out " + locDB + ".loc-load" + val calcRepeatMaskedPercent = pseqExec + " . loc-stats --locdb " + locDB + " --group targets --seqdb " + pseqSeqDB + " --out " + locDB + ".loc-stats" + val extractRepeatMaskedPercent = "cat " + locDB + ".loc-stats.locstats | awk '{if (NR > 1) print $_}' | sort -k1 -g | awk '{print $10}' | paste " + intervals + " - | awk '{print $1\"\\t\"$2}' > " + out + + var command: String = + removeFiles + + " && " + createRegFile + + " && " + createLOCDB + + " && " + calcRepeatMaskedPercent + + " && " + extractRepeatMaskedPercent + + def commandLine = command + + override def description = "Calculate the percentage of each target that is repeat-masked in the reference sequence: " + command + } + + if (minTargRepeats > 0 || maxTargRepeats < 1) { + val calcRepeatComplexity = new CalculateRepeatComplexity(outputBase.getPath + TARGS_REPEAT_COMPLEXITY_SUFFIX) + add(calcRepeatComplexity) + + val excludeTargetsBasedOnRepeats = new ExcludeTargetsBasedOnValue(calcRepeatComplexity.out, EXTREME_REPEAT_COMPLEXITY_SUFFIX, minTargRepeats, maxTargRepeats) + add(excludeTargetsBasedOnRepeats) + excludeTargets ::= excludeTargetsBasedOnRepeats.out + } + + val filterCenterDepths = new FilterCenterRawMatrix(mergeDepths.mergedDoC, excludeTargets) + add(filterCenterDepths) + + val pca = new PCA(filterCenterDepths.filteredCentered) + add(pca) + + val normalize = new Normalize(pca) + add(normalize) + + val filterZscore = new FilterAndZscoreNormalized(normalize.normalized) + add(filterZscore) + + val filterOriginal = new FilterOriginalData(mergeDepths.mergedDoC, filterCenterDepths, filterZscore) + add(filterOriginal) + + val discover = new DiscoverCNVs(filterZscore.filteredZscored, filterOriginal.sameFiltered) + add(discover) + + val genotype = new GenotypeCNVs(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered) + add(genotype) + + if (genotypeSubsegments) { + val genotypeSegs = new GenotypeCNVandSubsegments(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered) + add(genotypeSegs) + } + } + + class ExcludeTargetsBasedOnValue(locus_valueIn : File, outSuffix : String, minVal : Double, maxVal : Double) extends InProcessFunction { + @Input(doc="") + var locus_value : File = locus_valueIn + + @Output(doc="") + var out : File = new File(outputBase.getPath + outSuffix) + + def run = { + var outWriter = new PrintWriter(new PrintStream(out)) + var elems = asScalaIterator(new XReadLines(locus_value)) + + while (elems.hasNext) { + val line = elems.next + val splitLine = line.split("\\s+") + val locus = splitLine(0) + val locValStr = splitLine(1) + try { + val locVal = locValStr.toDouble + if (locVal < minVal || locVal > maxVal) + outWriter.printf("%s%n", locus) + } + catch { + case nfe: NumberFormatException => println("Ignoring non-numeric value " + locValStr + " for locus " + locus) + case e: Exception => throw e + } + } + + outWriter.close + } + } + + class FilterCenterRawMatrix(inputParam: File, excludeTargetsIn : List[File]) extends CommandLineFunction with WholeMatrixMemoryLimit { + @Input(doc = "") + val input = inputParam + + @Input(doc = "") + val excludeTargets = excludeTargetsIn + + @Output + val filteredCentered: File = new File(outputBase.getPath + ".filtered_centered" + RD_OUTPUT_SUFFIX) + @Output + val filteredTargets: File = new File(filteredCentered.getPath + FILTERED_TARGS_SUFFIX) + @Output + val filteredSamples: File = new File(filteredCentered.getPath + FILTERED_SAMPS_SUFFIX) + + var command: String = + xhmmExec + " --matrix" + + " -r " + input + + " --centerData --centerType target" + + " -o " + filteredCentered + + " --outputExcludedTargets " + filteredTargets + + " --outputExcludedSamples " + filteredSamples + command += excludeTargets.map(u => " --excludeTargets " + u).reduceLeft(_ + "" + _) + if (targetSampleFiltersString != "") + command += " " + targetSampleFiltersString + + def commandLine = command + + override def description = "Filters samples and targets and then mean-centers the targets: " + command + } + + class PCA(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit { + @Input(doc = "") + val input = inputParam + + val PCAbase: String = outputBase.getPath + ".RD_PCA" + + @Output + val outPC: File = new File(PCAbase + ".PC.txt") + @Output + val outPC_SD: File = new File(PCAbase + ".PC_SD.txt") + @Output + val outPC_LOADINGS: File = new File(PCAbase + ".PC_LOADINGS.txt") + + var command: String = + xhmmExec + " --PCA" + + " -r " + input + + " --PCAfiles " + PCAbase + + def commandLine = command + + override def description = "Runs PCA on mean-centered data: " + command + } + + class Normalize(pca: PCA) extends CommandLineFunction { + @Input(doc = "") + val input = pca.input + + @Input(doc = "") + val inPC = pca.outPC + + @Input(doc = "") + val inPC_SD = pca.outPC_SD + + @Input(doc = "") + val inPC_LOADINGS = pca.outPC_LOADINGS + + @Output + val normalized: File = new File(outputBase.getPath + ".PCA_normalized.txt") + + var command: String = + xhmmExec + " --normalize" + + " -r " + input + + " --PCAfiles " + pca.PCAbase + + " --normalizeOutput " + normalized + if (PCAnormalizeMethodString != "") + command += " " + PCAnormalizeMethodString + + def commandLine = command + + override def description = "Normalizes mean-centered data using PCA information: " + command + } + + class FilterAndZscoreNormalized(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit { + @Input(doc = "") + val input = inputParam + + @Output + val filteredZscored: File = new File(outputBase.getPath + ".PCA_normalized.filtered.sample_zscores" + RD_OUTPUT_SUFFIX) + @Output + val filteredTargets: File = new File(filteredZscored.getPath + FILTERED_TARGS_SUFFIX) + @Output + val filteredSamples: File = new File(filteredZscored.getPath + FILTERED_SAMPS_SUFFIX) + + var command: String = + xhmmExec + " --matrix" + + " -r " + input + + " --centerData --centerType sample --zScoreData" + + " -o " + filteredZscored + + " --outputExcludedTargets " + filteredTargets + + " --outputExcludedSamples " + filteredSamples + if (targetSampleNormalizedFiltersString != "") + command += " " + targetSampleNormalizedFiltersString + + def commandLine = command + + override def description = "Filters and z-score centers (by sample) the PCA-normalized data: " + command + } + + class FilterOriginalData(inputParam: File, filt1: FilterCenterRawMatrix, filt2: FilterAndZscoreNormalized) extends CommandLineFunction with WholeMatrixMemoryLimit { + @Input(doc = "") + val input = inputParam + + @Input(doc = "") + val targFilters: List[File] = List(filt1.filteredTargets, filt2.filteredTargets).map(u => new File(u)) + + @Input(doc = "") + val sampFilters: List[File] = List(filt1.filteredSamples, filt2.filteredSamples).map(u => new File(u)) + + @Output + val sameFiltered: File = new File(outputBase.getPath + ".same_filtered" + RD_OUTPUT_SUFFIX) + + var command: String = + xhmmExec + " --matrix" + + " -r " + input + + targFilters.map(u => " --excludeTargets " + u).reduceLeft(_ + "" + _) + + sampFilters.map(u => " --excludeSamples " + u).reduceLeft(_ + "" + _) + + " -o " + sameFiltered + + def commandLine = command + + override def description = "Filters original read-depth data to be the same as filtered, normalized data: " + command + } + + class DiscoverCNVs(inputParam: File, origRDParam: File) extends CommandLineFunction with LongRunTime { + @Input(doc = "") + val input = inputParam + + @Input(doc = "") + val xhmmParams = xhmmParamsArg + + @Input(doc = "") + val origRD = origRDParam + + @Output + val xcnv: File = new File(outputBase.getPath + ".xcnv") + + @Output + val aux_xcnv: File = new File(outputBase.getPath + ".aux_xcnv") + + val posteriorsBase = outputBase.getPath + + @Output + val dipPosteriors: File = new File(posteriorsBase + ".posteriors.DIP.txt") + + @Output + val delPosteriors: File = new File(posteriorsBase + ".posteriors.DEL.txt") + + @Output + val dupPosteriors: File = new File(posteriorsBase + ".posteriors.DUP.txt") + + var command: String = + xhmmExec + " --discover" + + " -p " + xhmmParams + + " -r " + input + + " -R " + origRD + + " -c " + xcnv + + " -a " + aux_xcnv + + " -s " + posteriorsBase + + " " + discoverCommandLineParams + + def commandLine = command + + override def description = "Discovers CNVs in normalized data: " + command + } + + abstract class BaseGenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends CommandLineFunction with LongRunTime { + @Input(doc = "") + val input = inputParam + + @Input(doc = "") + val xhmmParams = xhmmParamsArg + + @Input(doc = "") + val origRD = origRDParam + + @Input(doc = "") + val inXcnv = xcnv + + var command: String = + xhmmExec + " --genotype" + + " -p " + xhmmParams + + " -r " + input + + " -g " + inXcnv + + " -F " + referenceFile + + " -R " + origRD + + " " + genotypeCommandLineParams + } + + class GenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam) { + @Output + val vcf: File = new File(outputBase.getPath + ".vcf") + + command += + " -v " + vcf + + def commandLine = command + + override def description = "Genotypes discovered CNVs in all samples: " + command + } + + class GenotypeCNVandSubsegments(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam) { + @Output + val vcf: File = new File(outputBase.getPath + ".subsegments.vcf") + + command += + " -v " + vcf + + " --subsegments" + + " --maxTargetsInSubsegment " + maxTargetsInSubsegment + + " --genotypeQualThresholdWhenNoExact " + subsegmentGenotypeThreshold + + def commandLine = command + + override def description = "Genotypes discovered CNVs (and their sub-segments, of up to " + maxTargetsInSubsegment + " targets) in all samples: " + command + } +} \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala b/public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala new file mode 100644 index 000000000..f35db4aa3 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala @@ -0,0 +1,123 @@ +package org.broadinstitute.sting.queue.util + +import java.io.File +import org.broadinstitute.sting.queue.extensions.gatk.{IntervalScatterFunction, CommandLineGATK} +import org.broadinstitute.sting.queue.function.scattergather.ScatterGatherableFunction +import org.broadinstitute.sting.gatk.downsampling.DownsampleType +import org.broadinstitute.sting.commandline.{Input, Gather, Output} +import org.broadinstitute.sting.queue.function.CommandLineFunction + +package object DoC { + class DoC(val bams: List[File], val DoC_output: File, val MAX_DEPTH: Int, val minMappingQuality: Int, val scatterCountInput: Int, val START_BIN: Int, val NUM_BINS: Int, val minCoverageCalcs: Seq[Int]) extends CommandLineGATK with ScatterGatherableFunction { + val DOC_OUTPUT_SUFFIX: String = ".sample_interval_summary" + + // So that the output files of this DoC run get deleted once they're used further downstream: + this.isIntermediate = true + + this.analysis_type = "DepthOfCoverage" + + this.input_file = bams + + this.downsample_to_coverage = Some(MAX_DEPTH) + this.downsampling_type = DownsampleType.BY_SAMPLE + + this.scatterCount = scatterCountInput + this.scatterClass = classOf[IntervalScatterFunction] + + // HACK for DoC to work properly within Queue: + @Output + @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) + var intervalSampleOut: File = new File(DoC_output.getPath + DOC_OUTPUT_SUFFIX) + + override def commandLine = super.commandLine + + " --omitDepthOutputAtEachBase" + + " --omitLocusTable" + + " --minBaseQuality 0" + + " --minMappingQuality " + minMappingQuality + + " --start " + START_BIN + " --stop " + MAX_DEPTH + " --nBins " + NUM_BINS + + (if (!minCoverageCalcs.isEmpty) minCoverageCalcs.map(cov => " --summaryCoverageThreshold " + cov).reduceLeft(_ + "" + _) else "") + + " --includeRefNSites" + + " -o " + DoC_output + + override def shortDescription = "DoC: " + DoC_output + } + + class DoCwithDepthOutputAtEachBase(bams: List[File], DoC_output: File, MAX_DEPTH: Int, minMappingQuality: Int, scatterCountInput: Int, START_BIN: Int, NUM_BINS: Int, minCoverageCalcs: Seq[Int]) extends DoC(bams, DoC_output, MAX_DEPTH: Int, minMappingQuality, scatterCountInput, START_BIN, NUM_BINS, minCoverageCalcs) { + // HACK for DoC to work properly within Queue: + @Output + @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) + var outPrefix = DoC_output + + override def commandLine = super.commandLine.replaceAll(" --omitDepthOutputAtEachBase", "") + } + + def buildDoCgroups(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]], samplesPerJob: Int, outputBase: File): List[Group] = { + + def buildDoCgroupsHelper(samples: List[String], count: Int): List[Group] = (samples splitAt samplesPerJob) match { + case (Nil, y) => + return Nil + case (subsamples, remaining) => + return new Group("group" + count, outputBase, subsamples, VCF_BAM_utilities.findBAMsForSamples(subsamples, sampleToBams)) :: buildDoCgroupsHelper(remaining, count + 1) + } + + return buildDoCgroupsHelper(samples, 0) + } + + // A group has a list of samples and bam files to use for DoC + class Group(val name: String, val outputBase: File, val samples: List[String], val bams: List[File]) { + // getName() just includes the file name WITHOUT the path: + val groupOutputName = name + "." + outputBase.getName + + // Comment this out, so that when jobs are scattered in DoC class below, they do not scatter into outputs at directories that don't exist!!! : + //def DoC_output = new File(outputBase.getParentFile(), groupOutputName) + + def DoC_output = new File(groupOutputName) + + override def toString(): String = String.format("[Group %s [%s] with samples %s against bams %s]", name, DoC_output, samples, bams) + } + + class MergeGATKdepths(DoCsToCombine: List[File], outFile: String, columnSuffix: String, xhmmExec: File, sampleIDsMap: String, sampleIDsMapFromColumn: Int, sampleIDsMapToColumn: Int, rdPrecisionArg: Option[Int], outputTargetsBySamples: Boolean) extends CommandLineFunction { + @Input(doc = "") + var inputDoCfiles: List[File] = DoCsToCombine + + @Output + val mergedDoC: File = new File(outFile) + var command: String = + xhmmExec + " --mergeGATKdepths" + + inputDoCfiles.map(input => " --GATKdepths " + input).reduceLeft(_ + "" + _) + + " --columnSuffix " + columnSuffix + + " -o " + mergedDoC + if (sampleIDsMap != "") + command += " --sampleIDmap " + sampleIDsMap + " --fromID " + sampleIDsMapFromColumn + " --toID " + sampleIDsMapToColumn + rdPrecisionArg match { + case Some(rdPrecision) => { + command += " --rdPrecision " + rdPrecision + } + case None => {} + } + if (outputTargetsBySamples) + command += " --outputTargetsBySamples" + + def commandLine = command + + override def description = "Combines DoC outputs for multiple samples (at same loci): " + command + } + + class PrepareTargets(intervalsIn: List[File], outIntervals: String, val xhmmExec: File, val referenceFile: File) extends CommandLineFunction { + @Input(doc = "List of files containing targeted intervals to be prepared and merged") + var inIntervals: List[File] = intervalsIn + + @Output(doc = "The merged intervals file to write to") + var out: File = new File(outIntervals) + + var command: String = + xhmmExec + " --prepareTargets" + + " -F " + referenceFile + + inIntervals.map(intervFile => " --targets " + intervFile).reduceLeft(_ + "" + _) + + " --mergedTargets " + out + + def commandLine = command + + override def description = "Sort all target intervals, merge overlapping ones, and print the resulting interval list: " + command + } +} From 9af4b34fd8a45d4bd561f29dc337f5676a57e21b Mon Sep 17 00:00:00 2001 From: Menachem Fromer Date: Fri, 26 Oct 2012 01:21:05 -0400 Subject: [PATCH 04/90] Changed @Input to @Argument for non-File types --- .../queue/qscripts/CNV/xhmmCNVpipeline.scala | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala index 362337c84..8db089484 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala @@ -33,30 +33,30 @@ class xhmmCNVpipeline extends QScript { @Argument(shortName = "L", doc = "Intervals", required = false) var intervals: File = _ - @Input(doc = "level of parallelism for BAM DoC. By default is set to 0 [no scattering].", shortName = "scatter", required = false) + @Argument(doc = "level of parallelism for BAM DoC. By default is set to 0 [no scattering].", shortName = "scatter", required = false) var scatterCountInput = 0 - @Input(doc = "Samples to run together for DoC. By default is set to 1 [one job per sample].", shortName = "samplesPerJob", required = false) + @Argument(doc = "Samples to run together for DoC. By default is set to 1 [one job per sample].", shortName = "samplesPerJob", required = false) var samplesPerJob = 1 @Output(doc = "Base name for files to output", shortName = "o", required = true) var outputBase: File = _ - @Input(doc = "Maximum depth (before GATK down-sampling kicks in...)", shortName = "MAX_DEPTH", required = false) + @Argument(doc = "Maximum depth (before GATK down-sampling kicks in...)", shortName = "MAX_DEPTH", required = false) var MAX_DEPTH = 20000 @Hidden - @Input(doc = "Number of read-depth bins", shortName = "NUM_BINS", required = false) + @Argument(doc = "Number of read-depth bins", shortName = "NUM_BINS", required = false) var NUM_BINS = 200 @Hidden - @Input(doc = "Starting value of read-depth bins", shortName = "START_BIN", required = false) + @Argument(doc = "Starting value of read-depth bins", shortName = "START_BIN", required = false) var START_BIN = 1 - @Input(doc = "Minimum read mapping quality", shortName = "MMQ", required = false) + @Argument(doc = "Minimum read mapping quality", shortName = "MMQ", required = false) var minMappingQuality = 0 - @Input(doc = "Memory (in GB) required for storing the whole matrix in memory", shortName = "wholeMatrixMemory", required = false) + @Argument(doc = "Memory (in GB) required for storing the whole matrix in memory", shortName = "wholeMatrixMemory", required = false) var wholeMatrixMemory = -1 @Argument(shortName = "minTargGC", doc = "Exclude all targets with GC content less than this value", required = false) @@ -496,4 +496,4 @@ class xhmmCNVpipeline extends QScript { override def description = "Genotypes discovered CNVs (and their sub-segments, of up to " + maxTargetsInSubsegment + " targets) in all samples: " + command } -} \ No newline at end of file +} From c8e17a7adf410aa540f6e51091160f56bd4231e2 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Tue, 30 Oct 2012 13:57:54 -0400 Subject: [PATCH 05/90] totally experimental UG feature, to be removed --- .../sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 3c4a97ec1..d9b46ad36 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -190,6 +190,9 @@ public class UnifiedGenotyperEngine { final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); if ( vc != null ) results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap)); + else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES) + results.add(generateEmptyContext(tracker, refContext, null, rawContext)); + } } } From 96344c6b62df84d4727bbbd8307d03f88da0f0c1 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 31 Oct 2012 12:35:45 -0400 Subject: [PATCH 06/90] Add note to realigner docs --- .../sting/gatk/walkers/indels/IndelRealigner.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 998894fbf..8f2528e23 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -236,6 +236,8 @@ public class IndelRealigner extends ReadWalker { * then extensions (".bam" or ".sam") will be stripped from the input file names and the provided string value will be pasted on instead; 2) if the * value ends with a '.map' (e.g. input_output.map), then the two-column tab-separated file with the specified name must exist and list unique output * file name (2nd column) for each input file name (1st column). + * + * Note that some GATK arguments do NOT work in conjunction with nWayOut (e.g. --disable_bam_indexing). */ @Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file") protected String N_WAY_OUT = null; From f8af8a2355fd70bd0b07694d62081826f317e08c Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 31 Oct 2012 21:28:07 -0400 Subject: [PATCH 07/90] Moving UG integration tests to protected since they use protected-only contamination filtering. Adding a new UGLite integration test to confirm that contamination filtering is ignored in lite. --- .../UnifiedGenotyperIntegrationTest.java | 0 .../UnifiedGenotyperLiteIntegrationTest.java | 35 +++++++++++++++++++ 2 files changed, 35 insertions(+) rename {public => protected}/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java (100%) create mode 100755 public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLiteIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java similarity index 100% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLiteIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLiteIntegrationTest.java new file mode 100755 index 000000000..f9a921a86 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLiteIntegrationTest.java @@ -0,0 +1,35 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +// ********************************************************************************** // +// Note that this class also serves as an integration test for the VariantAnnotator! // +// ********************************************************************************** // + +public class UnifiedGenotyperLiteIntegrationTest extends WalkerTest { + + private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + + // -------------------------------------------------------------------------------------------------------------- + // + // testing contamination down-sampling gets ignored + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void testContaminationDownsampling() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, + Arrays.asList("9addd225a985178339a0c49dc5fdc220")); + executeTest("test contamination_percentage_to_filter gets ignored", spec); + } + +} From 47a0f5859e48a64a5d466f3370ee6d69126dbf88 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 31 Oct 2012 22:56:38 -0400 Subject: [PATCH 08/90] Don't run these tests if not GAKT lite --- .../genotyper/UnifiedGenotyperLiteIntegrationTest.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLiteIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLiteIntegrationTest.java index f9a921a86..783a8d7fc 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLiteIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLiteIntegrationTest.java @@ -1,14 +1,11 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.classloader.GATKLiteUtils; +import org.testng.SkipException; import org.testng.annotations.Test; -import java.io.File; import java.util.Arrays; -import java.util.Collections; -import java.util.List; // ********************************************************************************** // // Note that this class also serves as an integration test for the VariantAnnotator! // @@ -26,6 +23,9 @@ public class UnifiedGenotyperLiteIntegrationTest extends WalkerTest { @Test public void testContaminationDownsampling() { + if ( !GATKLiteUtils.isGATKLite() ) + throw new SkipException("Only want to test for GATK lite"); + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, Arrays.asList("9addd225a985178339a0c49dc5fdc220")); From 9cd04c335c9f5662bb02d1581ec2ad970f7abbd7 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 29 Oct 2012 19:06:05 -0400 Subject: [PATCH 09/90] Work on GSA-508 / CachingIndexedFastaReader should internally upper case bases loading data -- As one might expect, CachingIndexedFastaSequenceFile now internally upper cases the FASTA reference bases. This is now done by default, unless requested explicitly to preserve the original bases. -- This is really the correct place to do this for a variety of reasons. First, you don't need to work about upper casing bases throughout the code. Second, the cache is only upper cased once, no matter how often the bases are accessed, which walkers cannot optimize themselves. Finally, this uses the fastest function for this -- Picard's toUpperCase(byte[]) which is way better than String.toUpperCase() -- Added unit tests to ensure this functionality works correct. -- Removing unnecessary upper casing of bases in some core GATK tools, now that RefContext guarentees that the reference bases are all upper case. -- Added contracts to ensure this is the case. -- Remove a ton of sh*t from BaseUtils that was so old I had no idea what it was doing any longer, and didn't have any unit tests to ensure it was correct, and wasn't used anywhere in our code --- .../sting/gatk/contexts/ReferenceContext.java | 14 +- .../gatk/walkers/indels/IndelRealigner.java | 8 +- .../broadinstitute/sting/utils/BaseUtils.java | 215 +----------------- .../CachingIndexedFastaSequenceFile.java | 140 +++++++++--- ...chingIndexedFastaSequenceFileUnitTest.java | 65 +++++- 5 files changed, 185 insertions(+), 257 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java index af330bba9..c8bf1e3e8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java +++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.contexts; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import net.sf.samtools.util.StringUtil; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -39,10 +38,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser; * @author hanna * @version 0.1 */ - public class ReferenceContext { - final public static boolean UPPERCASE_REFERENCE = true; - /** * Facilitates creation of new GenomeLocs. */ @@ -59,7 +55,8 @@ public class ReferenceContext { final private GenomeLoc window; /** - * The bases in the window around the current locus. If null, then bases haven't been fetched yet + * The bases in the window around the current locus. If null, then bases haven't been fetched yet. + * Bases are always upper cased */ private byte[] basesCache = null; @@ -81,7 +78,7 @@ public class ReferenceContext { * * @return */ - @Ensures("result != null") + @Ensures({"result != null", "BaseUtils.isUpperCase(result)"}) public byte[] getBases(); } @@ -146,7 +143,6 @@ public class ReferenceContext { private void fetchBasesFromProvider() { if ( basesCache == null ) { basesCache = basesProvider.getBases(); - if (UPPERCASE_REFERENCE) StringUtil.toUpperCase(basesCache); } } @@ -176,6 +172,7 @@ public class ReferenceContext { * Get the base at the given locus. * @return The base at the given locus from the reference. */ + @Ensures("BaseUtils.isUpperCase(result)") public byte getBase() { return getBases()[(locus.getStart() - window.getStart())]; } @@ -185,7 +182,7 @@ public class ReferenceContext { * @return All bases available. If the window is of size [0,0], the array will * contain only the base at the given locus. */ - @Ensures({"result != null", "result.length > 0"}) + @Ensures({"result != null", "result.length > 0", "BaseUtils.isUpperCase(result)"}) public byte[] getBases() { fetchBasesFromProvider(); return basesCache; @@ -194,6 +191,7 @@ public class ReferenceContext { /** * All the bases in the window from the current base forward to the end of the window. */ + @Ensures({"result != null", "result.length > 0", "BaseUtils.isUpperCase(result)"}) public byte[] getForwardBases() { final byte[] bases = getBases(); final int mid = locus.getStart() - window.getStart(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 8f2528e23..345f79b2b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -25,7 +25,7 @@ package org.broadinstitute.sting.gatk.walkers.indels; -import net.sf.picard.reference.IndexedFastaSequenceFile; +import com.google.java.contract.Requires; import net.sf.samtools.*; import net.sf.samtools.util.RuntimeIOException; import net.sf.samtools.util.SequenceUtil; @@ -276,7 +276,7 @@ public class IndelRealigner extends ReadWalker { protected String OUT_SNPS = null; // fasta reference reader to supplement the edges of the reference sequence - private IndexedFastaSequenceFile referenceReader; + private CachingIndexedFastaSequenceFile referenceReader; // the intervals input by the user private Iterator intervals = null; @@ -1603,7 +1603,8 @@ public class IndelRealigner extends ReadWalker { public List getReads() { return reads; } - public byte[] getReference(IndexedFastaSequenceFile referenceReader) { + @Requires("referenceReader.isUppercasingBases()") + public byte[] getReference(CachingIndexedFastaSequenceFile referenceReader) { // set up the reference if we haven't done so yet if ( reference == null ) { // first, pad the reference to handle deletions in narrow windows (e.g. those with only 1 read) @@ -1611,7 +1612,6 @@ public class IndelRealigner extends ReadWalker { int padRight = Math.min(loc.getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(loc.getContig()).getSequenceLength()); loc = getToolkit().getGenomeLocParser().createGenomeLoc(loc.getContig(), padLeft, padRight); reference = referenceReader.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases(); - StringUtil.toUpperCase(reference); } return reference; diff --git a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 69920ece4..53a49d8b2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -24,33 +24,6 @@ public class BaseUtils { public final static byte[] BASES = {'A', 'C', 'G', 'T'}; public final static byte[] EXTENDED_BASES = {'A', 'C', 'G', 'T', 'N', 'D'}; - public enum Base { - A('A', 0), - C('C', 1), - G('G', 2), - T('T', 3); - - byte b; - int index; - - private Base(char base, int index) { - this.b = (byte) base; - this.index = index; - } - - public byte getBase() { return b; } - - public char getBaseAsChar() { return (char) b; } - - public int getIndex() { return index; } - - public boolean sameBase(byte o) { return b == o; } - - public boolean sameBase(char o) { return b == (byte) o; } - - public boolean sameBase(int i) { return index == i; } - } - static private final int[] baseIndexMap = new int[256]; static { Arrays.fill(baseIndexMap, -1); @@ -130,6 +103,17 @@ public class BaseUtils { return false; } + public static boolean isUpperCase(final byte[] bases) { + for ( byte base : bases ) + if ( ! isUpperCase(base) ) + return false; + return true; + } + + public static boolean isUpperCase(final byte base) { + return base >= 'A' && base <= 'Z'; + } + /** * Converts a IUPAC nucleotide code to a pair of bases * @@ -271,59 +255,6 @@ public class BaseUtils { } } - /** - * Converts a base index to a base index representing its cross-talk partner - * - * @param baseIndex 0, 1, 2, 3 - * @return 1, 0, 3, 2, or -1 if the index can't be understood - */ - static public int crossTalkPartnerIndex(int baseIndex) { - switch (baseIndex) { - case 0: - return 1; // A -> C - case 1: - return 0; // C -> A - case 2: - return 3; // G -> T - case 3: - return 2; // T -> G - default: - return -1; - } - } - - /** - * Converts a base to the base representing its cross-talk partner - * - * @param base [AaCcGgTt] - * @return C, A, T, G, or '.' if the base can't be understood - */ - @Deprecated - static public char crossTalkPartnerBase(char base) { - return (char) baseIndexToSimpleBase(crossTalkPartnerIndex(simpleBaseToBaseIndex(base))); - } - - /** - * Return the complement of a base index. - * - * @param baseIndex the base index (0:A, 1:C, 2:G, 3:T) - * @return the complementary base index - */ - static public byte complementIndex(int baseIndex) { - switch (baseIndex) { - case 0: - return 3; // a -> t - case 1: - return 2; // c -> g - case 2: - return 1; // g -> c - case 3: - return 0; // t -> a - default: - return -1; // wtf? - } - } - /** * Return the complement (A <-> T or C <-> G) of a base, or the specified base if it can't be complemented (i.e. an ambiguous base). * @@ -350,7 +281,7 @@ public class BaseUtils { } @Deprecated - static public char simpleComplement(char base) { + static private char simpleComplement(char base) { return (char) simpleComplement((byte) base); } @@ -370,22 +301,6 @@ public class BaseUtils { return rcbases; } - /** - * Complement a byte array of bases (that is, chars casted to bytes, *not* base indices in byte form) - * - * @param bases the byte array of bases - * @return the complement of the base byte array - */ - static public byte[] simpleComplement(byte[] bases) { - byte[] rcbases = new byte[bases.length]; - - for (int i = 0; i < bases.length; i++) { - rcbases[i] = simpleComplement(bases[i]); - } - - return rcbases; - } - /** * Reverse complement a char array of bases * @@ -403,23 +318,6 @@ public class BaseUtils { return rcbases; } - /** - * Complement a char array of bases - * - * @param bases the char array of bases - * @return the complement of the base char array - */ - @Deprecated - static public char[] simpleComplement(char[] bases) { - char[] rcbases = new char[bases.length]; - - for (int i = 0; i < bases.length; i++) { - rcbases[i] = simpleComplement(bases[i]); - } - - return rcbases; - } - /** * Reverse complement a String of bases. Preserves ambiguous bases. * @@ -431,17 +329,6 @@ public class BaseUtils { return new String(simpleReverseComplement(bases.getBytes())); } - /** - * Complement a String of bases. Preserves ambiguous bases. - * - * @param bases the String of bases - * @return the complement of the String - */ - @Deprecated - static public String simpleComplement(String bases) { - return new String(simpleComplement(bases.getBytes())); - } - /** * Returns the uppercased version of the bases * @@ -543,82 +430,4 @@ public class BaseUtils { return randomBaseIndex; } - - /** - * Return a random base (A, C, G, T). - * - * @return a random base (A, C, G, T) - */ - @Deprecated - static public byte getRandomBase() { - return getRandomBase('.'); - } - - /** - * Return a random base, excluding some base. - * - * @param excludeBase the base to exclude - * @return a random base, excluding the one specified (A, C, G, T) - */ - @Deprecated - static public byte getRandomBase(char excludeBase) { - return BaseUtils.baseIndexToSimpleBase(getRandomBaseIndex(BaseUtils.simpleBaseToBaseIndex(excludeBase))); - } - - /** - * Computes the smallest period >= minPeriod for the specified string. The period is defined as such p, - * that for all i = 0... seq.length-1, seq[ i % p ] = seq[i] (or equivalently seq[i] = seq[i+p] for i=0...seq.length-1-p). - * The sequence does not have to contain whole number of periods. For instance, "ACACACAC" has a period - * of 2 (it has a period of 4 as well), and so does - * "ACACA"; similarly, smallest periods of "CTCCTC", "CTCCT", and "CTCC" are all equal to 3. The "trivial" period is - * the length of the string itself, and it will always be returned if no smaller period can be found in the specified period range - * or if specified minPeriod is greater than the sequence length. - * - * @param seq - * @return - */ - public static int sequencePeriod(byte[] seq, int minPeriod) { - int period = (minPeriod > seq.length ? seq.length : minPeriod); - // we assume that bases [0,period-1] repeat themselves and check this assumption - // until we find correct period - - for (int pos = period; pos < seq.length; pos++) { - - int offset = pos % period; // we are currenlty 'offset' bases into the putative repeat of period 'period' - // if our current hypothesis holds, base[pos] must be the same as base[offset] - - if (Character.toUpperCase(seq[pos]) != Character.toUpperCase(seq[offset])) { - - // period we have been trying so far does not work. - // two possibilities: - // A) offset = 0, i.e. current position pos must be start of the next repeat, but it is not; - // in this case only bases from start up to the current one, inclusive, may form a repeat, if at all; - // so period is at least pos+1 (remember, pos is 0-based), then on the next loop re-entrance - // pos will be autoincremented and we will be checking next base - // B) offset != 0, i.e. the current base breaks the repeat, but maybe it starts a new one? - // hence we should first check if it matches the first base of the sequence, and to do that - // we set period to pos (thus trying the hypothesis that bases from start up to the current one, - // non-inclusive are repeated hereafter), and decrement pos (this will re-test current base against the first base - // on the next loop re-entrance after pos is autoincremented) - if (offset == 0) - period = pos + 1; - else - period = pos--; - - } - } - return period; - } } - -/* code snippet for testing sequencePeriod(): - * - * String str = "CCTTG"; - int p = 0; - System.out.print("Periods of " + str +" are:"); - while ( p < str.length() ) { - p = sequencePeriod(str, p+1); - System.out.print(" "+p); - } - System.out.println(); System.exit(1); -*/ diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java index db54851dd..0e8a3ea70 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java +++ b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java @@ -29,6 +29,7 @@ import net.sf.picard.reference.FastaSequenceIndex; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.ReferenceSequence; import net.sf.samtools.SAMSequenceRecord; +import net.sf.samtools.util.StringUtil; import org.apache.log4j.Priority; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -40,6 +41,8 @@ import java.util.Arrays; * A caching version of the IndexedFastaSequenceFile that avoids going to disk as often as the raw indexer. * * Thread-safe! Uses a thread-local cache + * + * Automatically upper-cases the bases coming in, unless they the flag preserveCase is explicitly set */ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(CachingIndexedFastaSequenceFile.class); @@ -54,10 +57,15 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { public static final long DEFAULT_CACHE_SIZE = 1000000; /** The cache size of this CachingIndexedFastaSequenceFile */ - final long cacheSize; + private final long cacheSize; /** When we have a cache miss at position X, we load sequence from X - cacheMissBackup */ - final long cacheMissBackup; + private final long cacheMissBackup; + + /** + * If true, we will preserve the case of the original base in the genome, not + */ + private final boolean preserveCase; // information about checking efficiency long cacheHits = 0; @@ -84,37 +92,17 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { /** * Same as general constructor but allows one to override the default cacheSize * - * @param fasta - * @param index - * @param cacheSize + * @param fasta the file we will read our FASTA sequence from. + * @param index the index of the fasta file, used for efficient random access + * @param cacheSize the size in bp of the cache we will use for this reader + * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case */ - public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize) { + public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize, final boolean preserveCase) { super(fasta, index); if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0"); this.cacheSize = cacheSize; this.cacheMissBackup = Math.max(cacheSize / 1000, 1); - } - - /** - * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. - * - * @param fasta The file to open. - * @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk. - * @throws java.io.FileNotFoundException If the fasta or any of its supporting files cannot be found. - */ - public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index) { - this(fasta, index, DEFAULT_CACHE_SIZE); - } - - /** - * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. - * - * Looks for a index file for fasta on disk - * - * @param fasta The file to open. - */ - public CachingIndexedFastaSequenceFile(final File fasta) throws FileNotFoundException { - this(fasta, DEFAULT_CACHE_SIZE); + this.preserveCase = preserveCase; } /** @@ -124,12 +112,76 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * Uses provided cacheSize instead of the default * * @param fasta The file to open. + * @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0 + * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case */ - public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize ) throws FileNotFoundException { + public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize, final boolean preserveCase ) throws FileNotFoundException { super(fasta); if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0"); this.cacheSize = cacheSize; this.cacheMissBackup = Math.max(cacheSize / 1000, 1); + this.preserveCase = preserveCase; + } + +// /** +// * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. +// * +// * @param fasta The file to open. +// * @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk. +// * @throws java.io.FileNotFoundException If the fasta or any of its supporting files cannot be found. +// */ +// public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index) { +// this(fasta, index, DEFAULT_CACHE_SIZE); +// } + + /** + * Same as general constructor but allows one to override the default cacheSize + * + * By default, this CachingIndexedFastaReader converts all incoming bases to upper case + * + * @param fasta the file we will read our FASTA sequence from. + * @param index the index of the fasta file, used for efficient random access + * @param cacheSize the size in bp of the cache we will use for this reader + */ + public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize) { + this(fasta, index, cacheSize, false); + } + + /** + * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. + * + * Looks for a index file for fasta on disk. + * This CachingIndexedFastaReader will convert all FASTA bases to upper cases under the hood + * + * @param fasta The file to open. + */ + public CachingIndexedFastaSequenceFile(final File fasta) throws FileNotFoundException { + this(fasta, false); + } + + /** + * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. + * + * Looks for a index file for fasta on disk + * + * @param fasta The file to open. + * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case + */ + public CachingIndexedFastaSequenceFile(final File fasta, final boolean preserveCase) throws FileNotFoundException { + this(fasta, DEFAULT_CACHE_SIZE, preserveCase); + } + + /** + * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. + * + * Looks for a index file for fasta on disk + * Uses provided cacheSize instead of the default + * + * @param fasta The file to open. + * @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0 + */ + public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize ) throws FileNotFoundException { + this(fasta, cacheSize, false); } /** @@ -168,6 +220,25 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { return cacheSize; } + /** + * Is this CachingIndexedFastaReader keeping the original case of bases in the fasta, or is + * everything being made upper case? + * + * @return true if the bases coming from this reader are in the original case in the fasta, false if they are all upper cased + */ + public boolean isPreservingCase() { + return preserveCase; + } + + /** + * Is uppercasing bases? + * + * @return true if bases coming from this CachingIndexedFastaSequenceFile are all upper cased, false if this reader are in the original case in the fasta + */ + public boolean isUppercasingBases() { + return ! isPreservingCase(); + } + /** * Gets the subsequence of the contig in the range [start,stop] * @@ -177,8 +248,10 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * @param contig Contig whose subsequence to retrieve. * @param start inclusive, 1-based start of region. * @param stop inclusive, 1-based stop of region. - * @return The partial reference sequence associated with this range. + * @return The partial reference sequence associated with this range. If preserveCase is false, then + * all of the bases in the ReferenceSequence returned by this method will be upper cased. */ + @Override public ReferenceSequence getSubsequenceAt( final String contig, final long start, final long stop ) { final ReferenceSequence result; final Cache myCache = cache.get(); @@ -186,6 +259,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { if ( (stop - start) >= cacheSize ) { cacheMisses++; result = super.getSubsequenceAt(contig, start, stop); + if ( ! preserveCase ) StringUtil.toUpperCase(result.getBases()); } else { // todo -- potential optimization is to check if contig.name == contig, as this in generally will be true SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig); @@ -198,7 +272,9 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { myCache.start = Math.max(start - cacheMissBackup, 0); myCache.stop = Math.min(start + cacheSize + cacheMissBackup, contigInfo.getSequenceLength()); myCache.seq = super.getSubsequenceAt(contig, myCache.start, myCache.stop); - //System.out.printf("New cache at %s %d-%d%n", contig, cacheStart, cacheStop); + + // convert all of the bases in the sequence to upper case if we aren't preserving cases + if ( ! preserveCase ) StringUtil.toUpperCase(myCache.seq.getBases()); } else { cacheHits++; } @@ -215,8 +291,10 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { } } + // for debugging -- print out our efficiency if requested if ( PRINT_EFFICIENCY && (getCacheHits() + getCacheMisses()) % PRINT_FREQUENCY == 0 ) printEfficiency(Priority.INFO); + return result; } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java index 736162300..bcd846184 100644 --- a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java @@ -30,6 +30,7 @@ import java.util.concurrent.Executors; public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { private File simpleFasta = new File(publicTestDir + "/exampleFASTA.fasta"); private static final int STEP_SIZE = 1; + private final static boolean DEBUG = false; //private static final List QUERY_SIZES = Arrays.asList(1); private static final List QUERY_SIZES = Arrays.asList(1, 10, 100); @@ -53,9 +54,9 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { return cacheSizeRequested == -1 ? CachingIndexedFastaSequenceFile.DEFAULT_CACHE_SIZE : cacheSizeRequested; } - @Test(dataProvider = "fastas", enabled = true) + @Test(dataProvider = "fastas", enabled = true && ! DEBUG) public void testCachingIndexedFastaReaderSequential1(File fasta, int cacheSize, int querySize) throws FileNotFoundException { - final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize)); + final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true); SAMSequenceRecord contig = caching.getSequenceDictionary().getSequence(0); logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d", @@ -64,6 +65,8 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { } private void testSequential(final CachingIndexedFastaSequenceFile caching, final File fasta, final int querySize) throws FileNotFoundException { + Assert.assertTrue(caching.isPreservingCase(), "testSequential only works for case preserving CachingIndexedFastaSequenceFile readers"); + final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta); SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0); @@ -92,10 +95,10 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { } // Tests grabbing sequences around a middle cached value. - @Test(dataProvider = "fastas", enabled = true) + @Test(dataProvider = "fastas", enabled = true && ! DEBUG) public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) throws FileNotFoundException { final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta); - final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize)); + final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true); SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0); @@ -123,11 +126,6 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { @DataProvider(name = "ParallelFastaTest") public Object[][] createParallelFastaTest() { List params = new ArrayList(); -// for ( int nt : Arrays.asList(1, 2, 3) ) { -// for ( int cacheSize : CACHE_SIZES ) { -// params.add(new Object[]{simpleFasta, cacheSize, 10, nt}); -// } -// } for ( File fasta : Arrays.asList(simpleFasta) ) { for ( int cacheSize : CACHE_SIZES ) { @@ -143,9 +141,9 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { } - @Test(dataProvider = "ParallelFastaTest", enabled = true, timeOut = 60000) + @Test(dataProvider = "ParallelFastaTest", enabled = true && ! DEBUG, timeOut = 60000) public void testCachingIndexedFastaReaderParallel(final File fasta, final int cacheSize, final int querySize, final int nt) throws FileNotFoundException, InterruptedException { - final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize)); + final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true); logger.warn(String.format("Parallel caching index fasta reader test cacheSize %d querySize %d nt %d", caching.getCacheSize(), querySize, nt)); for ( int iterations = 0; iterations < 1; iterations++ ) { @@ -163,4 +161,49 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { executor.shutdownNow(); } } + + // make sure some bases are lower case and some are upper case + @Test(enabled = true) + public void testMixedCasesInExample() throws FileNotFoundException, InterruptedException { + final IndexedFastaSequenceFile original = new IndexedFastaSequenceFile(new File(exampleFASTA)); + final CachingIndexedFastaSequenceFile casePreserving = new CachingIndexedFastaSequenceFile(new File(exampleFASTA), true); + final CachingIndexedFastaSequenceFile allUpper = new CachingIndexedFastaSequenceFile(new File(exampleFASTA)); + + int nMixedCase = 0; + for ( SAMSequenceRecord contig : original.getSequenceDictionary().getSequences() ) { + nMixedCase += testCases(original, casePreserving, allUpper, contig.getSequenceName(), -1, -1); + + final int step = 100; + for ( int lastPos = step; lastPos < contig.getSequenceLength(); lastPos += step ) { + testCases(original, casePreserving, allUpper, contig.getSequenceName(), lastPos - step, lastPos); + } + } + + Assert.assertTrue(nMixedCase > 0, "No mixed cases sequences found in file. Unexpected test state"); + } + + private int testCases(final IndexedFastaSequenceFile original, + final IndexedFastaSequenceFile casePreserving, + final IndexedFastaSequenceFile allUpper, + final String contig, final int start, final int stop ) { + final String orig = fetchBaseString(original, contig, start, stop); + final String keptCase = fetchBaseString(casePreserving, contig, start, stop); + final String upperCase = fetchBaseString(allUpper, contig, start, stop).toUpperCase(); + + final String origToUpper = orig.toUpperCase(); + if ( ! orig.equals(origToUpper) ) { + Assert.assertEquals(keptCase, orig, "Case preserving operation not equal to the original case for contig " + contig); + Assert.assertEquals(upperCase, origToUpper, "All upper case reader not equal to the uppercase of original case for contig " + contig); + return 1; + } else { + return 0; + } + } + + private String fetchBaseString(final IndexedFastaSequenceFile reader, final String contig, final int start, final int stop) { + if ( start == -1 ) + return new String(reader.getSequence(contig).getBases()); + else + return new String(reader.getSubsequenceAt(contig, start, stop).getBases()); + } } From 1444cd753bfcdb0084afdadae3e7c45d257d8f91 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 30 Oct 2012 16:58:55 -0400 Subject: [PATCH 10/90] Bugfix for GSA-647 HaplotypeCaller misses good variant because the active region doesn't trigger for an exome -- The logic for determining active regions was a bit broken in the HC when intervals were used in the system -- TraverseActiveRegions now uses the AllLocus view, since we always want to see all reference sites, not just those covered. Simplifies logic of TAR -- Non-overlapping intervals are always treated as separate objects for determing active / inactive state. This means that each exon will stand on its own when deciding if it should be active or inactive -- Misc. cleanup, docs of some TAR infrastructure to make it safer and easier to debug in the future. -- Committing the SingleExomeCalling script that I used to find this problem, and will continue to use in evaluating calling of a single exome with the HC -- Make sure to get all of the reads into the set of potentially active reads, even for genomic locations that themselves don't overlap the engine intervals but may have reads that overlap the regions -- Remove excessively expensive calls to check bases are upper cased in ReferenceContext -- Update md5s after a lot of manual review and discussion with Ryan --- .../haplotypecaller/HaplotypeCaller.java | 16 +- .../HaplotypeCallerIntegrationTest.java | 6 +- .../sting/gatk/contexts/ReferenceContext.java | 10 +- .../traversals/TraverseActiveRegions.java | 199 +++++++++--------- .../targets/FindCoveredIntervals.java | 2 +- .../utils/activeregion/ActiveRegion.java | 20 +- .../utils/activeregion/ActivityProfile.java | 54 ++++- .../activeregion/ActivityProfileResult.java | 52 ++++- .../activeregion/ActivityProfileUnitTest.java | 2 +- 9 files changed, 226 insertions(+), 135 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 5aba23faa..a185ba6af 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; -import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -41,7 +40,10 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.genotyper.*; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.clipping.ReadClipper; @@ -212,7 +214,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem private VariantAnnotatorEngine annotationEngine; // fasta reference reader to supplement the edges of the reference sequence - private IndexedFastaSequenceFile referenceReader; + private CachingIndexedFastaSequenceFile referenceReader; // reference base padding size private static final int REFERENCE_PADDING = 900; @@ -324,15 +326,15 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } } if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) { - return new ActivityProfileResult(1.0); + return new ActivityProfileResult(ref.getLocus(), 1.0); } } if( USE_ALLELES_TRIGGER ) { - return new ActivityProfileResult( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 ); + return new ActivityProfileResult( ref.getLocus(), tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 ); } - if( context == null ) { return new ActivityProfileResult(0.0); } + if( context == null ) { return new ActivityProfileResult(ref.getLocus(), 0.0); } final List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied noCall.add(Allele.NO_CALL); @@ -369,7 +371,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL); final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() ); - return new ActivityProfileResult( isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileResult.ActivityProfileResultState.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileResult.ActivityProfileResultState.NONE, averageHQSoftClips.mean() ); + return new ActivityProfileResult( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileResult.ActivityProfileResultState.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileResult.ActivityProfileResultState.NONE, averageHQSoftClips.mean() ); } //--------------------------------------------------------------------------------------------------------------- diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 86f3748ce..d00f5b61d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -53,7 +53,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "16013a9203367c3d1c4ce1dcdc81ef4a"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "d86fae2d1b504b422b7b0cfbbdecc2c4"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -69,8 +69,8 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { - final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c306140ad28515ee06c603c225217939")); + final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("f6326adfdf5bc147626b30a89ce06d56")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java index c8bf1e3e8..34627b973 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java +++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java @@ -78,7 +78,7 @@ public class ReferenceContext { * * @return */ - @Ensures({"result != null", "BaseUtils.isUpperCase(result)"}) + @Ensures({"result != null"}) public byte[] getBases(); } @@ -143,6 +143,9 @@ public class ReferenceContext { private void fetchBasesFromProvider() { if ( basesCache == null ) { basesCache = basesProvider.getBases(); + + // must be an assertion that only runs when the bases are fetch to run in a reasonable amount of time + assert BaseUtils.isUpperCase(basesCache); } } @@ -172,7 +175,6 @@ public class ReferenceContext { * Get the base at the given locus. * @return The base at the given locus from the reference. */ - @Ensures("BaseUtils.isUpperCase(result)") public byte getBase() { return getBases()[(locus.getStart() - window.getStart())]; } @@ -182,7 +184,7 @@ public class ReferenceContext { * @return All bases available. If the window is of size [0,0], the array will * contain only the base at the given locus. */ - @Ensures({"result != null", "result.length > 0", "BaseUtils.isUpperCase(result)"}) + @Ensures({"result != null", "result.length > 0"}) public byte[] getBases() { fetchBasesFromProvider(); return basesCache; @@ -191,7 +193,7 @@ public class ReferenceContext { /** * All the bases in the window from the current base forward to the end of the window. */ - @Ensures({"result != null", "result.length > 0", "BaseUtils.isUpperCase(result)"}) + @Ensures({"result != null", "result.length > 0"}) public byte[] getForwardBases() { final byte[] bases = getBases(); final int mid = locus.getStart() - window.getStart(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 5d38df0f5..a2c37944a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -11,7 +11,6 @@ import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActivityProfile; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; @@ -46,99 +45,127 @@ public class TraverseActiveRegions extends TraversalEngine activeRegions = new LinkedList(); + ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); - ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); + ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); - // We keep processing while the next reference location is within the interval - GenomeLoc prevLoc = null; - while( locusView.hasNext() ) { - final AlignmentContext locus = locusView.next(); - GenomeLoc location = locus.getLocation(); + // We keep processing while the next reference location is within the interval + GenomeLoc prevLoc = null; + while( locusView.hasNext() ) { + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); - if(prevLoc != null) { - // fill in the active / inactive labels from the stop of the previous location to the start of this location - // TODO refactor to separate function - for(int iii = prevLoc.getStop() + 1; iii < location.getStart(); iii++ ) { - final GenomeLoc fakeLoc = engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii); - if( initialIntervals == null || initialIntervals.overlaps( fakeLoc ) ) { - profile.add(fakeLoc, new ActivityProfileResult( walker.hasPresetActiveRegions() && walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0 )); - } - } + // Grab all the previously unseen reads from this pileup and add them to the massive read list + // Note that this must occur before we leave because we are outside the intervals because + // reads may occur outside our intervals but overlap them in the future + // TODO -- this whole HashSet logic should be changed to a linked list of reads with + // TODO -- subsequent pass over them to find the ones overlapping the active regions + for( final PileupElement p : locus.getBasePileup() ) { + final GATKSAMRecord read = p.getRead(); + if( !myReads.contains(read) ) { + myReads.add(read); } - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); - - // Call the walkers isActive function for this locus and add them to the list to be integrated later - if( initialIntervals == null || initialIntervals.overlaps( location ) ) { - profile.add(location, walkerActiveProb(walker, tracker, refContext, locus, location)); - } - - // Grab all the previously unseen reads from this pileup and add them to the massive read list - for( final PileupElement p : locus.getBasePileup() ) { - final GATKSAMRecord read = p.getRead(); - if( !myReads.contains(read) ) { - myReads.add(read); - } - - // If this is the last pileup for this shard calculate the minimum alignment start so that we know - // which active regions in the work queue are now safe to process - minStart = Math.min(minStart, read.getAlignmentStart()); - } - - prevLoc = location; - - printProgress(locus.getLocation()); + // If this is the last pileup for this shard calculate the minimum alignment start so that we know + // which active regions in the work queue are now safe to process + minStart = Math.min(minStart, read.getAlignmentStart()); } - updateCumulativeMetrics(dataProvider.getShard()); + // skip this location -- it's not part of our engine intervals + // TODO -- this is dangerously slow with current overlaps implementation : GSA-649 / GenomeLocSortedSet.overlaps is crazy slow + if ( outsideEngineIntervals(location) ) + continue; - // Take the individual isActive calls and integrate them into contiguous active regions and - // add these blocks of work to the work queue - // band-pass filter the list of isActive probabilities and turn into active regions - final ActivityProfile bandPassFiltered = profile.bandPassFilter(); - final List activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ); - - // add active regions to queue of regions to process - // first check if can merge active regions over shard boundaries - if( !activeRegions.isEmpty() ) { - if( !workQueue.isEmpty() ) { - final ActiveRegion last = workQueue.getLast(); - final ActiveRegion first = activeRegions.get(0); - if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) { - workQueue.removeLast(); - activeRegions.remove(first); - workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); - } - } - workQueue.addAll( activeRegions ); + if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { + // we've move across some interval boundary, restart profile + profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); } - logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); + dataProvider.getShard().getReadMetrics().incrementNumIterations(); - // now go and process all of the active regions - sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); + + // Call the walkers isActive function for this locus and add them to the list to be integrated later + profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); + + prevLoc = location; + + printProgress(locus.getLocation()); } + updateCumulativeMetrics(dataProvider.getShard()); + + if ( ! profile.isEmpty() ) + incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); + + // add active regions to queue of regions to process + // first check if can merge active regions over shard boundaries + if( !activeRegions.isEmpty() ) { + if( !workQueue.isEmpty() ) { + final ActiveRegion last = workQueue.getLast(); + final ActiveRegion first = activeRegions.get(0); + if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) { + workQueue.removeLast(); + activeRegions.remove(first); + workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); + } + } + workQueue.addAll( activeRegions ); + } + + logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); + + // now go and process all of the active regions + sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); + return sum; } + /** + * Is the loc outside of the intervals being requested for processing by the GATK? + * @param loc + * @return + */ + private boolean outsideEngineIntervals(final GenomeLoc loc) { + return engine.getIntervals() != null && ! engine.getIntervals().overlaps(loc); + } + + /** + * Take the individual isActive calls and integrate them into contiguous active regions and + * add these blocks of work to the work queue + * band-pass filter the list of isActive probabilities and turn into active regions + * + * @param profile + * @param activeRegions + * @param activeRegionExtension + * @param maxRegionSize + * @return + */ + private ActivityProfile incorporateActiveRegions(final ActivityProfile profile, + final List activeRegions, + final int activeRegionExtension, + final int maxRegionSize) { + if ( profile.isEmpty() ) + throw new IllegalStateException("trying to incorporate an empty active profile " + profile); + + final ActivityProfile bandPassFiltered = profile.bandPassFilter(); + activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize )); + return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() ); + } + // -------------------------------------------------------------------------------- // @@ -150,7 +177,7 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine walker, final LocusShardDataProvider dataProvider ) { - final DataSource dataSource = WalkerManager.getWalkerDataSource(walker); - if( dataSource == DataSource.READS ) - return new CoveredLocusView(dataProvider); - else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) - return new AllLocusView(dataProvider); - else if( dataSource == DataSource.REFERENCE_ORDERED_DATA ) - return new RodLocusView(dataProvider); - else - throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource); - } - /** * Special function called in LinearMicroScheduler to empty out the work queue. * Ugly for now but will be cleaned up when we push this functionality more into the engine diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java index e17c6cdb7..85b7159e8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java @@ -57,7 +57,7 @@ public class FindCoveredIntervals extends ActiveRegionWalker { int depth = ThresHolder.DEFAULTS.getFilteredCoverage(context.getBasePileup()); // note the linear probability scale - return new ActivityProfileResult(Math.min(depth / coverageThreshold, 1)); + return new ActivityProfileResult(ref.getLocus(), Math.min(depth / coverageThreshold, 1)); } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index decc54d47..0d12d53cc 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -1,11 +1,11 @@ package org.broadinstitute.sting.utils.activeregion; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.util.StringUtil; +import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.HasGenomeLocation; import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.ArrayList; @@ -54,27 +54,31 @@ public class ActiveRegion implements HasGenomeLocation { public ArrayList getReads() { return reads; } - public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader ) { + @Requires("referenceReader.isUppercasingBases()") + public byte[] getActiveRegionReference( final CachingIndexedFastaSequenceFile referenceReader ) { return getActiveRegionReference(referenceReader, 0); } - public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + @Requires("referenceReader.isUppercasingBases()") + public byte[] getActiveRegionReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding ) { return getReference( referenceReader, padding, extendedLoc ); } - public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) { + @Requires("referenceReader.isUppercasingBases()") + public byte[] getFullReference( final CachingIndexedFastaSequenceFile referenceReader ) { return getFullReference(referenceReader, 0); } - public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + @Requires("referenceReader.isUppercasingBases()") + public byte[] getFullReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding ) { return getReference( referenceReader, padding, fullExtentReferenceLoc ); } - private byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) { + @Requires("referenceReader.isUppercasingBases()") + private byte[] getReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) { final byte[] reference = referenceReader.getSubsequenceAt( genomeLoc.getContig(), Math.max(1, genomeLoc.getStart() - padding), Math.min(referenceReader.getSequenceDictionary().getSequence(genomeLoc.getContig()).getSequenceLength(), genomeLoc.getStop() + padding) ).getBases(); - StringUtil.toUpperCase(reference); return reference; } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java index 73f3cc487..e96eb843d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java @@ -24,11 +24,11 @@ package org.broadinstitute.sting.utils.activeregion; +import com.google.java.contract.Requires; import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.ArrayList; import java.util.Collections; @@ -45,6 +45,7 @@ public class ActivityProfile { final GenomeLocParser parser; final boolean presetRegions; GenomeLoc regionStartLoc = null; + GenomeLoc regionStopLoc = null; final List isActiveList; private static final int FILTER_SIZE = 80; private static final double[] GaussianKernel; @@ -71,19 +72,49 @@ public class ActivityProfile { this.regionStartLoc = regionStartLoc; } - public void add(final GenomeLoc loc, final ActivityProfileResult result) { - if ( loc.size() != 1 ) - throw new ReviewedStingException("Bad add call to ActivityProfile: loc " + loc + " size != 1" ); - isActiveList.add(result); - if( regionStartLoc == null ) { + @Override + public String toString() { + return "ActivityProfile{" + + "start=" + regionStartLoc + + ", stop=" + regionStopLoc + + '}'; + } + + /** + * Add the next ActivityProfileResult to this profile. + * + * Must be contiguous with the previously added result, or an IllegalArgumentException will be thrown + * + * @param result a well-formed ActivityProfileResult result to incorporate into this profile + */ + @Requires("result != null") + public void add(final ActivityProfileResult result) { + final GenomeLoc loc = result.getLoc(); + + if ( regionStartLoc == null ) { regionStartLoc = loc; + regionStopLoc = loc; + } else { + if ( regionStopLoc.getStart() != loc.getStart() - 1 ) + throw new IllegalArgumentException("Bad add call to ActivityProfile: loc " + loc + " not immediate after last loc " + regionStopLoc ); + regionStopLoc = loc; } + + isActiveList.add(result); } public int size() { return isActiveList.size(); } + public boolean isEmpty() { + return isActiveList.isEmpty(); + } + + public boolean hasPresetRegions() { + return presetRegions; + } + /** * Band pass this ActivityProfile, producing a new profile that's band pass filtered * @return a new ActivityProfile that's the band-pass filtered version of this profile @@ -104,14 +135,21 @@ public class ActivityProfile { } iii++; } - final double[] filteredProbArray = new double[activeProbArray.length]; + + final double[] filteredProbArray; if( !presetRegions ) { + // if we aren't using preset regions, actually apply the band pass filter for activeProbArray into filteredProbArray + filteredProbArray = new double[activeProbArray.length]; for( iii = 0; iii < activeProbArray.length; iii++ ) { final double[] kernel = ArrayUtils.subarray(GaussianKernel, Math.max(FILTER_SIZE-iii, 0), Math.min(GaussianKernel.length,FILTER_SIZE + activeProbArray.length - iii)); final double[] activeProbSubArray = ArrayUtils.subarray(activeProbArray, Math.max(0,iii - FILTER_SIZE), Math.min(activeProbArray.length,iii + FILTER_SIZE + 1)); filteredProbArray[iii] = MathUtils.dotProduct(activeProbSubArray, kernel); } + } else { + // otherwise we simply use the activeProbArray directly + filteredProbArray = activeProbArray; } + iii = 0; for( final double prob : filteredProbArray ) { final ActivityProfileResult result = isActiveList.get(iii++); @@ -119,6 +157,7 @@ public class ActivityProfile { result.resultState = ActivityProfileResult.ActivityProfileResultState.NONE; result.resultValue = null; } + return new ActivityProfile(parser, presetRegions, isActiveList, regionStartLoc); } @@ -166,6 +205,7 @@ public class ActivityProfile { private final List createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize) { return createActiveRegion(isActive, curStart, curEnd, activeRegionExtension, maxRegionSize, new ArrayList()); } + private final List createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize, final List returnList) { if( !isActive || curEnd - curStart < maxRegionSize ) { final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd); diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileResult.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileResult.java index 8dc29aa3c..273c2e785 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileResult.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileResult.java @@ -1,12 +1,16 @@ package org.broadinstitute.sting.utils.activeregion; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.GenomeLoc; + /** * Created with IntelliJ IDEA. * User: rpoplin * Date: 7/27/12 */ - public class ActivityProfileResult { + private GenomeLoc loc; public double isActiveProb; public ActivityProfileResultState resultState; public Number resultValue; @@ -16,16 +20,52 @@ public class ActivityProfileResult { HIGH_QUALITY_SOFT_CLIPS } - public ActivityProfileResult( final double isActiveProb ) { - this.isActiveProb = isActiveProb; - this.resultState = ActivityProfileResultState.NONE; - this.resultValue = null; + /** + * Create a new ActivityProfileResult at loc with probability of being active of isActiveProb + * + * @param loc the position of the result profile (for debugging purposes) + * @param isActiveProb the probability of being active (between 0 and 1) + */ + @Requires({"loc != null", "isActiveProb >= 0.0 && isActiveProb <= 1.0"}) + public ActivityProfileResult( final GenomeLoc loc, final double isActiveProb ) { + this(loc, isActiveProb, ActivityProfileResultState.NONE, null); } - public ActivityProfileResult( final double isActiveProb, final ActivityProfileResultState resultState, final Number resultValue ) { + /** + * Create a new ActivityProfileResult at loc with probability of being active of isActiveProb that maintains some + * information about the result state and value (TODO RYAN -- what do these mean?) + * + * @param loc the position of the result profile (for debugging purposes) + * @param isActiveProb the probability of being active (between 0 and 1) + */ + @Requires({"loc != null", "isActiveProb >= 0.0 && isActiveProb <= 1.0"}) + public ActivityProfileResult( final GenomeLoc loc, final double isActiveProb, final ActivityProfileResultState resultState, final Number resultValue ) { + // make sure the location of that activity profile is 1 + if ( loc.size() != 1 ) + throw new IllegalArgumentException("Location for an ActivityProfileResult must have to size 1 bp but saw " + loc); + + this.loc = loc; this.isActiveProb = isActiveProb; this.resultState = resultState; this.resultValue = resultValue; } + /** + * Get the genome loc associated with the ActivityProfileResult + * @return the location of this result + */ + @Ensures("result != null") + public GenomeLoc getLoc() { + return loc; + } + + @Override + public String toString() { + return "ActivityProfileResult{" + + "loc=" + loc + + ", isActiveProb=" + isActiveProb + + ", resultState=" + resultState + + ", resultValue=" + resultValue + + '}'; + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java index f7c564c74..57dd19888 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java @@ -123,7 +123,7 @@ public class ActivityProfileUnitTest extends BaseTest { for ( int i = 0; i < cfg.probs.size(); i++ ) { double p = cfg.probs.get(i); GenomeLoc loc = genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart() + i, cfg.regionStart.getStart() + i); - profile.add(loc, new ActivityProfileResult(p)); + profile.add(new ActivityProfileResult(loc, p)); } Assert.assertEquals(profile.regionStartLoc, genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart(), cfg.regionStart.getStart() )); From 872abddfcec844f2217fafb3ac3f4451b9cc844b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 31 Oct 2012 19:52:25 -0400 Subject: [PATCH 12/90] Add custom TestNGTestTransformer that adds a maximum test runtime of 10 minutes to all testng tests -- Closes GSA-494 / Add maximum runtime for integration tests, running them in timeout thread -- Needed to debug locking issues -- Needed to debug excessively long running integrationtests -- Added build.xml maximum runtime for all testng tests of 10 hours. We will ultimately fail the build if it goes on for more than 10 hours --- build.xml | 4 +- .../sting/TestNGTestTransformer.java | 37 +++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java diff --git a/build.xml b/build.xml index c6b1afc56..7702be7e4 100644 --- a/build.xml +++ b/build.xml @@ -1174,14 +1174,16 @@ + + listeners="org.testng.reporters.FailedReporter,org.testng.reporters.JUnitXMLReporter,org.broadinstitute.sting.TestNGTestTransformer,org.broadinstitute.sting.StingTextReporter,org.uncommons.reportng.HTMLReporter"> diff --git a/public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java b/public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java new file mode 100644 index 000000000..6a1a37de9 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java @@ -0,0 +1,37 @@ +package org.broadinstitute.sting; + +import org.apache.log4j.Logger; +import org.testng.IAnnotationTransformer; +import org.testng.annotations.ITestAnnotation; + +import java.lang.reflect.Constructor; +import java.lang.reflect.Method; + +/** + * Provide default @Test values for GATK testng tests. + * + * Currently only sets the maximum runtime to 10 minutes, if it's not been specified. + * + * See http://beust.com/weblog/2006/10/18/annotation-transformers-in-java/ + * + * @author depristo + * @since 10/31/12 + * @version 0.1 + */ +public class TestNGTestTransformer implements IAnnotationTransformer { + public static final long DEFAULT_TIMEOUT = 1000 * 60 * 10; // 10 minutes max per test + + final static Logger logger = Logger.getLogger(TestNGTestTransformer.class); + + public void transform(ITestAnnotation annotation, + Class testClass, + Constructor testConstructor, + Method testMethod) + { + if ( annotation.getTimeOut() == 0 ) { + logger.warn("test " + testMethod.toString() + " has no specified timeout, adding default timeout " + DEFAULT_TIMEOUT / 1000 / 60 + " minutes"); + annotation.setTimeOut(DEFAULT_TIMEOUT); + } + } +} + From 386b45e94db43e5e68b2b0fec3cac8fd97d1d6fc Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 1 Nov 2012 15:44:41 -0400 Subject: [PATCH 13/90] This VE eval module isn't useful anymore. --- .../evaluators/VariantQualityScore.java | 249 ------------------ 1 file changed, 249 deletions(-) delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java deleted file mode 100755 index 347ca56b8..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java +++ /dev/null @@ -1,249 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; - -import java.util.ArrayList; -import java.util.HashMap; - -/** - * @author rpoplin - * @since Apr 6, 2010 - */ - -//@Analysis(name = "Variant Quality Score", description = "Shows various stats of sets of variants binned by variant quality score") -@Deprecated -public class VariantQualityScore { - // TODO - this should really be a stratification - -// public class VariantQualityScore extends VariantEvaluator { -// -// // a mapping from quality score histogram bin to Ti/Tv ratio -// @DataPoint(description = "the Ti/Tv ratio broken out by variant quality") -// TiTvStats titvStats = null; -// -// @DataPoint(description = "average variant quality for each allele count") -// AlleleCountStats alleleCountStats = null; -// -// static class TiTvStats extends TableType { -// final static int NUM_BINS = 20; -// final HashMap> qualByIsTransition = new HashMap>(); // A hashMap holds all the qualities until we are able to bin them appropriately -// final long transitionByQuality[] = new long[NUM_BINS]; -// final long transversionByQuality[] = new long[NUM_BINS]; -// final double titvByQuality[] = new double[NUM_BINS]; // the final ti/tv sets that get reported out -// -// public Object[] getRowKeys() { -// return new String[]{"sample"}; -// } -// -// public Object[] getColumnKeys() { -// final String columnKeys[] = new String[NUM_BINS]; -// for( int iii = 0; iii < NUM_BINS; iii++ ) { -// columnKeys[iii] = "titvBin" + iii; -// } -// return columnKeys; -// } -// -// public String getCell(int x, int y) { -// return String.valueOf(titvByQuality[y]); -// } -// -// public String toString() { -// StringBuffer returnString = new StringBuffer(); -// // output the ti/tv array -// returnString.append("titvByQuality: "); -// for( int iii = 0; iii < NUM_BINS; iii++ ) { -// returnString.append(titvByQuality[iii]); -// returnString.append(" "); -// } -// return returnString.toString(); -// } -// -// public void incrValue( final double qual, final boolean isTransition ) { -// final Integer qualKey = Math.round((float) qual); -// final long numTransition = (isTransition ? 1L : 0L); -// final long numTransversion = (isTransition ? 0L : 1L); -// if( qualByIsTransition.containsKey(qualKey) ) { -// Pair transitionPair = qualByIsTransition.get(qualKey); -// transitionPair.set(transitionPair.getFirst() + numTransition, transitionPair.getSecond() + numTransversion); -// qualByIsTransition.put(qualKey, transitionPair); -// } else { -// qualByIsTransition.put(qualKey, new Pair(numTransition,numTransversion)); -// } -// } -// -// public void organizeTiTvTables() { -// for( int iii = 0; iii < NUM_BINS; iii++ ) { -// transitionByQuality[iii] = 0L; -// transversionByQuality[iii] = 0L; -// titvByQuality[iii] = 0.0; -// } -// -// int maxQual = 0; -// -// // Calculate the maximum quality score in order to normalize and histogram -// for( final Integer qual : qualByIsTransition.keySet() ) { -// if( qual > maxQual ) { -// maxQual = qual; -// } -// } -// -// final double binSize = ((double)maxQual) / ((double) (NUM_BINS-1)); -// -// for( final Integer qual : qualByIsTransition.keySet() ) { -// final int index = (int)Math.floor( ((double) qual) / binSize ); -// if( index >= 0 ) { // BUGBUG: why is there overflow here? -// Pair transitionPair = qualByIsTransition.get(qual); -// transitionByQuality[index] += transitionPair.getFirst(); -// transversionByQuality[index] += transitionPair.getSecond(); -// } -// } -// -// for( int iii = 0; iii < NUM_BINS; iii++ ) { -// if( transitionByQuality[iii] + transversionByQuality[iii] > 800L ) { // need to have a sufficient number of variants to get a useful Ti/Tv ratio -// titvByQuality[iii] = ((double) transitionByQuality[iii]) / ((double) transversionByQuality[iii]); -// } else { -// titvByQuality[iii] = 0.0; -// } -// } -// -// } -// } -// -// class AlleleCountStats extends TableType { -// final HashMap> qualityListMap = new HashMap>(); -// final HashMap qualityMap = new HashMap(); -// -// public Object[] getRowKeys() { -// final int NUM_BINS = qualityListMap.keySet().size(); -// final String rowKeys[] = new String[NUM_BINS]; -// int iii = 0; -// for( final Integer key : qualityListMap.keySet() ) { -// rowKeys[iii] = "AC" + key; -// iii++; -// } -// return rowKeys; -// -// } -// -// public Object[] getColumnKeys() { -// return new String[]{"alleleCount","avgQual"}; -// } -// -// public String getCell(int x, int y) { -// int iii = 0; -// for( final Integer key : qualityListMap.keySet() ) { -// if(iii == x) { -// if(y == 0) { return String.valueOf(key); } -// else { return String.valueOf(qualityMap.get(key)); } -// } -// iii++; -// } -// return null; -// } -// -// public String toString() { -// String returnString = ""; -// // output the quality map -// returnString += "AlleleCountStats: "; -// //for( int iii = 0; iii < NUM_BINS; iii++ ) { -// // returnString += titvByQuality[iii] + " "; -// //} -// return returnString; -// } -// -// public void incrValue( final double qual, final int alleleCount ) { -// ArrayList list = qualityListMap.get(alleleCount); -// if(list==null) { list = new ArrayList(); } -// list.add(qual); -// qualityListMap.put(alleleCount, list); -// } -// -// public void organizeAlleleCountTables() { -// for( final Integer key : qualityListMap.keySet() ) { -// final ArrayList list = qualityListMap.get(key); -// double meanQual = 0.0; -// final double numQuals = (double)list.size(); -// for( Double qual : list ) { -// meanQual += qual / numQuals; -// } -// qualityMap.put(key, meanQual); -// } -// } -// } -// -// //public VariantQualityScore(VariantEvalWalker parent) { -// //super(parent); -// //} -// -// public String getName() { -// return "VariantQualityScore"; -// } -// -// public int getComparisonOrder() { -// return 1; // we only need to see each eval track -// } -// -// public String toString() { -// return getName(); -// } -// -// public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { -// final String interesting = null; -// -// if( eval != null && eval.isSNP() && eval.isBiallelic() && eval.isPolymorphicInSamples() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites) -// if( titvStats == null ) { titvStats = new TiTvStats(); } -// titvStats.incrValue(eval.getPhredScaledQual(), VariantContextUtils.isTransition(eval)); -// -// if( alleleCountStats == null ) { alleleCountStats = new AlleleCountStats(); } -// int alternateAlleleCount = 0; -// for (final Allele a : eval.getAlternateAlleles()) { -// alternateAlleleCount += eval.getCalledChrCount(a); -// } -// alleleCountStats.incrValue(eval.getPhredScaledQual(), alternateAlleleCount); -// } -// -// return interesting; // This module doesn't capture any interesting sites, so return null -// } -// -// public void finalizeEvaluation() { -// if( titvStats != null ) { -// titvStats.organizeTiTvTables(); -// } -// if( alleleCountStats != null ) { -// alleleCountStats.organizeAlleleCountTables(); -// } -// } -} \ No newline at end of file From 6185e8c43255d7213f4d4c464c71ff3b5e1a09d4 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Thu, 1 Nov 2012 17:48:58 -0400 Subject: [PATCH 14/90] Allow large-scale tests 5 hours each to run --- .../walkers/genotyper/UnifiedGenotyperLargeScaleTest.java | 6 +++--- .../gatk/walkers/indels/IndelRealignerLargeScaleTest.java | 4 ++-- .../indels/RealignerTargetCreatorLargeScaleTest.java | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLargeScaleTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLargeScaleTest.java index 109088875..c5a5dcc21 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLargeScaleTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLargeScaleTest.java @@ -7,7 +7,7 @@ import java.util.ArrayList; public class UnifiedGenotyperLargeScaleTest extends WalkerTest { - @Test + @Test( timeOut = 18000000 ) public void testUnifiedGenotyperWholeGenome() { WalkerTestSpec spec = new WalkerTestSpec( "-R " + hg18Reference + @@ -22,7 +22,7 @@ public class UnifiedGenotyperLargeScaleTest extends WalkerTest { executeTest("testUnifiedGenotyperWholeGenome", spec); } - @Test + @Test( timeOut = 18000000 ) public void testUnifiedGenotyperWholeExome() { WalkerTestSpec spec = new WalkerTestSpec( "-R " + hg18Reference + @@ -37,7 +37,7 @@ public class UnifiedGenotyperLargeScaleTest extends WalkerTest { executeTest("testUnifiedGenotyperWholeExome", spec); } - @Test + @Test( timeOut = 18000000 ) public void testUnifiedGenotyperWGParallel() { WalkerTestSpec spec = new WalkerTestSpec( "-R " + hg18Reference + diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerLargeScaleTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerLargeScaleTest.java index 4526fc0d7..2dd5a66fd 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerLargeScaleTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerLargeScaleTest.java @@ -6,7 +6,7 @@ import org.testng.annotations.Test; import java.util.ArrayList; public class IndelRealignerLargeScaleTest extends WalkerTest { - @Test + @Test( timeOut = 18000000 ) public void testHighCoverage() { WalkerTestSpec spec = new WalkerTestSpec( @@ -21,7 +21,7 @@ public class IndelRealignerLargeScaleTest extends WalkerTest { executeTest("testIndelRealignerHighCoverage", spec); } - @Test + @Test( timeOut = 18000000 ) public void testRealigner() { WalkerTestSpec spec1 = new WalkerTestSpec( diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorLargeScaleTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorLargeScaleTest.java index 3203ee100..e32afd06b 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorLargeScaleTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorLargeScaleTest.java @@ -6,7 +6,7 @@ import org.testng.annotations.Test; import java.util.ArrayList; public class RealignerTargetCreatorLargeScaleTest extends WalkerTest { - @Test + @Test( timeOut = 18000000 ) public void testRealignerTargetCreator() { WalkerTestSpec spec1 = new WalkerTestSpec( From f8a0a947e34cc7f3874e6adf43175cda5b2d7e26 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 2 Nov 2012 09:09:32 -0400 Subject: [PATCH 15/90] Critical bugfix for GSA-652 / Multi-threaded VCF -> BCF writing produces invalid intermediate file that fails on merging -- New tribble library now uses 64 bit sizes. The 26K VCF has so much data that low-level tribble block indices where overflowing their int size values. This includes a to-be-committed tribble jar that fixes this problem -- See https://jira.broadinstitute.org/browse/GSA-652 -- Minor cleanup of error messages that were useful on the way to solving this monster problem --- .../utils/codecs/vcf/AbstractVCFCodec.java | 2 +- .../writer/BCF2FieldWriterManager.java | 1 - .../{tribble-110.jar => tribble-119.jar} | Bin 313966 -> 319935 bytes .../{tribble-110.xml => tribble-119.xml} | 2 +- 4 files changed, 2 insertions(+), 3 deletions(-) rename settings/repository/org.broad/{tribble-110.jar => tribble-119.jar} (78%) rename settings/repository/org.broad/{tribble-110.xml => tribble-119.xml} (79%) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 043e5e185..652f7f96f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -587,7 +587,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR); if ( nParts != genotypeParts.length ) - generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo); + generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records at " + chr + ":" + pos, lineNo); ArrayList genotypes = new ArrayList(nParts); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java index 7b8224568..9c63a69e7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java @@ -76,7 +76,6 @@ public class BCF2FieldWriterManager { if ( map.containsKey(field) ) throw new ReviewedStingException("BUG: field " + field + " already seen in VCFHeader while building BCF2 field encoders"); map.put(field, writer); - if ( logger.isDebugEnabled() ) logger.debug(writer); } // ----------------------------------------------------------------- diff --git a/settings/repository/org.broad/tribble-110.jar b/settings/repository/org.broad/tribble-119.jar similarity index 78% rename from settings/repository/org.broad/tribble-110.jar rename to settings/repository/org.broad/tribble-119.jar index f8e312ad94d7c96eba2e9093a65a2470204911fb..ab456938aba23540eb8aedbac454841f220c6bba 100644 GIT binary patch delta 34968 zcmcG%2Y6J~);GTPnUkEE$w^2iJ){y6QXvHhgqF}dp%V~EfB>Ne2nep)Q@D1N96nk(W&H@Wv;qIbT|U)-@S+U&~?g_m!EVr8=t!l0yF-VZTYHroMs`<5x1v#cy&8?nSjXntq zTvWAq$=q5&!#pR`Q{>bkBE_>u3--7Z({0I=;t`)kdMe_g(@jdIB__pF0$^TMYtT}Y zuAyZnB~p?}eWyLnA$9 zQLZSHN@xg2xzS9ANyBM`AoAQ35eXy+^9)ML_AI`=lgG0xTrNnlMP=ABGRFyu{x9=@ znx`$8S5rN=N{)1C2{P{?5fI#BrsTe-v;m%V&qd1l*NYCGX(Nk0XOiOFalkbKO1QX( z281s)`0A5v|Uil+-D31bpK z=@d%Y6iXebEn=c92$Y6+{uf8>D8o&HGFv2OH6~^wEeBs3fkJ^GU6a~m><60X#kV_p=lEJn^>3p<~dtxHS%Edh?!Y`6U z1EOsPbup-`LB%8t>PFp>q;vm!;MWtqv#Fu}i{6F(Z+i!@fj~2pO3{Vh0N>}z&bj}e z_D*%qn!;WPm!KWi6wbl$@?b{08I>@Nu3<+gyqu>p!r`UJy-2gLiOChwAac+U)%Iko zT;n7}`zFCbA?RU3i%Ez^E}jH6(yA>kUa-D4jKEuSBiWJl6zSRZ6xCE~AaWCpXok0r z#u^>I8k;7Y(^}TU=-Ll3+5pN>t_SbHEYuNpMICOM*sPMk#!3PeMp_EL;m3(cyq25%NvTy?X-3u&w+gxmvYAVAuICs(2{XX6U3`5UBj z#oQlbh`bpcTSdL_-H&cljb<7Uh?p`OiDGC)WAg{lDAi?-8?9Ep4QMq6AzL#V z!1=jHOap?+c9{aP_C=T909Nyj;@D&whg6Nmw-hF7`Ya6n83X#W=Gk{P#`9oXQ~qa- zXmwz8>VGs#fEb>>T?DfL@x*ufW6uG`bF%VbE!I<}8=i!WHd-BIcd0O`k^9&gpDnIxU zP=4u!`8H z{NgL3Py5X(Qa1YUn~H(f{Of@t>7LM85%T+o^}quoX7u&tmesfWLT%^u@P(>sG+$`W z;+aM;OLOo1F!g+B-pT^PHHm0uB{&Tu1)w#E;I5bmVSm zXmDcox^NWeb~l5%ADDmLav^$nzPkSI?%~waq*fGaP%mcYr3Ur(2?YJ9)TA(SnOLa^ zfsYzAzy!(+;*V5=27AsQ=;!(6hPU;>gEI%qu=%#g^;=q}BQY2)WP~7w%&lNJc=9q2 zrwF7(dNSPEW25k`IG9*Lx>!jLaTA4!n<-YTqG+U9bTW=&6n|%K%kTtJ;uNRXz~^~2 z2ylCW+|21CJv-ewZicMrpSVpyvX*CSAEY)&1^$Nj7gqpCWdKNg9EEvt6@sb;;-}S< zQB|Zz>5dH#P=NMC!YxsH1UU{-5dPoRS!KzNj|{9Q^9dBgifroIT@)hjrbw}lqJ@Xr zi4Cej%+vtCv1^4`t!b)jg%B;;slh9x41{bL%rvY>7D?KH&Sc>ks0bE1c!{D}*zpg7 z22q6vJ|hPs#PSb>R%igzs5C zt>-uoKQ{1VBR@6;k)(S$wAqQr7Jl5vpZ5pR1N0z=wi>j}poa|F?nHy{veU!-uCvo4 zoaIse*kPxg{IQEa9&=Kdn;s9MCulc6o-}BW6SKLOAN!oNpAOh4an%Zi%K`uX7^89T zrWH4cL`(z6CV=Y;rk%An`%beeO# z!1cdq&`a4)$V+Dpdf86roEWWFxY}pz^ePv4oI82mpw|p~-Jmx-3sVygTzgZtEf^7R z8uXSyZzIvO|CR&#JLtQodi5B{NP9d-SI^d;LuH;}_b1BY-6GaA_14p4$I>{H#?vyC zHR)aIXV7~ly-$BLL4n7ZAit2W!OTp^3IhSzq}6n*$+X)iuHV2i`(PSk&^so5Kp*nj z>?c3&AUesKfqIPR%x&HCKcWi|#*0n5NFST@5q*NMT$4WKug|#1c>3I+FHD+8^G#Yn zw|nyITd!Xo5NFbt^p#0p(^m%l-K2leKMnfEq;Kgv&%)d9&KXbNbJ-u1lyA~4+>{>~ z#J}h#gZ^#O&-5P*h)KWDubzE#(gIBSjeeIEH;cBOYu7~EFEQ|s=&~pOn(m$p#nHC$ z^oQrWH3RVe-P3h#s%-()$d5N_k)9L#GY>qlHc^X)x-eK!=*+5GXkLqI%N8uIW(~_z zd1KcDL3jOKxJ!Yv0^*PZIFG?oxQ`LRo4MLR^~O-*t?{rS?@$#UhHvK39M7E5A3qA; zmJ~ zAPV6zyMbjM6`$LbpdkUdXb#FUX$~m|h1%6YlS_K|+P@Sap|f(en2B`IXB!H*L7knaX{LG~X8m?EP2#x2 zlql!~B@KnZn(Wx{AoWIU06tRcsf{a@VcCNlDG$9Ql>al}b4Lw2y<*vw{&^&t`--l8m zkX>T1-g#tpZc6IMtC}fGENlS0B)RWtf2x~f{C&5REo2bZToCet^`LCIr_>qlZtBOP zX8q9q1dC%BmPKW63oT{-;_FW--rRzKta>W3G^|FjU@}c4DBw?E%Q%YR+I*1L&{}+H zbO&9HcO2qN!R{v`oQd}#ki{;*s59hfDfYB-ZS6P&g6NO~-^x0pby~kgixwAaa;W_tAAIGy;yU$@-PI(TU87^M%7-wfX zj$s0iL#{pQx#nz+(tT2XFTW~098@tJ8dRbC0|!hw%bZvj%MH4gMgHrYh`HW@m>cYL zqZ3%af>3PTD{lIkd^3agyU9|7~W`FOo&S&2&FM9&plww3Xww@#7(eyWL>f??I~Lm`Awe zqjuWC@12~y%b>>$dfcEVJjYHa9XS2+co7&_yI@Y$d>J-P#L8a%^-xjN%z%o4mC3tw zr|9Ns{mONogRhKopa!(tAbBA5)m&i!9@gq^&zSQy%}uZJ#CZ|qIo}Dt^VRkzS>NeI z-94qRrB!vJJtpm?8WVzeM_3=Ps=YoVIXly!eI`quCX1P@4V9QwPhJyTRLgd zvjq9@lu6IiX_H={7g18@+(kmQ#mCfYmO?r)9H|Y(QAKzjD z@hY0^xo25;GS>^4@Lfxw^a|43Ppz76(%-mAy{N>q^7WH#{%;JgU0WDlVT=AhwWfFT zl)TZoZGf`9!-SM!S>5MW&Y#(#&#cNtqpKD!shU5ns@tGUS^0w&F3)~q>v|yc&3xhU zM5KF4-&!j&Kj?PZL5cmvn-uM<}aAXT&pIv1jck;U$eZ zvWE4~>JQV;C`by?)DKbw3%Y(l+d%ScDBcSV4T9FWg#yIhY-KmzLXp`n@Z^v!WM;cU5!g&Y%4|1Y z>;?C5WJ8;EDJxsJMTU8-EMrn3#;ha8sR+xo3#`OlG4kErc=b?COSbB4Y+5g0%esP^ zS)y)#fPz)jj5Abo2Uz{qN)YKl(%0F?dc+58SBQ@U>!we2Nt-ZT`~0(TWrJz#v1h$7jM*2T(*HSt-ByD z8gcOu=s10EyuA)$iyh{*AbcYQ17#0Q-a(F7!&Xm;o$ZnK(A;el5*xUmk{|-^pajHZ z`(hv}BE}uWscjIB4+XsN>`;4wRr?`It*5r}p+R*NmId1#S_Bh7nwQd1H@ZHaWRzd#>40--}dbVjLuwg%}MtwgmXFyk=@1)$s>rCJ3-Xd;Mb<|T59KiIf5 zUcj7@X3&Ij*fxX&7K^BUh;oOEuwTiD3}?XxW8+`;K5MOwEiTQM(VeQ3)^gMu+o}0* zl|_`v$Pa_2cpZ4lma4~dLIayHF(4Z+U{0t5f%XxdjY#7twJWw;FGo(Ovmn+U znsbOU#^;0v6~sEMaI6k2@?7_RJ9k7yXEV}_wMPaqqzZj{=iuB(Gcq_bATlu0Sx=cN z>!ajY=AdrbqC(1HrBI7_3=5>###1Q+r5Eyuxn*%1uEi82aZxsz4}+ z2cmJf9#<2`boKE@E%tFB<2Xz-#|=k09S|P?Zt}E7qnXnY3PeNKlDi5)1MOV}r0oZ6 zhk!ye5pn>)SSY6*flEC9`4rfInw-#Rh0nAZEF`;hA$KUhHXx7i{NYCcsQ$K5&((E965t94u7MoQgCrnmwx|4(*p|ZWfngsW({vVv545@%9%O~m z|8YmK6{`PbyG>qTFIdTLOr;+>^#jPF9k1_lhYo~FXYrF$y zoN|6laW7SaB?84* zHSoq?jIRn8D&wtm4T345pkN&uvt-pOPJwa-AmQ`uW1+6Cs!kSj9QEYXD|$ezvBEtQ z;rO0S0foFLS+{g7FF7NDgF#;{7m4_R>+nZ z>qaOPp(Kt9^S6qUWA&y%90&D}K=}>Odk2JzNC*~5U>{kqd36ElFM(VzT=9f)pkU*H z!Q~KBCSnU>61dz|^f0*felV?*U{`0rsNMh*`UI>d8&KWMc`&Ap^ZmJ+0)a}-19)3Z zMiQL{1#XK8=|Z~|_14t{6;XiWGE9V`1V*zi6o73pgL*LIfy|E0um+TE5eX|>FM~=z zX$ve2M8z^ZN<=h6!dCwU2}>CXRY|6?jD#(ihC(?FBjQKq{|<*;H8e0EUO+yBcPRC5 zNt3-tW5nyN6kSEW#(!~&uCCjyJLItp;jqZ^*$h!0Wd$>OiKQ-8>in9G*Yt&y$G;p^ z>f%6QmkDEq>3w&k7-Luk^2)_mzF>T%*l7`Z`C_%`toC%b4*p?SA{JVrXIpiwyf|B= z%B!E$1LcW%!YzCD(jvTw+h}d#dY8`=)2wYzd9g-xvMPvOC6N;1ud5S&mf1x z5sW_;I+-%_qt*%HIcOS3F5A1sqw;&V80vAn8e$Vv+Kh>OzgjqC#Yzz>i&uz! zZ7qWe9MS--J$81L%UVBp+o7KSi3KID6pwVwXTAV&hglFah&2e5n85u!W)gQWH(CrH zb0^(pg4(v{yglUkIigrj+9_hN`-5$1+4m+fwga{lO_=5Gfs2M^Wb2X4`~*BEpFGS@ z5ZcHVCTT*z-OTn3IF;qoW`nkvbRXSs^7f)ZTlr&~3CVW53A#{c&?8*>QG<4H%1#s7 z&totunS|X$2JoZ_Wuw6J(9$H??`9EW%cs3wSXq9s#byIBcnz^zR(lmfzwi$9l)N1+ z--{GxHd`7jW|R!#l>*Y|@{?u5g6V z3h)T9kq9_49akE36j;SDs5~)8D7GB-`#4#18>nCFSUoZ@JCd0h^NLZc0>r97s4|}^ z_GK$jI(`DU1=M<|+)0NhS#dkwhEZBO2XdiFGas{HFyFF5vAUPq?j43rgtQ}+-q_Z5 zO)YeziD*p5an6Ljo4?HD&!Z;d2y|nR2|7X-vD9qf7_5|s;@Jy>Q-T5PgIVnhlSe@{Kwj`FPGd$k0&Bettf#mcrUr4BEwQP!M|>EaKaz1qWg6wf z+7|ptA)}@Ss#&SDk*ZC+Vah9M42Fc?`QwU!l|M6kSTpC0IyQb3-3!h6GG;%HR6-!ZmM)bYs?ss_r=9tI?ZC8z^7 zrvwepJxuw$`S-l6JSmdJL-MARB0C`|)_#}@cq4FLu_G31Y`$Z|7UE5 zzaC>b1oZ4*2(e@2!}JsV8?&DYL(I>ZTOH!aSyh6?&QGArOT0PsAMBO0-`e~1&p7%9 zgZCVwc}-hDezkzeub&l#qK!A}+riv;=GNWH(jWtR=k$E!3`zWKbED{YTy z-9wxUYG+j~>OFVCG^jZp<+{@%KG;Q_mB$b}2ZhQfKM?KY#nYmn9X>{EIP5BOUl6Hw z5I_?otGmy+KrVYhJll#F8Do$O`@Di8pdx1Rkl%^|-7P%Ew-sc`ECB|86-tu4HOxrkp;w~7c@*W$P?r0|v@)`;3k(||j+%H8;~ z9DXdkPHYCj>8ljZ;L6m%5LRLIxIx{LP+Z$^KNB1sR4D-H7%0nL5~=QkWV38IP2EaE zm(sDqGBB-Kss@W}cWdlcH<({T(Y7SOMwgg?CpR{mJ%+j20o>#(Tb$Rd#cYI$<#7_= zR&DU)Cf|)tbCV0Dc}6793}+xY_+b?oI@AnhJ+mnp-I@fp+jK%}IwP(~LANHMu(7p; zpgyd|MWWSdm>3qadE9jaJ|4*irI!d<@~{?RE%nFF_?P;P2ljvUJlwPSmv1fgvU8%1 zyPp&0!o3by4UNA8_{>$?iTQ+1+O(yZel!qe0-1mb;Ig zPBM&Vomfq$*a66L__5IoFky1aOLjWLiD%*RgHsA_u5JvSGuZJbkv)$R+4U!p&O4xj zycPuW=^OCZaiY&iYs+bEt+cne+}loihu#gM_vn3o{Ech)fD3&X1aFrMLEsS|^WzhP zKIKM!#%Z6!0mn{XIx*5;K}G+XJ&^Wt!rz17cXPp@e+Getf5Z8|W%%DYnf7olH059L zwsC@o?6uR+{QeJ*-!B~cl^?&cBNtqFerKZxT0Dd<@#8W-{@_Q0fjukO*vM8rwGsiF zycny+%J#2{r9$^U@~T*&>j8rH%axg08@W!`GUWI-L@Sx_jwqLZds9?6Q5^pR4Pkiu zy(LbF0QwqD_B!7Y|3vtA^i=--u6QqKwy>K}-KH3jN>9FE>i`$S3f{|tUB!eQ8;=<@ zlYO7Yo3II1o7kbh$z&^y$wbnmM_I6g&xawLctZsl_P&Uz2ok0tf=vOtkSRh%n4qZs z{YuA-9o28j(0;=Qj2Sp(^q9U=`t}=V3YRc}r^EV{4xi%BGOl#gpwi)e`k5k}6NCNn zeMXEOK8Artc-9Sz2rxw?jhEj~vqj3hF|gvCIIJ7dCahyPMuEL_Q^5RS2xNuU(=qy*(*dje86T{@*Iz3Si-6h(}x@nm0V;_oQSCQyqimvb#6Gft% zDY}aun?4dzuAZWo3B5Sm6s4lKDf)<{`W{AGB;#Zt+xoFec zd3Sy!KGS5#XwC4fniVBIpNJ)%19Q?mQ$}=^ql&Z?nfWPNdtoU|f90QwHIqjIXW2f2 zgG)UBcoI)M;@BxbgVIloIULP6quxLp^#KGH0Lo2g8^TqIM~$-=Sa7cp19@R?}qo`)pM=)=-m37AnS6g={=q437y;f$>) z%>GJ?(MTrHzzjUuIGuwxr)ipHfo7-{f9a#3SsY69K@&V}R>>~^L(#r}S(N>uxWC&! z7aP#uqAx`DSZ=Uo7rD8y!8bzzosOKX(Qr1f*uan`vpx#$kcY9j0UBulc8e_bY2{jh zTpF)PJm*U97veB#SoozF;N}`eVdi4MAYf1gj4ogfx4@(rfWa6*x2m{X!4_^)aRiSX zHm>u(osC1cQ_X<}INQ|B+ZyY)VkV>h+fgEjv`aL?pf$fk;AZ5A)u4Y0UL4l2iP5Mx z5D8DqvaiI!Fz8fviwE<_w;{W{?`zT8{RnpLA628sqq7tE!u=0`;MRuCoahB0G`90; zSUqO~KF8gIvC~vr(kP(Pp!E=j);FMGHiJ9{ZBRZ}>-9@~G-l&(V7`ic@vwmxL%~y0 ztNR0ZQNhVT-%i0ts8hMCvzLmFvriOUp^5{LSS-*&sN2QZMN~5FX4=Uih_xrEJMUi} zhJ^}xN8)Dc<8HFx)*N_JplaGRERgfSho24KPwwHq1BYe zQeeboid4QN8!yoqgX%i`M(MvHqeY1xK?sZpPhQ-|@MfE1!4VpQkr_HX7ux%(-mb$da^PMR(6bUVj5B=bMd)$$o_X#ZrLoFiGS% zCX&2ZvVgj~&^114b%R>K+*ymjasCy^Z{>OnId}oVo$?a384tq=2OM)P2RX~#0ATA=e_*B zkiyQfD+2aR^4|X=_B2l)nr}n5AxAAjrm}4T$>61{y zRkEAVhKLGztI&pt8S*Wm^%ZkuoTd#3sDZ$;P|nb_wFyfoNL)jqVi~1~H3qa=J|$r;MSYKh$6QJ7@#{dRbEIx_3m141Ed;YOt>TXcO_9>{_0P zm-%+BCtfvnZJr|m6PpMbK3;xd*K*XDw2oN%p*PNqKCXbQmcq8!9plSj^0pw`heW_1s?a4KSXMC#Xxy=6u=CX z_eE*f1ebv+jRfs2gK$1dCPiy62d5!_I<)n6utl_$zeH;tmBsYP`|Dqy(FW>m8_$Vf zAmS7Wmy~_=5VwFCa|%6Yr&CULPcrBQgGFa}Cb9l_)<(v8ta6=RHrQj{NoV;1>3J#( z$5ZLNlf9JSEgu9u0)jIfl{mxO@LG3Z1-=WfcPB6w@ZaY*MCK^^z)2s{M+RN6)5oxl z#w&T*pwFCG$Dca@^b0;a42gL%eZ`+&J3y5G&Y%C_D*wrkZw&g@&a(1U`rZkQhQK_P zehl(CsI#jaCtGMh~bvK&7oMkhy$!-(HcV{;_V`VAt%~Jl3gSlA_Ze4+88jT?~c{lgo(B|3_NK5 z^r~fJme*84nDj!9H?^oY43UnB!DvGeH(0#Pz|*V6Yq_=n0q?e7yd9IYBq7#X5ta(~*6_yam!dOpBBoUKg>Vr)+pv#BI7y*ge~x zicRU(vm3_A5Lr-c43Q02vl##}X2Iy%MexfM)WLHyAp$3;?Q;Jcw#Zn6Q&j~)(`Hpo zn=^XBB2*%10Cy+TBVSc{wajty;tV}*(~sC_9-g9wgh6J5&eMM8wHOeH)gsrkVqcEKir$kF|pV0pfc*4Z{o^pl|j z^(dK?swHL568%jvfK?#a#rbisDF)(G45B(Vd2cr{DQMIX>;mkY(@w5S)jCBE6Q$ht z;iecN$}oBnkBW# zGjO6idES!6waK$8msTZDTQI-2dghV^OBN?jQ<$-M!Mv))?V*b$kFLBX**Ck%rkF0O zJo$ITXP6Kpo)HwDJY&J!xeKnTnw~tj635@+sGE$%nOr%4dNRyZ^OJoiOCqZc)}>}x z!!gqovqZHiW{Wwdm@BaL&ic-L9=)Do0lsogQ6nIj4Qtle>cz?P7t|&%R+z>0Eavkk z$^JFbKG`|GDtU3`HS&fJM4YVsK%^?QjVY*ZiiJX!#@eD}Mw)gcB)I9YJ8lkW)b6|= zr)lGCa#=5^*ef!%f49;z2JzW=V^-zj;Z@6O<#62=EjMIo=|&*VzhDV@O7BiJgLv>N ztLHBsQnlPOvMgV=zAU=Sht6xI^20a5NVfbSIyvnA12eL8uAbtVbf-g(%+$M1fn+)v zbe#=EPR3G%S|M!dGPCi;TjVTl1Z3fWyyfvY2cOn1NG$kxTO|YeO>BIcg!L}Vab~u+ zK3mHYZZ2^xcpsrE4Z52S86prWe=88|cBr-!RHvSR(--36U_clH&0sS0u<6ic=0Z=Y zg;;Y9Wb@?^ZZ<$Kk+8aNhIIHSQg%XB+K<$ykoOtnIEB0~An!}a`x(MtlH0ACSOZSk z7p#NTI~~=;0l*>_aX{i(Qwv1l*3i7JLF#6(>SZdl6J1?~6deGqR8~*FX0HmoHGqd} z1x84Ry&qe6hzZ2Gsui>nqS8&sF`9z^a8o-R7BJLaUT|llH#Xm<;5Qumc|#1ktpVd@ zH|X}pJSYa;1`o~|&)d%W70C|XZbPj+Lp@Lv3%`!6Y`7$1*Mx;i4O*{X#awIk-;xI% zygP>^o+NC-0S3dm#|Mj#A+h-ERIuU?VW$OO&4F!FV`oKYtpn4ix z=x))(w7+QLLkjADZ=vU}TbTA&E!_R57AA3b|Eh`DX5@~`YddIh3P~-U{LueTYe0Z~ z<2A@PUT~((!Dds&UkuoD&6*R`IAB4I0~Q1{lo+r`2h(&P+&#ccRw??*j6AKQ8+un> zIo}GAAFCgt0-h6_nqX@q@D-rTs}Sv8!*=!S5MAC-Ray#v-^MEYHdfi!S0&)9*|MQr zEbw?Udc!yidovrTw?OC87{%oKdD=KNKG2+z{tlwwLy7mXnm&Z(^COi15CY3bun>RT zth+^OZJ+>T*ZYBW?5EI)_igd!dJ@UjfTB>Du{gHks3h-B-FAr=!?3b5d}ZzD>4p4Ec*S|iBLk}#M2^mM0uM&om0^{BwZG+L)vefW-sO!Q_vfo?WRU#bP+l1= zer}x6pP|U^#iViEJ`64I?f(R3@g!z67&QC&mP z2+WRa93J=@FL1PEf~(y6ny9|&$sKnEDHf4xoGi|p2a^ix?oPF-XswgyF&s?HYqL+& zA{dhZ7YFq4+)mRF02(3^PoUst8hM09z_g>#&NT^D`_rsuOr%Hd|29 zW}u`WpDb$|BZ*pjA`Z%WYg#N4o6(2b#+j;Bv%;fNtLD+74*?{82dT0H=d1ZTA?9{% z)Yze;;WYUWU44KexhBz!-Nj*iPa;l(vwJ1pY{tlgI8gyWQ}Lb#GsbkhtMHCT>P6ts z2+9)S&CM9>zS`+q`r~7h_>eRdWf^%RZen6_&ON4Tac%o)&}$Z-`QzzhsG@*RKPAy< zgT~ltEXzjY9BAoyJ5As>dm8b3lEL1kP8!W0lbvkLwo^qAS~=C{SXxCh_!ACB)7Yi7 zimLf@b`ZKZiyw30yBWaFg#2%~6ZC05=UTw9__`~D7CEpi7W1PPhxUNgmcZJ(YPw9x z7SY+QmdtOA=?@`XP}ub9#Y+C3&+@kwXU9RIJ~Fwl$n^9v`pPLb-RgP*cOB|)WG&u( z7D4H%;(MVg7b%@nR^*5r&*Ie9((?#(9c_n}rIb7QevU9b^KQv-x405(DK=_HErH-a zuWB5$LfoHKIv*l-rMfW-&fG&6ESlMYE5b3M%BGYKzBWWRwL#TB*4Hru+d|EhV5p9R z=FhKM)CcorTU@nRUWkSiT@$1wI9RyWYoRF1e5cka%2l|2^c>vS6#;o%wc1yn z)pbNiWn@m0`7=Z~us!X+^}lbkW?(M#`IYl9jn2^vmMof9WzAn?3)qaVta6yJAZ1}Z zl=@MX}vZr&P6HTsaTd!dXYgGWpUkZ1D<;Ynj`RO;)xv z&XLqIhlO31g16YpmQHPP_AF(f2_}(&lO;a)WF4wsfk8K#5TjQZmVYu!&Fr1Ly3r|l zjdC?pPG*MB6WQlbfNM$Cv-8+m+MtwVsE2AS<;Xo+q+Rkw3irZ3cA)?pR7x!ygN|Do zcKSZpA%GL{kVQ5tosoB}7G15YoDouw*JgMM>yxGJTkNZ;4Yvr}=&D*z`?9p4uqHf} zdt>zk&s*L|dEK`nHf3@3wU`-PSuhBr4subosJ7q2C6#kA(*YTSpxntR+i;@sUmw}R z`aX=?*=!@^Oh+IYwD1BicZ z5fcidp7mxgYN%}>jhJ*iv+=bMC78$-u`&l)Nh^2p(YAxo?$k%1`Y@~yyZUh8!wFd= z=pdPv`iH|mY+PpXl)W>;@r-~d7l|68l#jK=k%~YwtuuOgsGmhlqOXvj%T>VTfPELt zLlivJ6*3GAq!qYPJ%t{E#~C<}O9R^s)2(>;E91aSc8(s3jg3g1+{1N9QBeX}B64kq zDVl|%m|{JG1su31AEJ1R`{baiEYMp%4s1euYX$pgJer!0!7HR>sGKSA*GYv2*cOUu znrgfyHCOl=Zz2soix{+4Fre~LLyda6G5UPg(1t{IZ|Qn3_7!8v zMW|M4S5(R#HGBxBKMqJugAdqX>%xVhf7t@QE$pvb&>j_ZXx0LEh;MZM*DV+Z_ipQ6 znn4hd7$cJcj#6T|O+QLW<+lBlJbHAw?*B*`J=zj|cs`+_@y>8zGfYeJHHXKUJ?#0G ztupSahB;$7fKQ&WLy-oqcr}%R?+<`ZKk!O^FLMgDZptmqJEu_l)oul6b=OW4%nNo7bTaiolT%py-o7QV_g5(n`wEH~US4D7Bv%FhZYU?xu z;tKp_Z;&}VwPJ70Ds7^$1FhA;XLWOv>K2TBSOC^}VO$-ld>TM#JJF*iw3waPBQqh< z?ebI(ijozpwQE8-Z9klLd?%$l$-l4G+6?b$(6c6X_~220iqAK9H?h^%mk&AD^W`za zOw1Y{FVayy?FKZ1@b-vLnp!L-d{Ym9y{A;#xJ;IvWz{966<{9zC+@ zIB#E|X%Rr=04}c#>*~4Y$F=4+QX58Yt2O;NC)`B&K=1&DAb-C@`wXqWc&9eP&8@!{ zv}zPqGS`HY4IqNJwhIk?04r%Q7K;N4dT%(O4nv~|CIo*k-{9#{EqR0I2(ZH-Sn~}z zHekkfz#NUi>xgZ}F-Q%7IyM1rpCbWzCDO(LvQeR&Ie7T2qI`9Zm8~4vcuO^h;3t-7 zKtle5_g$}oUO4$TO8bKy7a=uYEiO?mUcXT=ANR95I|YrZ02$`*%dUkSX}|#?hLNa21*qiqYUP!VP#xXO3vaOh62GIE1vze83f* zLq|(t1GTsX`xmhBRkcwB0zDn9(@v7ttkV*)NwR63mL7kLmojk+O|cy$8)|z!;R+Vj zWSnS`@2u0Z-NWIcFdjPRM5u$@bau^X3v}hFv?+GNQ^}4V?Vwfht(Be8p}B~I*Dz3l zk%ZA;Qe(#^!8FSk1;qft79gM14P3HxiDICoHSYl&=xb0`gSf#00Mp%UphtVa<@2lf z*W?e(#MJxbJ=&QdMJ`7>Ztr_5W`%#9nOwG&^S5cOWZ5C?tUmjI_76=A_F}yUXpZ4( zR0*4x&uquVu!$c?#kmVEJ^-Q5}`0*`2zBA~1JMVfH z(2xB6ub|G{41<2+8vxPwfAbrApegJ#Sj0YqDeN=&CjF+|1$)sYgDzw1(__Av=$-k9 zc1jbUdciR=g^L=Xbu`nJz-I3GR`imO?!p{k1%E2VW7=4qZ|ss|9@DNBuX~!_T5@?Bpm8Zpwn<>0A-+ed{0JSaFdvxBlt`sgciR340#4pe?w@du+dr*I9p)~ zT?8mEI=(F$UIo0(YKkC!m?GGx6JVPv1V>FoD}IDBun}s{m0kTJ3=yfE_TaIHy08fu zXHWyY^w@_lUL=?zQ6yoH*A&Tw{jC(Pvkk4oW-FiX_~9|F5L^t`&X`P9_%sWSv@Ag; z$dY(?SPM}uiw}TDing43KgVsOhZO6SD|Y~4OZRF~_7QLnME@sPdxY2R)e582u!9Ik z57gTZ4=lv1ls_OJgu#B|2%NGMBcZeGzEw+=m-lLyH|+Rt zvUgpO{N*XF^^j65-rm4(J}TB1IHZA$_E&EVuXZam7I`x;4A{TM|E0IHq=d!R!Q;Z~sB9H}d`apqAZ%^H~yXeq+9T)fVP* z`KmQ+7(<^`*#^6aYBvf=7B8&>oAFn~dp1@@7#5ZJj(RPj6KA&A`A&#HkEsF{&t@^+ zDqzC!b2Wj95PrU;Dixf6E<(&`_BD>wJ`jACCpd0CLYf86gSc=|Ryj)R6$i=5 z+};9Vq0a>j&&6h7f>d*&S$nf?n?uJ$EJm1FZW{?N;Q@NRZ;lwOG-H-e{n`qNrB_YzpF8Q=Lz&HCzhKNKR z_uaK`(Ph3SjF)}QGg`Hqu|_%Gfj<~v5AYnPNHDf0S~a1uQ4_Fs*t;|mH0v%<6L!{R zf~1Z(^g~lh7%8nSRg5TBsb%ZPb(Fv+-c}(7pGZ<@5sN^)a4WRXaa0-@QCWfma!I?n z8w#7PcD085A5Y05fwEyb=XUY7RWgR|9-xGd`jZq&uoRl>Yb%Re>oL51eFe7=lBQ|R zKHpgYzJw>YunUW^yc9xOp@nH{=HQur>w-OqX||w58&$1E;zN8@2<%lVFf#g>mefQ( zMfzW4)|d@)opSOqt+QAoHyqQF!4?jpXtf{9J76N#!#-z#K*g)h1;XaiNp+5~qkrQ> z^vAN|eNr9xz6Bu*C!$EdOs#+cH(us*t@qi~)gZj%1E#Ag?{Qp5^=R^Dl1HUt9%_siRZA7uhAT`Mvg;|U~btER)U?@@n|=92^SV#A~wd> zKrOWWMsXUt2XlkfJu!nE@|BZXRPRnhvoVSeuoQmduq^LqVy$(C?+(m}SZiB&o45cw zTSYKpc5!vBr($I3hF3jxM<-?avsx#2j~x_`qpz-V>?4QchkFrtNrKYxy^0NcvY8R< zYF!pXcOx+s^D+&zemVwV2IYvEa2lOOqs1KPu5)RIn1`!87tqb3hSrM3R3~a_hgjM; zC^JCVcVYe7;4+d9NyW`enGj`7ste2Iaf}5_qM&+%K=C!~b6V1m6~nH&JWfdzc$v4I zKnHKU#9k)%mEd=YLh$^RVtCY^#3rbPnyhv+op7~&CSX`j+#$ks61{i>y~NGkv(~(| zTX!w?r?IaJ|JNo`=)ztq6M80@`!Q#{0z=dusupU;E~S5t9Jy zJOYPN*zm%Z%ggSiWpYh!{Ms0zwK1NO z)3DItz>d4+o}EZ#{Z2YD7xy|rQ?P~ClkN)wy?7vq?=IxWHe6iIxwbn&aUW(e>k-cN zC_i@aC$6pDMGxTOLeB8G1JktIiN}-b&guvTT*n8lB50q3+$qX1#!*l49RLS|FaRj) z<(NYZ;4s$?;OSxqO52IUsaN^r?n1p!f9;??+$wZaax-!oJS1ht836j@2D5GABET> zbGM0(-sNZDhipKz#h&>?@|m-+)qOHUkC7QKYvH*{@q?WNPQE5)@l(Fo8he}{@wL>S z1C)sy#A?(f)P9w#E#F4{cfN}HQ=Bg|aYXi8F7zE=TaBDJ5@*nlGHjt6j>=M4IT@6Rby$KhDh!eKH2ofFsM*p#{{KMvJ8 zz?&M!($t;z5WM8@SF|zmjcFn>=wnj^^06(uaQKh7zWR!mJj*G93}Hei6(LYWMW`Xd z_`?Mq6xE3c92j9mG+0ELBASL9B1Y+=Ciq()>>lOkcgsU_WnU zuz3n?#4?QY?L=4#17c?@hu_W*T zpe=*IXDyn?EY7-m8-{(542x2o0x@H*(wYCd3U6a!wLZ3ZRx0EUK6a+#oepJmk|LU&eJwzJ&^afgPaMJy zK3Y?Outm~1trBTd5rb2(><$9SYZ~qkF{lc6G83>cjo8algMh{tjkBqb3EoNk(GJy} zLiY1A!My%=ePKQ_3V=pKNE`#J&sa5z+^^)uewoesRp{>*|HYu$psC!kLe<6Q9pjiv zG_|o~EckNAtiHAA*c`yXGRBiT=D&pv_gi9)c-XD$0j!ChfQFT;CgSfiu$~y`Zx2G2 z#nFEY8w0x!K$(RVL#PSH1(&ubcxG{*rP@Fo3Jg$FACG{V`p{uHn15^~Zsxo@8a~!7 zDT^`9+QOU+38ez`vXc8h6?=KpRcn$_O(rIlokcZdZKDu@$EMiVKpu%;(jAw`#Kb24 zhHN^t-dbO`tt+PB!?~YakSQ!Jm8(?(C8oUzxmh(D$4Uy6TXw+(BU+vU(i_7ct91&;)p1Gr-Bsnre_1HVhj}KIB^l;H*kobHo=(z~cZbBAk3T z4>dtt)C{8TgD8ADC+0+Cpx>+r&)H8gn`s7L1!6TX&WjtqeJxc=FyC0^n;|$d`lcwM znALbktUCaTCofhUb!&%9L5huBK3$d!J#$CH*&~!vo`t`6)rMj0&9cNXZlfiWuExCI z0o4unJc_$$fVdkLvUM~;;KnHYfsGmBUikTMQT<>`_yFuf+yJx-1Ql{X1S^NE+|>Cw)Zq;LR$`WTdCl<+0*7t_nG%fzz{+2Vyo?=T{I_vZ)!Lu& z8x>*m0$Hzv4r~p2fp3QSlG;@GceS;Iu*QKGyrbeLEO0AA?TjL;pd2dz0+)NKfqy02 zm1a9i=@9M5Hy>`6fYox?gMeBN32HhZ->QMuZJh&{9%d+738s=)PZwNNE8i;Zko zR;V4|%Wb*Xlw}t}>{I8AE>~sYZrFZwc_4%7pkj^40QI&ekh~HC!fl!CjorEq5zZ!ipUY!v$Gr*q;!-x)aiboFnMrpb=5jsGoH9A@2JQTIu%iIA9l>*L1Cc(c@@c&O zUTK`zSJ z$vZGB=^zzcK?8*sq#joa^JM>-2HpeXD-H^w6CNMl!~;10)AtjGf>ELoWx$c_&~``q za*p!lY#&Z3hp6xXb#!%dbv{Hzhp0b47bk$Y8^@} z!Yp(HOD_RM>6^L6zko3L3`c6&ds7P zwSF3g5Num?lzaN)pu@$lwGFrmV9DQcPQc;a_IIs8`_nb*)-`1(E+gM7-)O#T%Kr9^ zc2k&jpG0MYR%rd%rGxj{s;rP#Xtv-;1uAmR#T#D%*Wh0f-(BX(8<^yYF(*WvjDFp9}cn z>8a?{)BxoUP?KnD?JaC?TV(xtOz*BjTWF58D|K{=Enl4>TyuJI&`l`9zX9^(PFtM3 zSYgZZ_NlO~_Z54u%64ap*UjL+;VJW8#?_nWpVeL7>~pq9GE|lxFFki5F9^=8L6#!~vUo0DXO|tGgU8@E zPum>cwIX_~72n2Y!Bd&nJ*T&l3-WY__kN*2-pcCHkTCs~P%D_w zMsF2j1@rQBcc>b`dpB)ZI0ZAxzij#Ozin+c^|l4dkms>e_-%<^ZLZ>pE4cBW! z^?laGq~6sd^h?6ZuzIBalW!1j9H;-`%dnzcf7!|)%O>fatgN?B(*L8YuH1gp+hut$ z2{{XoWt%U+->((jL&q zSt0iWdX*LW>H&ScLj0z`(<+BOhzmOTq^{iZpdP0!%<$~%7@__~n|w@nSt zr6xU5mOP~Uu5cZ^6)7WfRSN!Co8&Ud46W&}wPCXsOU(bTZsmyfzTBp)e!!Re@rmc> zm4M6gFUy}5@4A%(^R3)bE}6Z}-=5NKsP%j&f32QZLj&cFNL4itH?NxaI?D1dy#=hV zx~LL+3M1qPJ9Sg#Zt-{FTIBwtD{^;l*4MI!Q0wSHh)r**EpbxJ$fH0;{w1|2@17*X zw(G6sg@=3%b5`Fq9nSvj@DDYr>0jN-%VPgPLXv~xW980z7l-*cFO)vcT|%YqU*Jy_oIupi2O4+F~NIld(8 z57b%1nfB=Pr#hl@{Ifh@tVY$X{1Jblf!ninj!Tx;`5`T+10?q%t35c{n5Wuv&9P1G z{Ow_nIRC%8l_!=oX0_uKWVkGO#E)EMj{wx^Wt`qtu6qQ-*y4}ifrj??!@;|5W&R2o zwnOi*{;6=OKkCcf_y^9SNU<(|fx4ArSF7BfyVr)uj$3`%8~-qfi*=Z$e|0N84_Mic zZV8eX{LuUVG5@8T!-@D;*pwS>Dc;N3OI7EaC15wi|1XrjDGfQLFmm1ZL7`_8itx%vU{8@=-VV{ix0DscQ4GID_6>aaQg)e%{;Q;*WV zZc44$si&)^HT_|cE0^iS*BW6Vw_U!y(?7%i+=+(m`K&P+e~cqu5iMD=3n>e~OZM=S5SRAOO}OciZ+nDE zwCP{n$|vpq?7F|9f^rYold6|J!7 z;LyhbyiH0Y@JL1Pth5ggBkh+wD@|Rldhv1H;yGnS~Tg-0gZ*?)D)=vE$(G)k>8lunBTgmRes~IWWA?*3XZbyaTe^qHT|nw zxv{dT&?cNwmxFivhqGcgTE4Q{N=L79tXg6l_OIHYCy_R@DNWt> zYV~txP1Q#(&=LL>HMIak{}HGF!zx<thSmh&J6B%cnk?{?)CV3o^kiQ#Yz6@70r3e)?kX z`3F!X|5{etV?(q2xPi5mT)P)_D_mE|neiZJi2GAdKKZ%&au&!n|5}pltF01zA7YG5 z+=yfB-S+v|SBqN{o0^Rg{a~A_rtuO*YX%xG3S`6 zRP$Th>(L_r_#hcpkJJDDYd)+YZ% zOG&B_`fSQGPoWQtDJ>5nHC0l#@|xa_+2i;mgSETSoW?0kyZ98aylw>NO_fOp{lvDz zK}3J5qO;}FgMPfcgQG7_;N02j>N0r&X{sf`AML#M6ZDjS>}u)%SGTfm2Iuc@VfG}9 zllB1?alPI_K3eYw{0v8@Y_g)23v{Gck5GVH_(=Pq;$64$*ZZ2~XEd;?xZev{8BaE* z1<3F3(E{ZiUJNn!xrG;D3xGc#=0d5BPKAGFqfL?9V!keBl}ZAe{7Cw6^k4Eg^Onm;P;Y#Jicgm3b8L3m`x#%wg*mS+9|HM_ ze=XPL=t5tCnk1VcFLXmG7Wk~LJpBwne&3@hhg06{#or)O{fs+`#7?D6iKa|D?W?az z{l8N7`>=0Q0lPeQRFCnE8bez*xG6DEepCqUmHVg0qFMLhEaU&jC|S4ECdD~htgJq! z2dQyxVM_34#k+3hdz`nzHc)rY@E$M55qRC(lte|f)#Xxn|q733KC;4X}ddD2JH1MF3^`(ax7*K%rqIW4!H)YIgN zZ{hJcVlN8KJ>es{|E&2j_XG;?uTzTwy8AUQ)JF~DsgYgJMWOrrYbknd|G?Ll_-FNY zD(_FL;}6^kF2uiaDGA-uEo}^WLY3KWEF5YoGN|sV39Ol~{AclqJvW zDQrYZR^!=ZIk{3M=^kTcg$c!1+n(nzs0A5T^a>lU_Ww2jSM2|eRc19O>=ZKSU9Ak? zrRz>P;gmmZKBq+wu+paF>7jD#bN&fG%V}H2b6Pt#vshEzfBdEu=}=Vo*K*W)PvWd4 zssnN6g;CWI9r)M2#S9#pCbvBgO9c}EdGUEvF>eOPw3Df)^$07v+i8xT#nEZ<#!=|a z@sW6(IqDlp_kE!OM**3C?OH&un#YAw<(_fK|KVwLMb+~S0NQ5O=)5>$mb`^s3`9;Vw`sv;ZuuHhph2a2gw&$Uuf7hJ?^d&g^oKF%}T3 zeNF&Pc-#jPQJ5~!&B8iezk-Dgr3nZ!O{(ko^bTO|0bwl^F9FZv0_(kh0$7-Uee)Y+ zaP6wXTCOCc0NoX;EWD{_o91MVpoyavT w(;KG(6Xl1K(1EGx|4sr!aVkVye)^rMEOKH2-mGjOUClsv3>eXmH-RQh07eh;yZ`_I delta 30841 zcmbV#31AdO^KbR+?9T3HLXr)+6W9PDkOM*>0Rn_@-w3Jn{HEQQoh5b~g!r@BQC<(Vgk(qq@7gy1K3z4!#)s z-TK%@rGsp`o5)Ete8$@)>l$S`OL6bh@yfz=-Dc*s%bk+fA-6-jg4}j_L)sU1DD<}K zQCT^s!aHEz)Hban-I4CP_S|mW+f8(J|E-0(`@JGZC8|rs?RG2mk>iqXrOrpZ6WNHX zzad^67waO5b@|JWB>JNU$N0Zbit5+TO^v9~ppI_RsFRB#sk0lGBEEFtOII%FW>7Id zba#`BN;s#;pdJSG6y%;Wb6&*=`Sd_h;QvXCu5O!>E98>{#1PS|IwGw^_>Z4S^~bhu z>_5?Dkr?7Xyr-xCLeuuP81ne9{yNToXrNO*by|;VVp0tCHYtjtP0FHdlbTYFNf~^J zp+5cxe~t5(#(Uf*Wl|nz_4Ti79H*vHke=;#?~C_8pYD(kj}S%v_nRgAA~7u@K^}&P zl#_Hc@?hk#c$!Hzq=;g&vI8Wt_LEi?Aln|&pGKyQ8spjsccNBg#~7liHO1j6-fDz< z0-n&E5-ACtBsVBa!IRG_OhsWD9yMx$R8#T>`!5al@6O@@*lWnK8?~&C9jHl^TdzZR zxI;IkQ?p=)rmDwf++lP4bmX+aFN3^30ON`Nqa)+;j`IJ%JF+I$5*=m&PZnZl17D0c zUjpM4b8`;aPp-18oEmbw0u)i#I};I6N6=GeH3F|{mk^$6LU5`stVC<9fTpl)O`2;p zoV15&COC#+xceBmE9+U(vunt#>(xSTw|aASg?n`U1G!w*upeH;9L$(;I~e_NH;ydy zaT#8*{{8E9;s|%*z{<7(045o0GF*mEbY3?GZK1WTLTg^ItNyriSDLtdD&y<*qJ?|W zI#D~yN29uKxRB?f1v_X@9n@1nL+(=TYt_MKv5r9YLqete6LT+GSrgm*M^oJ7iSeiA zmgqz!x;)=hOXRI7y#d`(y|&%JAm{n0pm7k;FP%IVplY*dc$5XHd_XIYmAdnaUxM{k zPTHzjdD&BX22)p0`z=^Hcg7#VyxFtYS$WUQzb#mJ=Ynp*)X7C!F!koOR~a$P`c7pZ}XMaXth1m(DSVcgV~@;KFw^YZzzJzqMwselSO z)zP5N1{GDGTfRhyF8=X1-qkgRx|-mPjST8$QZaQmsKlfm)YGJ1)IG@2dUL7?H8rS@ zNqwmwKcpMf-@k5eAOEbI-rkUIbI6}M+G6ism(mh!A_C*$wIT+LD+SpY@2wyk^KZI2 zdo**Zck0GnwB33C#7O|!TA$d`1CoQ(GxC}r#13=+J0P4U!3wzLkpN1_`v8qsy(>Ie{ zdonFfuO(NZolCM)UQ(Es?STXDQ%K85l|v;IkD{cHmac>mq>G8@dJ_!}!huM<;TV;3en;Ht%e> zJ%$eUyB~-rdLUlL+^;3xy{@(YsavCbf4Z>nx4Ef1-R`10XtkT}q&1v(7hl%$WgTDc zjvzm+=hQuJTsH7U^5eY`w2?M(>OO<+H)yj#54dq$-R`6G>R6Xok?#}PlMhv>1}$)#K|^1xXXH5d#Vqhnsg)G zWYQ2CYSKblWYQA8^rQX;oiXWMdXM*HPx(WQEzxf-OY;}68P)E6`aqB2wf1C6b zeJ#s3X>tCw>%IQ`yT<2?qHh??w?Zv}zxsD|Z{c^WO>q83pYyR2uPUkM+$61GSQ4qJ^mlOlB>0(T;yPu=^fyT;f;-2uqc zXdr$VvPuJ~;e0+~u>eHT(S#*JxeH~ZXf!t67%-zWl=JQ{&C1>b`ReWs!^BvB>4vtx zameg|UC5G28BM^QQ#>T*b){-B%;QSIB>Q0Ba%v)7i58QPw&X8vIT=q@Svkt4AYDM( zB`mJPps5B;GpIrcgQgQMS6zbiXXWSC6=*0xEt-M0%>SZ;Q=W-5RvOna{%qN#_gIu$ ze9$Bwy~PJ3gQMqB)#Xga6ORR)F`@)y1ZtU!iH-5!DJy&oD4DS$qRB2iVAb(tidc#i zjcK4rP{3ITillBdkLD|c3=Vc<$EsASb*V!GmvM(y*+BHhUB@ySOv3DdhO2q{3%aS9 zK_aRGOza9C5o-QI26`i!10L)?N~8F;m(m0nOMfffM9?8>IvCQ8_W^2$JUf!G@(q^r`p$hPTBdyVAZ=y%UV5G&f zl*3tjDcAq~#x&8!AGxVdo4lNZ5OY1yA!0rGH56wxYF|SgYA9vsekv%-wc6Tp0#xW< zzA2+yTL@?sIOWsBQcxe#5_tA zb07}Qr50ko8VfU#B%JQN$9U`+{}>hLOhyTljgxGR24E9Kpf1i9wB<1tlkoR*N2)z- z&2vG$d20WF$ThcZi)M|m`wpF?kKq$N;Q|R`}F$09xNw*#UTN?`lHR2hVc4_ zJrZ9%>yh`}mSo<4_o-Nw@~0l@Xl0E(GQ*Oy(M|X-9$@19-gk9R!><1GSJGb(|9M;& zwQ}cmwzeA8%8#zIHT4fZ)xvcE-0^v=%t8N>Q$xiI{#Q<2<=TtY-3LLf*5B!Lj*{l5 zT+uaYS_~2YT7-J>=bdr-ZxvC#*IZDd7PukERT;F-@YlOCh7g6s}~_-xW3op3#7Kyf%A z*cZ3KcNRkkkagnqsN>zzPX6(LsO^ zqhJIn0!X1|`k-Gl7KS{tR#6kVKV2Nbb8$Ch8D=ji#hFoZ!(n_Os* zvllwj?As_J$IiKhJ_ny`NqU?;&Vkdd$VhW+qxc+0zNfJ#u7=`^oJmF!Mv;(aY@_5H z1C0_pAuj=WN!!WE<-60I+h}l(6DK1-TVr-cd$*G-$!Wt0S?G#$Mdxm%MoGpqlv+b+ z+bIn>*~sy6Ruh#K!O5mbrbPo@Y)-T@-|D@Vybw#$q9e9p8gcdx4-@NQ29T7c7TqV+}aoEufYXti00)(6F>(G)@4vVO+hD+Jl@v zTnykjYMd^Ba^sliJdj3+u#mC{r~us#P~vyXkEO47Hs}_A}88Wk4kf*nb2Yi z9XZi1l`!HA5OkAsxfvL?H054+#hBX-}2L^fvl_Oi3}VFXbLgycwpOR*5YllBLCx(fZX z`jm{{3~RZOQ;hXv7U~A?Y3a$Vp;m0&v6!nx1CQ6E0e6@N+>mxHerKs*%%){xwlL*W z_X(qU9SJ?9np>Ke-D(b@syoe*%sf5V_YJ!pzaPTr%R9hkqT4{W{dCjHuFkSWnzDS&VZkxhn9SQ+#f!q%3?uG?T?i!XaOa6K*92 z-BYuu$c1x&f7vVsad{&d5ic71{4g`JtDvtN>)+2w{oPD8P8!7b!EWsJp?n#}=V2*d zhVx~FlSXodQ7%x$7`}|<>~VZ4SmY7G)`9V zWjbG=YR@oeX5hJaFgxC z-9*=-Qd<+d;ue!`<*b!_Sw+_xbQ=S1db`Pjo(ZjMjY)U$Wi8iO$CtZlsoZ+AEr|75 z$s*UkGcUrwX1mir^1YVwK(a`XzpNG+0W(Fc(wcj*8Z0>^2CbDX6g{zJQs6ww#I=>Y zVXa8=u^FqNYi_D!6W^xn|%Y)Dg>7w91Q!WUbMavAq+G9x|(vYm-2)5=j7KT`+ zutXui!DhB2pXj15+oMB**=WfW4j65RN(+87OvKL!H8SjfiXAOn{{q#X~_;}vusoc${>?ccFecwsC{_QmkRT*Siocx61O z#_kyn-Ms}y*Bpb&$5CI1l#Wh!LR{&D73&Njuy{pWq$s!~{vxjsJ;Wst1&m!?4C;zZ z=H1=c3RHXv;4aUHm8jUD?qG=Rz}Y~97~TIsQ&ImLnkrE=#nwe;lsz#nzV~j(AOm^n z;tk6IB=6O2&E%^7Lq1+<;tKXVBdy zyHiZaUmFaP{BW-cA#9Tgnd^QyQMmB~P^-CQtI0x`K@Xb{#2#VTkD9a{W5`=AI$|(8 z*xVjivs%1nOKGi$ob^VFQEiT;@l9d~ofRxQvxe=F&i}1xG7`ga^tO-GW?t6G6)ZkG9yg@W~+50I123ixg$w*J58fA`kKVL)9o)|npJc|Oag=TjU3ZR?$E%q3vI?XD;u^@Z9l`diaH^C5rVeOc#a1=I@^P{^DuNb5 z6oZsSF@4ZQ2QoYYEWafg2K6;Q-tS zjzVC&Zmd%TU7#P(RXM!N&%=TRX&S5iBaG_~$N+2UC!}!Dg1NwXhxKet&p^FDfEA0w z$1Mu~Zy(rOLlgi5K!%$7ZG&i_Ip0KZ5jRQ=2s^i{rGK?jbEJ zZ4cR=MiBy9#&=jqks^`CMbe z%0-6xmqC9!>5`iRHVq-%upaJ3zR)6s4M7V+kDvjLXhm0oJK1Cr znMF%TmE|+Z@PmiIKLRH4}7%`y7&|ZB^k;#RTq5NLMMh_ju zKwAllVW!!3(ZZQ?+V(H6nlYrj(iE*}GSu?<3$F5a92O-HUDTQ;1g#fK={i$niEKk8 znJ{JLTGDiMnby466m4Ma5N&C)A@WSoPUHi3rf3g)hv>kU0#Rs+j-rzxI-8^ZF%V#=N@SQ~5Z%Cs!B|_B7%Z=TR!er8hz=fV zh+*>3Zjl&QDzId1oe{&u2t$ly{EYIKFYGI$6SZ_X@e5IuH(CJEeFWT9_0TqIK}Cf^ z+c;5Xitzv=t`HMUF;R4vr@s)r#FcV>Bdu#-_?O}{O`aPC9q*3ju*;nD>goQ6M_uVZ z@J*I187b23dezLO6|$U3Xw+AtN_SpWu~44=R7;b6zZQA&-mk^&6UT$w2y8?c?3kEv zC8i?eO+`wBxIj#coXrHN?i~NYe;`A3w+xBIbckn;nSs8B>mup<1}n2z?)ye$W?X|Z zi`lnEXu}fKvbE}ynID&~h+!$B|N^Y!%$xIuB#R&0|%Ud~C%+UYUX6 z3bu+dAmruX&SjcrnQ0lSrJVGF1Qa?9wIDS3{zEN0{U4h4{;y48fa6Z(`ESK~%y;p3 zB6~DNaXO|a6ip_9kH2hT~)++v;#sW z+o9G#1DvgDsgtHf-D-sXUd@J2S5Q=cEp@NMxem@mL^XUkMu9K%TA|gU%MV>C! z6WZ2LS5G%Qg#jR~io?)RqlSv{2w#V3L|_ELXAMUEuLUdrPDryZPX$Lhg=A|uIX>_r3?e9}!e zQbCgSaES8v(jY-AJ3Eu?iO!a@(^e)r=hxEU*|jtzSf_@Dj@?ef^B+)e7 zD{!|V!gdz6$^b}a4?A3xRtO#QM`R463nJq3;GDqWpT);EW&2!S_M7;<_qmhqI2ldEKPb7s39qbtA12bX7e$o6v?wZY_a9WxjjnD!|mNDtr)k2Xl<@51q6`_6);)e8Lj20 z7qBiyKe{k*I9mJG4d>df6f}V$I7RCU;VLavTOE@RUeq3o+yOdR0d$o@xi3|lA-c%s zY1-sCw%&I~jS}dWJ#d)x1pW7t_oiucL|KE{L(%e~ zR$Bi^XXX5=nG0vmhd&bQ+`Y9{DqozaCCbg0MB^AlMnnB+bJbE6Hje0Ue~&;D*&nlLgA z=K%J2dFKnZbgAWNDKQ)^Zo;bcyFq`LusvS%Pg>SZ&d$*~*h=XyS@M$#cTdUX9{)D! z68VB!#r3e>gP-_JA=uHmjBYoDhCv9M&=H89DO=}i1&Mb09W8~!gcjyBgv%6e5dj0Y zJfEp$D8}OEa+87HB@aESb&&tc)rulZ1%_QJpeRHEZ;;5_z(r6hVoc%TK4OJ#BG4($ z6!8LzLn3Q7$s$E5B-qX#yD3tk*bUFqPWhu6HxFFbPHSiLKm2Q~Qdi`6zlj#|WC!g( z@>qe^NB&f(O_MM9^{z2}uxlV6fYvShW2AiGJ1t3G=%^XlBT-@*?i0WO36U=$>DJCp zZ#=o+xf_SpBLdAjX<5R@EtW#7Cy1Y#3bUAmonH>S(iB)Wrh#^^g3Wdo%(1gU=W_sY z9%{}<$znWR1M&VkT$kX#rI4bQ;pnY|0CF|<<^milRUp?zkf#>oFu#V5pzh14`6}vu zhV++oy-y9X5{#xd_Fx%I7CO2~hAuK5OkfoD$^>-hK%*24ZW2m2g40b_sU4UqJoCtV zHCW2J%fB&j>jeH1gRH<*b^4rrOqiYn5lSs2#L3v!2>u|3y{6?X%} zM(k^-e4lan288uCIY0z0+9=wd3g{sPk}^OHtQwVchx%h_ETONzqi^hNw1}R z9+*unS-h%r0tE0Hj<(m~DLKVlU3F;bcfGXM*!0^NqgmDhdZtA-mB?g& zl6OCKVTW1Qp}AQQP@#Sm%O1sAtQx7MqkoN--#|zIR*fplY8FO`H;fVv+FGsFAb~xf zK!KZCYp_FBfw{xh2pI3ak>YVH>5+|V5AR;;snhmL|HeZv&MDbTy?r(a*`Xy;!SOgs z40tQ}C`rK4k&Kh037kUA3wg(wU^eixqJQ)OLD!Evr`9XgoiW7$`boh3J&1g;kkAZw z{&UAtpIYi$O>wMA3MPxu;3{00h(;T5*T4b?;NA>i@*oKG2cUtt4}xSo822H#r$TPn zbGaalHsQZ?8M9AoQSZ%q6+7_5FzxqX@YHwXv_n>uRpduImVSaD!cQ8>lBcxx^1@Qh z9{72>_G2T<@ATCot%ui2w%wq;X;;3Mg2U~moQ#1KEu-((#$0)|3xu)2penY~st0)4 zZTwo=VmELJN4t%#g`1sl#(9Le9JFd7Awd9fe zwMsdBv$j!gc|l9|XW!mfo?D@%$kKbMtOpl7gO>qK zoIZ%%&X<)BYt4q@MgJE#FdN|`M>saSi;1@ddU0%a4Zl`5*u380cIv61VJ>5lM|F=^#P9^?k4JG7SVxq+qqO-6%qK%9boI#o5a z4iENJa6^vk=3a&TexEHXZTf$OZM`qoL0;IQCHv|IGYh0TyB+89I_jZ=}Hh7m>sO^iJ2jUbNEvvS5FhcVHJw7)1_ z%^DaefuzM9A|4Vywi8=RY{zoBn46E6e?O@;5#40kPOW81fLaGAtH{X+;D~ugaKtu@ zc+yTS+cyLjpfRu`l)-SqBWLSo3+xGAVjE_Vqn4OCY)i;b6A{1lY_oQiCg*90fh}@^Os$&a*Ne4jXnZrQay!aXFs}Yzy5a8HPZVo6h@MDDF zJ#txoP-`YKvXIl&kB0Z-EFd^yDzD5!6+O8O>iqi@pNocE5w zo;?q%p&okQg$mF?J#^MhAJRu|2q5S9a-OSx5<$LC>9Yt3*`IUf7j8)S|KiKP`SKO~ zTC6L7%LBtDo!x!u^gUlL@Z|@`!3&IB=8fs}BVT^v%g?qhn~ovxYx)mgesR*T2D<^% z!*0Nf^cO?^+dYjgv0{`i;Na8vZgUG=*f|ByUphR0xUi1r6mGo5Eh6~YRD(Cz1K0}~ zV<%uQ_5k)`-(NcW{yr2QL&TyB<@=MBX|^VTLx;5^n)pVYSn}Wz?RyXOJaYHBc4|c> zYtB`04$4C>X&dxJcIc@XgX@oK`Et)u?OO3}pv}wLQcbj%;v22I{MW0RHx9eaWbMpk zJqT2yUi6kLmTQT{WpurX{W!`57el2HRH__h$#f4#2hTD^ib#c*(1amtjv<))aZ@&GNIw!_NjMr05_D zOi?I08Y0mYooJdVIx~nO(b53hnS2X${4gOx`CiZCn$6+O8&M7NHZItuCSr#+2i=m7 z(v$3@!(h6>O-B=l&iBrlKkurF1>U(83$B_u?<()2O7E1J^Qye_=Xo=#yw}WJIK!JU z%{#rKeBq)66;7-+%{22?%9uc+a}1~TetEaMHwwRmS42X&Xx6#wb&sj~LC*4{Hn42H)M zivk`z-4sLNc;pyDTbU@8KfDDta^^n}ptd|K;^p-xv^V^(-{g?PUel7r07NR<5(78B zrVYhTD4z;MRkg^^%afHoL=$=FcNk%Odk`8E+uW8ZC+-!K&ku7_56P@Mq z8}$g;_;oF15CjhJR0lXEzemr!lg|#mgJ9JiG^_5QV$>b2(qmB0<_I>YBM2YdTF@18 z>FZj0dwzG`l6W@YSh)k6dL3*M_re$u5|yGMIF;jxqiEPod4)Xry4F)nkTIvU?6zEw zrF{fegzL3agJb^NE_k%DbCVCdEBvQB#AkA!mISk{UO)D5d!SeKv*VPOR=|}l8EyyU z+{aY|OB6H14V3pDO@un5c!+p+4~I~&bQJr$Y95zIh}yAa}oikudAVO zMA5u>gnp8U%YUL@6Ropf)W+%7&aZ&|6Ef z-WY5ciBCgsELB)|8Y(pel6)v0>h0g?zhKc9mslP~0I=XcU1xMzSYSW$GANdeckdak ziNd?JWbXhW@77zg$Z&6w&}Fzc_5#o1rT|xe72K8!xo#Cc_%Ev}OKf^T{O#@7)-6EC5z)+3U%zoDK(AYSzc1hkMffz1e( zwhv5-K>+Up*rgoJ+eCJLOB;~SjxcXWN4z5)q{sr0lc_L0&IuyiOA+7@@s6H|zTJN5 zOla`cZ)xp)6NHN@1){+P%!2~a=78HD$Qli8|4v116QKX|0G-eq@<3Zmi>zOS_{LJ$ z9M)odabTS5P^v*Q>7XWOQ&Y+%{SpiVu$Vyay&E(EpZ6s|P#GDVqag=sOcP!F0V=6K zXB>e{J@WRqwV8_W6f+Z1@CQZ12^0ellSd8NVp8M6gBpkJAl0CHf!&p3P)zN?(6T-l zN(mDu&Q%7F5;kP;hRL>dg+-*rUIJ+hfm>J+MgkJ4!pDR$GzC_h#*@2lzw|{4(=dnt zS8G)GP!e2WP2mb_4GDq~$;KOw{oVwMV3SFjq6m%q367LMDRdu@CW7OGHUI-UFFt=2 z4hMa3e4R+g$BxjjQ543Pv2r%9EmzLpg`9fKOf*y7TGDZ&U>AZ9!8$N?P+BS|ng4{s zK@_3$Ap}Ju1A$NsMlN|*D-ga-^CnHsR0 z#fc!G-Iyp3MO4V|-_w$OgLzf-ZPXk4Pux;VZPq_PoocDAC(k%Q?aFK@&L3H(bFErx zKeCJuB%x9cRIa5CI3x;cs8D~>R!3aQ8YjAd+`E!TbfY$+n94;7O%*-s&9efpdiof( zj7EI3B^ufop9_!>t-*={>|kbsk7O0;KwHPob{<)$Xr~%lv^UK@qn0{b!^0@~OB0#a z*utYUke5|UMI+0cNOe)EXh&!rYJFT1Z6pG4QE(=S_Ds2%+A}PD3uU8m*NxCqDV+yV zOWg+DY!hkGO>o}rr{Xe?iZR)KX`1D)w>DQBoPs3!kxjsA0p2%T45JmI6z`6Vpp{}Y ztrlbGE>T8Xgl{}OB-qno>F6%-fSp(~W>=2@Jr?0T5?n{ldkQGB;lGCvV^9vFVBFyg zAr}>Ix=71ye^4^NG7Z@=h{WkYA;WuYKc%bB*)VSGGXm6OaCTPlv;10xD;s%ke!W65 zlAd8kvWe1plKZI_LKk{_`qWTgR6#gL-x}(N;2Qbx2U>xz|8`2n`xu@wFq&9qHc2{j z9I){k>c9Qcu52b(U9p`am=8!y233@UUrm9leH!J63V6k*!}~vzCWu*hHFq}MAm-9d zVjkTgu0|}*0(>`NVHio%v2UMI?CAy=0jkM21HduAAinTP3Yh-NG5Z|I03&gpX|TCk z%)!d3K@Q`sDLMY4k&4N$y2$>ZZ6)|!Iw&%28~@9L9m`{{r|&%1FP18_C_-QzgCxeXnL)z%F= zzHhseKhc)V297pt;7C>m4!*y_7nnGXvG@HLop7VVYxq7J1A3i52nPelY3>oO_iDCr zxadtD^IHb{+uP7PcwfO_BS#y0pS}7Ya1StYw4o23VLQhJm-3-I=~Hh1nHzq^FF5t3 z8y_8jjl)G>xgqa;&23@oI7Z((`K@D^Iq)GguJZ$5e#F~yPPS~|g$lm>VkmottQ@bU z%I?qLwZZE?)^g=r@ghb(_OUildal*_%SjtVW2IvUrk>N<2{+=h*&S}spMeL?Yd;F{ zd|=(DTBR^9s;|%Omd?+$*8f!M4o}pQ<$%w%*tY5ebLzt+;5*8({uevXC-D2h@Q1^W z0@&<0Uk0DKuwAC)w=mhRVnX!#+YpAy@ziK5-0G!d6Yg^p_7v7Wn|n-w5FCTu;<2K! zDdK3kEFY*PM_GPxG)gofEJFk9o;oh6$Fy=-PZn2r=i@ea=F)ODrFiF0_nIP!qkdBm z`ivB0-Dt}suy6*90W;>$nWkze3kXIOwt(>Ho6|(LqQD2=g5TzwATkWmQkH+A#YJX{ zR(Lm0v{tdl{U!OCIdiyZl-=Dni5=wJkmDX^r7&IZR97m6-0hH$V#H@+0( zb1|Yjj1y#QmzQS*1D3g3Z;lei%ZNt1y!;M%Tn4*Eq3DOWWZI%$*bZFyQfse4wMXDZ zd-+whX3D8wX}9|0Zs=@AsJT>3<9A!+u7EAymfEVf|G`nJ)*yS z=WCdtl{xF(Mv?V88Z}ZL^6OpX)Ni$W*pd(^Ru?C-QzzBb?oIAm}xxxovEgBK++ zDsfMSm0&c=#vsi$f@Cn>GNk$S29|oGP{!va-qFVsyH;45u|qltzC6RhX_l_n568j) z#po>Q(hmMP1*L+9O@R8(K@Wt`1k03O$e;4n@y%j{ojL>!&D{%u9n!WJ38w(hb>Z~K zA<0Ku1YgXt+b?}+8QUkxcHe6WzR3W`hW6$dBnKeqxVKaR2z+i727Bieha&K>3n9$U z#z5vEZHkk>wq!~F2@dLRACvuoy@#b^BJ-e_Y zA^-nBPXmoD%gYD=8cAL(MLrf|GV*y|-tfH4dh==@nwJl&u~->ocwKmk^(V$T92gs( z7%MhBF>7uOCUy-F0Ct2cuModD$odDq1%TyYjB7ZtLW?{O1GByr=Cj0Kj$z?dW@8D* zrZ9ovdkH~c_u@D)r`2;)<~H5Qel75FWzW{MDa(TjouWK2Yo?z%`O z9Pv#!fK-QCusk?YHmq@Vm}g=kON&5;gmJ&1iS`o8F=ad&#ed8TW_caX)!|#s6`g>-3;IyVqmQizhsYBt|8^c zlFJf@>L4zt2a$c4)*b}Yv>L-q<$#<|wEr1O+DJ9{Xw`m7##mE4{LLhc6yHf|>S23E zruCV|8fuP85LNLFA&8^!sql9ieY}pD_)O!_;NhCYUx|Rv+YqjOI~0LCV6s>ZW#CS_U96!y#ag;o+)XuNJ?#}6 z6r$MUwFG|1+rUEDMd?CBMg`8~*^pQ706}v=S0_bHI=pxb5KzH!P4|*79oCO27z8jG z#4@MWU?=kjJ1q`lHJFPAOn%tyo&KU!#a$2z(h;9TMBSAtB72tQ6oAd?>xPtj{)S9oP7g5`7Q3BRCxlUhXwR`YInPWZs+0@UAw z^~Jlc@H_Hq+i(o*QT3Pyo(wPS$)L4}ml1Dn!k6g-7r6uh)KEP2UOXAJ5r!PtX24x9 z!LkS=1-LDj@G+txkN}?(uvQ9O)>3aA{4#2f%ebtlM#S^8Y7YNovYWww1=JEB0(gVK zQURR*uyM4VxU(BP#JCDkb*nN-F>P$h2isJ>`Rex)JG~u{s zr|mh2&a;FqOV|SU2-_v=`$_zDkPcQ%&MDn?V~q0be(}JT;_uP@_hfx@9{+SiV0paF zWqn&daA$(8BAEOo(H0qJH8Bu~*5{l~U1M_x=5Mn-5R7Pw*=dUl&E#p@ zb;0PR#}3-cf*(C`ylnFaC$;&sEh@t5V)t3wlwim4pV(dtBH+tUZ6CF=D!gjy*9Yqk zOw@n0KF<;;X{DcxR%p+=A${CYEauTj{@_Z6-131QEzL;X6&T-He>u_ulF?81MO(=o zWA$x8e3f3IF9?qQ=v4h_Yi$G9P1i35hjHBu{byax-*;cM{Q!0Y|1!f^l;+b6Rmu{3^&-`stgP1CSgD7qb&ICu`_+0^D=+z3eTl)qq za*pg?gOV>YRY|FC$i?B3)vBabu*8&0_JrDe#3dhRTP4=N?BMZogx%xIPxg2VE5yHy zFx3-nTX9t$*S@;%=uKXFQ$eEy|i^{0lHuHOdUz&|UJ%xbf3 z#UEE%)wj%W`Y&AXmht;SBgol@5&SYGRNPoD-=_z^>9&@Oe#Tp;Jh)T)^h|3ee{%M1 zGp!bbm&MAV`*jQXlSJqyJ1k0abef7@2X&sEj$UhMMUwJ_fQ(E^8@dptPmYo6bb z`8ZneuQ=2M_iZc2Rt8(8%C9jhwPvr+o^m1sreyxLYEXM(71u6QwHo|`8Vzc_vJ^{@ z=f8e=qP+B65N6%KnLr4rs{7lv;{D~Q-K%b~8+?bjt|G?zj#O|z-W;uJTd4Z#08q8$ zag?-`oz8?%KInPm4tXBAjqiRtBByka&>Gv^>Yra?5wLsjzp8MkbDC@s|JBA^cPLv_Q-6c+E`yPeD`pWHX3}zFjO7)Z7U|$)mL9NjF7Ef4vn|R z3mEUN#4y0PdR(o|X_Vd4ip$cnFI)R(s@6ot#=~5TU&9$4Pz_RLNqc;2vek>Bbtrxj zz>>Q&uvB@)aJ(J2DqQvem#rUem93qKcFcGb=IXvtcKIqbE3@k3gJtQe>i0t$y>fPhBjT(w72Cfa%h5CubjjMja!@`#=wqZi{9zY9l#2-@(Ho-BT@S2D2*U-zng zDMUn-6(4!P#Q4`S1Pu3WD`wB-KI0i&bH(^9ay$um+8dQ}-BCU59~;Vk>+zbuAl>t? z;YNtKnp@<^Nk@YnkC>M8`(W@P{&jB9@tEtlYCnY%Tlhn31R=BTZ`+FBuUA8m{M9wt z?Bx*6eX;x0W^~TKh6C7jyH$Vk?fAUJ@^Jk+keAWuru97i#_AI|@2&c2N4p9+@2 zd>AX6zM`k8u{QWrQ9ZcZRt%LEtjbsP3{^G!`5tB(4XVC!q^?{3cSYozuY?Hjt5?wV z{m1IcrT0}mS;1@YZIlM^l3xqfZ-v*M5>JO)OC5FGyzULn%a7OHHu=MGu$l|UbnD;3<5w|d z)8l%)=5(o|4Ia%cuR0#=z3#=x%Y}eui58@TsA|z_yAzm1llGjQEGuUq?z8elXoAbR zY+_wmjC}b-sNIKLHXffD54MYu7ZPC`&U-C5hPu}X2#RRk-?kM^28J6Y%MGY+4dlt! z0B}rMxGYZo8ZL`GiL(2a)GI@%f3%!>2%8^#e#f4Ufz!U&BU?IIPtn+k_YKTL!&# zUBg;-+3ocp=x_io$dChvLx0;=%-bApv&N&k9%Ol?j;i^vAyqufe`EkAa$9~ZwWs}n( zQc%alJ>PPL#`t%7T$dje7jCI>b_VwO*YKcyE!ZgdFBe*DyiPa0oI*!lvijia&{m_B zd~8uj_8VBL+exl?M~%TF=e`kKq!VA19Nym;W-0s{65CHARNX7rydSI{H~ZWTC&A45 z*N~y(Xi(4(4UCj6&*&|zj)$H>$C)XrNJ7bNEGOsN47o8Rxwp?by<`w14F2Iggz(?C z6<4NOjSh{lrOGYsZPs?Tphmn2sDHK$L-oolPKD4to6AbF!et2(YPC}?`!H1D7+3gL zZn#36+}aP&-;lB34?g!eE--vMC!>k{sp z&(aAi!;PY3@q<|TU*FbaEL55AFgorGSFlSc`cZO8APDt8=X<|>7>mijhN?!R4cw@| z1?Dvdlevj=noAS^)`r?ZNjLs$HEfk^ue|J_d31->hKp|7 zEq{0ynt#brEkYg)b@NT-kGY3XmwyEfX1-}3*YBkUb7Xku*W!U2{^d8QwezLmVDQZ* zdEqrQcMZ{7nAK09HRHTulvk!M5tUAI?X{5TI0 zCj4tCTOIw6RsVtsxh}kx|DM*-^LP|gCHxu+9{q$|cUDiA^IPdI`4zUE>U+Zn=(<}> zxG1h}0qcw=m$Q$kW>$TeQBtiL9Aq%f@~rHO;q272$bO`RvoqzSvqAjy9lK$9OUR1+ zYlzPwy}4?M!g{j#@raoaviR3FRs`**0 z{Gkfn+KUQ$16xglHLV=O^_$C9ABC2u_#q)at(S-4Dzl|8SJ-(e4jeW()EhBkFhhULsHWXxVQ9^hx~ow`MN - + From 73157ae3d38e1010d13c5374d0fea3e249b432da Mon Sep 17 00:00:00 2001 From: David Roazen Date: Fri, 2 Nov 2012 12:02:59 -0400 Subject: [PATCH 16/90] Allow each pipeline test the max of 10 hours to run The runtime of these tests is extremely variable -- sometimes they will complete almost instantly, other times they will wait in an LSF queue for 5-10+ hours. Minimize timeout errors by setting the timeout for these tests to the maximum of 10 hours. --- .../pipeline/DataProcessingPipelineTest.scala | 4 ++-- .../PacbioProcessingPipelineTest.scala | 2 +- .../examples/DevNullOutputPipelineTest.scala | 2 +- .../ExampleCountLociPipelineTest.scala | 2 +- .../ExampleCountReadsPipelineTest.scala | 2 +- .../ExampleReadFilterPipelineTest.scala | 2 +- .../ExampleRetryMemoryLimitPipelineTest.scala | 2 +- .../ExampleUnifiedGenotyperPipelineTest.scala | 8 ++++---- .../examples/HelloWorldPipelineTest.scala | 20 +++++++++---------- 9 files changed, 22 insertions(+), 22 deletions(-) diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala index 944ef7977..60c9d9a59 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala @@ -28,7 +28,7 @@ import org.testng.annotations.Test import org.broadinstitute.sting.BaseTest class DataProcessingPipelineTest { - @Test + @Test(timeOut=36000000) def testSimpleBAM { val projectName = "test1" val testOut = projectName + ".exampleBAM.bam.clean.dedup.recal.bam" @@ -45,7 +45,7 @@ class DataProcessingPipelineTest { PipelineTest.executeTest(spec) } - @Test + @Test(timeOut=36000000) def testBWAPEBAM { val projectName = "test2" val testOut = projectName + ".exampleBAM.bam.clean.dedup.recal.bam" diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala index 3e9af3e68..dd07cbfdc 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala @@ -28,7 +28,7 @@ import org.testng.annotations.Test import org.broadinstitute.sting.BaseTest class PacbioProcessingPipelineTest { - @Test + @Test(timeOut=36000000) def testPacbioProcessingPipeline { val testOut = "exampleBAM.recal.bam" val spec = new PipelineTestSpec diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/DevNullOutputPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/DevNullOutputPipelineTest.scala index 92c40acb1..6bc6b56db 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/DevNullOutputPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/DevNullOutputPipelineTest.scala @@ -53,7 +53,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} import org.broadinstitute.sting.BaseTest class DevNullOutputPipelineTest { - @Test + @Test(timeOut=36000000) def testDevNullOutput() { val spec = new PipelineTestSpec spec.name = "devnulloutput" diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala index 9d885dda2..f52632a7f 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala @@ -29,7 +29,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} import org.broadinstitute.sting.BaseTest class ExampleCountLociPipelineTest { - @Test + @Test(timeOut=36000000) def testCountLoci() { val testOut = "count.out" val spec = new PipelineTestSpec diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountReadsPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountReadsPipelineTest.scala index 1b965d8d2..c23c12719 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountReadsPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountReadsPipelineTest.scala @@ -29,7 +29,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} import org.broadinstitute.sting.BaseTest class ExampleCountReadsPipelineTest { - @Test + @Test(timeOut=36000000) def testCountReads() { val spec = new PipelineTestSpec spec.name = "countreads" diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala index c6e4c3507..4ffaf7b5c 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala @@ -77,7 +77,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} import org.broadinstitute.sting.BaseTest class ExampleReadFilterPipelineTest { - @Test + @Test(timeOut=36000000) def testExampleReadFilter() { val spec = new PipelineTestSpec spec.name = "examplereadfilter" diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleRetryMemoryLimitPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleRetryMemoryLimitPipelineTest.scala index a9a5928fc..0215a389c 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleRetryMemoryLimitPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleRetryMemoryLimitPipelineTest.scala @@ -29,7 +29,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} import org.broadinstitute.sting.BaseTest class ExampleRetryMemoryLimitPipelineTest { - @Test + @Test(timeOut=36000000) def testRetryMemoryLimit() { val spec = new PipelineTestSpec spec.name = "RetryMemoryLimit" diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala index f6fcd7c12..67ac68c28 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala @@ -29,7 +29,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} import org.broadinstitute.sting.BaseTest class ExampleUnifiedGenotyperPipelineTest { - @Test + @Test(timeOut=36000000) def testUnifiedGenotyper() { val spec = new PipelineTestSpec spec.name = "unifiedgenotyper" @@ -51,7 +51,7 @@ class ExampleUnifiedGenotyperPipelineTest { Array("vcf_intervals", BaseTest.validationDataLocation + "intervalTest.1.vcf") ).asInstanceOf[Array[Array[Object]]] - @Test(dataProvider = "ugIntervals") + @Test(dataProvider = "ugIntervals", timeOut=36000000) def testUnifiedGenotyperWithIntervals(intervalsName: String, intervalsPath: String) { val spec = new PipelineTestSpec spec.name = "unifiedgenotyper_with_" + intervalsName @@ -64,7 +64,7 @@ class ExampleUnifiedGenotyperPipelineTest { PipelineTest.executeTest(spec) } - @Test + @Test(timeOut=36000000) def testUnifiedGenotyperNoGCOpt() { val spec = new PipelineTestSpec spec.name = "unifiedgenotyper_no_gc_opt" @@ -80,7 +80,7 @@ class ExampleUnifiedGenotyperPipelineTest { @DataProvider(name="resMemReqParams") def getResMemReqParam = Array(Array("mem_free"), Array("virtual_free")).asInstanceOf[Array[Array[Object]]] - @Test(dataProvider = "resMemReqParams") + @Test(dataProvider = "resMemReqParams", timeOut=36000000) def testUnifiedGenotyperResMemReqParam(reqParam: String) { val spec = new PipelineTestSpec spec.name = "unifiedgenotyper_" + reqParam diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala index a43727ba6..50fc529dd 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala @@ -28,7 +28,7 @@ import org.testng.annotations.Test import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} class HelloWorldPipelineTest { - @Test + @Test(timeOut=36000000) def testHelloWorld() { val spec = new PipelineTestSpec spec.name = "HelloWorld" @@ -37,7 +37,7 @@ class HelloWorldPipelineTest { PipelineTest.executeTest(spec) } - @Test + @Test(timeOut=36000000) def testHelloWorldWithRunName() { val spec = new PipelineTestSpec spec.name = "HelloWorldWithRunName" @@ -47,7 +47,7 @@ class HelloWorldPipelineTest { PipelineTest.executeTest(spec) } - @Test + @Test(timeOut=36000000) def testHelloWorldWithMemoryLimit() { val spec = new PipelineTestSpec spec.name = "HelloWorldMemoryLimit" @@ -57,7 +57,7 @@ class HelloWorldPipelineTest { PipelineTest.executeTest(spec) } - @Test + @Test(timeOut=36000000) def testHelloWorldWithPriority() { val spec = new PipelineTestSpec spec.name = "HelloWorldWithPriority" @@ -67,7 +67,7 @@ class HelloWorldPipelineTest { PipelineTest.executeTest(spec) } - @Test + @Test(timeOut=36000000) def testHelloWorldWithLsfResource() { val spec = new PipelineTestSpec spec.name = "HelloWorldWithLsfResource" @@ -77,7 +77,7 @@ class HelloWorldPipelineTest { PipelineTest.executeTest(spec) } - @Test + @Test(timeOut=36000000) def testHelloWorldWithLsfResourceAndMemoryLimit() { val spec = new PipelineTestSpec spec.name = "HelloWorldWithLsfResourceAndMemoryLimit" @@ -87,7 +87,7 @@ class HelloWorldPipelineTest { PipelineTest.executeTest(spec) } - @Test + @Test(timeOut=36000000) def testHelloWorldWithLsfEnvironment() { val spec = new PipelineTestSpec spec.name = "HelloWorldWithLsfEnvironment" @@ -97,7 +97,7 @@ class HelloWorldPipelineTest { PipelineTest.executeTest(spec) } - @Test + @Test(timeOut=36000000) def testHelloWorldWithGridEngineResource() { val spec = new PipelineTestSpec spec.name = "HelloWorldWithGridEngineResource" @@ -107,7 +107,7 @@ class HelloWorldPipelineTest { PipelineTest.executeTest(spec) } - @Test + @Test(timeOut=36000000) def testHelloWorldWithGridEngineResourceAndMemoryLimit() { val spec = new PipelineTestSpec spec.name = "HelloWorldWithGridEngineResourceAndMemoryLimit" @@ -117,7 +117,7 @@ class HelloWorldPipelineTest { PipelineTest.executeTest(spec) } - @Test + @Test(timeOut=36000000) def testHelloWorldWithGridEngineEnvironment() { val spec = new PipelineTestSpec spec.name = "HelloWorldWithGridEngineEnvironment" From 0ab4022f2301ffa5b68346835de64a5992356018 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 2 Nov 2012 14:30:20 -0400 Subject: [PATCH 17/90] Final r119 tribble jar --- settings/repository/org.broad/tribble-119.jar | Bin 319935 -> 319935 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/settings/repository/org.broad/tribble-119.jar b/settings/repository/org.broad/tribble-119.jar index ab456938aba23540eb8aedbac454841f220c6bba..c74bea398312b68b2f27a60b2061ef2ebf076ff6 100644 GIT binary patch delta 2458 zcmYjRXHZpF5WeT$1*u9^bPS?05ersCQ9#X7U5Rv*ql5`<=ZnzsNGb$kJrVLJ5on z!9Wmn(TOJG#k}amSxKULuD8SgeV?oUHMVTB3fce`uZP&C&p37(H=ClJGrV9h;$8=5 z2{CKpd8VQ!$gv;IpviB=!Mfx{?Z}I>^@Bq1MlxU+_GSou0Q$SyvFwGw@vu);3 zUf}g?D=eTjniDt#G1zZ{1!7##J03q2d`?7r{8}H5Tf#nYf8@qb+zyY_aJzqV9LKn* zc^oTt0LR+h;e*VPQJB@UNS&TKyZYy=>DoO~6LQc>8@qp*p80WMFiQ}uS)wqEoe;*c zlbWtW`>^Sn!>+>+&z`x!N%>~fTB^2YM6nCHmlsD2LbITJI+}0Fw{Wf2*92mgHup~* zr?~gAwJ{gvOxd=-y-Wqc%T&WEMru5sSZb?Y1)B00>-+U>fci8623#=Nt^jPZLfA?J zwUUx!z#f~Z2iYlw*~{+3nZA6_ni*lJ*_s6!<6-h>R*wqGr$Dlot0S19@&Ddiah=A3 zhGO37`D`~1qgbwD@kn1uTf?66!_jbd8T-jDQEWCwqNCUjR8e`lgRR4ZyflW*LdTI9 z=EwUt{KV35N@e$9R)s5~_?=)+3^7+F>x7Po6c)=Bm_CN*&wL^DcbN!6mx){*3jBhFa>Gk}!)4A` zEV^NS?_%*Ie}LyJ#X>$@#(L42m+ILedgEuKIGz&k^DcVHq9Yg9>8e=4pG3?Nn=vG} z=ZkYOcZ5zn{TUvz+Qmj2JRIv2_3?wkS93;3B}fNpq(>=d-FpQ==oRP_()k-G`-e(~ zeDb(q(guEbZ6gh_#*2EBrP26hDGt-6nLHFcLrUUr^vO-~=FjNqDP81;CqB|OPCD6N zvY3SZGj>Wd@b<%<(nB7ph?S0yM$hULshfK|&r20tjUm5D!>H9YX)m%@F6B!>e1)Jy zI>~z`)kx-CgsW{*7{6}SEmd(|5C4!JIN+@k6ZnDWEU(QU_GMub&4llr+You zArk$|Zx9yVpu7n=uonHtav&V>eGc(in+wrsH|9bFqW4W$X7JySseJY(e1~4sJlKR7 zn@3qac@T`Y-!1S%JaLPrd3FoD(4LY{(;Uqom@c1K&(J{-#uIcDf1xAxz6GRGY619Q z{80g{K%7@dYE7zvk#cq+Oh9LMAtk#O!MA897m?jpM8yM(iCkteEWmhwF^#(6HiV*` z(4iLVxeXK1u)ae~qVK>uv|H}LDn#cJ@~ezS${8hq+bp+~kcjT3L{Bb-#pv%TrSV+v z!fLdW?vnrcT^i4)j2lpIfP)|_vI9?E2pMYDqs=%_f^m&trf)F=N^Qi|MESW z%(xP^pdDLD&+@F2;!CTDUse?@gIG;Ng;&!sCDo*WDyJ{r-(Ev~YHEnb_*%jPwN&qM zE#-evN6&Dnj(8i^lk(y9G;e7=@fzPiy8PGx-dMk+ftJhlKGjLPPxBZxQeJ2y=~CVZ zftYXCM7nKjqCRy^um=6k&D8HyGwJ-anbys-h5DXrA^rMW=w3)G&6PKRBig86bQ{&F zYa7tJo#r~yPWg}9iHBR&Ah+~5EJ delta 2458 zcmX|BXHZmE6n^);LtQ|M$m$S8#fSyFC<=r&AQ(}z8jZw&fCY^W1jV4yM9qvM!mRO) z!2-rIqsGB3@D$6^6cwcRz9b5k(a|8*klb@G$q)EWyQh5Tyww-k>5J?v7S5BvTo6nJ zAt5NuVyqY+lr}S6bn#28b;kdFpVI$YCo++OE|er2A+{P5h8?OVOVsm*7FZ$fbaj^y zbH<+`6t!V){iudbd?Q-9_(l3&Twb}s2L!z0K-ku@0or5+K)_+S+8Y!|Hgw155|7R zuq+-JR&S3nv560g^lf+<(wT202>2JMHEeg(M(iD|OWM8I$Ry*q;B2LZhMv0XiCb7`&<6@vlIkBOD!pL)cQQO(^b6ESTY~!`}IvIT}`MdBTRNK z0Ci3TS!PNzDS4)(+ky)6^pnIz_8v-%QJM597s;s{cU49Z7Aa*-?75i`~Q z`=)NsDI{zN#vKbH+mT_#ZaGOdK_PP`dBTK4G2{Z)lRaX|EcC?1l6YiMc@j@np&&0x zBs0-)Adv*K`gK2&Ok9MreLtzd6H$VXkjG|pZftfs8rGd8Ni3u%gY0laOLRFYL8=tD z+5y+adSdH@7JV-{I3$}il zyW<1Bw-2CA3s=jj7UCG?vPx93D%Y(=HBzJ04HqwsWOUDXF7z8L1i@e-S4IJQV5?m7 z6JJ|k=UEFxFN(^G1>y%&6rL>;3oTK|S}nQ{LZK&K^mj(V?WkC9go4p=(ajcx>6gVa z6$)aW*n}>*O)t(t!?1ku*e59Dw22LB6jJ&`V=D}yTXV-2%R$=9M|yPfl=llk5MBuU z6Uq-URdx@N%-G}y9i%l(=v7OD>@kzkL}>(mS&HjaX$EtJPm|Kwjy`%x{%noDzS21+ zJPweqFlvs6NVXHOepZ4s4PQS@kRCEmS(0>k1SVc_QtDzB-!oDfV`I>7k^`X2^@_9; zyI0QZr7(6vP$K=zYNl67Hta4hw@T6M`QR?8f?)8&Kcr4q6iO`Mdq&h;HS{u8y*x0$HKzpaI&@kA_+9-|!9e!X+;%ew;nG@GSA2xP=%Lb4v@sk_* zkIS57dPK?c@Gi+ouKY_fQ_gON;~0*|>>_7O0yBAYESO`aq*!>zCfl|R5**O-A_d%7 zH76O$S^ej~fF0{mx(E?0&#Fr>8)GvsL8!5q5G23+9X#mog*-tAKD2jvqYfq`X6OKy zEH~=F8&RDN%MfF;VGUwSHhhKXo&!q|59Po@#Eu;J8qq5k<{=))<@VNG_!9N$3M@kW zNr5jATNGH0=zo>_)35S(Jy&5p+Lv78Sa^-cjmv{bw5Q}j4C1>y-cRIph(o>MI;=zV zzX6L)|NAkOPu+lTaZ1aZumLgYCXJ%la}&ZbI`|d@BObX$r=fH|y#;=#Ptx;g4(SJ` z)6=f0e_B3-A*SY2KgH+yy!*fc&QeAJ1mOEe1+WzHvqH|+geq{9a|>Y{8oLU4uxAl` zgZl9zuJ;wuAk)OMPz=QfN-#r_WOH0}d$*P|KDLhXLi7Jn2mYA9q>dk#XFac*S39={uSyqJ{H!6Oq0j*2?R}wbDG) zU(-6kcN?9H>Id3*e0Lk~!?T^&p_dYb{-$%SV%Lpr&h*~#-01B77w Id;`S(3mt0_!vFvP From eae2d019cf6f9b5b359a3bbc0b0066f490464f44 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Fri, 2 Nov 2012 19:01:59 -0400 Subject: [PATCH 18/90] Refuse to package the GATK from a non-clean working directory Packaging from a non-clean working directory can result in an incorrect jar. Now that we have external collaborators packaging and distributing the GATK, not enforcing the clean requirement has become far too dangerous. At the same time, invoking "clean" automatically through a direct dependency would also be dangerous -- instead, it's better to error out if a packaging target is invoked from a non-clean working dir. --- build.xml | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/build.xml b/build.xml index c6b1afc56..aa5792419 100644 --- a/build.xml +++ b/build.xml @@ -891,7 +891,7 @@ - + @@ -1011,6 +1011,24 @@ + + + + + + + + + + + + + + + + + + From b07106b3a74505d827a1041870d51616d4a6a24f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 6 Nov 2012 14:39:58 -0800 Subject: [PATCH 21/90] Reimplement the allele biased downsampling to be smarter. Now we don't blindly pull n% of reads off of each allele. Instead, we try all possible genotype conformations for the contaminating sample and choose the one that provides the best genotype for the target sample (based heuristically on allele balance). This method allows us to save some of the reads that belong to the target sample, which should make Daniel M happy. Added unit tests to test the biased downsampling functionality. --- .../AlleleBiasedDownsamplingUtils.java | 114 ++++++++++++++++-- ...AlleleBiasedDownsamplingUtilsUnitTest.java | 108 +++++++++++++++++ 2 files changed, 209 insertions(+), 13 deletions(-) create mode 100755 protected/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java index 59357e1c4..1a7b4da51 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java @@ -67,7 +67,6 @@ public class AlleleBiasedDownsamplingUtils { alleleStratifiedElements[baseIndex].add(pe); } - // Down-sample *each* allele by the contamination fraction applied to the entire pileup. // Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later. int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor final TreeSet elementsToKeep = new TreeSet(new Comparator() { @@ -78,12 +77,21 @@ public class AlleleBiasedDownsamplingUtils { } }); + // make a listing of allele counts + final int[] alleleCounts = new int[4]; + for ( int i = 0; i < 4; i++ ) + alleleCounts[i] = alleleStratifiedElements[i].size(); + + // do smart down-sampling + final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove); + for ( int i = 0; i < 4; i++ ) { final ArrayList alleleList = alleleStratifiedElements[i]; - if ( alleleList.size() <= numReadsToRemove ) - logAllElements(alleleList, log); + // if we don't need to remove any reads, keep them all + if ( alleleList.size() <= targetAlleleCounts[i] ) + elementsToKeep.addAll(alleleList); else - elementsToKeep.addAll(downsampleElements(alleleList, numReadsToRemove, log)); + elementsToKeep.addAll(downsampleElements(alleleList, alleleList.size() - targetAlleleCounts[i], log)); } // clean up pointers so memory can be garbage collected if needed @@ -93,6 +101,59 @@ public class AlleleBiasedDownsamplingUtils { return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList(elementsToKeep)); } + private static int scoreAlleleCounts(final int[] alleleCounts) { + final int maxIndex = MathUtils.maxElementIndex(alleleCounts); + final int maxCount = alleleCounts[maxIndex]; + + int nonMaxCount = 0; + for ( int i = 0; i < 4; i++ ) { + if ( i != maxIndex ) + nonMaxCount += alleleCounts[i]; + } + + // try to get the best score: in the het case the counts should be equal and in the hom case the non-max should be zero + return Math.min(Math.abs(maxCount - nonMaxCount), Math.abs(nonMaxCount)); + } + + /** + * Computes an allele biased version of the given pileup + * + * @param alleleCounts the original pileup + * @param numReadsToRemove fraction of total reads to remove per allele + * @return allele biased pileup + */ + protected static int[] runSmartDownsampling(final int[] alleleCounts, final int numReadsToRemove) { + final int numAlleles = alleleCounts.length; + + int maxScore = scoreAlleleCounts(alleleCounts); + int[] alleleCountsOfMax = alleleCounts; + + final int numReadsToRemovePerAllele = numReadsToRemove / 2; + + for ( int i = 0; i < numAlleles; i++ ) { + for ( int j = i; j < numAlleles; j++ ) { + final int[] newCounts = alleleCounts.clone(); + + // split these cases so we don't lose on the floor (since we divided by 2) + if ( i == j ) { + newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemove); + } else { + newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemovePerAllele); + newCounts[j] = Math.max(0, newCounts[j] - numReadsToRemovePerAllele); + } + + final int score = scoreAlleleCounts(newCounts); + + if ( score < maxScore ) { + maxScore = score; + alleleCountsOfMax = newCounts; + } + } + } + + return alleleCountsOfMax; + } + /** * Performs allele biased down-sampling on a pileup and computes the list of elements to keep * @@ -102,7 +163,15 @@ public class AlleleBiasedDownsamplingUtils { * @return the list of pileup elements TO KEEP */ private static List downsampleElements(final ArrayList elements, final int numElementsToRemove, final PrintStream log) { + if ( numElementsToRemove == 0 ) + return elements; + final int pileupSize = elements.size(); + if ( numElementsToRemove == pileupSize ) { + logAllElements(elements, log); + return new ArrayList(0); + } + final BitSet itemsToRemove = new BitSet(pileupSize); for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) { itemsToRemove.set(selectedIndex); @@ -132,15 +201,25 @@ public class AlleleBiasedDownsamplingUtils { for ( final List reads : alleleReadMap.values() ) totalReads += reads.size(); - // Down-sample *each* allele by the contamination fraction applied to the entire pileup. int numReadsToRemove = (int)(totalReads * downsamplingFraction); - final List readsToRemove = new ArrayList(numReadsToRemove * alleleReadMap.size()); - for ( final List reads : alleleReadMap.values() ) { - if ( reads.size() <= numReadsToRemove ) { - readsToRemove.addAll(reads); - logAllReads(reads, log); - } else { - readsToRemove.addAll(downsampleReads(reads, numReadsToRemove, log)); + + // make a listing of allele counts + final List alleles = new ArrayList(alleleReadMap.keySet()); + alleles.remove(Allele.NO_CALL); // ignore the no-call bin + final int numAlleles = alleles.size(); + final int[] alleleCounts = new int[numAlleles]; + for ( int i = 0; i < numAlleles; i++ ) + alleleCounts[i] = alleleReadMap.get(alleles.get(i)).size(); + + // do smart down-sampling + final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove); + + final List readsToRemove = new ArrayList(numReadsToRemove); + for ( int i = 0; i < numAlleles; i++ ) { + final List alleleBin = alleleReadMap.get(alleles.get(i)); + + if ( alleleBin.size() > targetAlleleCounts[i] ) { + readsToRemove.addAll(downsampleReads(alleleBin, alleleBin.size() - targetAlleleCounts[i], log)); } } @@ -156,13 +235,22 @@ public class AlleleBiasedDownsamplingUtils { * @return the list of pileup elements TO REMOVE */ private static List downsampleReads(final List reads, final int numElementsToRemove, final PrintStream log) { + final ArrayList readsToRemove = new ArrayList(numElementsToRemove); + + if ( numElementsToRemove == 0 ) + return readsToRemove; + final int pileupSize = reads.size(); + if ( numElementsToRemove == pileupSize ) { + logAllReads(reads, log); + return reads; + } + final BitSet itemsToRemove = new BitSet(pileupSize); for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) { itemsToRemove.set(selectedIndex); } - ArrayList readsToRemove = new ArrayList(pileupSize - numElementsToRemove); for ( int i = 0; i < pileupSize; i++ ) { if ( itemsToRemove.get(i) ) { final GATKSAMRecord read = reads.get(i); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java new file mode 100755 index 000000000..7c2f5619a --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + + +/** + * Basic unit test for AlleleBiasedDownsamplingUtils + */ +public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest { + + + @Test + public void testSmartDownsampling() { + + final int[] idealHetAlleleCounts = new int[]{0, 50, 0, 50}; + final int[] idealHomAlleleCounts = new int[]{0, 100, 0, 0}; + + // no contamination, no removal + testOneCase(0, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + + // hom sample, het contaminant, different alleles + testOneCase(5, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 5, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 0, 5, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + + // hom sample, hom contaminant, different alleles + testOneCase(10, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 10, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 0, 10, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + + // het sample, het contaminant, different alleles + testOneCase(5, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + + // het sample, hom contaminant, different alleles + testOneCase(10, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 10, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + + // hom sample, het contaminant, overlapping alleles + final int[] enhancedHomAlleleCounts = new int[]{0, 105, 0, 0}; + testOneCase(5, 5, 0, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); + testOneCase(0, 5, 5, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); + testOneCase(0, 5, 0, 5, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); + + // hom sample, hom contaminant, overlapping alleles + testOneCase(0, 10, 0, 0, 0.1, 100, idealHomAlleleCounts, new int[]{0, 110, 0, 0}); + + // het sample, het contaminant, overlapping alleles + testOneCase(5, 5, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 5, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 5, 0, 5, 0.1, 100, idealHetAlleleCounts, new int[]{0, 55, 0, 55}); + testOneCase(5, 0, 0, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 5, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + + // het sample, hom contaminant, overlapping alleles + testOneCase(0, 10, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 0, 10, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + } + + private static void testOneCase(final int addA, final int addC, final int addG, final int addT, final double contaminationFraction, + final int pileupSize, final int[] initialCounts, final int[] targetCounts) { + + final int[] actualCounts = initialCounts.clone(); + actualCounts[0] += addA; + actualCounts[1] += addC; + actualCounts[2] += addG; + actualCounts[3] += addT; + + final int[] results = AlleleBiasedDownsamplingUtils.runSmartDownsampling(actualCounts, (int)(pileupSize * contaminationFraction)); + Assert.assertTrue(countsAreEqual(actualCounts, targetCounts)); + } + + private static boolean countsAreEqual(final int[] counts1, final int[] counts2) { + for ( int i = 0; i < 4; i++ ) { + if ( counts1[i] != counts2[i] ) + return false; + } + return true; + } +} From 0a2dded09395f7f09f157f86791ad92433403d47 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 6 Nov 2012 16:07:40 -0800 Subject: [PATCH 22/90] Fixes for bugs uncovered by unit tests --- .../AlleleBiasedDownsamplingUtils.java | 25 ++++++++++++------- ...AlleleBiasedDownsamplingUtilsUnitTest.java | 2 +- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java index 1a7b4da51..a61614481 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java @@ -102,17 +102,24 @@ public class AlleleBiasedDownsamplingUtils { } private static int scoreAlleleCounts(final int[] alleleCounts) { - final int maxIndex = MathUtils.maxElementIndex(alleleCounts); - final int maxCount = alleleCounts[maxIndex]; + if ( alleleCounts.length < 2 ) + return 0; - int nonMaxCount = 0; - for ( int i = 0; i < 4; i++ ) { - if ( i != maxIndex ) - nonMaxCount += alleleCounts[i]; - } + // sort the counts (in ascending order) + final int[] alleleCountsCopy = alleleCounts.clone(); + Arrays.sort(alleleCountsCopy); - // try to get the best score: in the het case the counts should be equal and in the hom case the non-max should be zero - return Math.min(Math.abs(maxCount - nonMaxCount), Math.abs(nonMaxCount)); + final int maxCount = alleleCountsCopy[alleleCounts.length - 1]; + final int nextBestCount = alleleCountsCopy[alleleCounts.length - 2]; + + int remainderCount = 0; + for ( int i = 0; i < alleleCounts.length - 2; i++ ) + remainderCount += alleleCountsCopy[i]; + + // try to get the best score: + // - in the het case the counts should be equal with nothing else + // - in the hom case the non-max should be zero + return Math.min(maxCount - nextBestCount + remainderCount, Math.abs(nextBestCount + remainderCount)); } /** diff --git a/protected/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java index 7c2f5619a..be19d3ef4 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java @@ -95,7 +95,7 @@ public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest { actualCounts[3] += addT; final int[] results = AlleleBiasedDownsamplingUtils.runSmartDownsampling(actualCounts, (int)(pileupSize * contaminationFraction)); - Assert.assertTrue(countsAreEqual(actualCounts, targetCounts)); + Assert.assertTrue(countsAreEqual(results, targetCounts)); } private static boolean countsAreEqual(final int[] counts1, final int[] counts2) { From 15b8c08132c59cfa08ed0e7b5a08f1d893da8029 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 6 Nov 2012 20:53:33 -0800 Subject: [PATCH 23/90] Apparently CIGAR elements can have 0 length according to the spec, but 0Ms were causing left alignment of indels to fail. Fixed. --- .../src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index 4f1e66ba2..585578958 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -646,7 +646,7 @@ public class AlignmentUtils { // get the indel element and move it left one base CigarElement ce = cigar.getCigarElement(indexOfIndel - 1); - elements.add(new CigarElement(ce.getLength() - 1, ce.getOperator())); + elements.add(new CigarElement(Math.max(ce.getLength() - 1, 0), ce.getOperator())); elements.add(cigar.getCigarElement(indexOfIndel)); if (indexOfIndel + 1 < cigar.numCigarElements()) { ce = cigar.getCigarElement(indexOfIndel + 1); From 2da76db9452c50277e1269c3fa4d8248e97dcd8a Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 6 Nov 2012 22:23:05 -0800 Subject: [PATCH 24/90] Updating integration tests --- .../UnifiedGenotyperIntegrationTest.java | 68 +++++++++---------- .../HaplotypeCallerIntegrationTest.java | 18 ++--- 2 files changed, 43 insertions(+), 43 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 9212d0e53..d3e77e002 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("cdec335abc9ad8e59335e39a73e0e95a")); + Arrays.asList("847605f4efafef89529fe0e496315edd")); executeTest("test MultiSample Pilot1", spec); } @@ -38,7 +38,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("efddb5e258f97fd4f6661cff9eaa57de")); + Arrays.asList("5b31b811072a4df04524e13604015f9b")); executeTest("test MultiSample Pilot2 with alleles passed in", spec1); } @@ -46,7 +46,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("24532eb381724cd74e99370da28d49ed")); + Arrays.asList("d9992e55381afb43742cc9b30fcd7538")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -54,7 +54,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("062a946160eec1d0fc135d58ca654ff4")); + Arrays.asList("fea530fdc8677e10be4cc11625fa5376")); executeTest("test SingleSample Pilot2", spec); } @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("94dc17d76d841f1d3a36160767ffa034")); + Arrays.asList("704888987baacff8c7b273b8ab9938d0")); executeTest("test Multiple SNP alleles", spec); } @@ -78,7 +78,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("9106d01ca0d0a8fedd068e72d509f380")); + Arrays.asList("e14c9b1f9f34d6c16de445bfa385be89")); executeTest("test reverse trim", spec); } @@ -86,7 +86,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("d847acf841ba8ba653f996ce4869f439")); + Arrays.asList("fb204e821a24d03bd3a671b6e01c449a")); executeTest("test mismatched PLs", spec); } @@ -96,7 +96,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "6792419c482e767a3deb28913ed2b1ad"; + private final static String COMPRESSED_OUTPUT_MD5 = "5b8f477c287770b5769b05591e35bc2d"; @Test public void testCompressedOutput() { @@ -149,7 +149,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinBaseQualityScore() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, - Arrays.asList("56157d930da6ccd224bce1ca93f11e41")); + Arrays.asList("6ee6537e9ebc1bfc7c6cf8f04b1582ff")); executeTest("test min_base_quality_score 26", spec); } @@ -157,7 +157,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSLOD() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("6ccb9bd88934e4272d0ce362dd35e603")); + Arrays.asList("55760482335497086458b09e415ecf54")); executeTest("test SLOD", spec); } @@ -165,7 +165,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNDA() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("480437dd6e2760f4ab3194431519f331")); + Arrays.asList("938e888a40182878be4c3cc4859adb69")); executeTest("test NDA", spec); } @@ -173,7 +173,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testCompTrack() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("22c039412fd387dde6125b07c9a74a25")); + Arrays.asList("7dc186d420487e4e156a24ec8dea0951")); executeTest("test using comp track", spec); } @@ -187,17 +187,17 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameterSitesOnly() { - testOutputParameters("-sites_only", "40aeb4c9e31fe7046b72afc58e7599cb"); + testOutputParameters("-sites_only", "f99c7471127a6fb6f72e136bc873b2c9"); } @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "c706ca93b25ff83613cb4e95dcac567c"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "9dbc9389db39cf9697e93e0bf529314f"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "8a263fd0a94463ce1de9990f2b8ec841"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "81fff490c0f59890f1e75dc290833434"); } private void testOutputParameters(final String args, final String md5) { @@ -211,7 +211,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("df524e98903d96ab9353bee7c16a69de")); + Arrays.asList("4af83a883ecc03a23b0aa6dd4b8f1ceb")); executeTest("test confidence 1", spec1); } @@ -222,12 +222,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testHeterozyosity1() { - testHeterozosity( 0.01, "8e61498ca03a8d805372a64c466b3b42" ); + testHeterozosity( 0.01, "8dd37249e0a80afa86594c3f1e720760" ); } @Test public void testHeterozyosity2() { - testHeterozosity( 1.0 / 1850, "668d06b5173cf3b97d052726988e1d7b" ); + testHeterozosity( 1.0 / 1850, "040d169e20fda56f8de009a6015eb384" ); } private void testHeterozosity(final double arg, final String md5) { @@ -251,7 +251,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("908eb5e21fa39e7fb377cf4a9c4c7835")); + Arrays.asList("0e4713e4aa44f4f8fcfea7138295a627")); executeTest(String.format("test multiple technologies"), spec); } @@ -270,7 +270,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("c814558bb0ed2e19b12e1a2bf4465d52")); + Arrays.asList("46ea5d1ceb8eed1d0db63c3577915d6c")); executeTest(String.format("test calling with BAQ"), spec); } @@ -289,7 +289,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("3593495aab5f6204c65de0b073a6ff65")); + Arrays.asList("50329e15e5139be9e3b643f0b3ba8a53")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -304,7 +304,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("8b486a098029d5a106b0a37eff541c15")); + Arrays.asList("2b85e3bd6bf981afaf7324666740d74b")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -317,7 +317,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("18efedc50cae2aacaba372265e38310b")); + Arrays.asList("a6fd46eff78827060451a62cffd698a7")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -327,7 +327,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("3ff8c7c80a518aa3eb8671a21479de5f")); + Arrays.asList("b8129bf754490cc3c76191d8cc4ec93f")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -337,7 +337,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("578c0540f4f2052a634a829bcb9cc27d")); + Arrays.asList("591332fa0b5b22778cf820ee257049d2")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -345,13 +345,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("f7d0d0aee603df25c1f0525bb8df189e")); + Arrays.asList("a4761d7f25e7a62f34494801c98a0da7")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("fc91d457a16b4ca994959c2b5f3f0352")); + Arrays.asList("c526c234947482d1cd2ffc5102083a08")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -407,7 +407,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("857b8e5df444463ac27f665c4f67fbe2")); + Arrays.asList("90adefd39ed67865b0cb275ad0f07383")); executeTest("test minIndelFraction 0.0", spec); } @@ -415,7 +415,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction25() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("81d4c7d9010fd6733b2997bc378e7471")); + Arrays.asList("2fded43949e258f8e9f68893c61c1bdd")); executeTest("test minIndelFraction 0.25", spec); } @@ -437,7 +437,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNsInCigar() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + validationDataLocation + "testWithNs.bam -o %s -L 8:141799600-141814700", 1, - Arrays.asList("bd7984a374f0ae5d277bd5fc5065f64f")); + Arrays.asList("d6d40bacd540a41f305420dfea35e04a")); executeTest("test calling on reads with Ns in CIGAR", spec); } @@ -451,18 +451,18 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("9a7cd58b9e3d5b72608c0d529321deba")); + Arrays.asList("c1077662411164182c5f75478344f83d")); executeTest("test calling on a ReducedRead BAM", spec); } @Test public void testReducedBamSNPs() { - testReducedCalling("SNP", "e7fc11baf208a1bca7b462d3148c936e"); + testReducedCalling("SNP", "f5ccbc96d0d66832dd9b3c5cb6507db4"); } @Test public void testReducedBamINDELs() { - testReducedCalling("INDEL", "132a4e0ccf9230b5bb4b56c649e2bdd5"); + testReducedCalling("INDEL", "3c02ee5187933bed44dc416a2e28511f"); } @@ -483,7 +483,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testContaminationDownsampling() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --contamination_fraction_to_filter 0.20", 1, - Arrays.asList("27dd04159e06d9524fb8a4eef41f96ae")); + Arrays.asList("1f9071466fc40f4c6a0f58ac8e9135fb")); executeTest("test contamination_percentage_to_filter 0.20", spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index d00f5b61d..6828dbcb5 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -21,17 +21,17 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "aa1df35d6e64d7ca93feb4d2dd15dd0e"); + HCTest(CEUTRIO_BAM, "", "56aa4b84606b6b0b7dc78a383974d1b3"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "186c7f322978283c01249c6de2829215"); + HCTest(NA12878_BAM, "", "baabae06c85d416920be434939124d7f"); } @Test public void testHaplotypeCallerMultiSampleGGA() { - HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "de9e78a52207fe62144dba5337965469"); + HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "39da622b309597d7a0b082c8aa1748c9"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -42,7 +42,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "000dbb1b48f94d017cfec127c6cabe8f"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "966d338f423c86a390d685aa6336ec69"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -53,7 +53,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "d86fae2d1b504b422b7b0cfbbdecc2c4"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "7fbc6b9e27e374f2ffe4be952d88c7c6"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -64,20 +64,20 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "b369c2a6cb5c99a424551b33bae16f3b"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "2581e760279291a3901a506d060bfac8"); } @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("f6326adfdf5bc147626b30a89ce06d56")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("788176e1717bd28fc7cbc8e3efbb6100")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("b6c67ee8e99cc8f53a6587bb26028047")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("96ab8253d242b851ccfc218759f79784")); executeTest("HCTestStructuralIndels: ", spec); } @@ -91,7 +91,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("4beb9f87ab3f316a9384c3d0dca6ebe9")); + Arrays.asList("425f1a0fb00d7145edf1c55e54346fae")); executeTest("HC calling on a ReducedRead BAM", spec); } } From 17ab3a39d55ce389c45f24c81349263e8cd4dad7 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 8 Nov 2012 14:35:23 -0500 Subject: [PATCH 25/90] Make the --intermediate_csv_file argument un-hidden. --- .../gatk/walkers/bqsr/RecalibrationArgumentCollection.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index fc7d8a8a4..e5704a1e2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -75,8 +75,9 @@ public class RecalibrationArgumentCollection { /** * If not provided, then a temporary file is created and then deleted upon completion. + * For advanced users only. */ - @Hidden + @Advanced @Argument(fullName = "intermediate_csv_file", shortName = "intermediate", doc = "The intermediate csv file to create", required = false) public File RECAL_CSV_FILE = null; From e9183d9fe0ada286ed91bced4feb1e004e10ad56 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 8 Nov 2012 15:07:47 -0500 Subject: [PATCH 26/90] Fix bugs as reported on the forum: BED needs to be explicitly set as the default output format and the output didn't actually adhere to the BED spec. --- .../sting/gatk/walkers/coverage/CallableLoci.java | 4 ++-- .../coverage/CallableLociWalkerIntegrationTest.java | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java index 58ddd0879..48019efea 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java @@ -191,7 +191,7 @@ public class CallableLoci extends LocusWalker Date: Fri, 9 Nov 2012 08:33:55 -0500 Subject: [PATCH 27/90] Fixed nasty bug in BQSR csv file creation: numbers larger than 999 in the Errors column were printed out with commas (which looks like a separate column). This wasn't caught earlier because there are no integration tests covering the csv. I'll add one into unstable in a sec. --- .../broadinstitute/sting/utils/recalibration/RecalDatum.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java index e3348d3de..207988749 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java @@ -199,7 +199,7 @@ public class RecalDatum { @Override public String toString() { - return String.format("%.2f,%,2f,%.2f", getNumObservations(), getNumMismatches(), getEmpiricalQuality()); + return String.format("%.2f,%.2f,%.2f", getNumObservations(), getNumMismatches(), getEmpiricalQuality()); } public String stringForCSV() { From e93d46191004f554bdd75b42d07b098241b9715e Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 9 Nov 2012 09:11:04 -0500 Subject: [PATCH 28/90] Adding integration test to BQSR for the csv file --- .../gatk/walkers/bqsr/BQSRIntegrationTest.java | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java index b839382dc..f6ec47760 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java @@ -90,6 +90,21 @@ public class BQSRIntegrationTest extends WalkerTest { executeTest("testBQSRFailWithoutDBSNP", spec); } + @Test + public void testBQSRCSV() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + " -T BaseRecalibrator" + + " -R " + b36KGReference + + " -I " + validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam" + + " -knownSites " + b36dbSNP129 + + " -L 1:10,000,000-10,200,000" + + " -o /dev/null" + + " --plot_pdf_file /dev/null" + + " --intermediate_csv_file %s", + Arrays.asList("d1c38a3418979400630e2bca1140689c")); + executeTest("testBQSR-CSVfile", spec); + } + @Test public void testBQSRFailWithSolidNoCall() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( From 525cf331f4e6568479582c44f7c7b8912cdf7773 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 13 Nov 2012 11:52:47 -0500 Subject: [PATCH 29/90] Don't catch a User Error and re-throw as a Reviewed Exception. That makes Eric unhappy. --- .../sting/gatk/io/storage/VariantContextWriterStorage.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java index 31f6d5954..8e4633869 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java @@ -198,8 +198,6 @@ public class VariantContextWriterStorage implements Storage Date: Tue, 13 Nov 2012 11:53:12 -0500 Subject: [PATCH 30/90] Having a malformed GATK report is a User Error --- .../broadinstitute/sting/gatk/report/GATKReportVersion.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java index b5a5e0443..b51fb17f0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.report; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; public enum GATKReportVersion { /** @@ -91,6 +92,6 @@ public enum GATKReportVersion { if (header.startsWith("#:GATKReport.v1.1")) return GATKReportVersion.V1_1; - throw new ReviewedStingException("Unknown GATK report version in header: " + header); + throw new UserException.BadInput("The GATK report has an unknown/unsupported version in the header: " + header); } } From ba41f65759724b650d8996185bc4761f50bda466 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 13 Nov 2012 11:53:39 -0500 Subject: [PATCH 31/90] Protect against NPEs in SelectVariants by checking for missing Genotypes --- .../sting/gatk/walkers/variantutils/SelectVariants.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index d1b7cb96f..d28fe34d6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -659,7 +659,10 @@ public class SelectVariants extends RodWalker implements TreeR return (g !=null && !g.isHomRef() && (g.isCalled() || (g.isFiltered() && !EXCLUDE_FILTERED))); } - private boolean haveSameGenotypes(Genotype g1, Genotype g2) { + private boolean haveSameGenotypes(final Genotype g1, final Genotype g2) { + if ( g1 == null || g2 == null ) + return false; + if ((g1.isCalled() && g2.isFiltered()) || (g2.isCalled() && g1.isFiltered()) || (g1.isFiltered() && g2.isFiltered() && EXCLUDE_FILTERED)) From 6d59dd34558fb6cc564eabab9b6c5a278a341a26 Mon Sep 17 00:00:00 2001 From: kshakir Date: Sun, 4 Nov 2012 23:42:02 -0500 Subject: [PATCH 32/90] Scala classes were only returning direct subclasses (confirmed when inspected in debugger) so changed PluginManager to allow specifying the explicit subclass. Removed some generics from PluginManager for now until able to figure out syntax for requesting explicit subclass. QStatusMessenger uses a slightly more primitive Map[String, Seq[RemoteFile]] instead of Map[ArgumentSource, Seq[RemoteFile]]. Added an QCommandPlugin.initScript utility method for handling specialized script types. --- .../org/broadinstitute/sting/gatk/WalkerManager.java | 4 ++-- .../sting/utils/classloader/PluginManager.java | 11 ++++++----- .../org/broadinstitute/sting/queue/QCommandLine.scala | 11 ++++++++--- .../broadinstitute/sting/queue/QCommandPlugin.scala | 2 ++ .../src/org/broadinstitute/sting/queue/QScript.scala | 8 ++++++-- .../sting/queue/engine/QStatusMessenger.scala | 3 +-- 6 files changed, 25 insertions(+), 14 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java index fbacbddc4..28b5f918d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java @@ -350,11 +350,11 @@ public class WalkerManager extends PluginManager { * @return A name for this type of walker. */ @Override - public String getName(Class walkerType) { + public String getName(Class walkerType) { String walkerName = ""; if (walkerType.getAnnotation(WalkerName.class) != null) - walkerName = walkerType.getAnnotation(WalkerName.class).value().trim(); + walkerName = ((WalkerName)walkerType.getAnnotation(WalkerName.class)).value().trim(); else walkerName = super.getName(walkerType); diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java index 43cc800d8..b39aae8ab 100644 --- a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java @@ -101,7 +101,7 @@ public class PluginManager { * Create a new plugin manager. * @param pluginType Core type for a plugin. */ - public PluginManager(Class pluginType) { + public PluginManager(Class pluginType) { this(pluginType, pluginType.getSimpleName().toLowerCase(), pluginType.getSimpleName(), null); } @@ -110,7 +110,7 @@ public class PluginManager { * @param pluginType Core type for a plugin. * @param classpath Custom class path to search for classes. */ - public PluginManager(Class pluginType, List classpath) { + public PluginManager(Class pluginType, List classpath) { this(pluginType, pluginType.getSimpleName().toLowerCase(), pluginType.getSimpleName(), classpath); } @@ -120,7 +120,7 @@ public class PluginManager { * @param pluginCategory Provides a category name to the plugin. Must not be null. * @param pluginSuffix Provides a suffix that will be trimmed off when converting to a plugin name. Can be null. */ - public PluginManager(Class pluginType, String pluginCategory, String pluginSuffix) { + public PluginManager(Class pluginType, String pluginCategory, String pluginSuffix) { this(pluginType, pluginCategory, pluginSuffix, null); } @@ -131,7 +131,7 @@ public class PluginManager { * @param pluginSuffix Provides a suffix that will be trimmed off when converting to a plugin name. Can be null. * @param classpath Custom class path to search for classes. */ - public PluginManager(Class pluginType, String pluginCategory, String pluginSuffix, List classpath) { + public PluginManager(Class pluginType, String pluginCategory, String pluginSuffix, List classpath) { this.pluginCategory = pluginCategory; this.pluginSuffix = pluginSuffix; @@ -149,6 +149,7 @@ public class PluginManager { } // Load all classes types filtering them by concrete. + @SuppressWarnings("unchecked") Set> allTypes = reflections.getSubTypesOf(pluginType); for( Class type: allTypes ) { // The plugin manager does not support anonymous classes; to be a plugin, a class must have a name. @@ -325,7 +326,7 @@ public class PluginManager { * @param pluginType The type of plugin. * @return A name for this type of plugin. */ - public String getName(Class pluginType) { + public String getName(Class pluginType) { String pluginName = ""; if (pluginName.length() == 0) { diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala index 65abaf7be..637174557 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -92,13 +92,19 @@ class QCommandLine extends CommandLineProgram with Logging { private lazy val qScriptPluginManager = { qScriptClasses = IOUtils.tempDir("Q-Classes-", "", settings.qSettings.tempDirectory) qScriptManager.loadScripts(scripts, qScriptClasses) - new PluginManager[QScript](classOf[QScript], Seq(qScriptClasses.toURI.toURL)) + new PluginManager[QScript](qPluginType, Seq(qScriptClasses.toURI.toURL)) } private lazy val qCommandPlugin = { new PluginManager[QCommandPlugin](classOf[QCommandPlugin]) } + private lazy val allCommandPlugins = qCommandPlugin.createAllTypes() + + private lazy val qPluginType: Class[_ <: QScript] = { + allCommandPlugins.map(_.qScriptClass).headOption.getOrElse(classOf[QScript]) + } + /** * Takes the QScripts passed in, runs their script() methods, retrieves their generated * functions, and then builds and runs a QGraph based on the dependencies. @@ -106,8 +112,6 @@ class QCommandLine extends CommandLineProgram with Logging { def execute = { ClassFieldCache.parsingEngine = this.parser - val allCommandPlugins = qCommandPlugin.createAllTypes() - if (settings.qSettings.runName == null) settings.qSettings.runName = FilenameUtils.removeExtension(scripts.head.getName) if (IOUtils.isDefaultTempDir(settings.qSettings.tempDirectory)) @@ -138,6 +142,7 @@ class QCommandLine extends CommandLineProgram with Logging { for (script <- allQScripts) { logger.info("Scripting " + qScriptPluginManager.getName(script.getClass.asSubclass(classOf[QScript]))) loadArgumentsIntoObject(script) + allCommandPlugins.foreach(_.initScript(script)) // TODO: Pulling inputs can be time/io expensive! Some scripts are using the files to generate functions-- even for dry runs-- so pull it all down for now. //if (settings.run) script.pullInputs() diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala index 499c31554..eae6a6a92 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandPlugin.scala @@ -6,4 +6,6 @@ import util.RemoteFileConverter trait QCommandPlugin { def statusMessenger: QStatusMessenger = null def remoteFileConverter: RemoteFileConverter = null + def qScriptClass: Class[_ <: QScript] = classOf[QScript] + def initScript(script: QScript) {} } diff --git a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala index 8c834696c..eb8be183a 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala @@ -149,13 +149,17 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon * List out the remote outputs * @return the RemoteFile outputs by argument source */ - def remoteInputs: Map[ArgumentSource, Seq[RemoteFile]] = remoteFieldMap(inputFields) + def remoteInputs: Map[String, Seq[RemoteFile]] = tagMap(remoteFieldMap(inputFields)) /** * List out the remote outputs * @return the RemoteFile outputs by argument source */ - def remoteOutputs: Map[ArgumentSource, Seq[RemoteFile]] = remoteFieldMap(outputFields) + def remoteOutputs: Map[String, Seq[RemoteFile]] = tagMap(remoteFieldMap(outputFields)) + + private def tagMap(remoteFieldMap: Map[ArgumentSource, Seq[RemoteFile]]): Map[String, Seq[RemoteFile]] = { + remoteFieldMap.collect{ case (k, v) => ClassFieldCache.fullName(k) -> v }.toMap + } private def remoteFieldMap(fields: Seq[ArgumentSource]): Map[ArgumentSource, Seq[RemoteFile]] = { fields.map(field => (field -> filterRemoteFiles(ClassFieldCache.getFieldFiles(this, field)))).filter(tuple => !tuple._2.isEmpty).toMap diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala index c4151dafc..a1133b944 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala @@ -1,6 +1,5 @@ package org.broadinstitute.sting.queue.engine -import org.broadinstitute.sting.commandline.ArgumentSource import org.broadinstitute.sting.queue.util.RemoteFile /** @@ -8,7 +7,7 @@ import org.broadinstitute.sting.queue.util.RemoteFile */ trait QStatusMessenger { def started() - def done(inputs: Seq[Map[ArgumentSource, Seq[RemoteFile]]], outputs: Seq[Map[ArgumentSource, Seq[RemoteFile]]]) + def done(inputs: Seq[Map[String, Seq[RemoteFile]]], outputs: Seq[Map[String, Seq[RemoteFile]]]) def exit(message: String) def started(job: String) From a17cd54b6883427454c6ae0bb379fee9ef7f460b Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 24 Oct 2012 16:57:08 -0400 Subject: [PATCH 36/90] Co-Reduction implementation in ReduceReads ReduceReads now co-reduces bams if they're passed in toghether with multiple -I. Co-reduction forces every variant region in one sample to be a variant region in all samples. Also: * Added integrationtest for co-reduction * Fixed bug with new no-recalculation implementation of the marksites object where the last object wasn't being removed after finalizing a variant region (updated MD5's accordingly) DEV-200 #resolve #time 8m --- .../reducereads/CompressionStash.java | 38 ++++++++ .../reducereads/MultiSampleCompressor.java | 49 ++++++---- .../compression/reducereads/ReduceReads.java | 2 +- .../reducereads/SingleSampleCompressor.java | 38 ++++---- .../reducereads/SlidingWindow.java | 89 ++++++++++--------- .../ReduceReadsIntegrationTest.java | 10 +-- .../reducereads/SimpleGenomeLoc.java | 43 +++++++++ 7 files changed, 185 insertions(+), 84 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java index 714a4df18..a6e5b6c5b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import org.broadinstitute.sting.utils.GenomeLocComparator; +import java.util.Collection; import java.util.TreeSet; /** @@ -18,4 +19,41 @@ public class CompressionStash extends TreeSet { public CompressionStash() { super(new GenomeLocComparator()); } + + /** + * Adds a SimpleGenomeLoc to the stash and merges it with any overlapping (and contiguous) existing loc + * in the stash. + * + * @param insertLoc the new loc to be inserted + * @return true if the loc, or it's merged version, wasn't present in the list before. + */ + @Override + public boolean add(SimpleGenomeLoc insertLoc) { + TreeSet removedLocs = new TreeSet(); + for (SimpleGenomeLoc existingLoc : this) { + if (existingLoc.isPast(insertLoc)) { + break; // if we're past the loc we're done looking for overlaps. + } + if (existingLoc.equals(insertLoc)) { + return false; // if this loc was already present in the stash, we don't need to insert it. + } + if (existingLoc.contiguousP(insertLoc)) { + removedLocs.add(existingLoc); // list the original loc for merging + } + } + for (SimpleGenomeLoc loc : removedLocs) { + this.remove(loc); // remove all locs that will be merged + } + removedLocs.add(insertLoc); // add the new loc to the list of locs that will be merged + return super.add(SimpleGenomeLoc.merge(removedLocs)); // merge them all into one loc and add to the stash + } + + @Override + public boolean addAll(Collection locs) { + boolean result = false; + for (SimpleGenomeLoc loc : locs) { + result |= this.add(loc); + } + return result; + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java index 2c3439010..f348225ca 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java @@ -3,13 +3,14 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import net.sf.samtools.SAMFileHeader; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.HashMap; import java.util.Map; -import java.util.SortedSet; +import java.util.Set; import java.util.TreeSet; /* @@ -41,7 +42,7 @@ import java.util.TreeSet; * * @author depristo */ -public class MultiSampleCompressor implements Compressor { +public class MultiSampleCompressor { protected static final Logger logger = Logger.getLogger(MultiSampleCompressor.class); protected Map compressorsPerSample = new HashMap(); @@ -55,30 +56,44 @@ public class MultiSampleCompressor implements Compressor { final int minBaseQual, final ReduceReads.DownsampleStrategy downsampleStrategy, final int nContigs, - final boolean allowPolyploidReduction, - final CompressionStash compressionStash) { + final boolean allowPolyploidReduction) { for ( String name : SampleUtils.getSAMFileSamples(header) ) { compressorsPerSample.put(name, new SingleSampleCompressor(contextSize, downsampleCoverage, - minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, allowPolyploidReduction, compressionStash)); + minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, allowPolyploidReduction)); } } - @Override - public Iterable addAlignment(GATKSAMRecord read) { - String sample = read.getReadGroup().getSample(); - SingleSampleCompressor compressor = compressorsPerSample.get(sample); + public Set addAlignment(GATKSAMRecord read) { + String sampleName = read.getReadGroup().getSample(); + SingleSampleCompressor compressor = compressorsPerSample.get(sampleName); if ( compressor == null ) - throw new ReviewedStingException("No compressor for sample " + sample); - return compressor.addAlignment(read); + throw new ReviewedStingException("No compressor for sample " + sampleName); + Pair, CompressionStash> readsAndStash = compressor.addAlignment(read); + Set reads = readsAndStash.getFirst(); + CompressionStash regions = readsAndStash.getSecond(); + + reads.addAll(closeVariantRegionsInAllSamples(regions)); + + return reads; } - @Override - public Iterable close() { - SortedSet reads = new TreeSet(new AlignmentStartWithNoTiesComparator()); - for ( SingleSampleCompressor comp : compressorsPerSample.values() ) - for ( GATKSAMRecord read : comp.close() ) - reads.add(read); + public Set close() { + Set reads = new TreeSet(new AlignmentStartWithNoTiesComparator()); + for ( SingleSampleCompressor sample : compressorsPerSample.values() ) { + Pair, CompressionStash> readsAndStash = sample.close(); + reads = readsAndStash.getFirst(); + } + return reads; + } + + private Set closeVariantRegionsInAllSamples(CompressionStash regions) { + Set reads = new TreeSet(new AlignmentStartWithNoTiesComparator()); + if (!regions.isEmpty()) { + for (SingleSampleCompressor sample : compressorsPerSample.values()) { + reads.addAll(sample.closeVariantRegions(regions)); + } + } return reads; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index b6761f4a6..a05992cb4 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -330,7 +330,7 @@ public class ReduceReads extends ReadWalker, ReduceRea */ @Override public ReduceReadsStash reduceInit() { - return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, USE_POLYPLOID_REDUCTION, compressionStash)); + return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, USE_POLYPLOID_REDUCTION)); } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java index 82a433300..ac3388795 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java @@ -1,8 +1,10 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.util.Set; import java.util.TreeSet; /** @@ -10,7 +12,7 @@ import java.util.TreeSet; * @author carneiro, depristo * @version 3.0 */ -public class SingleSampleCompressor implements Compressor { +public class SingleSampleCompressor { final private int contextSize; final private int downsampleCoverage; final private int minMappingQuality; @@ -20,11 +22,11 @@ public class SingleSampleCompressor implements Compressor { final private ReduceReads.DownsampleStrategy downsampleStrategy; final private int nContigs; final private boolean allowPolyploidReduction; - final CompressionStash compressionStash; private SlidingWindow slidingWindow; private int slidingWindowCounter; + public static Pair, CompressionStash> emptyPair = new Pair,CompressionStash>(new TreeSet(), new CompressionStash()); public SingleSampleCompressor(final int contextSize, final int downsampleCoverage, @@ -34,8 +36,7 @@ public class SingleSampleCompressor implements Compressor { final int minBaseQual, final ReduceReads.DownsampleStrategy downsampleStrategy, final int nContigs, - final boolean allowPolyploidReduction, - final CompressionStash compressionStash) { + final boolean allowPolyploidReduction) { this.contextSize = contextSize; this.downsampleCoverage = downsampleCoverage; this.minMappingQuality = minMappingQuality; @@ -46,15 +47,11 @@ public class SingleSampleCompressor implements Compressor { this.downsampleStrategy = downsampleStrategy; this.nContigs = nContigs; this.allowPolyploidReduction = allowPolyploidReduction; - this.compressionStash = compressionStash; } - /** - * @{inheritDoc} - */ - @Override - public Iterable addAlignment( GATKSAMRecord read ) { - TreeSet result = new TreeSet(new AlignmentStartWithNoTiesComparator()); + public Pair, CompressionStash> addAlignment( GATKSAMRecord read ) { + Set reads = new TreeSet(new AlignmentStartWithNoTiesComparator()); + CompressionStash stash = new CompressionStash(); int readOriginalStart = read.getUnclippedStart(); // create a new window if: @@ -63,22 +60,27 @@ public class SingleSampleCompressor implements Compressor { (readOriginalStart - contextSize > slidingWindow.getStopLocation()))) { // this read is too far away from the end of the current sliding window // close the current sliding window - result.addAll(slidingWindow.close()); + Pair, CompressionStash> readsAndStash = slidingWindow.close(); + reads = readsAndStash.getFirst(); + stash = readsAndStash.getSecond(); slidingWindow = null; // so we create a new one on the next if } if ( slidingWindow == null) { // this is the first read - slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), nContigs, allowPolyploidReduction, compressionStash); + slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), nContigs, allowPolyploidReduction); slidingWindowCounter++; } - result.addAll(slidingWindow.addRead(read)); - return result; + stash.addAll(slidingWindow.addRead(read)); + return new Pair, CompressionStash>(reads, stash); } - @Override - public Iterable close() { - return (slidingWindow != null) ? slidingWindow.close() : new TreeSet(); + public Pair, CompressionStash> close() { + return (slidingWindow != null) ? slidingWindow.close() : emptyPair; + } + + public Set closeVariantRegions(CompressionStash regions) { + return slidingWindow.closeVariantRegions(regions); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 24cacd997..24a3ba3cb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -6,8 +6,10 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.recalibration.EventType; +import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -55,7 +57,8 @@ public class SlidingWindow { private final int nContigs; private boolean allowPolyploidReductionInGeneral; - private CompressionStash compressionStash; + + private static CompressionStash emptyRegions = new CompressionStash(); /** * The types of synthetic reads to use in the finalizeAndAdd method @@ -87,7 +90,7 @@ public class SlidingWindow { } - public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs, boolean allowPolyploidReduction, CompressionStash compressionStash) { + public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs, boolean allowPolyploidReduction) { this.contextSize = contextSize; this.downsampleCoverage = downsampleCoverage; @@ -124,7 +127,6 @@ public class SlidingWindow { this.nContigs = nContigs; this.allowPolyploidReductionInGeneral = allowPolyploidReduction; - this.compressionStash = compressionStash; } /** @@ -138,7 +140,7 @@ public class SlidingWindow { * @param read the read * @return a list of reads that have been finished by sliding the window. */ - public List addRead(GATKSAMRecord read) { + public CompressionStash addRead(GATKSAMRecord read) { addToHeader(windowHeader, read); // update the window header counts readsInWindow.add(read); // add read to sliding reads return slideWindow(read.getUnclippedStart()); @@ -152,8 +154,9 @@ public class SlidingWindow { * @param variantSite boolean array with true marking variant regions * @return null if nothing is variant, start/stop if there is a complete variant region, start/-1 if there is an incomplete variant region. */ - private SimpleGenomeLoc getNextVariantRegion(int from, int to, boolean[] variantSite) { + private SimpleGenomeLoc findNextVariantRegion(int from, int to, boolean[] variantSite, boolean forceClose) { boolean foundStart = false; + final int windowHeaderStart = getStartLocation(windowHeader); int variantRegionStartIndex = 0; for (int i=from; i slideWindow(final int incomingReadUnclippedStart) { - List finalizedReads = new LinkedList(); - + protected CompressionStash slideWindow(final int incomingReadUnclippedStart) { final int windowHeaderStartLocation = getStartLocation(windowHeader); + CompressionStash regions = emptyRegions; + boolean forceClose = true; if (incomingReadUnclippedStart - contextSize > windowHeaderStartLocation) { markSites(incomingReadUnclippedStart); int readStartHeaderIndex = incomingReadUnclippedStart - windowHeaderStartLocation; int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive) - CompressionStash regions = getVariantRegionsFromThisSample(0, breakpoint, markedSites.getVariantSiteBitSet()); - finalizedReads = closeVariantRegions(regions, false); - - while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) { - readsInWindow.pollFirst(); - } + regions = findVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet(), !forceClose); } - return finalizedReads; + // todo -- can be more aggressive here removing until the NEW window header start location after closing the variant regions + while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) { + readsInWindow.pollFirst(); + } + + return regions; } @@ -623,31 +629,27 @@ public class SlidingWindow { result.addAll(addToSyntheticReads(windowHeader, 0, stop, false)); result.addAll(finalizeAndAdd(ConsensusType.BOTH)); - return result; // finalized reads will be downsampled if necessary + return result; // finalized reads will be downsampled if necessary } - - private List closeVariantRegions(CompressionStash regions, boolean forceClose) { - List allReads = new LinkedList(); + public Set closeVariantRegions(CompressionStash regions) { + TreeSet allReads = new TreeSet(new AlignmentStartWithNoTiesComparator()); if (!regions.isEmpty()) { int lastStop = -1; + int windowHeaderStart = getStartLocation(windowHeader); + for (SimpleGenomeLoc region : regions) { - int start = region.getStart(); - int stop = region.getStop(); + if (region.isFinished() && region.getContig() == contig && region.getStart() >= windowHeaderStart && region.getStop() <= windowHeaderStart + windowHeader.size()) { + int start = region.getStart() - windowHeaderStart; + int stop = region.getStop() - windowHeaderStart; - if (!region.isFinished()) { - if(forceClose) // region is unfinished but we're forcing the close of this window - stop = windowHeader.size() - 1; - else - continue; // region is unfinished and we're not forcing the close of this window + allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); // todo -- add condition here dependent on dbSNP track + lastStop = stop; } - - allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); - lastStop = stop; } - for (int i = 0; i < lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion) - windowHeader.remove(); // todo -- can't believe java doesn't allow me to just do windowHeader = windowHeader.get(stop). Should be more efficient here! + for (int i = 0; i <= lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion) + windowHeader.remove(); } return allReads; } @@ -681,23 +683,24 @@ public class SlidingWindow { * * @return All reads generated */ - public List close() { + public Pair, CompressionStash> close() { // mark variant regions - List finalizedReads = new LinkedList(); + Set finalizedReads = new TreeSet(new AlignmentStartWithNoTiesComparator()); + CompressionStash regions = new CompressionStash(); + boolean forceCloseUnfinishedRegions = true; if (!windowHeader.isEmpty()) { markSites(getStopLocation(windowHeader) + 1); - CompressionStash regions = getVariantRegionsFromThisSample(0, windowHeader.size(), markedSites.getVariantSiteBitSet()); - finalizedReads = closeVariantRegions(regions, true); + regions = findVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet(), forceCloseUnfinishedRegions); + finalizedReads = closeVariantRegions(regions); if (!windowHeader.isEmpty()) { finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), false)); finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up } - } - return finalizedReads; + return new Pair, CompressionStash>(finalizedReads, regions); } /** diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java index 50500536f..1e539dc9d 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java @@ -26,23 +26,23 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testDefaultCompression() { - RRTest("testDefaultCompression ", L, "46ea88e32bae3072f5cd68a0db4b55f1"); + RRTest("testDefaultCompression ", L, "98080d3c53f441564796fc143cf510da"); } @Test(enabled = true) public void testMultipleIntervals() { String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110"; - RRTest("testMultipleIntervals ", intervals, "c3784a0b42f5456b705f9b152a4b697a"); + RRTest("testMultipleIntervals ", intervals, "c5dcdf4edf368b5b897d66f76034d9f0"); } @Test(enabled = true) public void testHighCompression() { - RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "e385eb0ae5768f8507671d5303a212d5"); + RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "27cb99e87eda5e46187e56f50dd37f26"); } @Test(enabled = true) public void testLowCompression() { - RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "6b5546be9363e493b9838542f5dc8cae"); + RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "4e7f111688d49973c35669855b7a2eaf"); } @Test(enabled = true) @@ -83,7 +83,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testCoReduction() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s "; - executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList(""))); + executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("5c30fde961a1357bf72c15144c01981b"))); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java index 45e105751..51d8aad63 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SimpleGenomeLoc.java @@ -1,6 +1,10 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.SortedSet; /** * GenomeLocs are very useful objects to keep track of genomic locations and perform set operations @@ -27,4 +31,43 @@ public class SimpleGenomeLoc extends GenomeLoc { public boolean isFinished() { return finished; } + + @Requires("a != null && b != null") + public static SimpleGenomeLoc merge(SimpleGenomeLoc a, SimpleGenomeLoc b) throws ReviewedStingException { + if(GenomeLoc.isUnmapped(a) || GenomeLoc.isUnmapped(b)) { + throw new ReviewedStingException("Tried to merge unmapped genome locs"); + } + + if (!(a.contiguousP(b))) { + throw new ReviewedStingException("The two genome locs need to be contiguous"); + } + + + return new SimpleGenomeLoc(a.getContig(), a.contigIndex, + Math.min(a.getStart(), b.getStart()), + Math.max(a.getStop(), b.getStop()), + a.isFinished()); + } + + /** + * Merges a list of *sorted* *contiguous* locs into one + * + * @param sortedLocs a sorted list of contiguous locs + * @return one merged loc + */ + public static SimpleGenomeLoc merge(SortedSet sortedLocs) { + SimpleGenomeLoc previousLoc = null; + for (SimpleGenomeLoc loc : sortedLocs) { + if (loc.isUnmapped()) { + throw new ReviewedStingException("Tried to merge unmapped genome locs"); + } + if (previousLoc != null && !previousLoc.contiguousP(loc)) { + throw new ReviewedStingException("The genome locs need to be contiguous"); + } + previousLoc = loc; + } + SimpleGenomeLoc firstLoc = sortedLocs.first(); + SimpleGenomeLoc lastLoc = sortedLocs.last(); + return merge(firstLoc, lastLoc); + } } From dba31018f46052984ab3b779d18950e5d1aee501 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 13 Nov 2012 01:18:37 -0500 Subject: [PATCH 40/90] Implementation of BySampleSAMFileWriter ReduceReads now works with the n-way-out capability, splitting by sample. DEV-27 #resolve #time 3m --- .../compression/reducereads/ReduceReads.java | 37 +- .../utils/sam/BySampleSAMFileWriter.java | 70 ++++ .../sting/utils/sam/NWaySAMFileWriter.java | 374 +++++++++--------- 3 files changed, 290 insertions(+), 191 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/sam/BySampleSAMFileWriter.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index a05992cb4..3712e4524 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -25,6 +25,9 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileWriter; +import net.sf.samtools.SAMProgramRecord; import net.sf.samtools.util.SequenceUtil; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Hidden; @@ -45,6 +48,7 @@ import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.sam.BySampleSAMFileWriter; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -86,7 +90,8 @@ import java.util.*; public class ReduceReads extends ReadWalker, ReduceReadsStash> { @Output - private StingSAMFileWriter out; + private StingSAMFileWriter out = null; + private SAMFileWriter writerToUse = null; /** * The number of bases to keep around mismatches (potential variation) @@ -196,6 +201,10 @@ public class ReduceReads extends ReadWalker, ReduceRea @Argument(fullName = "contigs", shortName = "ctg", doc = "", required = false) private int nContigs = 2; + @Hidden + @Argument(fullName = "nwayout", shortName = "nw", doc = "", required = false) + private boolean nwayout = false; + @Hidden @Argument(fullName = "", shortName = "dl", doc = "", required = false) private int debugLevel = 0; @@ -227,6 +236,7 @@ public class ReduceReads extends ReadWalker, ReduceRea SortedSet intervalList; private static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag + private static final String PROGRAM_FILENAME_EXTENSION = ".reduced.bam"; /** * Basic generic initialization of the readNameHash and the intervalList. Output initialization @@ -242,10 +252,24 @@ public class ReduceReads extends ReadWalker, ReduceRea if (toolkit.getIntervals() != null) intervalList.addAll(toolkit.getIntervals()); - if (!NO_PG_TAG) - Utils.setupWriter(out, toolkit, false, true, this, PROGRAM_RECORD_NAME); - else + + // todo -- rework the whole NO_PG_TAG thing + final boolean preSorted = true; + final boolean indexOnTheFly = true; + final boolean generateMD5 = true; + final boolean keep_records = true; + final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate; + if (nwayout) { + SAMProgramRecord programRecord = NO_PG_TAG ? null : Utils.createProgramRecord(toolkit, this, PROGRAM_RECORD_NAME); + writerToUse = new BySampleSAMFileWriter(toolkit, PROGRAM_FILENAME_EXTENSION, sortOrder, preSorted, indexOnTheFly, NO_PG_TAG, programRecord, true); + } + else { + writerToUse = out; out.setPresorted(false); + if (!NO_PG_TAG) { + Utils.setupWriter(out, toolkit, !preSorted, keep_records, this, PROGRAM_RECORD_NAME); + } + } } /** @@ -386,6 +410,9 @@ public class ReduceReads extends ReadWalker, ReduceRea // output any remaining reads in the compressor for (GATKSAMRecord read : stash.close()) outputRead(read); + + if (nwayout) + writerToUse.close(); } /** @@ -554,7 +581,7 @@ public class ReduceReads extends ReadWalker, ReduceRea if (!DONT_COMPRESS_READ_NAMES) compressReadName(read); - out.addAlignment(read); + writerToUse.addAlignment(read); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/BySampleSAMFileWriter.java b/public/java/src/org/broadinstitute/sting/utils/sam/BySampleSAMFileWriter.java new file mode 100644 index 000000000..6bad58d9f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/sam/BySampleSAMFileWriter.java @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMProgramRecord; +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.HashMap; +import java.util.Map; + +/** + * Created by IntelliJ IDEA. + * User: carneiro + * Date: Nov 13 + */ +public class BySampleSAMFileWriter extends NWaySAMFileWriter{ + + private final Map sampleToWriterMap; + + public BySampleSAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { + super(toolkit, ext, order, presorted, indexOnTheFly, generateMD5, pRecord, keep_records); + + sampleToWriterMap = new HashMap(toolkit.getSAMFileHeader().getReadGroups().size() * 2); + + for (SAMReaderID readerID : toolkit.getReadsDataSource().getReaderIDs()) { + for (SAMReadGroupRecord rg : toolkit.getReadsDataSource().getHeader(readerID).getReadGroups()) { + String sample = rg.getSample(); + if (sampleToWriterMap.containsKey(sample) && sampleToWriterMap.get(sample) != readerID) { + throw new ReviewedStingException("The same sample appears in multiple files, this input cannot be multiplexed using the BySampleSAMFileWriter, try NWaySAMFileWriter instead."); + } + else { + sampleToWriterMap.put(sample, readerID); + } + } + } + } + + @Override + public void addAlignment(SAMRecord samRecord) { + super.addAlignment(samRecord, sampleToWriterMap.get(samRecord.getReadGroup().getSample())); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java index fa07523f3..83d1c99bf 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java @@ -1,186 +1,188 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.sam; - -import net.sf.samtools.*; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; -import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.TextFormattingUtils; - -import java.io.File; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: May 31, 2011 - * Time: 3:52:49 PM - * To change this template use File | Settings | File Templates. - */ -public class NWaySAMFileWriter implements SAMFileWriter { - - private Map writerMap = null; - private boolean presorted ; - GenomeAnalysisEngine toolkit; - boolean KEEP_ALL_PG_RECORDS = false; - - public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { - this.presorted = presorted; - this.toolkit = toolkit; - this.KEEP_ALL_PG_RECORDS = keep_records; - writerMap = new HashMap(); - setupByReader(toolkit,in2out,order, presorted, indexOnTheFly, generateMD5, pRecord); - } - - public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly , boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { - this.presorted = presorted; - this.toolkit = toolkit; - this.KEEP_ALL_PG_RECORDS = keep_records; - writerMap = new HashMap(); - setupByReader(toolkit,ext,order, presorted, indexOnTheFly, generateMD5, pRecord); - } - - public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly, boolean generateMD5) { - this(toolkit, in2out, order, presorted, indexOnTheFly, generateMD5, null,false); - } - - public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly , boolean generateMD5) { - this(toolkit, ext, order, presorted, indexOnTheFly, generateMD5, null,false); - } - - /** - * Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK engine (those will be retrieved - * from toolkit). The in2out map must contain an entry for each input filename and map it - * onto a unique output file name. - * @param toolkit - * @param in2out - */ - public void setupByReader(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) { - if ( in2out==null ) throw new StingException("input-output bam filename map for n-way-out writing is NULL"); - for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { - - String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName(); - - String outName; - if ( ! in2out.containsKey(fName) ) - throw new UserException.BadInput("Input-output bam filename map does not contain an entry for the input file "+fName); - outName = in2out.get(fName); - - if ( writerMap.containsKey( rid ) ) - throw new StingException("nWayOut mode: Reader id for input sam file "+fName+" is already registered; "+ - "map file likely contains multiple entries for this input file"); - - addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord); - } - - } - - /** - * Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK engine (those will be retrieved - * from toolkit). The output file names will be generated automatically by stripping ".sam" or ".bam" off the - * input file name and adding ext instead (e.g. ".cleaned.bam"). - * onto a unique output file name. - * @param toolkit - * @param ext - */ - public void setupByReader(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) { - for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { - - String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName(); - - String outName; - int pos ; - if ( fName.toUpperCase().endsWith(".BAM") ) pos = fName.toUpperCase().lastIndexOf(".BAM"); - else { - if ( fName.toUpperCase().endsWith(".SAM") ) pos = fName.toUpperCase().lastIndexOf(".SAM"); - else throw new UserException.BadInput("Input file name "+fName+" does not end with .sam or .bam"); - } - String prefix = fName.substring(0,pos); - outName = prefix+ext; - - if ( writerMap.containsKey( rid ) ) - throw new StingException("nWayOut mode: Reader id for input sam file "+fName+" is already registered"); - addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord); - } - - } - - private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted, - boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord programRecord) { - File f = new File(outName); - SAMFileHeader header = toolkit.getSAMFileHeader(id).clone(); - header.setSortOrder(order); - - if ( programRecord != null ) { - // --->> add program record - List oldRecords = header.getProgramRecords(); - List newRecords = new ArrayList(oldRecords.size()+1); - for ( SAMProgramRecord record : oldRecords ) { - if ( !record.getId().startsWith(programRecord.getId()) || KEEP_ALL_PG_RECORDS ) - newRecords.add(record); - } - newRecords.add(programRecord); - header.setProgramRecords(newRecords); - // <-- add program record ends here - } - SAMFileWriterFactory factory = new SAMFileWriterFactory(); - factory.setCreateIndex(indexOnTheFly); - factory.setCreateMd5File(generateMD5); - SAMFileWriter sw = factory.makeSAMOrBAMWriter(header, presorted, f); - writerMap.put(id,sw); - } - - public Collection getWriters() { - return writerMap.values(); - } - - public void addAlignment(SAMRecord samRecord) { - final SAMReaderID id = toolkit.getReaderIDForRead(samRecord); - String rg = samRecord.getStringAttribute("RG"); - if ( rg != null ) { - String rg_orig = toolkit.getReadsDataSource().getOriginalReadGroupId(rg); - samRecord.setAttribute("RG",rg_orig); - } - writerMap.get(id).addAlignment(samRecord); - } - - public SAMFileHeader getFileHeader() { - return toolkit.getSAMFileHeader(); - } - - public void close() { - for ( SAMFileWriter w : writerMap.values() ) w.close(); - } -} +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.*; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.File; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: May 31, 2011 + * Time: 3:52:49 PM + * To change this template use File | Settings | File Templates. + */ +public class NWaySAMFileWriter implements SAMFileWriter { + + private Map writerMap = null; + private boolean presorted ; + GenomeAnalysisEngine toolkit; + boolean KEEP_ALL_PG_RECORDS = false; + + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { + this.presorted = presorted; + this.toolkit = toolkit; + this.KEEP_ALL_PG_RECORDS = keep_records; + writerMap = new HashMap(); + setupByReader(toolkit,in2out,order, presorted, indexOnTheFly, generateMD5, pRecord); + } + + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly , boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { + this.presorted = presorted; + this.toolkit = toolkit; + this.KEEP_ALL_PG_RECORDS = keep_records; + writerMap = new HashMap(); + setupByReader(toolkit,ext,order, presorted, indexOnTheFly, generateMD5, pRecord); + } + + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5) { + this(toolkit, in2out, order, presorted, indexOnTheFly, generateMD5, null,false); + } + + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly , boolean generateMD5) { + this(toolkit, ext, order, presorted, indexOnTheFly, generateMD5, null,false); + } + + /** + * Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK engine (those will be retrieved + * from toolkit). The in2out map must contain an entry for each input filename and map it + * onto a unique output file name. + * @param toolkit + * @param in2out + */ + public void setupByReader(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) { + if ( in2out==null ) throw new StingException("input-output bam filename map for n-way-out writing is NULL"); + for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { + + String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName(); + + String outName; + if ( ! in2out.containsKey(fName) ) + throw new UserException.BadInput("Input-output bam filename map does not contain an entry for the input file "+fName); + outName = in2out.get(fName); + + if ( writerMap.containsKey( rid ) ) + throw new StingException("nWayOut mode: Reader id for input sam file "+fName+" is already registered; "+ + "map file likely contains multiple entries for this input file"); + + addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord); + } + + } + + /** + * Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK engine (those will be retrieved + * from toolkit). The output file names will be generated automatically by stripping ".sam" or ".bam" off the + * input file name and adding ext instead (e.g. ".cleaned.bam"). + * onto a unique output file name. + * @param toolkit + * @param ext + */ + public void setupByReader(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) { + for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { + + String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName(); + + String outName; + int pos ; + if ( fName.toUpperCase().endsWith(".BAM") ) pos = fName.toUpperCase().lastIndexOf(".BAM"); + else { + if ( fName.toUpperCase().endsWith(".SAM") ) pos = fName.toUpperCase().lastIndexOf(".SAM"); + else throw new UserException.BadInput("Input file name "+fName+" does not end with .sam or .bam"); + } + String prefix = fName.substring(0,pos); + outName = prefix+ext; + + if ( writerMap.containsKey( rid ) ) + throw new StingException("nWayOut mode: Reader id for input sam file "+fName+" is already registered"); + addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord); + } + + } + + private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted, + boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord programRecord) { + File f = new File(outName); + SAMFileHeader header = toolkit.getSAMFileHeader(id).clone(); + header.setSortOrder(order); + + if ( programRecord != null ) { + // --->> add program record + List oldRecords = header.getProgramRecords(); + List newRecords = new ArrayList(oldRecords.size()+1); + for ( SAMProgramRecord record : oldRecords ) { + if ( !record.getId().startsWith(programRecord.getId()) || KEEP_ALL_PG_RECORDS ) + newRecords.add(record); + } + newRecords.add(programRecord); + header.setProgramRecords(newRecords); + // <-- add program record ends here + } + SAMFileWriterFactory factory = new SAMFileWriterFactory(); + factory.setCreateIndex(indexOnTheFly); + factory.setCreateMd5File(generateMD5); + SAMFileWriter sw = factory.makeSAMOrBAMWriter(header, presorted, f); + writerMap.put(id,sw); + } + + public Collection getWriters() { + return writerMap.values(); + } + + public void addAlignment(SAMRecord samRecord) { + final SAMReaderID id = toolkit.getReaderIDForRead(samRecord); + String rg = samRecord.getStringAttribute("RG"); + if ( rg != null ) { + String rg_orig = toolkit.getReadsDataSource().getOriginalReadGroupId(rg); + samRecord.setAttribute("RG",rg_orig); + } + addAlignment(samRecord, id); + } + + public void addAlignment(SAMRecord samRecord, SAMReaderID readerID) { + writerMap.get(readerID).addAlignment(samRecord); + } + + public SAMFileHeader getFileHeader() { + return toolkit.getSAMFileHeader(); + } + + public void close() { + for ( SAMFileWriter w : writerMap.values() ) w.close(); + } +} From a079d8d0d106660ca5bdd4d2c93d01f4fef141a9 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 13 Nov 2012 15:21:57 -0500 Subject: [PATCH 41/90] Breaking the utility to write @PG tags for SAMFileWriters and StingSAMFileWriters --- .../compression/reducereads/ReduceReads.java | 3 +- .../org/broadinstitute/sting/utils/Utils.java | 59 +++++++++++++++++-- .../sting/utils/sam/NWaySAMFileWriter.java | 21 ++----- 3 files changed, 59 insertions(+), 24 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 3712e4524..3cdf3d75e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -256,7 +256,6 @@ public class ReduceReads extends ReadWalker, ReduceRea // todo -- rework the whole NO_PG_TAG thing final boolean preSorted = true; final boolean indexOnTheFly = true; - final boolean generateMD5 = true; final boolean keep_records = true; final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate; if (nwayout) { @@ -267,7 +266,7 @@ public class ReduceReads extends ReadWalker, ReduceRea writerToUse = out; out.setPresorted(false); if (!NO_PG_TAG) { - Utils.setupWriter(out, toolkit, !preSorted, keep_records, this, PROGRAM_RECORD_NAME); + Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, keep_records, this, PROGRAM_RECORD_NAME); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index f4a200af0..b780d0966 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -687,23 +687,69 @@ public class Utils { array[i] = value; } - public static void setupWriter(StingSAMFileWriter writer, GenomeAnalysisEngine toolkit, boolean preSorted, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) { - final SAMProgramRecord programRecord = createProgramRecord(toolkit, walker, PROGRAM_RECORD_NAME); - - SAMFileHeader header = toolkit.getSAMFileHeader(); + /** + * Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and sets + * up the writer with the header and presorted status. + * + * @param toolkit the engine + * @param originalHeader original header + * @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file + * @param programRecord the program record for this program + */ + public static SAMFileHeader setupWriter(GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean KEEP_ALL_PG_RECORDS, SAMProgramRecord programRecord) { + SAMFileHeader header = originalHeader.clone(); List oldRecords = header.getProgramRecords(); List newRecords = new ArrayList(oldRecords.size()+1); for ( SAMProgramRecord record : oldRecords ) - if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) || KEEP_ALL_PG_RECORDS ) + if ( !record.getId().startsWith(programRecord.getId()) || KEEP_ALL_PG_RECORDS ) newRecords.add(record); newRecords.add(programRecord); header.setProgramRecords(newRecords); + return header; + } + /** + * Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and returns + * the new header to be added to the BAM writer. + * + * @param toolkit the engine + * @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file + * @param walker the walker object (so we can extract the command line) + * @param PROGRAM_RECORD_NAME the name for the PG tag + * @return a pre-filled header for the bam writer + */ + public static SAMFileHeader setupWriter(GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) { + final SAMProgramRecord programRecord = createProgramRecord(toolkit, walker, PROGRAM_RECORD_NAME); + return setupWriter(toolkit, originalHeader, KEEP_ALL_PG_RECORDS, programRecord); + } + + /** + * Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and sets + * up the writer with the header and presorted status. + * + * @param writer BAM file writer + * @param toolkit the engine + * @param preSorted whether or not the writer can assume reads are going to be added are already sorted + * @param KEEP_ALL_PG_RECORDS whether or not to keep all the other program records already existing in this BAM file + * @param walker the walker object (so we can extract the command line) + * @param PROGRAM_RECORD_NAME the name for the PG tag + */ + public static void setupWriter(StingSAMFileWriter writer, GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean preSorted, boolean KEEP_ALL_PG_RECORDS, Object walker, String PROGRAM_RECORD_NAME) { + SAMFileHeader header = setupWriter(toolkit, originalHeader, KEEP_ALL_PG_RECORDS, walker, PROGRAM_RECORD_NAME); writer.writeHeader(header); writer.setPresorted(preSorted); } - + + + /** + * Creates a program record (@PG) tag + * + * @param toolkit the engine + * @param walker the walker object (so we can extract the command line) + * @param PROGRAM_RECORD_NAME the name for the PG tag + * @return a program record for the tool + */ public static SAMProgramRecord createProgramRecord(GenomeAnalysisEngine toolkit, Object walker, String PROGRAM_RECORD_NAME) { final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME); final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); @@ -858,4 +904,5 @@ public class Utils { } return subLists; } + } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java index 83d1c99bf..cdf70884c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java @@ -28,11 +28,14 @@ package org.broadinstitute.sting.utils.sam; import net.sf.samtools.*; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; import java.io.File; -import java.util.*; +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; /** * Created by IntelliJ IDEA. @@ -138,21 +141,7 @@ public class NWaySAMFileWriter implements SAMFileWriter { private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord programRecord) { File f = new File(outName); - SAMFileHeader header = toolkit.getSAMFileHeader(id).clone(); - header.setSortOrder(order); - - if ( programRecord != null ) { - // --->> add program record - List oldRecords = header.getProgramRecords(); - List newRecords = new ArrayList(oldRecords.size()+1); - for ( SAMProgramRecord record : oldRecords ) { - if ( !record.getId().startsWith(programRecord.getId()) || KEEP_ALL_PG_RECORDS ) - newRecords.add(record); - } - newRecords.add(programRecord); - header.setProgramRecords(newRecords); - // <-- add program record ends here - } + SAMFileHeader header = Utils.setupWriter(toolkit, toolkit.getSAMFileHeader(id), KEEP_ALL_PG_RECORDS, programRecord); SAMFileWriterFactory factory = new SAMFileWriterFactory(); factory.setCreateIndex(indexOnTheFly); factory.setCreateMd5File(generateMD5); From 843384e43539eb9d41f5449cb2408fe3dd52cbb9 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Nov 2012 11:47:09 -0500 Subject: [PATCH 42/90] Rename hg19 files in bundle to b37 since that's what they are --- .../sting/queue/qscripts/GATKResourcesBundle.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index 24ab50451..dc6cae197 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -147,13 +147,13 @@ class GATKResourcesBundle extends QScript { // // example call set for wiki tutorial // - addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/exampleCalls/NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf", + addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/exampleCalls/NA12878.HiSeq.WGS.bwa.cleaned.raw.b37.subset.vcf", "NA12878.HiSeq.WGS.bwa.cleaned.raw.subset", b37, true, true)) // // Test BAM file, specific to each reference // - addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam", + addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.b37.20.bam", "IGNORE", b37, false, false)) // From 8b749673bce448d1b92cae97649f161d0d04eef0 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 14 Nov 2012 13:59:34 -0500 Subject: [PATCH 43/90] centralize header element removal in reduce reads --- .../compression/reducereads/ReduceReads.java | 1 - .../compression/reducereads/SlidingWindow.java | 16 +++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 3cdf3d75e..629a27c48 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -253,7 +253,6 @@ public class ReduceReads extends ReadWalker, ReduceRea intervalList.addAll(toolkit.getIntervals()); - // todo -- rework the whole NO_PG_TAG thing final boolean preSorted = true; final boolean indexOnTheFly = true; final boolean keep_records = true; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 24a3ba3cb..fff1c20a5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -220,7 +220,6 @@ public class SlidingWindow { regions = findVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet(), !forceClose); } - // todo -- can be more aggressive here removing until the NEW window header start location after closing the variant regions while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) { readsInWindow.pollFirst(); } @@ -607,9 +606,7 @@ public class SlidingWindow { toRemove.add(read); } } - for (GATKSAMRecord read : toRemove) { - readsInWindow.remove(read); - } + removeReadsFromWindow(toRemove); } return allReads; } @@ -805,9 +802,8 @@ public class SlidingWindow { hetReads.add(finalizeRunningConsensus()); } - for (GATKSAMRecord read : toRemove) { - readsInWindow.remove(read); - } + removeReadsFromWindow(toRemove); + return hetReads; } @@ -924,5 +920,11 @@ public class SlidingWindow { } } } + + private void removeReadsFromWindow (List readsToRemove) { + for (GATKSAMRecord read : readsToRemove) { + readsInWindow.remove(read); + } + } } From 89bbe73a43c6e6ebb30ee564f7b04a432139f716 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Wed, 14 Nov 2012 14:39:04 -0500 Subject: [PATCH 44/90] Commenting out CMI pipeline test that wasn't meant to be in GATK repository (why was this merged??) --- .../gatk/walkers/annotator/VariantAnnotatorEngine.java | 6 +++++- .../gatk/walkers/genotyper/UnifiedGenotyperEngine.java | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index ee4f77752..725097ddc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -277,8 +277,12 @@ public class VariantAnnotatorEngine { if ( expression.fieldName.equals("ID") ) { if ( vc.hasID() ) infoAnnotations.put(expression.fullName, vc.getID()); + } else if (expression.fieldName.equals("ALT")) { + infoAnnotations.put(expression.fullName, vc.getAlternateAllele(0).getDisplayString()); + } else if ( vc.hasAttribute(expression.fieldName) ) { - infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName)); + infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName)); + } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 879a46ab0..22ed95365 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -190,8 +190,8 @@ public class UnifiedGenotyperEngine { final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); if ( vc != null ) results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap)); - // else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES) - // results.add(generateEmptyContext(tracker, refContext, null, rawContext)); + else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES) + results.add(generateEmptyContext(tracker, refContext, null, rawContext)); } } From a68e6810c90711d88d2a829d27818034f721eb12 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Wed, 14 Nov 2012 14:45:15 -0500 Subject: [PATCH 45/90] Back off experimental code that escaped last commit, not for general use yet --- .../sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 22ed95365..80bc04845 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -190,8 +190,9 @@ public class UnifiedGenotyperEngine { final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); if ( vc != null ) results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap)); - else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES) - results.add(generateEmptyContext(tracker, refContext, null, rawContext)); +// todo - uncomment if we want to also emit a null ref call (with no QUAL) if there's no evidence for REF and if EMIT_ALL_SITES is set +// else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES) +// results.add(generateEmptyContext(tracker, refContext, null, rawContext)); } } From b70fd4a2426a21f375776c1de54540a562539423 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Wed, 14 Nov 2012 11:08:48 -0500 Subject: [PATCH 46/90] Initial testing of the Active Region Traversal contract - TODO: many more tests and test cases --- .../traversals/TraverseActiveRegions.java | 5 +- .../utils/activeregion/ActivityProfile.java | 5 + .../traversals/TraverseActiveRegionsTest.java | 126 ++++++++++++++++++ 3 files changed, 135 insertions(+), 1 deletion(-) create mode 100644 public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index a2c37944a..4fe83f331 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -34,6 +34,9 @@ public class TraverseActiveRegions extends TraversalEngine workQueue = new LinkedList(); private final LinkedHashSet myReads = new LinkedHashSet(); + // package access for unit testing + ActivityProfile profile; + @Override public String getTraversalUnits() { return "active regions"; @@ -53,7 +56,7 @@ public class TraverseActiveRegions extends TraversalEngine activeRegions = new LinkedList(); - ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); + profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java index e96eb843d..38cfbb38d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java @@ -103,6 +103,11 @@ public class ActivityProfile { isActiveList.add(result); } + // for unit testing + public List getActiveList() { + return isActiveList; + } + public int size() { return isActiveList.size(); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java new file mode 100644 index 000000000..8740a8b68 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java @@ -0,0 +1,126 @@ +package org.broadinstitute.sting.gatk.traversals; + +import org.testng.Assert; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMSequenceDictionary; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider; +import org.broadinstitute.sting.gatk.datasources.reads.MockLocusShard; +import org.broadinstitute.sting.gatk.datasources.reads.Shard; +import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.executive.WindowMaker; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.List; + +/** + * Created with IntelliJ IDEA. + * User: thibault + * Date: 11/13/12 + * Time: 2:47 PM + */ +public class TraverseActiveRegionsTest extends BaseTest { + + private class DummyActiveRegionWalker extends ActiveRegionWalker { + private final double prob; + + public DummyActiveRegionWalker() { + this.prob = 1.0; + } + + @Override + public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return new ActivityProfileResult(ref.getLocus(), prob); + } + + @Override + public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; + } + } + + private final TraverseActiveRegions t = new TraverseActiveRegions(); + + private IndexedFastaSequenceFile reference; + private GenomeLocParser genomeLocParser; + private ActiveRegionWalker walker; + + @BeforeClass + private void init() throws FileNotFoundException { + reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference)); + SAMSequenceDictionary dictionary = reference.getSequenceDictionary(); + genomeLocParser = new GenomeLocParser(dictionary); + } + + @Test + public void testAllIntervalsSeen() throws Exception { + List intervals = new ArrayList(); + GenomeLoc interval = genomeLocParser.createGenomeLoc("1", 1, 1); + intervals.add(interval); + + LocusShardDataProvider dataProvider = createDataProvider(intervals); + + t.traverse(walker, dataProvider, 0); + + boolean allGenomeLocsSeen = true; + for (GenomeLoc loc : intervals) { + boolean thisGenomeLocSeen = false; + for (ActivityProfileResult active : t.profile.getActiveList()) { + if (loc.equals(active.getLoc())) { + thisGenomeLocSeen = true; + break; + } + } + if (!thisGenomeLocSeen) { + allGenomeLocsSeen = false; + break; + } + } + + Assert.assertTrue(allGenomeLocsSeen, "Some intervals missing from activity profile"); + } + + private LocusShardDataProvider createDataProvider(List intervals) { + walker = new DummyActiveRegionWalker(); + + StingSAMIterator iterator = ArtificialSAMUtils.createReadIterator(new ArrayList()); + Shard shard = new MockLocusShard(genomeLocParser, intervals); + WindowMaker windowMaker = new WindowMaker(shard, genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + + GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + //engine.setReferenceDataSource(reference); + engine.setGenomeLocParser(genomeLocParser); + t.initialize(engine); + + return new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, reference, new ArrayList()); + } +} From 855a68ae39c1f31623f9561af4f9e2b4e85a14ab Mon Sep 17 00:00:00 2001 From: David Roazen Date: Mon, 19 Nov 2012 08:06:58 -0500 Subject: [PATCH 47/90] Testing out the new github-hosted repos. Please ignore. --- testfile | 1 + 1 file changed, 1 insertion(+) create mode 100644 testfile diff --git a/testfile b/testfile new file mode 100644 index 000000000..6de7b8c69 --- /dev/null +++ b/testfile @@ -0,0 +1 @@ +This is a test file. From 2a16d6fa55140ff6835c57a737586fbb18d0452b Mon Sep 17 00:00:00 2001 From: David Roazen Date: Mon, 19 Nov 2012 08:07:19 -0500 Subject: [PATCH 48/90] Revert "Testing out the new github-hosted repos. Please ignore." This reverts commit b6bf66cd088754e7fd3d5f105ca8b2551237f183. --- testfile | 1 - 1 file changed, 1 deletion(-) delete mode 100644 testfile diff --git a/testfile b/testfile deleted file mode 100644 index 6de7b8c69..000000000 --- a/testfile +++ /dev/null @@ -1 +0,0 @@ -This is a test file. From 78ce822b6f56fc5c6cc43be77f0faa47fbeabba6 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 19 Nov 2012 09:07:04 -0500 Subject: [PATCH 49/90] Protect against NPE when using non-GATK reports for inputs expecting valid GATK reports --- .../broadinstitute/sting/gatk/report/GATKReportVersion.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java index b51fb17f0..1079d9b91 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java @@ -80,6 +80,9 @@ public enum GATKReportVersion { * @return The version as an enum. */ public static GATKReportVersion fromHeader(String header) { + if ( header == null ) + throw new UserException.BadInput("The GATK report has no version specified in the header"); + if (header.startsWith("##:GATKReport.v0.1 ")) return GATKReportVersion.V0_1; From ff180a8e02eaeffbc42226c789c6c6946affae68 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 19 Nov 2012 09:09:57 -0500 Subject: [PATCH 50/90] Significant refactoring of the Haplotype Caller to handle problems with GGA. The main fix is that we now maintain a mapping from 'original' allele to 'Smith-Waterman-based' allele so that we no longer need to do a (buggy) matching throughout the calling process. --- .../haplotypecaller/GenotypingEngine.java | 253 ++++++------------ .../haplotypecaller/HaplotypeCaller.java | 6 +- .../LikelihoodCalculationEngine.java | 78 +++--- .../LikelihoodCalculationEngineUnitTest.java | 5 +- .../broadinstitute/sting/utils/Haplotype.java | 16 +- 5 files changed, 144 insertions(+), 214 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index d91df82e2..9fc636efe 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -31,7 +31,6 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; -import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -52,153 +51,17 @@ public class GenotypingEngine { noCall.add(Allele.NO_CALL); } - // WARN - // This function is the streamlined approach, currently not being used by default - // WARN - // WARN: This function is currently only being used by Menachem. Slated for removal/merging with the rest of the code. - // WARN - @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) - public List>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine, - final ArrayList haplotypes, - final byte[] ref, - final GenomeLoc refLoc, - final GenomeLoc activeRegionWindow, - final GenomeLocParser genomeLocParser ) { - // Prepare the list of haplotype indices to genotype - final ArrayList allelesToGenotype = new ArrayList(); - - for( final Haplotype h : haplotypes ) { - allelesToGenotype.add( Allele.create(h.getBases(), h.isReference()) ); - } - final int numHaplotypes = haplotypes.size(); - - // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample - final GenotypesContext genotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size()); - for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples - final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; - final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(haplotypes, sample); - int glIndex = 0; - for( int iii = 0; iii < numHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ) { - genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC - } - } - genotypes.add(new GenotypeBuilder(sample, noCall).PL(genotypeLikelihoods).make()); - } - final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder().loc(activeRegionWindow).alleles(allelesToGenotype).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel); - if( call == null ) { return Collections.emptyList(); } // exact model says that the call confidence is below the specified confidence threshold so nothing to do here - - // Prepare the list of haplotypes that need to be run through Smith-Waterman for output to VCF - final ArrayList haplotypesToRemove = new ArrayList(); - for( final Haplotype h : haplotypes ) { - if( call.getAllele(h.getBases()) == null ) { // exact model removed this allele from the list so no need to run SW and output to VCF - haplotypesToRemove.add(h); - } - } - haplotypes.removeAll(haplotypesToRemove); - - if( OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) { - final List>>> returnVCs = new ArrayList>>>(); - // set up the default 1-to-1 haplotype mapping object - final HashMap> haplotypeMapping = new HashMap>(); - for( final Haplotype h : haplotypes ) { - final ArrayList list = new ArrayList(); - list.add(h); - haplotypeMapping.put(call.getAllele(h.getBases()), list); - } - returnVCs.add( new Pair>>(call,haplotypeMapping) ); - return returnVCs; - } - - final ArrayList>>> returnCalls = new ArrayList>>>(); - - // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file - final TreeSet startPosKeySet = new TreeSet(); - int count = 0; - if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); } - for( final Haplotype h : haplotypes ) { - if( DEBUG ) { - System.out.println( h.toString() ); - System.out.println( "> Cigar = " + h.getCigar() ); - } - // Walk along the alignment and turn any difference from the reference into an event - h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) ); - startPosKeySet.addAll(h.getEventMap().keySet()); - } - - // Create the VC merge priority list - final ArrayList priorityList = new ArrayList(); - for( int iii = 0; iii < haplotypes.size(); iii++ ) { - priorityList.add("HC" + iii); - } - - // Walk along each position in the key set and create each event to be outputted - for( final int loc : startPosKeySet ) { - if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { - final ArrayList eventsAtThisLoc = new ArrayList(); - for( final Haplotype h : haplotypes ) { - final HashMap eventMap = h.getEventMap(); - final VariantContext vc = eventMap.get(loc); - if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) { - eventsAtThisLoc.add(vc); - } - } - - // Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event - final ArrayList> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes ); - - // Merge the event to find a common reference representation - final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); - - final HashMap> alleleHashMap = new HashMap>(); - int aCount = 0; - for( final Allele a : mergedVC.getAlleles() ) { - alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper - } - - if( DEBUG ) { - System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); - //System.out.println("Event/haplotype allele mapping = " + alleleMapper); - } - - // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample - final GenotypesContext myGenotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size()); - for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples - final int myNumHaplotypes = alleleMapper.size(); - final double[] genotypeLikelihoods = new double[myNumHaplotypes * (myNumHaplotypes+1) / 2]; - final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper); - int glIndex = 0; - for( int iii = 0; iii < myNumHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ) { - genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC - } - } - - // using the allele mapping object translate the haplotype allele into the event allele - final Genotype g = new GenotypeBuilder(sample) - .alleles(findEventAllelesInSample(mergedVC.getAlleles(), call.getAlleles(), call.getGenotype(sample).getAlleles(), alleleMapper, haplotypes)) - .phased(loc != startPosKeySet.first()) - .PL(genotypeLikelihoods).make(); - myGenotypes.add(g); - } - returnCalls.add( new Pair>>( - new VariantContextBuilder(mergedVC).log10PError(call.getLog10PError()).genotypes(myGenotypes).make(), alleleHashMap) ); - } - } - return returnCalls; - } - // BUGBUG: Create a class to hold this complicated return type @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) - public List>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine, - final ArrayList haplotypes, - final byte[] ref, - final GenomeLoc refLoc, - final GenomeLoc activeRegionWindow, - final GenomeLocParser genomeLocParser, - final ArrayList activeAllelesToGenotype ) { + public List>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine, + final List haplotypes, + final byte[] ref, + final GenomeLoc refLoc, + final GenomeLoc activeRegionWindow, + final GenomeLocParser genomeLocParser, + final List activeAllelesToGenotype ) { - final ArrayList>>> returnCalls = new ArrayList>>>(); + final ArrayList>>> returnCalls = new ArrayList>>>(); // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file final TreeSet startPosKeySet = new TreeSet(); @@ -261,7 +124,15 @@ public class GenotypingEngine { if( eventsAtThisLoc.isEmpty() ) { continue; } // Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event - final ArrayList> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes ); + Map> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes ); + + final Allele refAllele = eventsAtThisLoc.get(0).getReference(); + final ArrayList alleleOrdering = new ArrayList(alleleMapper.size()); + alleleOrdering.add(refAllele); + for ( final Allele allele : alleleMapper.keySet() ) { + if ( !refAllele.equals(allele) ) + alleleOrdering.add(allele); + } // Sanity check the priority list for( final VariantContext vc : eventsAtThisLoc ) { @@ -283,12 +154,6 @@ public class GenotypingEngine { final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); if( mergedVC == null ) { continue; } - HashMap> alleleHashMap = new HashMap>(); - int aCount = 0; - for( final Allele a : mergedVC.getAlleles() ) { - alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper - } - if( DEBUG ) { System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); //System.out.println("Event/haplotype allele mapping = " + alleleMapper); @@ -299,7 +164,7 @@ public class GenotypingEngine { for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples final int numHaplotypes = alleleMapper.size(); final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; - final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper); + final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper, alleleOrdering); int glIndex = 0; for( int iii = 0; iii < numHaplotypes; iii++ ) { for( int jjj = 0; jjj <= iii; jjj++ ) { @@ -313,23 +178,23 @@ public class GenotypingEngine { if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary! final VariantContext vcCallTrim = VariantContextUtils.reverseTrimAlleles(call); // also, need to update the allele -> haplotype mapping - final HashMap> alleleHashMapTrim = new HashMap>(); + final HashMap> alleleHashMapTrim = new HashMap>(); for( int iii = 0; iii < vcCallTrim.getAlleles().size(); iii++ ) { // BUGBUG: this is assuming that the original and trimmed alleles maintain the same ordering in the VC - alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleHashMap.get(call.getAlleles().get(iii))); + alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleMapper.get(call.getAlleles().get(iii))); } call = vcCallTrim; - alleleHashMap = alleleHashMapTrim; + alleleMapper = alleleHashMapTrim; } - returnCalls.add( new Pair>>(call, alleleHashMap) ); + returnCalls.add( new Pair>>(call, alleleMapper) ); } } } return returnCalls; } - protected static void cleanUpSymbolicUnassembledEvents( final ArrayList haplotypes ) { + protected static void cleanUpSymbolicUnassembledEvents( final List haplotypes ) { final ArrayList haplotypesToRemove = new ArrayList(); for( final Haplotype h : haplotypes ) { for( final VariantContext vc : h.getEventMap().values() ) { @@ -348,7 +213,7 @@ public class GenotypingEngine { haplotypes.removeAll(haplotypesToRemove); } - protected void mergeConsecutiveEventsBasedOnLD( final ArrayList haplotypes, final TreeSet startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) { + protected void mergeConsecutiveEventsBasedOnLD( final List haplotypes, final TreeSet startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) { final int MAX_SIZE_TO_COMBINE = 15; final double MERGE_EVENTS_R2_THRESHOLD = 0.95; if( startPosKeySet.size() <= 1 ) { return; } @@ -395,7 +260,9 @@ public class GenotypingEngine { final ArrayList haplotypeList = new ArrayList(); haplotypeList.add(h); for( final String sample : haplotypes.get(0).getSampleKeySet() ) { - final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( haplotypeList, sample )[0][0]; + final HashSet sampleSet = new HashSet(1); + sampleSet.add(sample); + final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( sampleSet, haplotypeList )[0][0]; if( thisHapVC == null ) { if( nextHapVC == null ) { x11 = MathUtils.approximateLog10SumLog10(x11, haplotypeLikelihood); } else { x12 = MathUtils.approximateLog10SumLog10(x12, haplotypeLikelihood); } @@ -489,37 +356,71 @@ public class GenotypingEngine { @Requires({"haplotypes.size() >= eventsAtThisLoc.size() + 1"}) @Ensures({"result.size() == eventsAtThisLoc.size() + 1"}) - protected static ArrayList> createAlleleMapper( final int loc, final ArrayList eventsAtThisLoc, final ArrayList haplotypes ) { - final ArrayList> alleleMapper = new ArrayList>(); - final ArrayList refList = new ArrayList(); + protected static Map> createAlleleMapper( final int loc, final List eventsAtThisLoc, final List haplotypes ) { + + final Allele refAllele = eventsAtThisLoc.get(0).getReference(); + + final Map> alleleMapper = new HashMap>(eventsAtThisLoc.size()+1); for( final Haplotype h : haplotypes ) { if( h.getEventMap().get(loc) == null ) { // no event at this location so this is a reference-supporting haplotype - refList.add(h); + if ( !alleleMapper.containsKey(refAllele) ) + alleleMapper.put(refAllele, new ArrayList()); + alleleMapper.get(refAllele).add(h); + } else if ( h.isArtificialHaplotype() ) { + if ( !alleleMapper.containsKey(h.getArtificialAllele()) ) + alleleMapper.put(h.getArtificialAllele(), new ArrayList()); + alleleMapper.get(h.getArtificialAllele()).add(h); } else { - boolean foundInEventList = false; for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) { if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) { - foundInEventList = true; + final Allele altAllele = vcAtThisLoc.getAlternateAllele(0); + if ( !alleleMapper.containsKey(altAllele) ) + alleleMapper.put(altAllele, new ArrayList()); + alleleMapper.get(altAllele).add(h); + break; } } - if( !foundInEventList ) { // event at this location isn't one of the genotype-able options (during GGA) so this is a reference-supporting haplotype - refList.add(h); - } } } - alleleMapper.add(refList); - for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) { - final ArrayList list = new ArrayList(); - for( final Haplotype h : haplotypes ) { - if( h.getEventMap().get(loc) != null && h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) { - list.add(h); + + for( final Haplotype h : haplotypes ) { + if ( h.getEventMap().get(loc) == null || h.isArtificialHaplotype() ) + continue; + + Allele matchingAllele = null; + for ( final Map.Entry> alleleToTest : alleleMapper.entrySet() ) { + if ( alleleToTest.getKey().equals(refAllele) ) + continue; + + final Haplotype artificialHaplotype = alleleToTest.getValue().get(0); + if ( isSubSetOf(artificialHaplotype.getEventMap(), h.getEventMap()) ) { + matchingAllele = alleleToTest.getKey(); + break; } } - alleleMapper.add(list); + + if ( matchingAllele == null ) + matchingAllele = refAllele; + alleleMapper.get(matchingAllele).add(h); } + return alleleMapper; } + protected static boolean isSubSetOf(final Map subset, final Map superset) { + + for ( final Map.Entry fromSubset : subset.entrySet() ) { + final VariantContext fromSuperset = superset.get(fromSubset.getKey()); + if ( fromSuperset == null ) + return false; + + if ( !fromSuperset.hasAlternateAllele(fromSubset.getValue().getAlternateAllele(0)) ) + return false; + } + + return true; + } + @Ensures({"result.size() == haplotypeAllelesForSample.size()"}) protected static List findEventAllelesInSample( final List eventAlleles, final List haplotypeAlleles, final List haplotypeAllelesForSample, final ArrayList> alleleMapper, final ArrayList haplotypes ) { if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index a185ba6af..2b739a321 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -421,10 +421,8 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes ) final ArrayList bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes ) : haplotypes ); - for( final Pair>> callResult : - ( GENOTYPE_FULL_ACTIVE_REGION && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES - ? genotypingEngine.assignGenotypeLikelihoodsAndCallHaplotypeEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getExtendedLoc(), getToolkit().getGenomeLocParser() ) - : genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) { + for( final Pair>> callResult : + genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) { if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); } final Map stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog ); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index a0924623b..543987e74 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -148,34 +148,21 @@ public class LikelihoodCalculationEngine { return Math.min(b1.length, b2.length); } - @Requires({"haplotypes.size() > 0"}) - @Ensures({"result.length == result[0].length", "result.length == haplotypes.size()"}) - public static double[][] computeDiploidHaplotypeLikelihoods( final ArrayList haplotypes, final String sample ) { - // set up the default 1-to-1 haplotype mapping object, BUGBUG: target for future optimization? - final ArrayList> haplotypeMapping = new ArrayList>(); - for( final Haplotype h : haplotypes ) { - final ArrayList list = new ArrayList(); - list.add(h); - haplotypeMapping.add(list); - } - return computeDiploidHaplotypeLikelihoods( sample, haplotypeMapping ); - } - // This function takes just a single sample and a haplotypeMapping @Requires({"haplotypeMapping.size() > 0"}) @Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"}) - public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final ArrayList> haplotypeMapping ) { + public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final Map> haplotypeMapping, final List alleleOrdering ) { final TreeSet sampleSet = new TreeSet(); sampleSet.add(sample); - return computeDiploidHaplotypeLikelihoods(sampleSet, haplotypeMapping); + return computeDiploidHaplotypeLikelihoods(sampleSet, haplotypeMapping, alleleOrdering); } // This function takes a set of samples to pool over and a haplotypeMapping @Requires({"haplotypeMapping.size() > 0"}) @Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"}) - public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, final ArrayList> haplotypeMapping ) { + public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, final Map> haplotypeMapping, final List alleleOrdering ) { - final int numHaplotypes = haplotypeMapping.size(); + final int numHaplotypes = alleleOrdering.size(); final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes]; for( int iii = 0; iii < numHaplotypes; iii++ ) { Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY); @@ -184,9 +171,9 @@ public class LikelihoodCalculationEngine { // compute the diploid haplotype likelihoods // todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code for( int iii = 0; iii < numHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ) { - for( final Haplotype iii_mapped : haplotypeMapping.get(iii) ) { - for( final Haplotype jjj_mapped : haplotypeMapping.get(jjj) ) { + for( int jjj = 0; jjj <= iii; jjj++ ) { + for( final Haplotype iii_mapped : haplotypeMapping.get(alleleOrdering.get(iii)) ) { + for( final Haplotype jjj_mapped : haplotypeMapping.get(alleleOrdering.get(jjj)) ) { double haplotypeLikelihood = 0.0; for( final String sample : samples ) { final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample); @@ -200,12 +187,48 @@ public class LikelihoodCalculationEngine { } haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); } - } + } } } // normalize the diploid likelihoods matrix - return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ); + return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ); + } + + // This function takes a set of samples to pool over and a haplotypeMapping + @Requires({"haplotypeMapping.size() > 0"}) + @Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"}) + public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, final List haplotypeList ) { + + final int numHaplotypes = haplotypeList.size(); + final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes]; + for( int iii = 0; iii < numHaplotypes; iii++ ) { + Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY); + } + + // compute the diploid haplotype likelihoods + // todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code + for( int iii = 0; iii < numHaplotypes; iii++ ) { + final Haplotype iii_haplotype = haplotypeList.get(iii); + for( int jjj = 0; jjj <= iii; jjj++ ) { + final Haplotype jjj_haplotype = haplotypeList.get(jjj); + double haplotypeLikelihood = 0.0; + for( final String sample : samples ) { + final double[] readLikelihoods_iii = iii_haplotype.getReadLikelihoods(sample); + final int[] readCounts_iii = iii_haplotype.getReadCounts(sample); + final double[] readLikelihoods_jjj = jjj_haplotype.getReadLikelihoods(sample); + for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) { + // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) + // First term is approximated by Jacobian log with table lookup. + haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF ); + } + } + haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); + } + } + + // normalize the diploid likelihoods matrix + return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ); } @Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"}) @@ -296,14 +319,7 @@ public class LikelihoodCalculationEngine { final Set sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples final ArrayList bestHaplotypesIndexList = new ArrayList(); bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype - // set up the default 1-to-1 haplotype mapping object - final ArrayList> haplotypeMapping = new ArrayList>(); - for( final Haplotype h : haplotypes ) { - final ArrayList list = new ArrayList(); - list.add(h); - haplotypeMapping.add(list); - } - final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, haplotypeMapping ); // all samples pooled together + final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, haplotypes ); // all samples pooled together int hap1 = 0; int hap2 = 0; @@ -347,7 +363,7 @@ public class LikelihoodCalculationEngine { public static Map partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap> perSampleReadList, final HashMap> perSampleFilteredReadList, - final Pair>> call, + final Pair>> call, final double downsamplingFraction, final PrintStream downsamplingLog ) { final Map returnMap = new HashMap(); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java index e82946690..19ced9f42 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java @@ -10,7 +10,6 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.testng.Assert; -import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -102,7 +101,9 @@ public class LikelihoodCalculationEngineUnitTest extends BaseTest { haplotypes.add(haplotype); } } - return LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(haplotypes, "myTestSample"); + final HashSet sampleSet = new HashSet(1); + sampleSet.add("myTestSample"); + return LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sampleSet, haplotypes); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index b30d47074..6de15e18b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -49,6 +49,7 @@ public class Haplotype { private int alignmentStartHapwrtRef; public int leftBreakPoint = 0; public int rightBreakPoint = 0; + private Allele artificialAllele = null; /** * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual @@ -71,6 +72,11 @@ public class Haplotype { this(bases, 0); } + public Haplotype( final byte[] bases, final Allele artificialAllele ) { + this(bases, 0); + this.artificialAllele = artificialAllele; + } + public Haplotype( final byte[] bases, final GenomeLoc loc ) { this(bases); this.genomeLocation = loc; @@ -171,6 +177,14 @@ public class Haplotype { this.cigar = cigar; } + public boolean isArtificialHaplotype() { + return artificialAllele != null; + } + + public Allele getArtificialAllele() { + return artificialAllele; + } + @Requires({"refInsertLocation >= 0"}) public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation ) { // refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates @@ -182,7 +196,7 @@ public class Haplotype { newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, 0, haplotypeInsertLocation)); // bases before the variant newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, haplotypeInsertLocation + refAllele.length(), bases.length)); // bases after the variant - return new Haplotype(newHaplotypeBases); + return new Haplotype(newHaplotypeBases, altAllele); } public static class HaplotypeBaseComparator implements Comparator, Serializable { From 9fc63efc307475f6b36ffad7e425b90bcab817cf Mon Sep 17 00:00:00 2001 From: David Roazen Date: Mon, 19 Nov 2012 09:34:15 -0500 Subject: [PATCH 51/90] Second test of new repos. Please ignore. --- testfile | 1 + 1 file changed, 1 insertion(+) create mode 100644 testfile diff --git a/testfile b/testfile new file mode 100644 index 000000000..524acfffa --- /dev/null +++ b/testfile @@ -0,0 +1 @@ +Test file From e1a5c3ce7a97e8eb92c64cc144340c81d01c9903 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Mon, 19 Nov 2012 09:34:32 -0500 Subject: [PATCH 52/90] Revert "Second test of new repos. Please ignore." This reverts commit 077532d870ddf53ec514b98f14534ca7dbf55331. --- testfile | 1 - 1 file changed, 1 deletion(-) delete mode 100644 testfile diff --git a/testfile b/testfile deleted file mode 100644 index 524acfffa..000000000 --- a/testfile +++ /dev/null @@ -1 +0,0 @@ -Test file From f0b8a0228fef45f23478c1a12be1cd58633c0873 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 19 Nov 2012 09:57:55 -0500 Subject: [PATCH 54/90] Quick fix for HC refactoring: when copying over Haplotype objects, make sure to copy over the artificial allele used to create it too. --- .../gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java | 2 ++ public/java/src/org/broadinstitute/sting/utils/Haplotype.java | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java index fd46e4e69..33fa49543 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java @@ -369,6 +369,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() ); h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0) ); + if ( haplotype.isArtificialHaplotype() ) + h.setArtificialAllele(haplotype.getArtificialAllele()); h.leftBreakPoint = leftBreakPoint; h.rightBreakPoint = rightBreakPoint; if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart ) { // protect against SW failures diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index 6de15e18b..af4e31698 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -185,6 +185,10 @@ public class Haplotype { return artificialAllele; } + public void setArtificialAllele(final Allele artificialAllele) { + this.artificialAllele = artificialAllele; + } + @Requires({"refInsertLocation >= 0"}) public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation ) { // refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates From 937ac7290f7caa7f0cae996608f5a68358f10c09 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 20 Nov 2012 16:13:29 -0500 Subject: [PATCH 58/90] Lots more GGA fixes for the HC now that I understand what's going on internally. Integration tests pass except for the GGA test which I believe now produces better results. --- .../haplotypecaller/GenotypingEngine.java | 82 ++++++++++++------- .../LikelihoodCalculationEngine.java | 4 +- .../SimpleDeBruijnAssembler.java | 22 +++-- .../HaplotypeCallerIntegrationTest.java | 3 +- .../broadinstitute/sting/utils/Haplotype.java | 17 ++-- .../sting/utils/HaplotypeUnitTest.java | 2 +- 6 files changed, 87 insertions(+), 43 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 9fc636efe..beec8a92e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -62,6 +62,7 @@ public class GenotypingEngine { final List activeAllelesToGenotype ) { final ArrayList>>> returnCalls = new ArrayList>>>(); + final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty(); // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file final TreeSet startPosKeySet = new TreeSet(); @@ -70,7 +71,7 @@ public class GenotypingEngine { for( final Haplotype h : haplotypes ) { // Walk along the alignment and turn any difference from the reference into an event h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) ); - if( activeAllelesToGenotype.isEmpty() ) { startPosKeySet.addAll(h.getEventMap().keySet()); } + if( !in_GGA_mode ) { startPosKeySet.addAll(h.getEventMap().keySet()); } if( DEBUG ) { System.out.println( h.toString() ); System.out.println( "> Cigar = " + h.getCigar() ); @@ -80,10 +81,10 @@ public class GenotypingEngine { } cleanUpSymbolicUnassembledEvents( haplotypes ); - if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure + if( !in_GGA_mode && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc ); } - if( !activeAllelesToGenotype.isEmpty() ) { // we are in GGA mode! + if( in_GGA_mode ) { for( final VariantContext compVC : activeAllelesToGenotype ) { startPosKeySet.add( compVC.getStart() ); } @@ -95,7 +96,7 @@ public class GenotypingEngine { final ArrayList eventsAtThisLoc = new ArrayList(); // the overlapping events to merge into a common reference view final ArrayList priorityList = new ArrayList(); // used to merge overlapping events into common reference view - if( activeAllelesToGenotype.isEmpty() ) { + if( !in_GGA_mode ) { for( final Haplotype h : haplotypes ) { final HashMap eventMap = h.getEventMap(); final VariantContext vc = eventMap.get(loc); @@ -129,9 +130,8 @@ public class GenotypingEngine { final Allele refAllele = eventsAtThisLoc.get(0).getReference(); final ArrayList alleleOrdering = new ArrayList(alleleMapper.size()); alleleOrdering.add(refAllele); - for ( final Allele allele : alleleMapper.keySet() ) { - if ( !refAllele.equals(allele) ) - alleleOrdering.add(allele); + for( final VariantContext vc : eventsAtThisLoc ) { + alleleOrdering.add(vc.getAlternateAllele(0)); } // Sanity check the priority list @@ -154,6 +154,16 @@ public class GenotypingEngine { final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); if( mergedVC == null ) { continue; } + // let's update the Allele keys in the mapper because they can change after merging when there are complex events + Map> updatedAlleleMapper = new HashMap>(alleleMapper.size()); + for ( int i = 0; i < mergedVC.getNAlleles(); i++ ) { + final Allele oldAllele = alleleOrdering.get(i); + final Allele newAllele = mergedVC.getAlleles().get(i); + updatedAlleleMapper.put(newAllele, alleleMapper.get(oldAllele)); + alleleOrdering.set(i, newAllele); + } + alleleMapper = updatedAlleleMapper; + if( DEBUG ) { System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); //System.out.println("Event/haplotype allele mapping = " + alleleMapper); @@ -358,48 +368,48 @@ public class GenotypingEngine { @Ensures({"result.size() == eventsAtThisLoc.size() + 1"}) protected static Map> createAlleleMapper( final int loc, final List eventsAtThisLoc, final List haplotypes ) { - final Allele refAllele = eventsAtThisLoc.get(0).getReference(); - final Map> alleleMapper = new HashMap>(eventsAtThisLoc.size()+1); + final Allele refAllele = eventsAtThisLoc.get(0).getReference(); + alleleMapper.put(refAllele, new ArrayList()); + for( final VariantContext vc : eventsAtThisLoc ) + alleleMapper.put(vc.getAlternateAllele(0), new ArrayList()); + + final ArrayList undeterminedHaplotypes = new ArrayList(haplotypes.size()); for( final Haplotype h : haplotypes ) { if( h.getEventMap().get(loc) == null ) { // no event at this location so this is a reference-supporting haplotype - if ( !alleleMapper.containsKey(refAllele) ) - alleleMapper.put(refAllele, new ArrayList()); alleleMapper.get(refAllele).add(h); - } else if ( h.isArtificialHaplotype() ) { - if ( !alleleMapper.containsKey(h.getArtificialAllele()) ) - alleleMapper.put(h.getArtificialAllele(), new ArrayList()); + } else if( h.isArtificialHaplotype() && loc == h.getArtificialAllelePosition() && alleleMapper.containsKey(h.getArtificialAllele()) ) { alleleMapper.get(h.getArtificialAllele()).add(h); } else { + boolean haplotypeIsDetermined = false; for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) { if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) { - final Allele altAllele = vcAtThisLoc.getAlternateAllele(0); - if ( !alleleMapper.containsKey(altAllele) ) - alleleMapper.put(altAllele, new ArrayList()); - alleleMapper.get(altAllele).add(h); + alleleMapper.get(vcAtThisLoc.getAlternateAllele(0)).add(h); + haplotypeIsDetermined = true; break; } } + + if( !haplotypeIsDetermined ) + undeterminedHaplotypes.add(h); } } - for( final Haplotype h : haplotypes ) { - if ( h.getEventMap().get(loc) == null || h.isArtificialHaplotype() ) - continue; - + for( final Haplotype h : undeterminedHaplotypes ) { Allele matchingAllele = null; - for ( final Map.Entry> alleleToTest : alleleMapper.entrySet() ) { - if ( alleleToTest.getKey().equals(refAllele) ) + for( final Map.Entry> alleleToTest : alleleMapper.entrySet() ) { + // don't test against the reference allele + if( alleleToTest.getKey().equals(refAllele) ) continue; final Haplotype artificialHaplotype = alleleToTest.getValue().get(0); - if ( isSubSetOf(artificialHaplotype.getEventMap(), h.getEventMap()) ) { + if( isSubSetOf(artificialHaplotype.getEventMap(), h.getEventMap(), true) ) { matchingAllele = alleleToTest.getKey(); break; } } - if ( matchingAllele == null ) + if( matchingAllele == null ) matchingAllele = refAllele; alleleMapper.get(matchingAllele).add(h); } @@ -407,20 +417,36 @@ public class GenotypingEngine { return alleleMapper; } - protected static boolean isSubSetOf(final Map subset, final Map superset) { + protected static boolean isSubSetOf(final Map subset, final Map superset, final boolean resolveSupersetToSubset) { for ( final Map.Entry fromSubset : subset.entrySet() ) { final VariantContext fromSuperset = superset.get(fromSubset.getKey()); if ( fromSuperset == null ) return false; - if ( !fromSuperset.hasAlternateAllele(fromSubset.getValue().getAlternateAllele(0)) ) + List supersetAlleles = fromSuperset.getAlternateAlleles(); + if ( resolveSupersetToSubset ) + supersetAlleles = resolveAlternateAlleles(fromSubset.getValue().getReference(), fromSuperset.getReference(), supersetAlleles); + + if ( !supersetAlleles.contains(fromSubset.getValue().getAlternateAllele(0)) ) return false; } return true; } + private static List resolveAlternateAlleles(final Allele targetReference, final Allele actualReference, final List currentAlleles) { + if ( targetReference.length() <= actualReference.length() ) + return currentAlleles; + + final List newAlleles = new ArrayList(currentAlleles.size()); + final byte[] extraBases = Arrays.copyOfRange(targetReference.getBases(), actualReference.length(), targetReference.length()); + for ( final Allele a : currentAlleles ) { + newAlleles.add(Allele.extend(a, extraBases)); + } + return newAlleles; + } + @Ensures({"result.size() == haplotypeAllelesForSample.size()"}) protected static List findEventAllelesInSample( final List eventAlleles, final List haplotypeAlleles, final List haplotypeAllelesForSample, final ArrayList> alleleMapper, final ArrayList haplotypes ) { if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 543987e74..304f8d5cb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -196,8 +196,8 @@ public class LikelihoodCalculationEngine { } // This function takes a set of samples to pool over and a haplotypeMapping - @Requires({"haplotypeMapping.size() > 0"}) - @Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"}) + @Requires({"haplotypeList.size() > 0"}) + @Ensures({"result.length == result[0].length", "result.length == haplotypeList.size()"}) public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, final List haplotypeList ) { final int numHaplotypes = haplotypeList.size(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java index 33fa49543..4f072d720 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java @@ -278,9 +278,10 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef(); final int activeRegionStop = refHaplotype.getAlignmentStartHapwrtRef() + refHaplotype.getCigar().getReferenceLength(); - for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype + // for GGA mode, add the desired allele into the haplotype + for( final VariantContext compVC : activeAllelesToGenotype ) { for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { - final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart()); + final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()); if( !addHaplotype( insertedRefHaplotype, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) { return returnHaplotypes; //throw new ReviewedStingException("Unable to add reference+allele haplotype during GGA-enabled assembly: " + insertedRefHaplotype); @@ -290,15 +291,24 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { for( final DefaultDirectedGraph graph : graphs ) { for ( final KBestPaths.Path path : KBestPaths.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) { + final Haplotype h = new Haplotype( path.getBases( graph ), path.getScore() ); if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) { - if( !activeAllelesToGenotype.isEmpty() ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present + + // for GGA mode, add the desired allele into the haplotype if it isn't already present + if( !activeAllelesToGenotype.isEmpty() ) { final HashMap eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), fullReferenceWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart()); - if( vcOnHaplotype == null || !vcOnHaplotype.hasSameAllelesAs(compVC) ) { + + // This if statement used to additionally have: + // "|| !vcOnHaplotype.hasSameAllelesAs(compVC)" + // but that can lead to problems downstream when e.g. you are injecting a 1bp deletion onto + // a haplotype that already contains a 1bp insertion (so practically it is reference but + // falls into the bin for the 1bp deletion because we keep track of the artificial alleles). + if( vcOnHaplotype == null ) { for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { - addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ); + addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ); } } } @@ -370,7 +380,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() ); h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0) ); if ( haplotype.isArtificialHaplotype() ) - h.setArtificialAllele(haplotype.getArtificialAllele()); + h.setArtificialAllele(haplotype.getArtificialAllele(), haplotype.getArtificialAllelePosition()); h.leftBreakPoint = leftBreakPoint; h.rightBreakPoint = rightBreakPoint; if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart ) { // protect against SW failures diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 6828dbcb5..a57462d1d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -29,9 +29,10 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { HCTest(NA12878_BAM, "", "baabae06c85d416920be434939124d7f"); } + // TODO -- add more tests for GGA mode, especially with input alleles that are complex variants and/or not trimmed @Test public void testHaplotypeCallerMultiSampleGGA() { - HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "39da622b309597d7a0b082c8aa1748c9"); + HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "f2d0309fdf50d5827e9c60ed0dd07e3f"); } private void HCTestComplexVariants(String bam, String args, String md5) { diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index af4e31698..30fdce75d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -50,7 +50,8 @@ public class Haplotype { public int leftBreakPoint = 0; public int rightBreakPoint = 0; private Allele artificialAllele = null; - + private int artificialAllelePosition = -1; + /** * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual * @@ -72,9 +73,10 @@ public class Haplotype { this(bases, 0); } - public Haplotype( final byte[] bases, final Allele artificialAllele ) { + protected Haplotype( final byte[] bases, final Allele artificialAllele, final int artificialAllelePosition ) { this(bases, 0); this.artificialAllele = artificialAllele; + this.artificialAllelePosition = artificialAllelePosition; } public Haplotype( final byte[] bases, final GenomeLoc loc ) { @@ -185,12 +187,17 @@ public class Haplotype { return artificialAllele; } - public void setArtificialAllele(final Allele artificialAllele) { + public int getArtificialAllelePosition() { + return artificialAllelePosition; + } + + public void setArtificialAllele(final Allele artificialAllele, final int artificialAllelePosition) { this.artificialAllele = artificialAllele; + this.artificialAllelePosition = artificialAllelePosition; } @Requires({"refInsertLocation >= 0"}) - public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation ) { + public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation, final int genomicInsertLocation ) { // refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates final int haplotypeInsertLocation = ReadUtils.getReadCoordinateForReferenceCoordinate(alignmentStartHapwrtRef, cigar, refInsertLocation, ReadUtils.ClippingTail.RIGHT_TAIL, true); if( haplotypeInsertLocation == -1 || haplotypeInsertLocation + refAllele.length() >= bases.length ) { // desired change falls inside deletion so don't bother creating a new haplotype @@ -200,7 +207,7 @@ public class Haplotype { newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, 0, haplotypeInsertLocation)); // bases before the variant newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, haplotypeInsertLocation + refAllele.length(), bases.length)); // bases after the variant - return new Haplotype(newHaplotypeBases, altAllele); + return new Haplotype(newHaplotypeBases, altAllele, genomicInsertLocation); } public static class HaplotypeBaseComparator implements Comparator, Serializable { diff --git a/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java index ddffb6e4c..13db1d39e 100644 --- a/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java @@ -159,7 +159,7 @@ public class HaplotypeUnitTest extends BaseTest { final VariantContext vc = new VariantContextBuilder().alleles(alleles).loc("1", loc, loc + h1refAllele.getBases().length - 1).make(); h.setAlignmentStartHapwrtRef(0); h.setCigar(cigar); - final Haplotype h1 = h.insertAllele(vc.getReference(), vc.getAlternateAllele(0), loc); + final Haplotype h1 = h.insertAllele(vc.getReference(), vc.getAlternateAllele(0), loc, vc.getStart()); final Haplotype h1expected = new Haplotype(newHap.getBytes()); Assert.assertEquals(h1, h1expected); } From cc7680e6010ab184752c9f4c20b96e055178478c Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 4 Nov 2012 14:40:17 -0800 Subject: [PATCH 59/90] NA12878 knowledge base backed by MongoDB -- Idea is simply to create a persistent database of all TP/FP sites on chr20 in NA12878. Individual callsets can be imported, and a consensus algorithm is run over all callsets in the database to create a consensus collection, which can be used to assess NA12878 callsets for GATK and methods development -- Framework for representing simple VariantContexts and Genotypes in MongoDB, querying for records, and iterating over them in the GATK -- Not hooked up to Tribble, but could be done reasonably easily now (future TODO) -- Tools to import callsets, create consensus callsets, import and export reviews -- Scripts to reset the knowledge base and repopulate it with the standard data files (Eric will expand) -- Actually scales to all of chr20, includes AssessNA12878 that reads a VCF and itemizes it against the truth data set -- ImportCallset can load OMNI, HM3, CEU best practices, mills/devine sites and genotypes, properly marking sites as poly/mono/unk as well as TP/FP/UNK based on command line parameters -- Added shell scripts that start up a local mongo db, that connect to a local or BI hosted mongo for NA12878.db for debugging, and a setupNA12878db script that can load OMNI, HM3, CEU best practices, Mills/Devine into the db and then update the consensus. -- Reviewed sites can be exported to a VCF, and imported again, as a mechanism to safely store the only non-recoverable data from the Mongo DB. -- Created a NA12878DBWalker that manages the outer DB interaction, and that all MongoDB interacting walkers inherit from. Added a NA12878DBArgumentCollection.java consolating all of the common command line arguments (though strictly not necessary as all of this occurs in the root walker) UnitTests -- Can connect to a test knowledge base for development and unit testing -- PolymorphicStatus, TruthStatus, SiteIterator -- NA12878KBUnitTestBase provides simple utilities for connecting to the test mongo db, getting calls, etc -- MongoVariantContext tests creation, matching, and encoding -> writing -> read -> decoding from the mongodb AssessNA12878 -- Generic tool for comparing a NA12878 callset against the knowledge base. See http://gatkforums.broadinstitute.org/discussion/1848/using-the-na12878-knowledge-base for detailed documentation -- Performs trivial filtering on FS, MQ, QD for SNPs and non-SNPs to separate out variants likely to be filtered from those that are honest-to-goodness FPs Misc -- Ability to provide Description for Simplified GATK report --- .../sting/gatk/report/GATKReport.java | 13 ++++++- .../broadinstitute/sting/utils/GenomeLoc.java | 14 ++++++++ .../org/broadinstitute/sting/utils/Utils.java | 4 +++ .../sting/utils/codecs/vcf/VCFUtils.java | 34 +++++++++++++++++++ 4 files changed, 64 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java index 47bc48f81..6685ee12a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java @@ -271,7 +271,18 @@ public class GATKReport { * @return a simplified GATK report */ public static GATKReport newSimpleReport(final String tableName, final String... columns) { - GATKReportTable table = new GATKReportTable(tableName, "A simplified GATK table report", columns.length); + return newSimpleReportWithDescription(tableName, "A simplified GATK table report", columns); + } + + /** + * @see #newSimpleReport(String, String...) but with a customized description + * @param tableName + * @param desc + * @param columns + * @return + */ + public static GATKReport newSimpleReportWithDescription(final String tableName, final String desc, final String... columns) { + GATKReportTable table = new GATKReportTable(tableName, desc, columns.length); for (String column : columns) { table.addColumn(column, ""); diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 6df9c9f1d..4d2c26a79 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -315,6 +315,20 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome return ( comparison == -1 || ( comparison == 0 && this.getStop() < that.getStart() )); } + /** + * Tests whether this genome loc starts at the same position as that. + * + * i.e., do this and that have the same contig and the same start position + * + * @param that genome loc to compare to + * @return true if this and that have the same contig and the same start position + */ + @Requires("that != null") + public final boolean startsAt( GenomeLoc that ) { + int comparison = this.compareContigs(that); + return comparison == 0 && this.getStart() == that.getStart(); + } + /** * Tests whether any portion of this contig is before that contig. * @param that Other contig to test. diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index b780d0966..1d12d6f8b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -293,6 +293,10 @@ public class Utils { } } + public static String join(final String separator, final T ... objects) { + return join(separator, Arrays.asList(objects)); + } + public static String dupString(char c, int nCopies) { char[] chars = new char[nCopies]; Arrays.fill(chars, c); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java index be87e7306..a8aefb703 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java @@ -30,12 +30,17 @@ import net.sf.samtools.SAMSequenceRecord; import org.apache.commons.io.FilenameUtils; import org.apache.log4j.Logger; import org.broad.tribble.Feature; +import org.broad.tribble.FeatureCodecHeader; +import org.broad.tribble.readers.PositionalBufferedStream; import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; import java.util.*; /** @@ -317,4 +322,33 @@ public class VCFUtils { assembly = "hg19"; return assembly; } + + /** + * Read all of the VCF records from source into memory, returning the header and the VariantContexts + * + * @param source the file to read, must be in VCF4 format + * @return + * @throws IOException + */ + public static Pair> readVCF(final File source) throws IOException { + // read in the features + final List vcs = new ArrayList(); + final VCFCodec codec = new VCFCodec(); + PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source)); + FeatureCodecHeader header = codec.readHeader(pbs); + pbs.close(); + + pbs = new PositionalBufferedStream(new FileInputStream(source)); + pbs.skip(header.getHeaderEnd()); + + final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue(); + + while ( ! pbs.isDone() ) { + final VariantContext vc = codec.decode(pbs); + if ( vc != null ) + vcs.add(vc); + } + + return new Pair>(vcfHeader, vcs); + } } \ No newline at end of file From ff87642a91c569982fed062d34cf1bf63caab63b Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 20 Nov 2012 22:29:56 -0500 Subject: [PATCH 60/90] Enable cycle covariate unit tests --- .../sting/utils/recalibration/CycleCovariateUnitTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java index c3d93b2cb..deb0931d6 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java @@ -24,7 +24,7 @@ public class CycleCovariateUnitTest { covariate.initialize(RAC); } - @Test(enabled = false) + @Test(enabled = true) public void testSimpleCycles() { short readLength = 10; GATKSAMRecord read = ReadUtils.createRandomRead(readLength); From 72e2d569c540e758284d9304caf44bf2b2e6ca35 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 20 Nov 2012 22:41:57 -0500 Subject: [PATCH 61/90] The user can now set the maximum allowable cycle on the command-line with --maximum_cycle_value. This value is (now) enforced in the Cycle covariate and a User Error is thrown if the maximum value is passed (with a helpful error message). Added unit tests to cover this new functionality. --- .../bqsr/RecalibrationArgumentCollection.java | 18 ++++++++++--- .../covariates/CycleCovariate.java | 7 +++++- .../recalibration/CycleCovariateUnitTest.java | 25 ++++++++++++++++++- 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index e5704a1e2..c64482151 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -102,13 +102,10 @@ public class RecalibrationArgumentCollection { @Argument(fullName = "no_standard_covs", shortName = "noStandard", doc = "Do not use the standard set of covariates, but rather just the ones listed using the -cov argument", required = false) public boolean DO_NOT_USE_STANDARD_COVARIATES = false; - ///////////////////////////// - // Debugging-only Arguments - ///////////////////////////// /** * This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option. */ - @Hidden + @Advanced @Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.") public boolean RUN_WITHOUT_DBSNP = false; @@ -139,6 +136,13 @@ public class RecalibrationArgumentCollection { @Argument(fullName = "indels_context_size", shortName = "ics", doc = "size of the k-mer context to be used for base insertions and deletions", required = false) public int INDELS_CONTEXT_SIZE = 3; + /** + * The cycle covariate will generate an error if it encounters a cycle greater than this value. + * This argument is ignored if the Cycle covariate is not used. + */ + @Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "the maximum cycle value permitted for the Cycle covariate", required = false) + public int MAXIMUM_CYCLE_VALUE = 500; + /** * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off) */ @@ -176,9 +180,15 @@ public class RecalibrationArgumentCollection { @Argument(fullName = "binary_tag_name", shortName = "bintag", required = false, doc = "the binary tag covariate name if using it") public String BINARY_TAG_NAME = null; + + ///////////////////////////// + // Debugging-only Arguments + ///////////////////////////// + @Hidden @Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") public String DEFAULT_PLATFORM = null; + @Hidden @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") public String FORCE_PLATFORM = null; diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java index 5d0d94b69..a9b6c7152 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java @@ -49,7 +49,7 @@ import java.util.EnumSet; public class CycleCovariate implements StandardCovariate { - private static final int MAXIMUM_CYCLE_VALUE = 1000; + private int MAXIMUM_CYCLE_VALUE; private static final int CUSHION_FOR_INDELS = 4; private String default_platform = null; @@ -59,6 +59,8 @@ public class CycleCovariate implements StandardCovariate { // Initialize any member variables using the command-line arguments passed to the walkers @Override public void initialize(final RecalibrationArgumentCollection RAC) { + this.MAXIMUM_CYCLE_VALUE = RAC.MAXIMUM_CYCLE_VALUE; + if (RAC.DEFAULT_PLATFORM != null && !NGSPlatform.isKnown(RAC.DEFAULT_PLATFORM)) throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM + ") is not a recognized platform."); @@ -88,6 +90,9 @@ public class CycleCovariate implements StandardCovariate { final int MAX_CYCLE_FOR_INDELS = readLength - CUSHION_FOR_INDELS - 1; for (int i = 0; i < readLength; i++) { + if ( cycle > MAXIMUM_CYCLE_VALUE ) + throw new UserException("The maximum allowed value for the cycle is " + MAXIMUM_CYCLE_VALUE + ", but a larger cycle was detected in read " + read.getReadName() + ". Please use the --maximum_cycle_value argument to increase this value (at the expense of requiring more memory to run)"); + final int substitutionKey = keyFromCycle(cycle); final int indelKey = (i < CUSHION_FOR_INDELS || i > MAX_CYCLE_FOR_INDELS) ? -1 : substitutionKey; values.addCovariate(substitutionKey, indelKey, indelKey, i); diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java index deb0931d6..b73b1a311 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.recalibration; import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.recalibration.covariates.CycleCovariate; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -53,9 +54,31 @@ public class CycleCovariateUnitTest { for (short i = 0; i < values.length; i++) { short actual = Short.decode(covariate.formatKey(values[i][0])); int expected = init + (increment * i); - // System.out.println(String.format("%d: %d, %d", i, actual, expected)); Assert.assertEquals(actual, expected); } } + @Test(enabled = true, expectedExceptions={UserException.class}) + public void testMoreThanMaxCycleFails() { + int readLength = RAC.MAXIMUM_CYCLE_VALUE + 1; + GATKSAMRecord read = ReadUtils.createRandomRead(readLength); + read.setReadPairedFlag(true); + read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); + read.getReadGroup().setPlatform("illumina"); + + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + } + + @Test(enabled = true) + public void testMaxCyclePasses() { + int readLength = RAC.MAXIMUM_CYCLE_VALUE; + GATKSAMRecord read = ReadUtils.createRandomRead(readLength); + read.setReadPairedFlag(true); + read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); + read.getReadGroup().setPlatform("illumina"); + + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + } } From c54fc94505e6d1b913081b8854d5063d5ed11593 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 20 Nov 2012 23:19:59 -0500 Subject: [PATCH 62/90] Protect against features that start off the end of the read (otherwise, Arrays.fill fails) --- .../gatk/walkers/bqsr/BaseRecalibrator.java | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 9506510a9..b415bb1f5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -268,16 +268,25 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche } protected boolean[] calculateKnownSites( final GATKSAMRecord read, final List features ) { - final int BUFFER_SIZE = 0; final int readLength = read.getReadBases().length; final boolean[] knownSites = new boolean[readLength]; Arrays.fill(knownSites, false); for( final Feature f : features ) { int featureStartOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getStart(), ReadUtils.ClippingTail.LEFT_TAIL, true); // BUGBUG: should I use LEFT_TAIL here? - if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { featureStartOnRead = 0; } + if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { + featureStartOnRead = 0; + } + int featureEndOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getEnd(), ReadUtils.ClippingTail.LEFT_TAIL, true); - if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { featureEndOnRead = readLength; } - Arrays.fill(knownSites, Math.max(0, featureStartOnRead - BUFFER_SIZE), Math.min(readLength, featureEndOnRead + 1 + BUFFER_SIZE), true); + if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { + featureEndOnRead = readLength; + } + + if( featureStartOnRead > readLength ) { + featureStartOnRead = featureEndOnRead = readLength; + } + + Arrays.fill(knownSites, Math.max(0, featureStartOnRead), Math.min(readLength, featureEndOnRead + 1), true); } return knownSites; } From c8be7c3102d7764b0141665bc64955062da8c00a Mon Sep 17 00:00:00 2001 From: Menachem Fromer Date: Wed, 21 Nov 2012 15:56:53 -0500 Subject: [PATCH 64/90] Keep SNPs and indels separately for batch merging; Add options to DepthOfCoverage to count fragments (to not double-count overlapping reads of same fragment); DepthOfCoverage should now support ReducedReads; Replace recusrion with loop in DoC/package.scala (for lists longer than 5000 elements) --- .../gatk/walkers/coverage/CoverageUtils.java | 114 +++++++++++++++--- .../walkers/coverage/DepthOfCoverage.java | 6 +- .../pileup/AbstractReadBackedPileup.java | 56 ++++++++- .../sting/utils/pileup/ReadBackedPileup.java | 17 +++ .../queue/qscripts/CNV/xhmmCNVpipeline.scala | 25 +++- .../sting/queue/util/DoC/package.scala | 26 ++-- .../sting/queue/util/VCF_BAM_utilities.scala | 27 ++--- 7 files changed, 215 insertions(+), 56 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java index a41e55166..21532823b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java @@ -6,11 +6,10 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.pileup.PileupElement; -import java.util.Collection; -import java.util.HashMap; -import java.util.Map; +import java.util.*; /** * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl @@ -20,6 +19,21 @@ import java.util.Map; */ public class CoverageUtils { + public enum CountPileupType { + /** + * Count all reads independently (even if from the same fragment). + */ + COUNT_READS, + /** + * Count all fragments (even if the reads that compose the fragment are not consistent at that base). + */ + COUNT_FRAGMENTS, + /** + * Count all fragments (but only if the reads that compose the fragment are consistent at that base). + */ + COUNT_FRAGMENTS_REQUIRE_SAME_BASE + } + /** * Returns the counts of bases from reads with MAPQ > minMapQ and base quality > minBaseQ in the context * as an array of ints, indexed by the index fields of BaseUtils @@ -64,10 +78,10 @@ public class CoverageUtils { } public static Map> - getBaseCountsByPartition(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, Collection types) { + getBaseCountsByPartition(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, CountPileupType countType, Collection types) { Map> countsByIDByType = new HashMap>(); - Map countsByRG = getBaseCountsByReadGroup(context,minMapQ,maxMapQ,minBaseQ,maxBaseQ); + Map countsByRG = getBaseCountsByReadGroup(context,minMapQ,maxMapQ,minBaseQ,maxBaseQ,countType); for (DoCOutputType.Partition t : types ) { // iterate through the read group counts and build the type associations for ( Map.Entry readGroupCountEntry : countsByRG.entrySet() ) { @@ -95,31 +109,95 @@ public class CoverageUtils { } } - public static Map getBaseCountsByReadGroup(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ) { + public static Map getBaseCountsByReadGroup(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, CountPileupType countType) { Map countsByRG = new HashMap(); - for ( PileupElement e : context.getBasePileup() ) { - if ( e.getMappingQual() >= minMapQ && e.getMappingQual() <= maxMapQ && ( e.getQual() >= minBaseQ && e.getQual() <= maxBaseQ || e.isDeletion() ) ) { - SAMReadGroupRecord readGroup = getReadGroup(e.getRead()); - if ( ! countsByRG.keySet().contains(readGroup) ) { - countsByRG.put(readGroup,new int[6]); - updateCounts(countsByRG.get(readGroup),e); - } else { - updateCounts(countsByRG.get(readGroup),e); + + List countPileup = new LinkedList(); + FragmentCollection fpile; + + switch (countType) { + + case COUNT_READS: + for (PileupElement e : context.getBasePileup()) + if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) + countPileup.add(e); + break; + + case COUNT_FRAGMENTS: // ignore base identities and put in FIRST base that passes filters: + fpile = context.getBasePileup().getStartSortedPileup().toFragments(); + + for (PileupElement e : fpile.getSingletonReads()) + if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) + countPileup.add(e); + + for (List overlappingPair : fpile.getOverlappingPairs()) { + // iterate over all elements in fragment: + for (PileupElement e : overlappingPair) { + if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) { + countPileup.add(e); // add the first passing element per fragment + break; + } + } } - } + break; + + case COUNT_FRAGMENTS_REQUIRE_SAME_BASE: + fpile = context.getBasePileup().getStartSortedPileup().toFragments(); + + for (PileupElement e : fpile.getSingletonReads()) + if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) + countPileup.add(e); + + for (List overlappingPair : fpile.getOverlappingPairs()) { + PileupElement firstElem = null; + PileupElement addElem = null; + + // iterate over all elements in fragment: + for (PileupElement e : overlappingPair) { + if (firstElem == null) + firstElem = e; + else if (e.getBase() != firstElem.getBase()) { + addElem = null; + break; + } + + // will add the first passing element per base-consistent fragment: + if (addElem == null && countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) + addElem = e; + } + + if (addElem != null) + countPileup.add(addElem); + } + break; + + default: + throw new UserException("Must use valid CountPileupType"); + } + + for (PileupElement e : countPileup) { + SAMReadGroupRecord readGroup = getReadGroup(e.getRead()); + if (!countsByRG.keySet().contains(readGroup)) + countsByRG.put(readGroup, new int[6]); + + updateCounts(countsByRG.get(readGroup), e); } return countsByRG; } + private static boolean countElement(PileupElement e, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ) { + return (e.getMappingQual() >= minMapQ && e.getMappingQual() <= maxMapQ && ( e.getQual() >= minBaseQ && e.getQual() <= maxBaseQ || e.isDeletion() )); + } + private static void updateCounts(int[] counts, PileupElement e) { if ( e.isDeletion() ) { - counts[BaseUtils.DELETION_INDEX]++; + counts[BaseUtils.DELETION_INDEX] += e.getRepresentativeCount(); } else if ( BaseUtils.basesAreEqual((byte) 'N', e.getBase()) ) { - counts[BaseUtils.NO_CALL_INDEX]++; + counts[BaseUtils.NO_CALL_INDEX] += e.getRepresentativeCount(); } else { try { - counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())]++; + counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())] += e.getRepresentativeCount(); } catch (ArrayIndexOutOfBoundsException exc) { throw new ReviewedStingException("Expected a simple base, but actually received"+(char)e.getBase()); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java index 44b0d74ca..fe9942662 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java @@ -129,11 +129,15 @@ public class DepthOfCoverage extends LocusWalker tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getOverlappingFragmentFilteredPileup(); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getOverlappingFragmentFilteredPileup(discardDiscordant, baseQualNotMapQual); filteredTracker.addElements(sample, pileup.pileupElementTracker); } return (RBP) createNewPileup(loc, filteredTracker); @@ -284,11 +297,16 @@ public abstract class AbstractReadBackedPileup sortedElements = new TreeSet(new Comparator() { + @Override + public int compare(PE element1, PE element2) { + final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart(); + return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName()); + } + }); + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; + for (PE pile : tracker) + sortedElements.add(pile); + + UnifiedPileupElementTracker sortedTracker = new UnifiedPileupElementTracker(); + for (PE pile : sortedElements) + sortedTracker.add(pile); + + return (RBP) createNewPileup(this.getLocation(), sortedTracker); + } + @Override public FragmentCollection toFragments() { return FragmentUtils.create(this); diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java index f15468840..be61bad99 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java @@ -60,6 +60,16 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca */ public ReadBackedPileup getOverlappingFragmentFilteredPileup(); + /** + * Returns a new ReadBackedPileup where only one read from an overlapping read + * pair is retained. If discardDiscordant and the two reads in question disagree to their basecall, + * neither read is retained. Otherwise, the read with the higher + * quality (base or mapping, depending on baseQualNotMapQual) observation is retained + * + * @return the newly filtered pileup + */ + public ReadBackedPileup getOverlappingFragmentFilteredPileup(boolean discardDiscordant, boolean baseQualNotMapQual); + /** * Returns a new ReadBackedPileup that is free of mapping quality zero reads in this pileup. Note that this * does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy @@ -261,6 +271,13 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca */ public byte[] getMappingQuals(); + /** + * Returns a new ReadBackedPileup that is sorted by start coordinate of the reads. + * + * @return + */ + public ReadBackedPileup getStartSortedPileup(); + /** * Converts this pileup into a FragmentCollection (see FragmentUtils for documentation) * @return diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala index 8db089484..c556913ab 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala @@ -8,6 +8,7 @@ import org.broadinstitute.sting.commandline.Hidden import java.io.{PrintStream, PrintWriter} import org.broadinstitute.sting.utils.text.XReadLines import collection.JavaConversions._ +import org.broadinstitute.sting.gatk.walkers.coverage.CoverageUtils class xhmmCNVpipeline extends QScript { qscript => @@ -15,22 +16,22 @@ class xhmmCNVpipeline extends QScript { @Input(doc = "bam input, as .bam or as a list of files", shortName = "I", required = true) var bams: File = _ - @Argument(doc = "gatk jar file", shortName = "J", required = true) + @Input(doc = "gatk jar file", shortName = "J", required = true) var gatkJarFile: File = _ - @Argument(doc = "xhmm executable file", shortName = "xhmmExec", required = true) + @Input(doc = "xhmm executable file", shortName = "xhmmExec", required = true) var xhmmExec: File = _ - @Argument(doc = "Plink/Seq executable file", shortName = "pseqExec", required = true) + @Input(doc = "Plink/Seq executable file", shortName = "pseqExec", required = true) var pseqExec: File = _ @Argument(doc = "Plink/Seq SEQDB file (Reference genome sequence)", shortName = "SEQDB", required = true) var pseqSeqDB: String = _ - @Argument(shortName = "R", doc = "ref", required = true) + @Input(shortName = "R", doc = "ref", required = true) var referenceFile: File = _ - @Argument(shortName = "L", doc = "Intervals", required = false) + @Input(shortName = "L", doc = "Intervals", required = false) var intervals: File = _ @Argument(doc = "level of parallelism for BAM DoC. By default is set to 0 [no scattering].", shortName = "scatter", required = false) @@ -42,6 +43,15 @@ class xhmmCNVpipeline extends QScript { @Output(doc = "Base name for files to output", shortName = "o", required = true) var outputBase: File = _ + @Hidden + @Argument(doc = "How should overlapping reads from the same fragment be handled?", shortName = "countType", required = false) + // TODO: change this to be the default once reads can be ordered properly for FragmentUtils.create(): + // + // Don't want to double-count (but also don't mind counting base-inconsistencies in overlap): + //var countType = CoverageUtils.CountPileupType.COUNT_FRAGMENTS + // + var countType = CoverageUtils.CountPileupType.COUNT_READS + @Argument(doc = "Maximum depth (before GATK down-sampling kicks in...)", shortName = "MAX_DEPTH", required = false) var MAX_DEPTH = 20000 @@ -56,6 +66,9 @@ class xhmmCNVpipeline extends QScript { @Argument(doc = "Minimum read mapping quality", shortName = "MMQ", required = false) var minMappingQuality = 0 + @Argument(doc = "Minimum base quality to be counted in depth", shortName = "MBQ", required = false) + var minBaseQuality = 0 + @Argument(doc = "Memory (in GB) required for storing the whole matrix in memory", shortName = "wholeMatrixMemory", required = false) var wholeMatrixMemory = -1 @@ -159,7 +172,7 @@ class xhmmCNVpipeline extends QScript { var docs: List[DoC] = List[DoC]() for (group <- groups) { Console.out.printf("Group is %s%n", group) - docs ::= new DoC(group.bams, group.DoC_output, MAX_DEPTH, minMappingQuality, scatterCountInput, START_BIN, NUM_BINS, Nil) with CommandLineGATKArgs + docs ::= new DoC(group.bams, group.DoC_output, countType, MAX_DEPTH, minMappingQuality, minBaseQuality, scatterCountInput, START_BIN, NUM_BINS, Nil) with CommandLineGATKArgs } addAll(docs) diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala b/public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala index f35db4aa3..2b19b0f8e 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala @@ -6,9 +6,10 @@ import org.broadinstitute.sting.queue.function.scattergather.ScatterGatherableFu import org.broadinstitute.sting.gatk.downsampling.DownsampleType import org.broadinstitute.sting.commandline.{Input, Gather, Output} import org.broadinstitute.sting.queue.function.CommandLineFunction +import org.broadinstitute.sting.gatk.walkers.coverage.CoverageUtils package object DoC { - class DoC(val bams: List[File], val DoC_output: File, val MAX_DEPTH: Int, val minMappingQuality: Int, val scatterCountInput: Int, val START_BIN: Int, val NUM_BINS: Int, val minCoverageCalcs: Seq[Int]) extends CommandLineGATK with ScatterGatherableFunction { + class DoC(val bams: List[File], val DoC_output: File, val countType: CoverageUtils.CountPileupType, val MAX_DEPTH: Int, val minMappingQuality: Int, val minBaseQuality: Int, val scatterCountInput: Int, val START_BIN: Int, val NUM_BINS: Int, val minCoverageCalcs: Seq[Int]) extends CommandLineGATK with ScatterGatherableFunction { val DOC_OUTPUT_SUFFIX: String = ".sample_interval_summary" // So that the output files of this DoC run get deleted once they're used further downstream: @@ -32,8 +33,9 @@ package object DoC { override def commandLine = super.commandLine + " --omitDepthOutputAtEachBase" + " --omitLocusTable" + - " --minBaseQuality 0" + " --minMappingQuality " + minMappingQuality + + " --minBaseQuality " + minBaseQuality + + optional("--countType", countType, spaceSeparated=true, escape=true, format="%s") + " --start " + START_BIN + " --stop " + MAX_DEPTH + " --nBins " + NUM_BINS + (if (!minCoverageCalcs.isEmpty) minCoverageCalcs.map(cov => " --summaryCoverageThreshold " + cov).reduceLeft(_ + "" + _) else "") + " --includeRefNSites" + @@ -42,7 +44,7 @@ package object DoC { override def shortDescription = "DoC: " + DoC_output } - class DoCwithDepthOutputAtEachBase(bams: List[File], DoC_output: File, MAX_DEPTH: Int, minMappingQuality: Int, scatterCountInput: Int, START_BIN: Int, NUM_BINS: Int, minCoverageCalcs: Seq[Int]) extends DoC(bams, DoC_output, MAX_DEPTH: Int, minMappingQuality, scatterCountInput, START_BIN, NUM_BINS, minCoverageCalcs) { + class DoCwithDepthOutputAtEachBase(bams: List[File], DoC_output: File, countType: CoverageUtils.CountPileupType, MAX_DEPTH: Int, minMappingQuality: Int, minBaseQuality: Int, scatterCountInput: Int, START_BIN: Int, NUM_BINS: Int, minCoverageCalcs: Seq[Int]) extends DoC(bams, DoC_output, countType: CoverageUtils.CountPileupType, MAX_DEPTH: Int, minMappingQuality, minBaseQuality, scatterCountInput, START_BIN, NUM_BINS, minCoverageCalcs) { // HACK for DoC to work properly within Queue: @Output @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) @@ -52,15 +54,21 @@ package object DoC { } def buildDoCgroups(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]], samplesPerJob: Int, outputBase: File): List[Group] = { + var l: List[Group] = Nil - def buildDoCgroupsHelper(samples: List[String], count: Int): List[Group] = (samples splitAt samplesPerJob) match { - case (Nil, y) => - return Nil - case (subsamples, remaining) => - return new Group("group" + count, outputBase, subsamples, VCF_BAM_utilities.findBAMsForSamples(subsamples, sampleToBams)) :: buildDoCgroupsHelper(remaining, count + 1) + var remaining = samples + var subsamples: List[String] = Nil + var count = 1 + + while (!remaining.isEmpty) { + val splitRes = (remaining splitAt samplesPerJob) + subsamples = splitRes._1 + remaining = splitRes._2 + l ::= new Group("group" + count, outputBase, subsamples, VCF_BAM_utilities.findBAMsForSamples(subsamples, sampleToBams)) + count = count + 1 } - return buildDoCgroupsHelper(samples, 0) + return l } // A group has a list of samples and bam files to use for DoC diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala b/public/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala index 1f18858e1..3fe867981 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala @@ -26,36 +26,31 @@ object VCF_BAM_utilities { case _ => throw new RuntimeException("Unexpected BAM input type: " + bamsIn + "; only permitted extensions are .bam and .list") } - def getMapOfBAMsForSample(bams: List[File]): scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = bams match { - case Nil => return scala.collection.mutable.Map.empty[String, scala.collection.mutable.Set[File]] - - case x :: y => - val m: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = getMapOfBAMsForSample(y) - val bamSamples: List[String] = getSamplesInBAM(x) + def getMapOfBAMsForSample(bams: List[File]): scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = { + var m = scala.collection.mutable.Map.empty[String, scala.collection.mutable.Set[File]] + for (bam <- bams) { + val bamSamples: List[String] = getSamplesInBAM(bam) for (s <- bamSamples) { if (!m.contains(s)) m += s -> scala.collection.mutable.Set.empty[File] - m(s) = m(s) + x + m(s) += bam } + } return m } def findBAMsForSamples(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]]): List[File] = { + var s = scala.collection.mutable.Set.empty[File] - def findBAMsForSamplesHelper(samples: List[String]): scala.collection.mutable.Set[File] = samples match { - case Nil => scala.collection.mutable.Set.empty[File] - - case x :: y => - var bamsForSampleX: scala.collection.mutable.Set[File] = scala.collection.mutable.Set.empty[File] - if (sampleToBams.contains(x)) - bamsForSampleX = sampleToBams(x) - return bamsForSampleX ++ findBAMsForSamplesHelper(y) + for (sample <- samples) { + if (sampleToBams.contains(sample)) + s ++= sampleToBams(sample) } val l: List[File] = Nil - return l ++ findBAMsForSamplesHelper(samples) + return l ++ s } } From ed50814ccba7fe72f296c466e0abf64d7923c51d Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 21 Nov 2012 15:57:05 -0500 Subject: [PATCH 65/90] Finally found a case where user errors were being masked behind other errors and could debug. It turns out that the checkForMaskedUserErrors() method needs to run recursively over all levels (calling exception.getCause()) to check for the original cause. --- .../sting/gatk/CommandLineGATK.java | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java index 0daad2c2b..4f9031329 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -118,17 +118,24 @@ public class CommandLineGATK extends CommandLineExecutable { public static final String DISK_QUOTA_EXCEEDED_ERROR = "Disk quota exceeded"; private static void checkForMaskedUserErrors(final Throwable t) { + // masked out of memory error + if ( t instanceof OutOfMemoryError ) + exitSystemWithUserError(new UserException.NotEnoughMemory()); + // masked user error + if ( t instanceof UserException || t instanceof TribbleException ) + exitSystemWithUserError(new UserException(t.getMessage())); + + // no message means no masked error final String message = t.getMessage(); if ( message == null ) return; - // we know what to do about the common "Too many open files" error + // too many open files error if ( message.contains("Too many open files") ) exitSystemWithUserError(new UserException.TooManyOpenFiles()); // malformed BAM looks like a SAM file - if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) || - message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) ) + if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) || message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) ) exitSystemWithSamError(t); // can't close tribble index when writing @@ -138,12 +145,10 @@ public class CommandLineGATK extends CommandLineExecutable { // disk is full if ( message.contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || message.contains(DISK_QUOTA_EXCEEDED_ERROR) ) exitSystemWithUserError(new UserException.NoSpaceOnDevice()); - if ( t.getCause() != null && (t.getCause().getMessage().contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || t.getCause().getMessage().contains(DISK_QUOTA_EXCEEDED_ERROR)) ) - exitSystemWithUserError(new UserException.NoSpaceOnDevice()); - // masked out of memory error - if ( t.getCause() != null && t.getCause() instanceof OutOfMemoryError ) - exitSystemWithUserError(new UserException.NotEnoughMemory()); + // masked error wrapped in another one + if ( t.getCause() != null ) + checkForMaskedUserErrors(t.getCause()); } /** From a8c7edca053a5e0fbe360bd9bd3bc09472e9b532 Mon Sep 17 00:00:00 2001 From: Menachem Fromer Date: Wed, 21 Nov 2012 16:01:10 -0500 Subject: [PATCH 66/90] Fixed fragment handling in DepthOfCoverage --- .../sting/queue/qscripts/CNV/xhmmCNVpipeline.scala | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala index c556913ab..28a2534c0 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala @@ -45,12 +45,7 @@ class xhmmCNVpipeline extends QScript { @Hidden @Argument(doc = "How should overlapping reads from the same fragment be handled?", shortName = "countType", required = false) - // TODO: change this to be the default once reads can be ordered properly for FragmentUtils.create(): - // - // Don't want to double-count (but also don't mind counting base-inconsistencies in overlap): - //var countType = CoverageUtils.CountPileupType.COUNT_FRAGMENTS - // - var countType = CoverageUtils.CountPileupType.COUNT_READS + var countType = CoverageUtils.CountPileupType.COUNT_FRAGMENTS @Argument(doc = "Maximum depth (before GATK down-sampling kicks in...)", shortName = "MAX_DEPTH", required = false) var MAX_DEPTH = 20000 From 4f2229d399948200576695574ef46111f3ed542d Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 21 Nov 2012 16:01:26 -0500 Subject: [PATCH 67/90] As per the TODO message, I removed a check that was no longer necessary. Now ID is an allowable INFO field key. --- .../sting/utils/variantcontext/VariantContext.java | 7 ------- 1 file changed, 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 27a5b0c24..12f9cb20c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -184,9 +184,6 @@ public class VariantContext implements Feature { // to enable tribble integratio protected CommonInfo commonInfo = null; public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR; - @Deprecated // ID is no longer stored in the attributes map - private final static String ID_KEY = "ID"; - public final static Set PASSES_FILTERS = Collections.unmodifiableSet(new LinkedHashSet()); /** The location of this VariantContext */ @@ -287,10 +284,6 @@ public class VariantContext implements Feature { // to enable tribble integratio this.commonInfo = new CommonInfo(source, log10PError, filters, attributes); - // todo -- remove me when this check is no longer necessary - if ( this.commonInfo.hasAttribute(ID_KEY) ) - throw new IllegalArgumentException("Trying to create a VariantContext with a ID key. Please use provided constructor argument ID"); - if ( alleles == null ) { throw new IllegalArgumentException("Alleles cannot be null"); } // we need to make this a LinkedHashSet in case the user prefers a given ordering of alleles From c08b78274395c916db872417d4f17ebdacd39906 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Mon, 19 Nov 2012 14:39:06 -0500 Subject: [PATCH 68/90] Count isActive calls directly --- .../sting/gatk/traversals/TraverseActiveRegions.java | 5 +---- .../sting/utils/activeregion/ActivityProfile.java | 5 ----- .../gatk/traversals/TraverseActiveRegionsTest.java | 11 ++++++++--- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 4fe83f331..a2c37944a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -34,9 +34,6 @@ public class TraverseActiveRegions extends TraversalEngine workQueue = new LinkedList(); private final LinkedHashSet myReads = new LinkedHashSet(); - // package access for unit testing - ActivityProfile profile; - @Override public String getTraversalUnits() { return "active regions"; @@ -56,7 +53,7 @@ public class TraverseActiveRegions extends TraversalEngine activeRegions = new LinkedList(); - profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); + ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java index 38cfbb38d..e96eb843d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java @@ -103,11 +103,6 @@ public class ActivityProfile { isActiveList.add(result); } - // for unit testing - public List getActiveList() { - return isActiveList; - } - public int size() { return isActiveList.size(); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java index 8740a8b68..edc818aca 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java @@ -41,6 +41,7 @@ public class TraverseActiveRegionsTest extends BaseTest { private class DummyActiveRegionWalker extends ActiveRegionWalker { private final double prob; + public List isActiveCalls = new ArrayList(); public DummyActiveRegionWalker() { this.prob = 1.0; @@ -48,6 +49,7 @@ public class TraverseActiveRegionsTest extends BaseTest { @Override public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + isActiveCalls.add(ref.getLocus()); return new ActivityProfileResult(ref.getLocus(), prob); } @@ -71,7 +73,7 @@ public class TraverseActiveRegionsTest extends BaseTest { private IndexedFastaSequenceFile reference; private GenomeLocParser genomeLocParser; - private ActiveRegionWalker walker; + private DummyActiveRegionWalker walker; @BeforeClass private void init() throws FileNotFoundException { @@ -83,18 +85,21 @@ public class TraverseActiveRegionsTest extends BaseTest { @Test public void testAllIntervalsSeen() throws Exception { List intervals = new ArrayList(); + List activeIntervals = new ArrayList(); + GenomeLoc interval = genomeLocParser.createGenomeLoc("1", 1, 1); intervals.add(interval); LocusShardDataProvider dataProvider = createDataProvider(intervals); t.traverse(walker, dataProvider, 0); + activeIntervals.addAll(walker.isActiveCalls); boolean allGenomeLocsSeen = true; for (GenomeLoc loc : intervals) { boolean thisGenomeLocSeen = false; - for (ActivityProfileResult active : t.profile.getActiveList()) { - if (loc.equals(active.getLoc())) { + for (GenomeLoc activeLoc : activeIntervals) { + if (loc.equals(activeLoc)) { thisGenomeLocSeen = true; break; } From e8defcb20dfcc50cec565fbb7fd2e93479162ae5 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Mon, 19 Nov 2012 14:44:00 -0500 Subject: [PATCH 69/90] Test multiple bases and intervals --- .../traversals/TraverseActiveRegionsTest.java | 65 +++++++++++++------ 1 file changed, 45 insertions(+), 20 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java index edc818aca..d61da5a83 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java @@ -83,49 +83,74 @@ public class TraverseActiveRegionsTest extends BaseTest { } @Test - public void testAllIntervalsSeen() throws Exception { + public void testAllBasesSeenSuite() { List intervals = new ArrayList(); List activeIntervals = new ArrayList(); GenomeLoc interval = genomeLocParser.createGenomeLoc("1", 1, 1); intervals.add(interval); + testAllBasesSeen(intervals); - LocusShardDataProvider dataProvider = createDataProvider(intervals); + interval = genomeLocParser.createGenomeLoc("1", 10, 20); + intervals.add(interval); + testAllBasesSeen(intervals); + } - t.traverse(walker, dataProvider, 0); - activeIntervals.addAll(walker.isActiveCalls); + public void testAllBasesSeen(List intervals) { + List activeIntervals = new ArrayList(); + for (LocusShardDataProvider dataProvider : createDataProviders(intervals)) { + t.traverse(walker, dataProvider, 0); + activeIntervals.addAll(walker.isActiveCalls); + } - boolean allGenomeLocsSeen = true; - for (GenomeLoc loc : intervals) { - boolean thisGenomeLocSeen = false; + boolean allBasesSeen = true; + for (GenomeLoc base : toBases(intervals)) { + boolean thisBaseSeen = false; for (GenomeLoc activeLoc : activeIntervals) { - if (loc.equals(activeLoc)) { - thisGenomeLocSeen = true; + if (base.equals(activeLoc)) { + thisBaseSeen = true; break; } } - if (!thisGenomeLocSeen) { - allGenomeLocsSeen = false; + if (!thisBaseSeen) { + allBasesSeen = false; break; } } - Assert.assertTrue(allGenomeLocsSeen, "Some intervals missing from activity profile"); + Assert.assertTrue(allBasesSeen, "Some intervals missing from activity profile"); } - private LocusShardDataProvider createDataProvider(List intervals) { + private List toBases(List intervals) { + List bases = new ArrayList(); + for (GenomeLoc interval : intervals) { + if (interval.size() == 1) + bases.add(interval); + else { + for (int location = interval.getStart(); location <= interval.getStop(); location++) { + bases.add(genomeLocParser.createGenomeLoc(interval.getContig(), location, location)); + } + } + } + return bases; + } + + private List createDataProviders(List intervals) { walker = new DummyActiveRegionWalker(); - StingSAMIterator iterator = ArtificialSAMUtils.createReadIterator(new ArrayList()); - Shard shard = new MockLocusShard(genomeLocParser, intervals); - WindowMaker windowMaker = new WindowMaker(shard, genomeLocParser,iterator,shard.getGenomeLocs()); - WindowMaker.WindowMakerIterator window = windowMaker.next(); - GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - //engine.setReferenceDataSource(reference); engine.setGenomeLocParser(genomeLocParser); t.initialize(engine); - return new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, reference, new ArrayList()); + StingSAMIterator iterator = ArtificialSAMUtils.createReadIterator(new ArrayList()); + Shard shard = new MockLocusShard(genomeLocParser, intervals); + + List providers = new ArrayList(); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + for (WindowMaker.WindowMakerIterator window : windowMaker) { + providers.add(new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, reference, new ArrayList())); + } + + return providers; } } From 3fa3b00f4abd3159b67a8013505606bf96db1a38 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Mon, 19 Nov 2012 14:45:19 -0500 Subject: [PATCH 70/90] Add ActiveRegion tests and refactor --- .../traversals/TraverseActiveRegionsTest.java | 156 +++++++++++++----- 1 file changed, 115 insertions(+), 41 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java index d61da5a83..ce4d400b4 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java @@ -28,20 +28,23 @@ import org.testng.annotations.Test; import java.io.File; import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.List; +import java.util.*; /** * Created with IntelliJ IDEA. * User: thibault * Date: 11/13/12 * Time: 2:47 PM + * + * Test the Active Region Traversal Contract + * http://iwww.broadinstitute.org/gsa/wiki/index.php/Active_Region_Traversal_Contract */ public class TraverseActiveRegionsTest extends BaseTest { private class DummyActiveRegionWalker extends ActiveRegionWalker { private final double prob; public List isActiveCalls = new ArrayList(); + public List mappedActiveRegions = new ArrayList(); public DummyActiveRegionWalker() { this.prob = 1.0; @@ -55,6 +58,7 @@ public class TraverseActiveRegionsTest extends BaseTest { @Override public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { + mappedActiveRegions.add(activeRegion); return 0; } @@ -73,7 +77,6 @@ public class TraverseActiveRegionsTest extends BaseTest { private IndexedFastaSequenceFile reference; private GenomeLocParser genomeLocParser; - private DummyActiveRegionWalker walker; @BeforeClass private void init() throws FileNotFoundException { @@ -83,61 +86,133 @@ public class TraverseActiveRegionsTest extends BaseTest { } @Test - public void testAllBasesSeenSuite() { + public void testAllBasesSeen() { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); List intervals = new ArrayList(); - List activeIntervals = new ArrayList(); - GenomeLoc interval = genomeLocParser.createGenomeLoc("1", 1, 1); - intervals.add(interval); - testAllBasesSeen(intervals); + intervals.add(genomeLocParser.createGenomeLoc("1", 1, 1)); + List activeIntervals = getIsActiveIntervals(walker, intervals); + // Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call + verifyEqualIntervals(intervals, activeIntervals); - interval = genomeLocParser.createGenomeLoc("1", 10, 20); - intervals.add(interval); - testAllBasesSeen(intervals); + intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20)); + activeIntervals = getIsActiveIntervals(walker, intervals); + // Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call + verifyEqualIntervals(intervals, activeIntervals); + + // TODO: more tests and edge cases } - public void testAllBasesSeen(List intervals) { + private List getIsActiveIntervals(DummyActiveRegionWalker walker, List intervals) { List activeIntervals = new ArrayList(); for (LocusShardDataProvider dataProvider : createDataProviders(intervals)) { t.traverse(walker, dataProvider, 0); activeIntervals.addAll(walker.isActiveCalls); } - boolean allBasesSeen = true; - for (GenomeLoc base : toBases(intervals)) { - boolean thisBaseSeen = false; - for (GenomeLoc activeLoc : activeIntervals) { - if (base.equals(activeLoc)) { - thisBaseSeen = true; - break; - } - } - if (!thisBaseSeen) { - allBasesSeen = false; - break; - } - } - - Assert.assertTrue(allBasesSeen, "Some intervals missing from activity profile"); + return activeIntervals; } - private List toBases(List intervals) { - List bases = new ArrayList(); + @Test + public void testActiveRegionCoverage() { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); + List intervals = new ArrayList(); + + intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999)); + intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999)); + intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999)); + + List activeRegions = getActiveRegions(walker, intervals); + verifyActiveRegionCoverage(intervals, activeRegions); + + // TODO: more tests and edge cases + } + + private void verifyActiveRegionCoverage(List intervals, List activeRegions) { + List intervalStarts = new ArrayList(); + List intervalStops = new ArrayList(); + for (GenomeLoc interval : intervals) { - if (interval.size() == 1) - bases.add(interval); - else { - for (int location = interval.getStart(); location <= interval.getStop(); location++) { - bases.add(genomeLocParser.createGenomeLoc(interval.getContig(), location, location)); - } - } + intervalStarts.add(interval.getStartLocation()); + intervalStops.add(interval.getStopLocation()); } + + Map baseRegionMap = new HashMap(); + + for (ActiveRegion activeRegion : activeRegions) { + for (GenomeLoc activeLoc : toSingleBaseLocs(activeRegion.getLocation())) { + // Contract: Regions do not overlap + Assert.assertFalse(baseRegionMap.containsKey(activeLoc), "Genome location " + activeLoc + " is assigned to more than one region"); + baseRegionMap.put(activeLoc, activeRegion); + } + + GenomeLoc start = activeRegion.getLocation().getStartLocation(); + if (intervalStarts.contains(start)) + intervalStarts.remove(start); + + GenomeLoc stop = activeRegion.getLocation().getStopLocation(); + if (intervalStops.contains(stop)) + intervalStops.remove(stop); + } + + for (GenomeLoc baseLoc : toSingleBaseLocs(intervals)) { + // Contract: Each location in the interval(s) is in exactly one region + // Contract: The total set of regions exactly matches the analysis interval(s) + Assert.assertTrue(baseRegionMap.containsKey(baseLoc), "Genome location " + baseLoc + " is not assigned to any region"); + baseRegionMap.remove(baseLoc); + } + + // Contract: The total set of regions exactly matches the analysis interval(s) + Assert.assertEquals(baseRegionMap.size(), 0, "Active regions contain base(s) outside of the given intervals"); + + // Contract: All explicit interval boundaries must also be region boundaries + Assert.assertEquals(intervalStarts.size(), 0, "Interval start location does not match an active region start location"); + Assert.assertEquals(intervalStops.size(), 0, "Interval stop location does not match an active region stop location"); + } + + private List getActiveRegions(DummyActiveRegionWalker walker, List intervals) { + for (LocusShardDataProvider dataProvider : createDataProviders(intervals)) + t.traverse(walker, dataProvider, 0); + + return walker.mappedActiveRegions; + } + + private Collection toSingleBaseLocs(GenomeLoc interval) { + List bases = new ArrayList(); + if (interval.size() == 1) + bases.add(interval); + else { + for (int location = interval.getStart(); location <= interval.getStop(); location++) + bases.add(genomeLocParser.createGenomeLoc(interval.getContig(), location, location)); + } + return bases; } - private List createDataProviders(List intervals) { - walker = new DummyActiveRegionWalker(); + private Collection toSingleBaseLocs(List intervals) { + Set bases = new TreeSet(); // for sorting and uniqueness + for (GenomeLoc interval : intervals) + bases.addAll(toSingleBaseLocs(interval)); + return bases; + } + + private void verifyEqualIntervals(List aIntervals, List bIntervals) { + Collection aBases = toSingleBaseLocs(aIntervals); + Collection bBases = toSingleBaseLocs(bIntervals); + + Assert.assertTrue(aBases.size() == bBases.size(), "Interval lists have a differing number of bases: " + aBases.size() + " vs. " + bBases.size()); + + Iterator aIter = aBases.iterator(); + Iterator bIter = bBases.iterator(); + while (aIter.hasNext() && bIter.hasNext()) { + GenomeLoc aLoc = aIter.next(); + GenomeLoc bLoc = bIter.next(); + Assert.assertTrue(aLoc.equals(bLoc), "Interval locations do not match: " + aLoc + " vs. " + bLoc); + } + } + + private List createDataProviders(List intervals) { GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); engine.setGenomeLocParser(genomeLocParser); t.initialize(engine); @@ -146,8 +221,7 @@ public class TraverseActiveRegionsTest extends BaseTest { Shard shard = new MockLocusShard(genomeLocParser, intervals); List providers = new ArrayList(); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); - for (WindowMaker.WindowMakerIterator window : windowMaker) { + for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, iterator, shard.getGenomeLocs())) { providers.add(new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, reference, new ArrayList())); } From 3ad9128800922ab40aeb5504ff5b0e5a6402e591 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Tue, 20 Nov 2012 13:11:24 -0500 Subject: [PATCH 71/90] Add some reads - Move intervals and reads to init - Update intervals and reads --- .../traversals/TraverseActiveRegionsTest.java | 73 ++++++++++++++----- 1 file changed, 53 insertions(+), 20 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java index ce4d400b4..2780b7421 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java @@ -1,9 +1,10 @@ package org.broadinstitute.sting.gatk.traversals; -import org.testng.Assert; +import net.sf.samtools.*; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.utils.interval.IntervalUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -22,6 +23,7 @@ import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -43,8 +45,8 @@ public class TraverseActiveRegionsTest extends BaseTest { private class DummyActiveRegionWalker extends ActiveRegionWalker { private final double prob; - public List isActiveCalls = new ArrayList(); - public List mappedActiveRegions = new ArrayList(); + protected List isActiveCalls = new ArrayList(); + protected List mappedActiveRegions = new ArrayList(); public DummyActiveRegionWalker() { this.prob = 1.0; @@ -76,30 +78,46 @@ public class TraverseActiveRegionsTest extends BaseTest { private final TraverseActiveRegions t = new TraverseActiveRegions(); private IndexedFastaSequenceFile reference; + private SAMSequenceDictionary dictionary; private GenomeLocParser genomeLocParser; + private List intervals; + private List reads; + @BeforeClass private void init() throws FileNotFoundException { reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference)); - SAMSequenceDictionary dictionary = reference.getSequenceDictionary(); + dictionary = reference.getSequenceDictionary(); genomeLocParser = new GenomeLocParser(dictionary); + + intervals = new ArrayList(); + intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20)); + intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999)); + intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999)); + intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999)); + intervals.add(genomeLocParser.createGenomeLoc("1", 10000, 20000)); + // TODO: this fails! + //intervals.add(genomeLocParser.createGenomeLoc("20", 10000, 20000)); + intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList(); + + reads = new ArrayList(); + reads.add(buildSAMRecord("overlap_overlapped_equal", "1", 10, 20)); + reads.add(buildSAMRecord("overlap_overlapped_unequal", "1", 10, 21)); + reads.add(buildSAMRecord("overlap_boundary_equal", "1", 1990, 2009)); + reads.add(buildSAMRecord("overlap_boundary_unequal", "1", 1995, 2050)); + reads.add(buildSAMRecord("extended_only", "1", 3000, 3100)); + reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990)); + reads.add(buildSAMRecord("simple", "20", 1000100, 1000150)); } @Test public void testAllBasesSeen() { DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); - List intervals = new ArrayList(); - intervals.add(genomeLocParser.createGenomeLoc("1", 1, 1)); List activeIntervals = getIsActiveIntervals(walker, intervals); // Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call verifyEqualIntervals(intervals, activeIntervals); - intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20)); - activeIntervals = getIsActiveIntervals(walker, intervals); - // Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call - verifyEqualIntervals(intervals, activeIntervals); - // TODO: more tests and edge cases } @@ -116,11 +134,6 @@ public class TraverseActiveRegionsTest extends BaseTest { @Test public void testActiveRegionCoverage() { DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); - List intervals = new ArrayList(); - - intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999)); - intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999)); - intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999)); List activeRegions = getActiveRegions(walker, intervals); verifyActiveRegionCoverage(intervals, activeRegions); @@ -212,17 +225,37 @@ public class TraverseActiveRegionsTest extends BaseTest { } } + // copied from LocusViewTemplate + protected GATKSAMRecord buildSAMRecord(String readName, String contig, int alignmentStart, int alignmentEnd) { + SAMFileHeader header = new SAMFileHeader(); + header.setSequenceDictionary(dictionary); + GATKSAMRecord record = new GATKSAMRecord(header); + + record.setReadName(readName); + record.setReferenceIndex(dictionary.getSequenceIndex(contig)); + record.setAlignmentStart(alignmentStart); + + Cigar cigar = new Cigar(); + int len = alignmentEnd - alignmentStart + 1; + cigar.add(new CigarElement(len, CigarOperator.M)); + record.setCigar(cigar); + record.setReadBases(new byte[len]); + record.setBaseQualities(new byte[len]); + + return record; + } + private List createDataProviders(List intervals) { GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); engine.setGenomeLocParser(genomeLocParser); t.initialize(engine); - StingSAMIterator iterator = ArtificialSAMUtils.createReadIterator(new ArrayList()); + StingSAMIterator iterator = ArtificialSAMUtils.createReadIterator(reads); Shard shard = new MockLocusShard(genomeLocParser, intervals); List providers = new ArrayList(); for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, iterator, shard.getGenomeLocs())) { - providers.add(new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, reference, new ArrayList())); + providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList())); } return providers; From c68bc95db6635ccbe90462d18aa5d3a4ae7ace38 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Wed, 21 Nov 2012 16:22:57 -0500 Subject: [PATCH 72/90] Initial read mapping tests - Failing tests are commented out --- .../traversals/TraverseActiveRegionsTest.java | 115 ++++++++++++++++-- 1 file changed, 105 insertions(+), 10 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java index 2780b7421..e4c7b2db0 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java @@ -46,7 +46,7 @@ public class TraverseActiveRegionsTest extends BaseTest { private class DummyActiveRegionWalker extends ActiveRegionWalker { private final double prob; protected List isActiveCalls = new ArrayList(); - protected List mappedActiveRegions = new ArrayList(); + protected Map mappedActiveRegions = new HashMap(); public DummyActiveRegionWalker() { this.prob = 1.0; @@ -60,7 +60,7 @@ public class TraverseActiveRegionsTest extends BaseTest { @Override public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { - mappedActiveRegions.add(activeRegion); + mappedActiveRegions.put(activeRegion.getLocation(), activeRegion); return 0; } @@ -101,13 +101,16 @@ public class TraverseActiveRegionsTest extends BaseTest { intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList(); reads = new ArrayList(); - reads.add(buildSAMRecord("overlap_overlapped_equal", "1", 10, 20)); - reads.add(buildSAMRecord("overlap_overlapped_unequal", "1", 10, 21)); - reads.add(buildSAMRecord("overlap_boundary_equal", "1", 1990, 2009)); - reads.add(buildSAMRecord("overlap_boundary_unequal", "1", 1995, 2050)); + reads.add(buildSAMRecord("simple", "1", 100, 200)); + reads.add(buildSAMRecord("overlap_equal", "1", 10, 20)); + reads.add(buildSAMRecord("overlap_unequal", "1", 10, 21)); + reads.add(buildSAMRecord("boundary_equal", "1", 1990, 2009)); + reads.add(buildSAMRecord("boundary_unequal", "1", 1995, 2050)); reads.add(buildSAMRecord("extended_only", "1", 3000, 3100)); reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990)); - reads.add(buildSAMRecord("simple", "20", 1000100, 1000150)); + reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000)); + // TODO + //reads.add(buildSAMRecord("simple20", "20", 10100, 10150)); } @Test @@ -135,13 +138,13 @@ public class TraverseActiveRegionsTest extends BaseTest { public void testActiveRegionCoverage() { DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); - List activeRegions = getActiveRegions(walker, intervals); + Collection activeRegions = getActiveRegions(walker, intervals).values(); verifyActiveRegionCoverage(intervals, activeRegions); // TODO: more tests and edge cases } - private void verifyActiveRegionCoverage(List intervals, List activeRegions) { + private void verifyActiveRegionCoverage(List intervals, Collection activeRegions) { List intervalStarts = new ArrayList(); List intervalStops = new ArrayList(); @@ -183,7 +186,99 @@ public class TraverseActiveRegionsTest extends BaseTest { Assert.assertEquals(intervalStops.size(), 0, "Interval stop location does not match an active region stop location"); } - private List getActiveRegions(DummyActiveRegionWalker walker, List intervals) { + @Test + public void testReadMapping() { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); + + // Contract: Each read has the Primary state in a single region (or none) + // This is the region of maximum overlap for the read (earlier if tied) + + // Contract: Each read has the Non-Primary state in all other regions it overlaps + // Contract: Each read has the Extended state in regions where it only overlaps if the region is extended + + // simple: Primary in 1:1-999 + // overlap_equal: Primary in 1:1-999 + // overlap_unequal: Primary in 1:1-999 + // boundary_equal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_unequal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 + // extended_only: Extended in 1:2000-2999 + // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 + // outside_intervals: none + + // TODO + // simple20: Primary in 20:10000-20000 + + Map activeRegions = getActiveRegions(walker, intervals); + ActiveRegion region; + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); + + verifyReadPrimary(region, "simple"); + verifyReadPrimary(region, "overlap_equal"); + verifyReadPrimary(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_only"); + // TODO: fail verifyReadNonPrimary(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + // TODO: fail verifyReadPrimary(region, "boundary_equal"); + // TODO: fail verifyReadNonPrimary(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_only"); + // TODO: fail verifyReadPrimary(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + // TODO: fail verifyReadNonPrimary(region, "boundary_equal"); + verifyReadPrimary(region, "boundary_unequal"); + // TODO: fail verifyReadExtended(region, "extended_only"); + // TODO: fail verifyReadExtended(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + + // TODO: more tests and edge cases + } + + private void verifyReadPrimary(ActiveRegion region, String readName) { + SAMRecord read = getRead(region, readName); + Assert.assertFalse(read.getNotPrimaryAlignmentFlag(), "Read " + read + " not primary in active region " + region); + } + + private void verifyReadNonPrimary(ActiveRegion region, String readName) { + SAMRecord read = getRead(region, readName); + Assert.assertTrue(read.getNotPrimaryAlignmentFlag(), "Read " + read + " primary in active region " + region); + } + + private void verifyReadExtended(ActiveRegion region, String readName) { + Assert.fail("The Extended read state has not been implemented"); + } + + private void verifyReadNotPlaced(ActiveRegion region, String readName) { + for (SAMRecord read : region.getReads()) { + if (read.getReadName().equals(readName)) + Assert.fail("Read " + readName + " found in active region " + region); + } + } + + private SAMRecord getRead(ActiveRegion region, String readName) { + for (SAMRecord read : region.getReads()) { + if (read.getReadName().equals(readName)) + return read; + } + + Assert.fail("Read " + readName + " not found in active region " + region); + return null; + } + + private Map getActiveRegions(DummyActiveRegionWalker walker, List intervals) { for (LocusShardDataProvider dataProvider : createDataProviders(intervals)) t.traverse(walker, dataProvider, 0); From 48f271c5bd825dc0f57249dc6164ec4bc3c35541 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 21 Nov 2012 17:23:41 -0500 Subject: [PATCH 75/90] Adding 80% support for multi-allelic variants -- Multi-allelic variants are split into their bi-allelic version, trimmed, and we attempt to provide a meaningful genotype for NA12878 here. It's not perfect and needs some discussion on how to handle het/alt variants -- Adding splitInBiallelic funtion to VariantContextUtils as well as extensive unit tests that also indirectly test reverseTrimAlleles (which worked perfectly FYI) --- .../variantcontext/VariantContextUtils.java | 34 +++++ .../VariantContextTestProvider.java | 2 +- .../VariantContextUtilsUnitTest.java | 119 +++++++++++++++++- 3 files changed, 150 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 81959c998..1f1867f75 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -979,6 +979,40 @@ public class VariantContextUtils { private static final List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. + /** + * Split variant context into its biallelic components if there are more than 2 alleles + * + * For VC has A/B/C alleles, returns A/B and A/C contexts. + * Genotypes are all no-calls now (it's not possible to fix them easily) + * Alleles are right trimmed to satisfy VCF conventions + * + * If vc is biallelic or non-variant it is just returned + * + * Chromosome counts are updated (but they are by definition 0) + * + * @param vc a potentially multi-allelic variant context + * @return a list of bi-allelic (or monomorphic) variant context + */ + public static List splitVariantContextToBiallelics(final VariantContext vc) { + if ( ! vc.isVariant() || vc.isBiallelic() ) + // non variant or biallelics already satisfy the contract + return Collections.singletonList(vc); + else { + final List biallelics = new LinkedList(); + + for ( final Allele alt : vc.getAlternateAlleles() ) { + VariantContextBuilder builder = new VariantContextBuilder(vc); + final List alleles = Arrays.asList(vc.getReference(), alt); + builder.alleles(alleles); + builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false)); + calculateChromosomeCounts(builder, true); + biallelics.add(reverseTrimAlleles(builder.make())); + } + + return biallelics; + } + } + /** * subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately) * diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index 6785fa816..c57b2a44d 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -782,7 +782,7 @@ public class VariantContextTestProvider { Assert.assertEquals(actual.getStart(), expected.getStart(), "start"); Assert.assertEquals(actual.getEnd(), expected.getEnd(), "end"); Assert.assertEquals(actual.getID(), expected.getID(), "id"); - Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles"); + Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles for " + expected + " vs " + actual); assertAttributesEquals(actual.getAttributes(), expected.getAttributes()); Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied(), "filtersWereApplied"); diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java index 114104d42..f3daa9e4c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java @@ -26,7 +26,7 @@ package org.broadinstitute.sting.utils.variantcontext; import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.testng.Assert; @@ -39,7 +39,7 @@ import java.io.FileNotFoundException; import java.util.*; public class VariantContextUtilsUnitTest extends BaseTest { - Allele Aref, T, C, Cref, ATC, ATCATC; + Allele Aref, T, C, G, Cref, ATC, ATCATC; private GenomeLocParser genomeLocParser; @BeforeSuite @@ -58,6 +58,7 @@ public class VariantContextUtilsUnitTest extends BaseTest { Cref = Allele.create("C", true); T = Allele.create("T"); C = Allele.create("C"); + G = Allele.create("G"); ATC = Allele.create("ATC"); ATCATC = Allele.create("ATCATC"); } @@ -697,10 +698,120 @@ public class VariantContextUtilsUnitTest extends BaseTest { return ReverseClippingPositionTestProvider.getTests(ReverseClippingPositionTestProvider.class); } - @Test(dataProvider = "ReverseClippingPositionTestProvider") public void testReverseClippingPositionTestProvider(ReverseClippingPositionTestProvider cfg) { int result = VariantContextUtils.computeReverseClipping(cfg.alleles, cfg.ref.getBytes(), 0, false); Assert.assertEquals(result, cfg.expectedClip); } -} + + // -------------------------------------------------------------------------------- + // + // test splitting into bi-allelics + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "SplitBiallelics") + public Object[][] makeSplitBiallelics() throws CloneNotSupportedException { + List tests = new ArrayList(); + + final VariantContextBuilder root = new VariantContextBuilder("x", "20", 10, 10, Arrays.asList(Aref, C)); + + // biallelic -> biallelic + tests.add(new Object[]{root.make(), Arrays.asList(root.make())}); + + // monos -> monos + root.alleles(Arrays.asList(Aref)); + tests.add(new Object[]{root.make(), Arrays.asList(root.make())}); + + root.alleles(Arrays.asList(Aref, C, T)); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(Aref, C)).make(), + root.alleles(Arrays.asList(Aref, T)).make())}); + + root.alleles(Arrays.asList(Aref, C, T, G)); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(Aref, C)).make(), + root.alleles(Arrays.asList(Aref, T)).make(), + root.alleles(Arrays.asList(Aref, G)).make())}); + + final Allele C = Allele.create("C"); + final Allele CA = Allele.create("CA"); + final Allele CAA = Allele.create("CAA"); + final Allele CAAAA = Allele.create("CAAAA"); + final Allele CAAAAA = Allele.create("CAAAAA"); + final Allele Cref = Allele.create("C", true); + final Allele CAref = Allele.create("CA", true); + final Allele CAAref = Allele.create("CAA", true); + final Allele CAAAref = Allele.create("CAAA", true); + + root.alleles(Arrays.asList(Cref, CA, CAA)); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(Cref, CA)).make(), + root.alleles(Arrays.asList(Cref, CAA)).make())}); + + root.alleles(Arrays.asList(CAAref, C, CA)).stop(12); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(CAAref, C)).make(), + root.alleles(Arrays.asList(CAref, C)).stop(11).make())}); + + root.alleles(Arrays.asList(CAAAref, C, CA, CAA)).stop(13); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(CAAAref, C)).make(), + root.alleles(Arrays.asList(CAAref, C)).stop(12).make(), + root.alleles(Arrays.asList(CAref, C)).stop(11).make())}); + + root.alleles(Arrays.asList(CAAAref, CAAAAA, CAAAA, CAA, C)).stop(13); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(Cref, CAA)).stop(10).make(), + root.alleles(Arrays.asList(Cref, CA)).stop(10).make(), + root.alleles(Arrays.asList(CAref, C)).stop(11).make(), + root.alleles(Arrays.asList(CAAAref, C)).stop(13).make())}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "SplitBiallelics") + public void testSplitBiallelicsNoGenotypes(final VariantContext vc, final List expectedBiallelics) { + final List biallelics = VariantContextUtils.splitVariantContextToBiallelics(vc); + Assert.assertEquals(biallelics.size(), expectedBiallelics.size()); + for ( int i = 0; i < biallelics.size(); i++ ) { + final VariantContext actual = biallelics.get(i); + final VariantContext expected = expectedBiallelics.get(i); + VariantContextTestProvider.assertEquals(actual, expected); + } + } + + @Test(dataProvider = "SplitBiallelics", dependsOnMethods = "testSplitBiallelicsNoGenotypes") + public void testSplitBiallelicsGenotypes(final VariantContext vc, final List expectedBiallelics) { + final List genotypes = new ArrayList(); + + int sampleI = 0; + for ( final List alleles : Utils.makePermutations(vc.getAlleles(), 2, true) ) { + genotypes.add(GenotypeBuilder.create("sample" + sampleI, alleles)); + } + genotypes.add(GenotypeBuilder.createMissing("missing", 2)); + + final VariantContext vcWithGenotypes = new VariantContextBuilder(vc).genotypes(genotypes).make(); + + final List biallelics = VariantContextUtils.splitVariantContextToBiallelics(vcWithGenotypes); + for ( int i = 0; i < biallelics.size(); i++ ) { + final VariantContext actual = biallelics.get(i); + Assert.assertEquals(actual.getNSamples(), vcWithGenotypes.getNSamples()); // not dropping any samples + + for ( final Genotype inputGenotype : genotypes ) { + final Genotype actualGenotype = actual.getGenotype(inputGenotype.getSampleName()); + Assert.assertNotNull(actualGenotype); + if ( ! vc.isVariant() || vc.isBiallelic() ) + Assert.assertEquals(actualGenotype, vcWithGenotypes.getGenotype(inputGenotype.getSampleName())); + else + Assert.assertTrue(actualGenotype.isNoCall()); + } + } + } +} \ No newline at end of file From 2306518ab6be1c323b46d41b33d481cc1bb65197 Mon Sep 17 00:00:00 2001 From: Menachem Fromer Date: Thu, 22 Nov 2012 01:45:18 -0500 Subject: [PATCH 76/90] Fix to deal with 'proper' options of casting --- .../pileup/AbstractReadBackedPileup.java | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index d0ae68912..42938d2a6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -1022,7 +1022,7 @@ public abstract class AbstractReadBackedPileup sortedElements = new TreeSet(new Comparator() { @Override @@ -1031,15 +1031,27 @@ public abstract class AbstractReadBackedPileup tracker = (UnifiedPileupElementTracker) pileupElementTracker; - for (PE pile : tracker) - sortedElements.add(pile); + + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + for (PE pile : perSampleElements) + sortedElements.add(pile); + } + } + else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; + for (PE pile : tracker) + sortedElements.add(pile); + } UnifiedPileupElementTracker sortedTracker = new UnifiedPileupElementTracker(); for (PE pile : sortedElements) sortedTracker.add(pile); - return (RBP) createNewPileup(this.getLocation(), sortedTracker); + return (RBP) createNewPileup(loc, sortedTracker); } @Override From 9719ba7adce0574a48281d6baac7db18ff3208fa Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 22 Nov 2012 21:53:42 -0500 Subject: [PATCH 77/90] Remove -number example from the docs since it's no longer supported. --- .../sting/gatk/walkers/variantutils/SelectVariants.java | 8 -------- 1 file changed, 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index d28fe34d6..9253446c8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -151,14 +151,6 @@ import java.util.*; * -mvq 50 \ * -o violations.vcf * - * Creating a sample of exactly 1000 variants randomly chosen with equal probability from the variant VCF: - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ - * -T SelectVariants \ - * --variant input.vcf \ - * -o output.vcf \ - * -number 1000 - * * Creating a set with 50% of the total number of variants in the variant VCF: * java -Xmx2g -jar GenomeAnalysisTK.jar \ * -R ref.fasta \ From d978cfe8350288bd5dd406f7de224ccb2847838a Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Sun, 25 Nov 2012 21:55:29 -0500 Subject: [PATCH 78/90] Soft clipped bases shouldn't be counted in the delocalized BQSR. --- .../walkers/bqsr/BQSRIntegrationTest.java | 31 ++++++++++++++----- .../gatk/walkers/bqsr/BaseRecalibrator.java | 2 +- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java index b839382dc..de328c825 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java @@ -51,16 +51,16 @@ public class BQSRIntegrationTest extends WalkerTest { String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam"; String HiSeqInterval = "chr1:10,000,000-10,100,000"; return new Object[][]{ - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "387b41dc2221a1a4a782958944662b25")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "b5e26902e76abbd59f94f65c70d18165")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "a8a9c3f83269911cb61c5fe8fb98dc4a")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "f43a0473101c63ae93444c300d843e81")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "9e05e63339d4716584bfc717cab6bd0f")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "1cf9b9c9c64617dc0f3d2f203f918dbe")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "aa1949a77bc3066fee551a217c970c0d")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "4fd3c9ad97e6ac58cba644a76564c9f7")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "2620f734cce20f70ce13afd880e46e5c")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "5eb3b94e767da19a4c037ee132e4b19a")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "ab261d291b107a3da7897759c0e4fa89")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "292303f649fbb19dc05d4a0197a49eeb")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "8ced9d1094493f17fb1876b818a64541")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "abb838131e403d39820dbd66932d1ed0")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "f70d8b5358bc2f76696f14b7a807ede0")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "4c0f63e06830681560a1e9f9aad9fe98")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "be2812cd3dae3c326cf35ae3f1c8ad9e")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "8f62aa0e75770204c98d8299793cc53c")}, {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "03c29a0c1d21f72b12daf51cec111599")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "7080b2cad02ec6e67ebc766b2dccebf8")}, {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "30e76055c16843b6e33e5b9bd8ced57c")}, @@ -90,6 +90,21 @@ public class BQSRIntegrationTest extends WalkerTest { executeTest("testBQSRFailWithoutDBSNP", spec); } + @Test + public void testBQSRCSV() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + " -T BaseRecalibrator" + + " -R " + b36KGReference + + " -I " + validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam" + + " -knownSites " + b36dbSNP129 + + " -L 1:10,000,000-10,200,000" + + " -o /dev/null" + + " --plot_pdf_file /dev/null" + + " --intermediate_csv_file %s", + Arrays.asList("d1c38a3418979400630e2bca1140689c")); + executeTest("testBQSR-CSVfile", spec); + } + @Test public void testBQSRFailWithSolidNoCall() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index b415bb1f5..7ce98cf1d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -227,7 +227,7 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche */ public Long map( final ReferenceContext ref, final GATKSAMRecord originalRead, final RefMetaDataTracker metaDataTracker ) { - final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(originalRead); + final GATKSAMRecord read = ReadClipper.hardClipSoftClippedBases( ReadClipper.hardClipAdaptorSequence(originalRead) ); if( read.isEmpty() ) { return 0L; } // the whole read was inside the adaptor so skip it RecalUtils.parsePlatformForRead(read, RAC); From a3f59325016b9ffb78f71941aa9fd54bb9e4b3d6 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 26 Nov 2012 11:12:27 -0500 Subject: [PATCH 81/90] Fixed null pointer exception in Integration Tests When running Utils.setupWriter with NO_PG_TAG set, the writer was attempting to create a program record with the null pointer. Fixed. --- public/java/src/org/broadinstitute/sting/utils/Utils.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index b780d0966..544030f73 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -701,11 +701,13 @@ public class Utils { List oldRecords = header.getProgramRecords(); List newRecords = new ArrayList(oldRecords.size()+1); for ( SAMProgramRecord record : oldRecords ) - if ( !record.getId().startsWith(programRecord.getId()) || KEEP_ALL_PG_RECORDS ) + if ( (programRecord != null && !record.getId().startsWith(programRecord.getId())) || KEEP_ALL_PG_RECORDS ) newRecords.add(record); - newRecords.add(programRecord); - header.setProgramRecords(newRecords); + if (programRecord != null) { + newRecords.add(programRecord); + header.setProgramRecords(newRecords); + } return header; } From 4f7fa3009a4adb8617501f35d61730f62b6ca0b1 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 26 Nov 2012 11:34:59 -0500 Subject: [PATCH 82/90] I forget why I thought that the VariantAnnotator couldn't run multi-threaded because it works just fine. Now you can specify -nt with VA. --- .../walkers/annotator/VariantAnnotator.java | 28 +++++++------------ .../VariantAnnotatorIntegrationTest.java | 8 +++++- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index c4de9ed45..92060b4a3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -82,7 +82,7 @@ import java.util.*; @Allows(value={DataSource.READS, DataSource.REFERENCE}) @Reference(window=@Window(start=-50,stop=50)) @By(DataSource.REFERENCE) -public class VariantAnnotator extends RodWalker implements AnnotatorCompatible { +public class VariantAnnotator extends RodWalker implements AnnotatorCompatible, TreeReducible { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); @@ -275,14 +275,6 @@ public class VariantAnnotator extends RodWalker implements Ann return true; } - /** - * Initialize the number of loci processed to zero. - * - * @return 0 - */ - public Integer reduceInit() { return 0; } - - /** * We want reads that span deletions * @@ -323,15 +315,15 @@ public class VariantAnnotator extends RodWalker implements Ann return 1; } - /** - * Increment the number of loci processed. - * - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return the new number of loci processed. - */ - public Integer reduce(Integer value, Integer sum) { - return sum + value; + @Override + public Integer reduceInit() { return 0; } + + @Override + public Integer reduce(Integer value, Integer sum) { return value + sum; } + + @Override + public Integer treeReduce(Integer lhs, Integer rhs) { + return lhs + rhs; } /** diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 01dff0089..b097e3d34 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -151,7 +151,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { } @Test - public void testTabixAnnotations() { + public void testTabixAnnotationsAndParallelism() { final String MD5 = "99938d1e197b8f10c408cac490a00a62"; for ( String file : Arrays.asList("CEU.exon.2010_03.sites.vcf", "CEU.exon.2010_03.sites.vcf.gz")) { WalkerTestSpec spec = new WalkerTestSpec( @@ -159,6 +159,12 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { Arrays.asList(MD5)); executeTest("Testing lookup vcf tabix vs. vcf tribble", spec); } + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -A HomopolymerRun -nt 2 --variant:vcf " + validationDataLocation + "CEU.exon.2010_03.sites.vcf -L " + validationDataLocation + "CEU.exon.2010_03.sites.vcf --no_cmdline_in_header", 1, + Arrays.asList(MD5)); + + executeTest("Testing lookup vcf tabix vs. vcf tribble plus parallelism", spec); } @Test From c3b7dd1374ece9cb8bffedd6518f5d6d9582eec0 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 26 Nov 2012 12:19:11 -0500 Subject: [PATCH 83/90] Misc cleanup in the HaplotypeCaller. Cleaning up unused arguments after recent changes to HC-GenotypingEngine --- .../haplotypecaller/GenotypingEngine.java | 4 +- .../haplotypecaller/HaplotypeCaller.java | 66 ++----------------- .../LikelihoodCalculationEngine.java | 1 - .../HaplotypeCallerIntegrationTest.java | 17 ++--- .../annotator/MappingQualityRankSumTest.java | 5 +- .../gatk/walkers/annotator/RankSumTest.java | 2 +- .../walkers/annotator/ReadPosRankSumTest.java | 2 +- 7 files changed, 21 insertions(+), 76 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index beec8a92e..4fc2dc8f7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -41,13 +41,11 @@ import java.util.*; public class GenotypingEngine { private final boolean DEBUG; - private final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE; private final static List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("", false); - public GenotypingEngine( final boolean DEBUG, final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) { + public GenotypingEngine( final boolean DEBUG ) { this.DEBUG = DEBUG; - this.OUTPUT_FULL_HAPLOTYPE_SEQUENCE = OUTPUT_FULL_HAPLOTYPE_SEQUENCE; noCall.add(Allele.NO_CALL); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 2b739a321..24b3309f1 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -131,14 +131,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false) protected int MIN_PRUNE_FACTOR = 1; - @Advanced - @Argument(fullName="genotypeFullActiveRegion", shortName="genotypeFullActiveRegion", doc = "If specified, alternate alleles are considered to be the full active region for the purposes of genotyping", required = false) - protected boolean GENOTYPE_FULL_ACTIVE_REGION = false; - - @Advanced - @Argument(fullName="fullHaplotype", shortName="fullHaplotype", doc = "If specified, output the full haplotype sequence instead of converting to individual variants w.r.t. the reference", required = false) - protected boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE = false; - @Advanced @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false) protected int gcpHMM = 10; @@ -248,10 +240,11 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC); - simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling - simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling - simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); - simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); + simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; + simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; + simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.CONTAMINATION_FRACTION = 0.0; simpleUAC.exactCallsLog = null; UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); @@ -273,15 +266,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem VCFConstants.GENOTYPE_QUALITY_KEY, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_PL_KEY); - // header lines for the experimental HaplotypeCaller-specific annotations - headerInfo.add(new VCFInfoHeaderLine("NVH", 1, VCFHeaderLineType.Integer, "Number of variants found on the haplotype that contained this variant")); - headerInfo.add(new VCFInfoHeaderLine("NumHapEval", 1, VCFHeaderLineType.Integer, "Number of haplotypes that were chosen for evaluation in this active region")); - headerInfo.add(new VCFInfoHeaderLine("NumHapAssembly", 1, VCFHeaderLineType.Integer, "Number of haplotypes created during the assembly of this active region")); - headerInfo.add(new VCFInfoHeaderLine("ActiveRegionSize", 1, VCFHeaderLineType.Integer, "Number of base pairs that comprise this active region")); - headerInfo.add(new VCFInfoHeaderLine("EVENTLENGTH", 1, VCFHeaderLineType.Integer, "Max length of all the alternate alleles")); - headerInfo.add(new VCFInfoHeaderLine("TYPE", 1, VCFHeaderLineType.String, "Type of event: SNP or INDEL")); - headerInfo.add(new VCFInfoHeaderLine("extType", 1, VCFHeaderLineType.String, "Extended type of event: SNP, MNP, INDEL, or COMPLEX")); - headerInfo.add(new VCFInfoHeaderLine("QDE", 1, VCFHeaderLineType.Float, "QD value divided by the number of variants found on the haplotype that contained this variant")); // FILTER fields are added unconditionally as it's not always 100% certain the circumstances // where the filters are used. For example, in emitting all sites the lowQual field is used @@ -298,7 +282,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); - genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE ); + genotypingEngine = new GenotypingEngine( DEBUG ); } //--------------------------------------------------------------------------------------------------------------- @@ -428,43 +412,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final Map stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog ); final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst()); final Map myAttributes = new LinkedHashMap(annotatedCall.getAttributes()); - - if( !GENOTYPE_FULL_ACTIVE_REGION ) { - // add some custom annotations to the calls - - // Calculate the number of variants on the haplotype - int maxNumVar = 0; - for( final Allele allele : callResult.getFirst().getAlleles() ) { - if( !allele.isReference() ) { - for( final Haplotype haplotype : callResult.getSecond().get(allele) ) { - final int numVar = haplotype.getEventMap().size(); - if( numVar > maxNumVar ) { maxNumVar = numVar; } - } - } - } - // Calculate the event length - int maxLength = 0; - for ( final Allele a : annotatedCall.getAlternateAlleles() ) { - final int length = a.length() - annotatedCall.getReference().length(); - if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; } - } - - myAttributes.put("NVH", maxNumVar); - myAttributes.put("NumHapEval", bestHaplotypes.size()); - myAttributes.put("NumHapAssembly", haplotypes.size()); - myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size()); - myAttributes.put("EVENTLENGTH", maxLength); - myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") ); - myAttributes.put("extType", annotatedCall.getType().toString() ); - - //if( likelihoodCalculationEngine.haplotypeScore != null ) { - // myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore)); - //} - if( annotatedCall.hasAttribute("QD") ) { - myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) ); - } - } - vcfWriter.add( new VariantContextBuilder(annotatedCall).attributes(myAttributes).make() ); } @@ -520,6 +467,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) { final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); // protect against INTERVALS with abnormally high coverage + // BUGBUG: remove when positinal downsampler is hooked up to ART/HC if( clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) { activeRegion.add(clippedRead); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 304f8d5cb..29622ca17 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -169,7 +169,6 @@ public class LikelihoodCalculationEngine { } // compute the diploid haplotype likelihoods - // todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code for( int iii = 0; iii < numHaplotypes; iii++ ) { for( int jjj = 0; jjj <= iii; jjj++ ) { for( final Haplotype iii_mapped : haplotypeMapping.get(alleleOrdering.get(iii)) ) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index a57462d1d..007df3602 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -21,18 +21,19 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "56aa4b84606b6b0b7dc78a383974d1b3"); + HCTest(CEUTRIO_BAM, "", "2b39732ff8e0de5bc2ae949aaf7a6f21"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "baabae06c85d416920be434939124d7f"); + HCTest(NA12878_BAM, "", "8b217638ff585effb9cc70e9a9aa544f"); } // TODO -- add more tests for GGA mode, especially with input alleles that are complex variants and/or not trimmed @Test public void testHaplotypeCallerMultiSampleGGA() { - HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "f2d0309fdf50d5827e9c60ed0dd07e3f"); + HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", + "541aa8291f03ba33bd1ad3d731fd5657"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -43,7 +44,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "966d338f423c86a390d685aa6336ec69"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "fd7170cbde7df04d4fbe1da7903c31c6"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -54,7 +55,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "7fbc6b9e27e374f2ffe4be952d88c7c6"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "99456fc7207c1fe9f367a0d0afae87cd"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -65,7 +66,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "2581e760279291a3901a506d060bfac8"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "6c1631785b3f832aecab1a99f0454762"); } @Test @@ -78,7 +79,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("96ab8253d242b851ccfc218759f79784")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("237601bbc39694c7413a332cbb656c8e")); executeTest("HCTestStructuralIndels: ", spec); } @@ -92,7 +93,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("425f1a0fb00d7145edf1c55e54346fae")); + Arrays.asList("40bf739fb2b1743642498efe79ea6342")); executeTest("HC calling on a ReducedRead BAM", spec); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index 82596a501..2679a169b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -29,7 +29,7 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn final List refQuals, final List altQuals) { if (pileup != null && likelihoodMap == null) { - // no per-read likelihoods available: + // old UG snp-only path through the annotations for ( final PileupElement p : pileup ) { if ( isUsableBase(p) ) { if ( allAlleles.get(0).equals(Allele.create(p.getBase(), true)) ) { @@ -43,14 +43,13 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn } for (Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet()) { final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + // BUGBUG: There needs to be a comparable isUsableBase check here if (a.isNoCall()) continue; // read is non-informative if (a.isReference()) refQuals.add((double)el.getKey().getMappingQuality()); else if (allAlleles.contains(a)) altQuals.add((double)el.getKey().getMappingQuality()); - - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index 0df7aff71..e7c0e6b14 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -49,7 +49,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR ReadBackedPileup pileup = null; - if (stratifiedContexts != null) { + if (stratifiedContexts != null) { // the old UG SNP-only path through the annotations final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); if ( context != null ) pileup = context.getBasePileup(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index d01233bb2..334b89f01 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -39,7 +39,7 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio final List refQuals, final List altQuals) { if (alleleLikelihoodMap == null) { - // use fast SNP-based version if we don't have per-read allele likelihoods + // use old UG SNP-based version if we don't have per-read allele likelihoods for ( final PileupElement p : pileup ) { if ( isUsableBase(p) ) { int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0); From 59cef880d195937e2e2eee9344a7bcc0d2ff016b Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 26 Nov 2012 12:20:07 -0500 Subject: [PATCH 84/90] Updating HC integration tests because experimental, HC-specific annotations have been removed. --- .../walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 007df3602..f8ba1f4cc 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -72,7 +72,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("788176e1717bd28fc7cbc8e3efbb6100")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ec437d2d9f3ae07d155983be0155c8ed")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } From 405f3c675d9daa589942e830db0870931741f113 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 27 Nov 2012 01:07:00 -0500 Subject: [PATCH 85/90] Fix for GSA-649: GenomeLocSortedSet.overlaps is crazy slow. Also improved GenomeLocSortedSet.sizeBeforeLoc. --- .../traversals/TraverseActiveRegions.java | 1 - .../sting/utils/GenomeLocSortedSet.java | 49 +++++++++++++++---- .../utils/GenomeLocSortedSetUnitTest.java | 36 ++++++++++++++ 3 files changed, 76 insertions(+), 10 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index a2c37944a..3f20db0af 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -80,7 +80,6 @@ public class TraverseActiveRegions extends TraversalEngine { // our private storage for the GenomeLoc's private List mArray = new ArrayList(); + // cache this to make overlap checking much more efficient + private int previousOverlapSearchIndex = -1; + /** default constructor */ public GenomeLocSortedSet(GenomeLocParser parser) { this.genomeLocParser = parser; @@ -101,7 +104,7 @@ public class GenomeLocSortedSet extends AbstractSet { * Return the number of bps before loc in the sorted set * * @param loc the location before which we are counting bases - * @return + * @return the number of base pairs over all previous intervals */ public long sizeBeforeLoc(GenomeLoc loc) { long s = 0; @@ -110,7 +113,7 @@ public class GenomeLocSortedSet extends AbstractSet { if ( e.isBefore(loc) ) s += e.size(); else if ( e.isPast(loc) ) - ; // don't do anything + break; // we are done else // loc is inside of s s += loc.getStart() - e.getStart(); } @@ -131,15 +134,43 @@ public class GenomeLocSortedSet extends AbstractSet { * Determine if the given loc overlaps any loc in the sorted set * * @param loc the location to test - * @return + * @return trip if the location overlaps any loc */ public boolean overlaps(final GenomeLoc loc) { - for(final GenomeLoc e : mArray) { - if(e.overlapsP(loc)) { - return true; - } + // edge condition + if ( mArray.isEmpty() ) + return false; + + // use the cached version first + if ( previousOverlapSearchIndex != -1 && overlapsAtOrImmediatelyAfterCachedIndex(loc, true) ) + return true; + + // update the cached index + previousOverlapSearchIndex = Collections.binarySearch(mArray, loc); + + // if it matches an interval exactly, we are done + if ( previousOverlapSearchIndex > 0 ) + return true; + + // check whether it overlaps the interval before or after the insertion point + previousOverlapSearchIndex = Math.max(0, -1 * previousOverlapSearchIndex - 2); + return overlapsAtOrImmediatelyAfterCachedIndex(loc, false); + } + + private boolean overlapsAtOrImmediatelyAfterCachedIndex(final GenomeLoc loc, final boolean updateCachedIndex) { + // check the cached entry + if ( mArray.get(previousOverlapSearchIndex).overlapsP(loc) ) + return true; + + // check the entry after the cached entry since we may have moved to it + boolean returnValue = false; + if ( previousOverlapSearchIndex < mArray.size() - 1 ) { + returnValue = mArray.get(previousOverlapSearchIndex + 1).overlapsP(loc); + if ( updateCachedIndex ) + previousOverlapSearchIndex++; } - return false; + + return returnValue; } /** @@ -155,7 +186,7 @@ public class GenomeLocSortedSet extends AbstractSet { mArray.add(e); return true; } else { - int loc = Collections.binarySearch(mArray,e); + final int loc = Collections.binarySearch(mArray,e); if (loc >= 0) { throw new ReviewedStingException("Genome Loc Sorted Set already contains the GenomicLoc " + e.toString()); } else { diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java index 3d21e654f..6138e7396 100755 --- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; import org.testng.annotations.BeforeClass; @@ -117,6 +118,41 @@ public class GenomeLocSortedSetUnitTest extends BaseTest { assertTrue(loc.getContigIndex() == 1); } + @Test + public void overlap() { + for ( int i = 1; i < 6; i++ ) { + final int start = i * 10; + mSortedSet.add(genomeLocParser.createGenomeLoc(contigOneName, start, start + 1)); + } + + // test matches in and around interval + assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 9, 9))); + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 10, 10))); + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 11, 11))); + assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 12, 12))); + + // test matches spanning intervals + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 14, 20))); + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 11, 15))); + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 30, 40))); + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 51, 53))); + + // test miss + assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 12, 19))); + + // test exact match after miss + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 40, 41))); + + // test matches at beginning of intervals + assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 5, 6))); + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 0, 10))); + + // test matches at end of intervals + assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 52, 53))); + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 51, 53))); + assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 52, 53))); + } + @Test public void mergingOverlappingAbove() { GenomeLoc e = genomeLocParser.createGenomeLoc(contigOneName, 0, 50); From cc72aaefebfed89723c40403ae0dfd3a932ff78d Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 27 Nov 2012 01:11:23 -0500 Subject: [PATCH 86/90] Minor efficiency: use >= instead of > in test --- .../src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java index ca1d385a2..394220106 100755 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java @@ -149,7 +149,7 @@ public class GenomeLocSortedSet extends AbstractSet { previousOverlapSearchIndex = Collections.binarySearch(mArray, loc); // if it matches an interval exactly, we are done - if ( previousOverlapSearchIndex > 0 ) + if ( previousOverlapSearchIndex >= 0 ) return true; // check whether it overlaps the interval before or after the insertion point From b1969a66bdcf755757517e77582b9d4e55caeb54 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 27 Nov 2012 08:24:41 -0500 Subject: [PATCH 87/90] Update docs --- .../gatk/walkers/fasta/FastaAlternateReferenceMaker.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java index 2b9744b89..22c6097cf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java @@ -47,6 +47,12 @@ import java.util.List; *

* Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s). * Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'. + * + * The output format can be partially controlled using the provided command-line arguments. + * Specify intervals with the usual -L argument to output only the reference bases within your intervals. + * Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a + * separate fasta sequence (named numerically in order). + * * Several important notes: * 1) if there are multiple variants that start at a site, it chooses one of them randomly. * 2) when there are overlapping indels (but with different start positions) only the first will be chosen. From a6c1fcd151463e96b68aef72d7e8824730706330 Mon Sep 17 00:00:00 2001 From: kshakir Date: Thu, 29 Nov 2012 13:31:08 -0500 Subject: [PATCH 90/90] Removed default use of @Output syntax. If compile completes for QScripts, sending runtime errors during execute. --- .../sting/queue/QCommandLine.scala | 172 +++++++++--------- .../broadinstitute/sting/queue/QScript.scala | 35 +--- .../sting/queue/engine/QStatusMessenger.scala | 2 +- 3 files changed, 97 insertions(+), 112 deletions(-) diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala index 637174557..f899af86d 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -110,95 +110,103 @@ class QCommandLine extends CommandLineProgram with Logging { * functions, and then builds and runs a QGraph based on the dependencies. */ def execute = { - ClassFieldCache.parsingEngine = this.parser + var success = false + var result = 1 + try { + ClassFieldCache.parsingEngine = this.parser - if (settings.qSettings.runName == null) - settings.qSettings.runName = FilenameUtils.removeExtension(scripts.head.getName) - if (IOUtils.isDefaultTempDir(settings.qSettings.tempDirectory)) - settings.qSettings.tempDirectory = IOUtils.absolute(settings.qSettings.runDirectory, ".queue/tmp") - qGraph.initializeWithSettings(settings) + if (settings.qSettings.runName == null) + settings.qSettings.runName = FilenameUtils.removeExtension(scripts.head.getName) + if (IOUtils.isDefaultTempDir(settings.qSettings.tempDirectory)) + settings.qSettings.tempDirectory = IOUtils.absolute(settings.qSettings.runDirectory, ".queue/tmp") + qGraph.initializeWithSettings(settings) - for (commandPlugin <- allCommandPlugins) { - loadArgumentsIntoObject(commandPlugin) - } - - for (commandPlugin <- allCommandPlugins) { - if (commandPlugin.statusMessenger != null) - commandPlugin.statusMessenger.started() - } - - qGraph.messengers = allCommandPlugins.filter(_.statusMessenger != null).map(_.statusMessenger).toSeq - - // TODO: Default command plugin argument? - val remoteFileConverter = ( - for (commandPlugin <- allCommandPlugins if (commandPlugin.remoteFileConverter != null)) - yield commandPlugin.remoteFileConverter - ).headOption.getOrElse(null) - - if (remoteFileConverter != null) - loadArgumentsIntoObject(remoteFileConverter) - - val allQScripts = qScriptPluginManager.createAllTypes() - for (script <- allQScripts) { - logger.info("Scripting " + qScriptPluginManager.getName(script.getClass.asSubclass(classOf[QScript]))) - loadArgumentsIntoObject(script) - allCommandPlugins.foreach(_.initScript(script)) - // TODO: Pulling inputs can be time/io expensive! Some scripts are using the files to generate functions-- even for dry runs-- so pull it all down for now. - //if (settings.run) - script.pullInputs() - script.qSettings = settings.qSettings - try { - script.script() - } catch { - case e: Exception => - throw new UserException.CannotExecuteQScript(script.getClass.getSimpleName + ".script() threw the following exception: " + e, e) + for (commandPlugin <- allCommandPlugins) { + loadArgumentsIntoObject(commandPlugin) } - if (remoteFileConverter != null) { - if (remoteFileConverter.convertToRemoteEnabled) - script.mkRemoteOutputs(remoteFileConverter) - } - - script.functions.foreach(qGraph.add(_)) - logger.info("Added " + script.functions.size + " functions") - } - // Execute the job graph - qGraph.run() - - val functionsAndStatus = qGraph.getFunctionsAndStatus - val success = qGraph.success - - // walk over each script, calling onExecutionDone - for (script <- allQScripts) { - val scriptFunctions = functionsAndStatus.filterKeys(f => script.functions.contains(f)) - script.onExecutionDone(scriptFunctions, success) - } - - logger.info("Script %s with %d total jobs".format(if (success) "completed successfully" else "failed", functionsAndStatus.size)) - - // write the final complete job report - logger.info("Writing final jobs report...") - qGraph.writeJobsReport() - - if (!success) { - logger.info("Done with errors") - qGraph.logFailed() - for (commandPlugin <- allCommandPlugins) + for (commandPlugin <- allCommandPlugins) { if (commandPlugin.statusMessenger != null) - commandPlugin.statusMessenger.exit("Done with errors: %s".format(qGraph.formattedStatusCounts)) - 1 - } else { - if (settings.run) { - allQScripts.foreach(_.pushOutputs()) - for (commandPlugin <- allCommandPlugins) - if (commandPlugin.statusMessenger != null) { - val allInputs = allQScripts.map(_.remoteInputs) - val allOutputs = allQScripts.map(_.remoteOutputs) - commandPlugin.statusMessenger.done(allInputs, allOutputs) - } + commandPlugin.statusMessenger.started() + } + + qGraph.messengers = allCommandPlugins.filter(_.statusMessenger != null).map(_.statusMessenger).toSeq + + // TODO: Default command plugin argument? + val remoteFileConverter = ( + for (commandPlugin <- allCommandPlugins if (commandPlugin.remoteFileConverter != null)) + yield commandPlugin.remoteFileConverter + ).headOption.getOrElse(null) + + if (remoteFileConverter != null) + loadArgumentsIntoObject(remoteFileConverter) + + val allQScripts = qScriptPluginManager.createAllTypes() + for (script <- allQScripts) { + logger.info("Scripting " + qScriptPluginManager.getName(script.getClass.asSubclass(classOf[QScript]))) + loadArgumentsIntoObject(script) + allCommandPlugins.foreach(_.initScript(script)) + // TODO: Pulling inputs can be time/io expensive! Some scripts are using the files to generate functions-- even for dry runs-- so pull it all down for now. + //if (settings.run) + script.pullInputs() + script.qSettings = settings.qSettings + try { + script.script() + } catch { + case e: Exception => + throw new UserException.CannotExecuteQScript(script.getClass.getSimpleName + ".script() threw the following exception: " + e, e) + } + + if (remoteFileConverter != null) { + if (remoteFileConverter.convertToRemoteEnabled) + script.mkRemoteOutputs(remoteFileConverter) + } + + script.functions.foreach(qGraph.add(_)) + logger.info("Added " + script.functions.size + " functions") + } + // Execute the job graph + qGraph.run() + + val functionsAndStatus = qGraph.getFunctionsAndStatus + + // walk over each script, calling onExecutionDone + for (script <- allQScripts) { + val scriptFunctions = functionsAndStatus.filterKeys(f => script.functions.contains(f)) + script.onExecutionDone(scriptFunctions, success) + } + + logger.info("Script %s with %d total jobs".format(if (success) "completed successfully" else "failed", functionsAndStatus.size)) + + // write the final complete job report + logger.info("Writing final jobs report...") + qGraph.writeJobsReport() + + if (qGraph.success) { + if (settings.run) { + allQScripts.foreach(_.pushOutputs()) + for (commandPlugin <- allCommandPlugins) + if (commandPlugin.statusMessenger != null) { + val allInputs = allQScripts.map(_.remoteInputs) + val allOutputs = allQScripts.map(_.remoteOutputs) + commandPlugin.statusMessenger.done(allInputs, allOutputs) + } + } + success = true + result = 0 + } + } finally { + if (!success) { + logger.info("Done with errors") + qGraph.logFailed() + if (settings.run) { + for (commandPlugin <- allCommandPlugins) + if (commandPlugin.statusMessenger != null) + commandPlugin.statusMessenger.exit("Done with errors: %s".format(qGraph.formattedStatusCounts)) + } } - 0 } + result } /** diff --git a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala index eb8be183a..5b67ae913 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala @@ -124,49 +124,26 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon } /** - * Pull all remote files to the local disk. + * Pull all remote files to the local disk */ def pullInputs() { - val inputs = ClassFieldCache.getFieldFiles(this, inputFields) - for (remoteFile <- filterRemoteFiles(inputs)) { - logger.info("Pulling %s from %s".format(remoteFile.getAbsolutePath, remoteFile.remoteDescription)) - remoteFile.pullToLocal() - } } /** - * Push all remote files from the local disk. + * Push all remote files from the local disk */ def pushOutputs() { - val outputs = ClassFieldCache.getFieldFiles(this, outputFields) - for (remoteFile <- filterRemoteFiles(outputs)) { - logger.info("Pushing %s to %s".format(remoteFile.getAbsolutePath, remoteFile.remoteDescription)) - remoteFile.pushToRemote() - } } /** - * List out the remote outputs - * @return the RemoteFile outputs by argument source + * @return the inputs or null if there are no inputs */ - def remoteInputs: Map[String, Seq[RemoteFile]] = tagMap(remoteFieldMap(inputFields)) + def remoteInputs: AnyRef = null /** - * List out the remote outputs - * @return the RemoteFile outputs by argument source + * @return the outputs or null if there are no outputs */ - def remoteOutputs: Map[String, Seq[RemoteFile]] = tagMap(remoteFieldMap(outputFields)) - - private def tagMap(remoteFieldMap: Map[ArgumentSource, Seq[RemoteFile]]): Map[String, Seq[RemoteFile]] = { - remoteFieldMap.collect{ case (k, v) => ClassFieldCache.fullName(k) -> v }.toMap - } - - private def remoteFieldMap(fields: Seq[ArgumentSource]): Map[ArgumentSource, Seq[RemoteFile]] = { - fields.map(field => (field -> filterRemoteFiles(ClassFieldCache.getFieldFiles(this, field)))).filter(tuple => !tuple._2.isEmpty).toMap - } - - private def filterRemoteFiles(fields: Seq[File]): Seq[RemoteFile] = - fields.filter(field => field != null && field.isInstanceOf[RemoteFile]).map(_.asInstanceOf[RemoteFile]) + def remoteOutputs: AnyRef = null /** The complete list of fields. */ def functionFields: Seq[ArgumentSource] = ClassFieldCache.classFunctionFields(this.getClass) diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala index a1133b944..a69f68b8e 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala @@ -7,7 +7,7 @@ import org.broadinstitute.sting.queue.util.RemoteFile */ trait QStatusMessenger { def started() - def done(inputs: Seq[Map[String, Seq[RemoteFile]]], outputs: Seq[Map[String, Seq[RemoteFile]]]) + def done(inputs: Seq[_], outputs: Seq[_]) def exit(message: String) def started(job: String)