From cf7afc1ad4476d069744858b2bbbcf9bb6a650e7 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Sat, 20 Apr 2013 00:28:35 -0400 Subject: [PATCH] Fixed "skipped intervals" bug on DiagnoseTargets Problem ------- Diagnose targets was skipping intervals when they were not covered by any reads. Solution -------- Rework the interval iteration logic to output all intervals as they're skipped over by the traversal, as well as adding a loop on traversal done to finish outputting intervals past the coverage of teh BAM file. Summarized Changes ------------------ * Outputs all intervals it iterates over, even if uncovered * Outputs leftover intervals in the end of the traversal * Updated integration tests [fixes #47813825] --- .../diagnostics/targets/DiagnoseTargets.java | 104 ++++++++---------- .../DiagnoseTargetsIntegrationTest.java | 4 +- 2 files changed, 49 insertions(+), 59 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java index b302a967c..33c7e7a1b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java @@ -152,11 +152,14 @@ public class DiagnoseTargets extends LocusWalker { @Argument(fullName = "print_debug_log", shortName = "dl", doc = "Used only for debugging the walker. Prints extra info to screen", required = false) private boolean debug = false; - private HashMap intervalMap = null; // maps each interval => statistics - private PeekableIterator intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome - private Set samples = null; // all the samples being processed - private final Allele SYMBOLIC_ALLELE = Allele.create("
", false); // avoid creating the symbolic allele multiple times - private ThresHolder thresholds = null; + private Map intervalMap = null; // maps each interval => statistics + private PeekableIterator intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome + private Set samples = null; // all the samples being processed + private static final Allele SYMBOLIC_ALLELE = Allele.create("
", false); // avoid creating the symbolic allele multiple times + private static final Allele UNCOVERED_ALLELE = Allele.create("A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times + private ThresHolder thresholds = null; // object that holds all the thresholds for Diagnose Targets (todo -- should become a plugin based system) + + private static final int INITIAL_HASH_SIZE = 500000; @Override public void initialize() { @@ -165,24 +168,32 @@ public class DiagnoseTargets extends LocusWalker { if (getToolkit().getIntervals() == null) throw new UserException("This tool only works if you provide one or more intervals. ( Use the -L argument )"); - thresholds = new ThresHolder(minimumBaseQuality, minimumMappingQuality, minimumCoverage, maximumCoverage, minMedianDepth, maxInsertSize, votePercentage, lowMedianDepthPercentage, badMateStatusThreshold, coverageStatusThreshold, excessiveCoverageThreshold, qualityStatusThreshold); + thresholds = new ThresHolder(minimumBaseQuality, minimumMappingQuality, minimumCoverage, maximumCoverage, + minMedianDepth, maxInsertSize, votePercentage, lowMedianDepthPercentage, + badMateStatusThreshold, coverageStatusThreshold, excessiveCoverageThreshold, + qualityStatusThreshold); - intervalMap = new HashMap(); + intervalMap = new HashMap(INITIAL_HASH_SIZE); intervalListIterator = new PeekableIterator(getToolkit().getIntervals().iterator()); - samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); // get all of the unique sample names for the VCF Header - vcfWriter.writeHeader(new VCFHeader(ThresHolder.getHeaderInfo(), samples)); // initialize the VCF header + // get all of the unique sample names for the VCF Header + samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + vcfWriter.writeHeader(new VCFHeader(ThresHolder.getHeaderInfo(), samples)); } @Override public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { GenomeLoc refLocus = ref.getLocus(); - removePastIntervals(refLocus, ref.getBase()); // process and remove any intervals in the map that are don't overlap the current locus anymore - addNewOverlappingIntervals(refLocus); // add all new intervals that may overlap this reference locus + // process and remove any intervals in the map that are don't overlap the current locus anymore + // and add all new intervals that may overlap this reference locus + outputFinishedIntervals(refLocus, ref.getBase()); + addNewOverlappingIntervals(refLocus); + // at this point, all intervals in intervalMap overlap with this locus, so update all of them for (IntervalStatistics intervalStatistics : intervalMap.values()) - intervalStatistics.addLocus(context, ref, thresholds); // Add current locus to stats + intervalStatistics.addLocus(context, ref, thresholds); + return 1L; } @@ -212,53 +223,40 @@ public class DiagnoseTargets extends LocusWalker { @Override public void onTraversalDone(Long result) { for (GenomeLoc interval : intervalMap.keySet()) - outputStatsToVCF(intervalMap.get(interval), Allele.create("A", true)); - } + outputStatsToVCF(intervalMap.get(interval), UNCOVERED_ALLELE); - private GenomeLoc getIntervalMapSpan() { - GenomeLoc loc = null; - for (GenomeLoc interval : intervalMap.keySet()) { - if (loc == null) { - loc = interval; - } else - loc = interval.union(loc); + GenomeLoc interval = intervalListIterator.peek(); + while (interval != null) { + outputStatsToVCF(createIntervalStatistic(interval), UNCOVERED_ALLELE); + intervalListIterator.next(); + interval = intervalListIterator.peek(); } - - return loc; - } - - private GenomeLoc getFinishedIntervalSpan(GenomeLoc pos) { - GenomeLoc loc = null; - for (GenomeLoc interval : intervalMap.keySet()) { - if (interval.isBefore(pos)) { - if (loc == null) - loc = interval; - else - loc = interval.union(loc); - } - } - - return loc; } /** - * Removes all intervals that are behind the current reference locus from the intervalMap + * Outputs all intervals that are behind the current reference locus * * @param refLocus the current reference locus * @param refBase the reference allele */ - private void removePastIntervals(GenomeLoc refLocus, byte refBase) { - // if there are statistics to output/ check to see that we can output them in order - if (getFinishedIntervalSpan(refLocus) != null && - getIntervalMapSpan().getStart() == getFinishedIntervalSpan(refLocus).getStart()) { + private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) { + GenomeLoc interval = intervalListIterator.peek(); - for (GenomeLoc interval : intervalMap.keySet()) { - if (interval.isBefore(refLocus)) { - outputStatsToVCF(intervalMap.get(interval), Allele.create(refBase, true)); - intervalMap.remove(interval); - } + // output empty statistics for uncovered intervals + while (interval != null && interval.isBefore(refLocus)) { + final IntervalStatistics stats = intervalMap.get(interval); + outputStatsToVCF(stats != null ? stats : createIntervalStatistic(interval), UNCOVERED_ALLELE); + if (stats != null) intervalMap.remove(interval); + intervalListIterator.next(); + interval = intervalListIterator.peek(); + } + + // remove any potential leftover interval in intervalMap (this will only happen when we have overlapping intervals) + for (GenomeLoc key : intervalMap.keySet()) { + if (key.isBefore(refLocus)) { + outputStatsToVCF(intervalMap.get(key), Allele.create(refBase, true)); + intervalMap.remove(key); } - } } @@ -269,17 +267,9 @@ public class DiagnoseTargets extends LocusWalker { */ private void addNewOverlappingIntervals(GenomeLoc refLocus) { GenomeLoc interval = intervalListIterator.peek(); - - // skip any intervals with no coverage that we have passed - while (interval != null && interval.isBefore(refLocus)) { - intervalListIterator.next(); // discard the interval (we've already added it to the map) - interval = intervalListIterator.peek(); - } - - // add any intervals that overlap this one while (interval != null && !interval.isPast(refLocus)) { intervalMap.put(interval, createIntervalStatistic(interval)); - intervalListIterator.next(); // discard the interval (we've already added it to the map) + intervalListIterator.next(); interval = intervalListIterator.peek(); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java index 6a52a42e5..2875e10d7 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargetsIntegrationTest.java @@ -66,11 +66,11 @@ public class DiagnoseTargetsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testSingleSample() { - DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "9954b21163d3e66db232938ec509067f"); + DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "9b51561bcf248da70a4d711380b04f7b"); } @Test(enabled = true) public void testMultiSample() { - DTTest("testMultiSample ", "-I " + multiSample, "7c5277261e8e9dd74666f04843ffb09c"); + DTTest("testMultiSample ", "-I " + multiSample, "925f88f0c41c6a9ac479be34e052dc5d"); } }