Fixed "skipped intervals" bug on DiagnoseTargets

Problem
-------
Diagnose targets was skipping intervals when they were not covered by any reads.

Solution
--------
Rework the interval iteration logic to output all intervals as they're skipped over by the traversal, as well as adding a loop on traversal done to finish outputting intervals past the coverage of teh BAM file.

Summarized Changes
------------------
   * Outputs all intervals it iterates over, even if uncovered
   * Outputs leftover intervals in the end of the traversal
   * Updated integration tests

[fixes #47813825]
This commit is contained in:
Mauricio Carneiro 2013-04-20 00:28:35 -04:00
parent ff430c821e
commit cf7afc1ad4
2 changed files with 49 additions and 59 deletions

View File

@ -152,11 +152,14 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
@Argument(fullName = "print_debug_log", shortName = "dl", doc = "Used only for debugging the walker. Prints extra info to screen", required = false) @Argument(fullName = "print_debug_log", shortName = "dl", doc = "Used only for debugging the walker. Prints extra info to screen", required = false)
private boolean debug = false; private boolean debug = false;
private HashMap<GenomeLoc, IntervalStatistics> intervalMap = null; // maps each interval => statistics private Map<GenomeLoc, IntervalStatistics> intervalMap = null; // maps each interval => statistics
private PeekableIterator<GenomeLoc> intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome private PeekableIterator<GenomeLoc> intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome
private Set<String> samples = null; // all the samples being processed private Set<String> samples = null; // all the samples being processed
private final Allele SYMBOLIC_ALLELE = Allele.create("<DT>", false); // avoid creating the symbolic allele multiple times private static final Allele SYMBOLIC_ALLELE = Allele.create("<DT>", false); // avoid creating the symbolic allele multiple times
private ThresHolder thresholds = null; private static final Allele UNCOVERED_ALLELE = Allele.create("A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times
private ThresHolder thresholds = null; // object that holds all the thresholds for Diagnose Targets (todo -- should become a plugin based system)
private static final int INITIAL_HASH_SIZE = 500000;
@Override @Override
public void initialize() { public void initialize() {
@ -165,24 +168,32 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
if (getToolkit().getIntervals() == null) if (getToolkit().getIntervals() == null)
throw new UserException("This tool only works if you provide one or more intervals. ( Use the -L argument )"); throw new UserException("This tool only works if you provide one or more intervals. ( Use the -L argument )");
thresholds = new ThresHolder(minimumBaseQuality, minimumMappingQuality, minimumCoverage, maximumCoverage, minMedianDepth, maxInsertSize, votePercentage, lowMedianDepthPercentage, badMateStatusThreshold, coverageStatusThreshold, excessiveCoverageThreshold, qualityStatusThreshold); thresholds = new ThresHolder(minimumBaseQuality, minimumMappingQuality, minimumCoverage, maximumCoverage,
minMedianDepth, maxInsertSize, votePercentage, lowMedianDepthPercentage,
badMateStatusThreshold, coverageStatusThreshold, excessiveCoverageThreshold,
qualityStatusThreshold);
intervalMap = new HashMap<GenomeLoc, IntervalStatistics>(); intervalMap = new HashMap<GenomeLoc, IntervalStatistics>(INITIAL_HASH_SIZE);
intervalListIterator = new PeekableIterator<GenomeLoc>(getToolkit().getIntervals().iterator()); intervalListIterator = new PeekableIterator<GenomeLoc>(getToolkit().getIntervals().iterator());
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); // get all of the unique sample names for the VCF Header // get all of the unique sample names for the VCF Header
vcfWriter.writeHeader(new VCFHeader(ThresHolder.getHeaderInfo(), samples)); // initialize the VCF header samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
vcfWriter.writeHeader(new VCFHeader(ThresHolder.getHeaderInfo(), samples));
} }
@Override @Override
public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
GenomeLoc refLocus = ref.getLocus(); GenomeLoc refLocus = ref.getLocus();
removePastIntervals(refLocus, ref.getBase()); // process and remove any intervals in the map that are don't overlap the current locus anymore // process and remove any intervals in the map that are don't overlap the current locus anymore
addNewOverlappingIntervals(refLocus); // add all new intervals that may overlap this reference locus // and add all new intervals that may overlap this reference locus
outputFinishedIntervals(refLocus, ref.getBase());
addNewOverlappingIntervals(refLocus);
// at this point, all intervals in intervalMap overlap with this locus, so update all of them
for (IntervalStatistics intervalStatistics : intervalMap.values()) for (IntervalStatistics intervalStatistics : intervalMap.values())
intervalStatistics.addLocus(context, ref, thresholds); // Add current locus to stats intervalStatistics.addLocus(context, ref, thresholds);
return 1L; return 1L;
} }
@ -212,53 +223,40 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
@Override @Override
public void onTraversalDone(Long result) { public void onTraversalDone(Long result) {
for (GenomeLoc interval : intervalMap.keySet()) for (GenomeLoc interval : intervalMap.keySet())
outputStatsToVCF(intervalMap.get(interval), Allele.create("A", true)); outputStatsToVCF(intervalMap.get(interval), UNCOVERED_ALLELE);
}
private GenomeLoc getIntervalMapSpan() { GenomeLoc interval = intervalListIterator.peek();
GenomeLoc loc = null; while (interval != null) {
for (GenomeLoc interval : intervalMap.keySet()) { outputStatsToVCF(createIntervalStatistic(interval), UNCOVERED_ALLELE);
if (loc == null) { intervalListIterator.next();
loc = interval; interval = intervalListIterator.peek();
} else
loc = interval.union(loc);
} }
return loc;
}
private GenomeLoc getFinishedIntervalSpan(GenomeLoc pos) {
GenomeLoc loc = null;
for (GenomeLoc interval : intervalMap.keySet()) {
if (interval.isBefore(pos)) {
if (loc == null)
loc = interval;
else
loc = interval.union(loc);
}
}
return loc;
} }
/** /**
* Removes all intervals that are behind the current reference locus from the intervalMap * Outputs all intervals that are behind the current reference locus
* *
* @param refLocus the current reference locus * @param refLocus the current reference locus
* @param refBase the reference allele * @param refBase the reference allele
*/ */
private void removePastIntervals(GenomeLoc refLocus, byte refBase) { private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) {
// if there are statistics to output/ check to see that we can output them in order GenomeLoc interval = intervalListIterator.peek();
if (getFinishedIntervalSpan(refLocus) != null &&
getIntervalMapSpan().getStart() == getFinishedIntervalSpan(refLocus).getStart()) {
for (GenomeLoc interval : intervalMap.keySet()) { // output empty statistics for uncovered intervals
if (interval.isBefore(refLocus)) { while (interval != null && interval.isBefore(refLocus)) {
outputStatsToVCF(intervalMap.get(interval), Allele.create(refBase, true)); final IntervalStatistics stats = intervalMap.get(interval);
intervalMap.remove(interval); outputStatsToVCF(stats != null ? stats : createIntervalStatistic(interval), UNCOVERED_ALLELE);
} if (stats != null) intervalMap.remove(interval);
intervalListIterator.next();
interval = intervalListIterator.peek();
}
// remove any potential leftover interval in intervalMap (this will only happen when we have overlapping intervals)
for (GenomeLoc key : intervalMap.keySet()) {
if (key.isBefore(refLocus)) {
outputStatsToVCF(intervalMap.get(key), Allele.create(refBase, true));
intervalMap.remove(key);
} }
} }
} }
@ -269,17 +267,9 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
*/ */
private void addNewOverlappingIntervals(GenomeLoc refLocus) { private void addNewOverlappingIntervals(GenomeLoc refLocus) {
GenomeLoc interval = intervalListIterator.peek(); GenomeLoc interval = intervalListIterator.peek();
// skip any intervals with no coverage that we have passed
while (interval != null && interval.isBefore(refLocus)) {
intervalListIterator.next(); // discard the interval (we've already added it to the map)
interval = intervalListIterator.peek();
}
// add any intervals that overlap this one
while (interval != null && !interval.isPast(refLocus)) { while (interval != null && !interval.isPast(refLocus)) {
intervalMap.put(interval, createIntervalStatistic(interval)); intervalMap.put(interval, createIntervalStatistic(interval));
intervalListIterator.next(); // discard the interval (we've already added it to the map) intervalListIterator.next();
interval = intervalListIterator.peek(); interval = intervalListIterator.peek();
} }
} }

View File

@ -66,11 +66,11 @@ public class DiagnoseTargetsIntegrationTest extends WalkerTest {
@Test(enabled = true) @Test(enabled = true)
public void testSingleSample() { public void testSingleSample() {
DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "9954b21163d3e66db232938ec509067f"); DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "9b51561bcf248da70a4d711380b04f7b");
} }
@Test(enabled = true) @Test(enabled = true)
public void testMultiSample() { public void testMultiSample() {
DTTest("testMultiSample ", "-I " + multiSample, "7c5277261e8e9dd74666f04843ffb09c"); DTTest("testMultiSample ", "-I " + multiSample, "925f88f0c41c6a9ac479be34e052dc5d");
} }
} }