Fixed "skipped intervals" bug on DiagnoseTargets
Problem ------- Diagnose targets was skipping intervals when they were not covered by any reads. Solution -------- Rework the interval iteration logic to output all intervals as they're skipped over by the traversal, as well as adding a loop on traversal done to finish outputting intervals past the coverage of teh BAM file. Summarized Changes ------------------ * Outputs all intervals it iterates over, even if uncovered * Outputs leftover intervals in the end of the traversal * Updated integration tests [fixes #47813825]
This commit is contained in:
parent
ff430c821e
commit
cf7afc1ad4
|
|
@ -152,11 +152,14 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
@Argument(fullName = "print_debug_log", shortName = "dl", doc = "Used only for debugging the walker. Prints extra info to screen", required = false)
|
||||
private boolean debug = false;
|
||||
|
||||
private HashMap<GenomeLoc, IntervalStatistics> intervalMap = null; // maps each interval => statistics
|
||||
private PeekableIterator<GenomeLoc> intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome
|
||||
private Set<String> samples = null; // all the samples being processed
|
||||
private final Allele SYMBOLIC_ALLELE = Allele.create("<DT>", false); // avoid creating the symbolic allele multiple times
|
||||
private ThresHolder thresholds = null;
|
||||
private Map<GenomeLoc, IntervalStatistics> intervalMap = null; // maps each interval => statistics
|
||||
private PeekableIterator<GenomeLoc> intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome
|
||||
private Set<String> samples = null; // all the samples being processed
|
||||
private static final Allele SYMBOLIC_ALLELE = Allele.create("<DT>", false); // avoid creating the symbolic allele multiple times
|
||||
private static final Allele UNCOVERED_ALLELE = Allele.create("A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times
|
||||
private ThresHolder thresholds = null; // object that holds all the thresholds for Diagnose Targets (todo -- should become a plugin based system)
|
||||
|
||||
private static final int INITIAL_HASH_SIZE = 500000;
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
|
|
@ -165,24 +168,32 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
if (getToolkit().getIntervals() == null)
|
||||
throw new UserException("This tool only works if you provide one or more intervals. ( Use the -L argument )");
|
||||
|
||||
thresholds = new ThresHolder(minimumBaseQuality, minimumMappingQuality, minimumCoverage, maximumCoverage, minMedianDepth, maxInsertSize, votePercentage, lowMedianDepthPercentage, badMateStatusThreshold, coverageStatusThreshold, excessiveCoverageThreshold, qualityStatusThreshold);
|
||||
thresholds = new ThresHolder(minimumBaseQuality, minimumMappingQuality, minimumCoverage, maximumCoverage,
|
||||
minMedianDepth, maxInsertSize, votePercentage, lowMedianDepthPercentage,
|
||||
badMateStatusThreshold, coverageStatusThreshold, excessiveCoverageThreshold,
|
||||
qualityStatusThreshold);
|
||||
|
||||
intervalMap = new HashMap<GenomeLoc, IntervalStatistics>();
|
||||
intervalMap = new HashMap<GenomeLoc, IntervalStatistics>(INITIAL_HASH_SIZE);
|
||||
intervalListIterator = new PeekableIterator<GenomeLoc>(getToolkit().getIntervals().iterator());
|
||||
|
||||
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); // get all of the unique sample names for the VCF Header
|
||||
vcfWriter.writeHeader(new VCFHeader(ThresHolder.getHeaderInfo(), samples)); // initialize the VCF header
|
||||
// get all of the unique sample names for the VCF Header
|
||||
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
vcfWriter.writeHeader(new VCFHeader(ThresHolder.getHeaderInfo(), samples));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
GenomeLoc refLocus = ref.getLocus();
|
||||
|
||||
removePastIntervals(refLocus, ref.getBase()); // process and remove any intervals in the map that are don't overlap the current locus anymore
|
||||
addNewOverlappingIntervals(refLocus); // add all new intervals that may overlap this reference locus
|
||||
// process and remove any intervals in the map that are don't overlap the current locus anymore
|
||||
// and add all new intervals that may overlap this reference locus
|
||||
outputFinishedIntervals(refLocus, ref.getBase());
|
||||
addNewOverlappingIntervals(refLocus);
|
||||
|
||||
// at this point, all intervals in intervalMap overlap with this locus, so update all of them
|
||||
for (IntervalStatistics intervalStatistics : intervalMap.values())
|
||||
intervalStatistics.addLocus(context, ref, thresholds); // Add current locus to stats
|
||||
intervalStatistics.addLocus(context, ref, thresholds);
|
||||
|
||||
|
||||
return 1L;
|
||||
}
|
||||
|
|
@ -212,53 +223,40 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
@Override
|
||||
public void onTraversalDone(Long result) {
|
||||
for (GenomeLoc interval : intervalMap.keySet())
|
||||
outputStatsToVCF(intervalMap.get(interval), Allele.create("A", true));
|
||||
}
|
||||
outputStatsToVCF(intervalMap.get(interval), UNCOVERED_ALLELE);
|
||||
|
||||
private GenomeLoc getIntervalMapSpan() {
|
||||
GenomeLoc loc = null;
|
||||
for (GenomeLoc interval : intervalMap.keySet()) {
|
||||
if (loc == null) {
|
||||
loc = interval;
|
||||
} else
|
||||
loc = interval.union(loc);
|
||||
GenomeLoc interval = intervalListIterator.peek();
|
||||
while (interval != null) {
|
||||
outputStatsToVCF(createIntervalStatistic(interval), UNCOVERED_ALLELE);
|
||||
intervalListIterator.next();
|
||||
interval = intervalListIterator.peek();
|
||||
}
|
||||
|
||||
return loc;
|
||||
}
|
||||
|
||||
private GenomeLoc getFinishedIntervalSpan(GenomeLoc pos) {
|
||||
GenomeLoc loc = null;
|
||||
for (GenomeLoc interval : intervalMap.keySet()) {
|
||||
if (interval.isBefore(pos)) {
|
||||
if (loc == null)
|
||||
loc = interval;
|
||||
else
|
||||
loc = interval.union(loc);
|
||||
}
|
||||
}
|
||||
|
||||
return loc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes all intervals that are behind the current reference locus from the intervalMap
|
||||
* Outputs all intervals that are behind the current reference locus
|
||||
*
|
||||
* @param refLocus the current reference locus
|
||||
* @param refBase the reference allele
|
||||
*/
|
||||
private void removePastIntervals(GenomeLoc refLocus, byte refBase) {
|
||||
// if there are statistics to output/ check to see that we can output them in order
|
||||
if (getFinishedIntervalSpan(refLocus) != null &&
|
||||
getIntervalMapSpan().getStart() == getFinishedIntervalSpan(refLocus).getStart()) {
|
||||
private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) {
|
||||
GenomeLoc interval = intervalListIterator.peek();
|
||||
|
||||
for (GenomeLoc interval : intervalMap.keySet()) {
|
||||
if (interval.isBefore(refLocus)) {
|
||||
outputStatsToVCF(intervalMap.get(interval), Allele.create(refBase, true));
|
||||
intervalMap.remove(interval);
|
||||
}
|
||||
// output empty statistics for uncovered intervals
|
||||
while (interval != null && interval.isBefore(refLocus)) {
|
||||
final IntervalStatistics stats = intervalMap.get(interval);
|
||||
outputStatsToVCF(stats != null ? stats : createIntervalStatistic(interval), UNCOVERED_ALLELE);
|
||||
if (stats != null) intervalMap.remove(interval);
|
||||
intervalListIterator.next();
|
||||
interval = intervalListIterator.peek();
|
||||
}
|
||||
|
||||
// remove any potential leftover interval in intervalMap (this will only happen when we have overlapping intervals)
|
||||
for (GenomeLoc key : intervalMap.keySet()) {
|
||||
if (key.isBefore(refLocus)) {
|
||||
outputStatsToVCF(intervalMap.get(key), Allele.create(refBase, true));
|
||||
intervalMap.remove(key);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -269,17 +267,9 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
*/
|
||||
private void addNewOverlappingIntervals(GenomeLoc refLocus) {
|
||||
GenomeLoc interval = intervalListIterator.peek();
|
||||
|
||||
// skip any intervals with no coverage that we have passed
|
||||
while (interval != null && interval.isBefore(refLocus)) {
|
||||
intervalListIterator.next(); // discard the interval (we've already added it to the map)
|
||||
interval = intervalListIterator.peek();
|
||||
}
|
||||
|
||||
// add any intervals that overlap this one
|
||||
while (interval != null && !interval.isPast(refLocus)) {
|
||||
intervalMap.put(interval, createIntervalStatistic(interval));
|
||||
intervalListIterator.next(); // discard the interval (we've already added it to the map)
|
||||
intervalListIterator.next();
|
||||
interval = intervalListIterator.peek();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -66,11 +66,11 @@ public class DiagnoseTargetsIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test(enabled = true)
|
||||
public void testSingleSample() {
|
||||
DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "9954b21163d3e66db232938ec509067f");
|
||||
DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "9b51561bcf248da70a4d711380b04f7b");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMultiSample() {
|
||||
DTTest("testMultiSample ", "-I " + multiSample, "7c5277261e8e9dd74666f04843ffb09c");
|
||||
DTTest("testMultiSample ", "-I " + multiSample, "925f88f0c41c6a9ac479be34e052dc5d");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue