Critical bug fix for the case of duplicate map calls in ActiveRegionWalkers with exome interval lists.
-- When consecutive intervals were within the bandpass filter size the ActiveRegion traversal engine would create duplicate active regions. -- Now when flushing the activity profile after we jump to a new interval we remove the extra states which are outside of the current interval. -- Added integration test which ensures that the output VCF contains no duplicate records. Was failing test before this commit.
This commit is contained in:
parent
09edee2c97
commit
8a93bb687b
|
|
@ -794,7 +794,7 @@ public class GenotypingEngine {
|
|||
return vcs;
|
||||
}
|
||||
|
||||
private static class Event {
|
||||
protected static class Event {
|
||||
public VariantContext vc;
|
||||
|
||||
public Event( final VariantContext vc ) {
|
||||
|
|
|
|||
|
|
@ -46,11 +46,22 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broad.tribble.TribbleIndexedFeatureReader;
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.variant.vcf.VCFCodec;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
||||
final static String REF = b37KGReference;
|
||||
|
|
@ -88,6 +99,11 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
"70bd5d0805bf6f51e5f61b377526c979");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerInsertionOnEdgeOfContig() {
|
||||
HCTest(CEUTRIO_MT_TEST_BAM, "-dcov 90 -L MT:1-10", "7f1fb8f9587f64643f6612ef1dd6d4ae");
|
||||
}
|
||||
|
||||
private void HCTestIndelQualityScores(String bam, String args, String md5) {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
|
|
@ -99,9 +115,41 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "4141b4c24a136a3fe4c0b0a4c231cdfa");
|
||||
}
|
||||
|
||||
private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
|
||||
try {
|
||||
final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference));
|
||||
final GenomeLocParser parser = new GenomeLocParser(fasta.getSequenceDictionary());
|
||||
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
for( final File vcf : executeTest("testHaplotypeCallerNearbySmallIntervals: args=" + args, spec).getFirst() ) {
|
||||
if( containsDuplicateRecord(vcf, parser) ) {
|
||||
throw new IllegalStateException("Duplicate records detected but there should be none.");
|
||||
}
|
||||
}
|
||||
} catch( FileNotFoundException e ) {
|
||||
throw new IllegalStateException("Could not find the b37 reference file.");
|
||||
}
|
||||
}
|
||||
|
||||
private boolean containsDuplicateRecord( final File vcf, final GenomeLocParser parser ) {
|
||||
final List<Pair<GenomeLoc, GenotypingEngine.Event>> VCs = new ArrayList<Pair<GenomeLoc, GenotypingEngine.Event>>();
|
||||
try {
|
||||
for( final VariantContext vc : GATKVCFUtils.readVCF(vcf).getSecond() ) {
|
||||
VCs.add(new Pair<GenomeLoc, GenotypingEngine.Event>(parser.createGenomeLoc(vc), new GenotypingEngine.Event(vc)));
|
||||
}
|
||||
} catch( IOException e ) {
|
||||
throw new IllegalStateException("Somehow the temporary VCF from the integration test could not be read.");
|
||||
}
|
||||
|
||||
final Set<Pair<GenomeLoc, GenotypingEngine.Event>> VCsAsSet = new HashSet<Pair<GenomeLoc, GenotypingEngine.Event>>(VCs);
|
||||
return VCsAsSet.size() != VCs.size(); // The set will remove duplicate Events.
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerInsertionOnEdgeOfContig() {
|
||||
HCTest(CEUTRIO_MT_TEST_BAM, "-dcov 90 -L MT:1-10", "7f1fb8f9587f64643f6612ef1dd6d4ae");
|
||||
public void testHaplotypeCallerNearbySmallIntervals() {
|
||||
HCTestNearbySmallIntervals(NA12878_BAM, "", "b9d614efdaf38b87b459df421aab93a7");
|
||||
}
|
||||
|
||||
// This problem bam came from a user on the forum and it spotted a problem where the ReadClipper
|
||||
|
|
|
|||
|
|
@ -70,7 +70,7 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
|
|||
protected final static Logger logger = Logger.getLogger(TraversalEngine.class);
|
||||
protected final static boolean LOG_READ_CARRYING = false;
|
||||
|
||||
// set by the tranversal
|
||||
// set by the traversal
|
||||
private boolean walkerHasPresetRegions = false;
|
||||
private int activeRegionExtension = -1;
|
||||
private int maxRegionSize = -1;
|
||||
|
|
@ -112,7 +112,7 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
|
|||
activityProfile = new BandPassActivityProfile(engine.getGenomeLocParser(), engine.getIntervals(), BandPassActivityProfile.MAX_FILTER_SIZE, bandPassSigma);
|
||||
|
||||
if ( walkerHasPresetRegions ) {
|
||||
// we load all of the preset locations into the
|
||||
// we load all of the preset locations into the workQueue
|
||||
for ( final GenomeLoc loc : this.walker.getPresetActiveRegions()) {
|
||||
workQueue.add(new ActiveRegion(loc, null, true, engine.getGenomeLocParser(), getActiveRegionExtension()));
|
||||
}
|
||||
|
|
@ -190,6 +190,7 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
|
|||
* @param read the read we want to test if it's already been seen in the last shard
|
||||
* @return true if read would have appeared in the last shard, false otherwise
|
||||
*/
|
||||
@Requires({"read != null"})
|
||||
private boolean appearedInLastShard(final GenomeLoc locOfLastReadAtTraversalStart, final GATKSAMRecord read) {
|
||||
if ( locOfLastReadAtTraversalStart == null )
|
||||
// we're in the first shard, so obviously the answer is no
|
||||
|
|
@ -285,6 +286,7 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
|
|||
*
|
||||
* @param read the last read we've seen during the traversal
|
||||
*/
|
||||
@Requires({"read != null"})
|
||||
protected void rememberLastReadLocation(final GATKSAMRecord read) {
|
||||
final GenomeLoc currentLocation = engine.getGenomeLocParser().createGenomeLoc(read);
|
||||
if ( spanOfLastReadSeen == null )
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ import java.util.*;
|
|||
* @since Date created
|
||||
*/
|
||||
public class ActivityProfile {
|
||||
private final static int MAX_PROB_PROPOGATION_DISTANCE = 50;
|
||||
private final static int MAX_PROB_PROPAGATION_DISTANCE = 50;
|
||||
protected final static double ACTIVE_PROB_THRESHOLD = 0.002; // TODO: needs to be set-able by the walker author
|
||||
|
||||
protected final List<ActivityProfileState> stateList;
|
||||
|
|
@ -98,7 +98,7 @@ public class ActivityProfile {
|
|||
*/
|
||||
@Ensures("result >= 0")
|
||||
public int getMaxProbPropagationDistance() {
|
||||
return MAX_PROB_PROPOGATION_DISTANCE;
|
||||
return MAX_PROB_PROPAGATION_DISTANCE;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -210,7 +210,7 @@ public class ActivityProfile {
|
|||
contigLength = parser.getContigInfo(regionStartLoc.getContig()).getSequenceLength();
|
||||
} else {
|
||||
if ( regionStopLoc.getStart() != loc.getStart() - 1 )
|
||||
throw new IllegalArgumentException("Bad add call to ActivityProfile: loc " + loc + " not immediate after last loc " + regionStopLoc );
|
||||
throw new IllegalArgumentException("Bad add call to ActivityProfile: loc " + loc + " not immediately after last loc " + regionStopLoc );
|
||||
regionStopLoc = loc;
|
||||
}
|
||||
|
||||
|
|
@ -239,7 +239,7 @@ public class ActivityProfile {
|
|||
throw new IllegalArgumentException("Must add state contiguous to existing states: adding " + stateToAdd);
|
||||
|
||||
if ( position >= 0 ) {
|
||||
// ignore states starting before this regions start
|
||||
// ignore states starting before this region's start
|
||||
if ( position < size() ) {
|
||||
stateList.get(position).isActiveProb += stateToAdd.isActiveProb;
|
||||
} else {
|
||||
|
|
@ -352,6 +352,12 @@ public class ActivityProfile {
|
|||
if ( stateList.isEmpty() )
|
||||
return null;
|
||||
|
||||
// If we are flushing the activity profile we need to trim off the excess states so that we don't create regions outside of our current processing interval
|
||||
if( forceConversion ) {
|
||||
final List<ActivityProfileState> statesToTrimAway = new ArrayList<ActivityProfileState>(stateList.subList(getSpan().size(), stateList.size()));
|
||||
stateList.removeAll(statesToTrimAway);
|
||||
}
|
||||
|
||||
final ActivityProfileState first = stateList.get(0);
|
||||
final boolean isActiveRegion = first.isActiveProb > ACTIVE_PROB_THRESHOLD;
|
||||
final int offsetOfNextRegionEnd = findEndOfRegion(isActiveRegion, minRegionSize, maxRegionSize, forceConversion);
|
||||
|
|
|
|||
Loading…
Reference in New Issue