Critical bug fix for the case of duplicate map calls in ActiveRegionWalkers with exome interval lists.

-- When consecutive intervals were within the bandpass filter size the ActiveRegion traversal engine would create
duplicate active regions.
-- Now when flushing the activity profile after we jump to a new interval we remove the extra states which are outside
of the current interval.
-- Added integration test which ensures that the output VCF contains no duplicate records. Was failing test before this commit.
This commit is contained in:
Ryan Poplin 2013-04-01 14:28:20 -04:00
parent 09edee2c97
commit 8a93bb687b
4 changed files with 67 additions and 11 deletions

View File

@ -794,7 +794,7 @@ public class GenotypingEngine {
return vcs;
}
private static class Event {
protected static class Event {
public VariantContext vc;
public Event( final VariantContext vc ) {

View File

@ -46,11 +46,22 @@
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broad.tribble.TribbleIndexedFeatureReader;
import org.broadinstitute.sting.WalkerTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
import org.broadinstitute.variant.variantcontext.VariantContext;
import org.broadinstitute.variant.vcf.VCFCodec;
import org.testng.annotations.Test;
import java.util.Arrays;
import java.util.Collections;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
public class HaplotypeCallerIntegrationTest extends WalkerTest {
final static String REF = b37KGReference;
@ -88,6 +99,11 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
"70bd5d0805bf6f51e5f61b377526c979");
}
@Test
public void testHaplotypeCallerInsertionOnEdgeOfContig() {
HCTest(CEUTRIO_MT_TEST_BAM, "-dcov 90 -L MT:1-10", "7f1fb8f9587f64643f6612ef1dd6d4ae");
}
private void HCTestIndelQualityScores(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
@ -99,9 +115,41 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "4141b4c24a136a3fe4c0b0a4c231cdfa");
}
private void HCTestNearbySmallIntervals(String bam, String args, String md5) {
try {
final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference));
final GenomeLocParser parser = new GenomeLocParser(fasta.getSequenceDictionary());
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
for( final File vcf : executeTest("testHaplotypeCallerNearbySmallIntervals: args=" + args, spec).getFirst() ) {
if( containsDuplicateRecord(vcf, parser) ) {
throw new IllegalStateException("Duplicate records detected but there should be none.");
}
}
} catch( FileNotFoundException e ) {
throw new IllegalStateException("Could not find the b37 reference file.");
}
}
private boolean containsDuplicateRecord( final File vcf, final GenomeLocParser parser ) {
final List<Pair<GenomeLoc, GenotypingEngine.Event>> VCs = new ArrayList<Pair<GenomeLoc, GenotypingEngine.Event>>();
try {
for( final VariantContext vc : GATKVCFUtils.readVCF(vcf).getSecond() ) {
VCs.add(new Pair<GenomeLoc, GenotypingEngine.Event>(parser.createGenomeLoc(vc), new GenotypingEngine.Event(vc)));
}
} catch( IOException e ) {
throw new IllegalStateException("Somehow the temporary VCF from the integration test could not be read.");
}
final Set<Pair<GenomeLoc, GenotypingEngine.Event>> VCsAsSet = new HashSet<Pair<GenomeLoc, GenotypingEngine.Event>>(VCs);
return VCsAsSet.size() != VCs.size(); // The set will remove duplicate Events.
}
@Test
public void testHaplotypeCallerInsertionOnEdgeOfContig() {
HCTest(CEUTRIO_MT_TEST_BAM, "-dcov 90 -L MT:1-10", "7f1fb8f9587f64643f6612ef1dd6d4ae");
public void testHaplotypeCallerNearbySmallIntervals() {
HCTestNearbySmallIntervals(NA12878_BAM, "", "b9d614efdaf38b87b459df421aab93a7");
}
// This problem bam came from a user on the forum and it spotted a problem where the ReadClipper

View File

@ -70,7 +70,7 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
protected final static Logger logger = Logger.getLogger(TraversalEngine.class);
protected final static boolean LOG_READ_CARRYING = false;
// set by the tranversal
// set by the traversal
private boolean walkerHasPresetRegions = false;
private int activeRegionExtension = -1;
private int maxRegionSize = -1;
@ -112,7 +112,7 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
activityProfile = new BandPassActivityProfile(engine.getGenomeLocParser(), engine.getIntervals(), BandPassActivityProfile.MAX_FILTER_SIZE, bandPassSigma);
if ( walkerHasPresetRegions ) {
// we load all of the preset locations into the
// we load all of the preset locations into the workQueue
for ( final GenomeLoc loc : this.walker.getPresetActiveRegions()) {
workQueue.add(new ActiveRegion(loc, null, true, engine.getGenomeLocParser(), getActiveRegionExtension()));
}
@ -190,6 +190,7 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
* @param read the read we want to test if it's already been seen in the last shard
* @return true if read would have appeared in the last shard, false otherwise
*/
@Requires({"read != null"})
private boolean appearedInLastShard(final GenomeLoc locOfLastReadAtTraversalStart, final GATKSAMRecord read) {
if ( locOfLastReadAtTraversalStart == null )
// we're in the first shard, so obviously the answer is no
@ -285,6 +286,7 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
*
* @param read the last read we've seen during the traversal
*/
@Requires({"read != null"})
protected void rememberLastReadLocation(final GATKSAMRecord read) {
final GenomeLoc currentLocation = engine.getGenomeLocParser().createGenomeLoc(read);
if ( spanOfLastReadSeen == null )

View File

@ -41,7 +41,7 @@ import java.util.*;
* @since Date created
*/
public class ActivityProfile {
private final static int MAX_PROB_PROPOGATION_DISTANCE = 50;
private final static int MAX_PROB_PROPAGATION_DISTANCE = 50;
protected final static double ACTIVE_PROB_THRESHOLD = 0.002; // TODO: needs to be set-able by the walker author
protected final List<ActivityProfileState> stateList;
@ -98,7 +98,7 @@ public class ActivityProfile {
*/
@Ensures("result >= 0")
public int getMaxProbPropagationDistance() {
return MAX_PROB_PROPOGATION_DISTANCE;
return MAX_PROB_PROPAGATION_DISTANCE;
}
/**
@ -210,7 +210,7 @@ public class ActivityProfile {
contigLength = parser.getContigInfo(regionStartLoc.getContig()).getSequenceLength();
} else {
if ( regionStopLoc.getStart() != loc.getStart() - 1 )
throw new IllegalArgumentException("Bad add call to ActivityProfile: loc " + loc + " not immediate after last loc " + regionStopLoc );
throw new IllegalArgumentException("Bad add call to ActivityProfile: loc " + loc + " not immediately after last loc " + regionStopLoc );
regionStopLoc = loc;
}
@ -239,7 +239,7 @@ public class ActivityProfile {
throw new IllegalArgumentException("Must add state contiguous to existing states: adding " + stateToAdd);
if ( position >= 0 ) {
// ignore states starting before this regions start
// ignore states starting before this region's start
if ( position < size() ) {
stateList.get(position).isActiveProb += stateToAdd.isActiveProb;
} else {
@ -352,6 +352,12 @@ public class ActivityProfile {
if ( stateList.isEmpty() )
return null;
// If we are flushing the activity profile we need to trim off the excess states so that we don't create regions outside of our current processing interval
if( forceConversion ) {
final List<ActivityProfileState> statesToTrimAway = new ArrayList<ActivityProfileState>(stateList.subList(getSpan().size(), stateList.size()));
stateList.removeAll(statesToTrimAway);
}
final ActivityProfileState first = stateList.get(0);
final boolean isActiveRegion = first.isActiveProb > ACTIVE_PROB_THRESHOLD;
final int offsetOfNextRegionEnd = findEndOfRegion(isActiveRegion, minRegionSize, maxRegionSize, forceConversion);