There are now three triggering options in the HaplotypeCaller. The default (mismatches, insertions, deletions, high quality soft clips), an external alleles file (from the UG for example), or extended triggers which include low quality soft clips, bad mates and unmapped mates. Added better algorithm for band pass filtering an ActivityProfile and breaking them apart when they get too big. Greatly increased the specificity of the caller by battening down the hatches on things like base quality and mapping quality thresholds for both the assembler and the likelihood function.

This commit is contained in:
Ryan Poplin 2012-04-10 14:48:23 -04:00
parent 87e6bea6c1
commit a4634624b7
7 changed files with 57 additions and 33 deletions

View File

@ -47,6 +47,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion();
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
int minStart = Integer.MAX_VALUE; int minStart = Integer.MAX_VALUE;
@ -108,7 +109,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
// add these blocks of work to the work queue // add these blocks of work to the work queue
// band-pass filter the list of isActive probabilities and turn into active regions // band-pass filter the list of isActive probabilities and turn into active regions
final ActivityProfile bandPassFiltered = profile.bandPassFilter(); final ActivityProfile bandPassFiltered = profile.bandPassFilter();
final List<ActiveRegion> activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension ); final List<ActiveRegion> activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize );
// add active regions to queue of regions to process // add active regions to queue of regions to process
workQueue.addAll( activeRegions ); workQueue.addAll( activeRegions );

View File

@ -16,4 +16,5 @@ import java.lang.annotation.RetentionPolicy;
public @interface ActiveRegionExtension { public @interface ActiveRegionExtension {
public int extension() default 0; public int extension() default 0;
public int maxRegion() default 1500;
} }

View File

@ -7,10 +7,7 @@ import org.broadinstitute.sting.commandline.IntervalBinding;
import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter; import org.broadinstitute.sting.gatk.filters.*;
import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter;
import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter;
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
@ -33,8 +30,8 @@ import java.util.List;
@By(DataSource.READS) @By(DataSource.READS)
@Requires({DataSource.READS, DataSource.REFERENCE_BASES}) @Requires({DataSource.READS, DataSource.REFERENCE_BASES})
@PartitionBy(PartitionType.READ) @PartitionBy(PartitionType.READ)
@ActiveRegionExtension(extension=50) @ActiveRegionExtension(extension=50,maxRegion=1500)
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class}) @ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, MappingQualityUnavailableFilter.class})
public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> { public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
@Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this interval list file", required = false) @Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this interval list file", required = false)

View File

@ -15,7 +15,7 @@ import java.util.ArrayList;
* Date: 1/4/12 * Date: 1/4/12
*/ */
public class ActiveRegion implements HasGenomeLocation { public class ActiveRegion implements HasGenomeLocation, Comparable<ActiveRegion> {
private final ArrayList<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>(); private final ArrayList<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
private final GenomeLoc activeRegionLoc; private final GenomeLoc activeRegionLoc;
@ -73,6 +73,11 @@ public class ActiveRegion implements HasGenomeLocation {
Math.min(referenceReader.getSequenceDictionary().getSequence(fullExtentReferenceLoc.getContig()).getSequenceLength(), fullExtentReferenceLoc.getStop() + padding) ).getBases(); Math.min(referenceReader.getSequenceDictionary().getSequence(fullExtentReferenceLoc.getContig()).getSequenceLength(), fullExtentReferenceLoc.getStop() + padding) ).getBases();
} }
@Override
public int compareTo( final ActiveRegion other ) {
return this.getLocation().compareTo(other.getLocation());
}
@Override @Override
public GenomeLoc getLocation() { return activeRegionLoc; } public GenomeLoc getLocation() { return activeRegionLoc; }
public GenomeLoc getExtendedLoc() { return extendedLoc; } public GenomeLoc getExtendedLoc() { return extendedLoc; }

View File

@ -24,8 +24,10 @@
package org.broadinstitute.sting.utils.activeregion; package org.broadinstitute.sting.utils.activeregion;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList; import java.util.ArrayList;
@ -45,8 +47,16 @@ public class ActivityProfile {
final boolean presetRegions; final boolean presetRegions;
GenomeLoc regionStartLoc = null; GenomeLoc regionStartLoc = null;
final List<Double> isActiveList; final List<Double> isActiveList;
private GenomeLoc lastLoc = null; private GenomeLoc lastLoc = null;
private static final int FILTER_SIZE = 65;
private static final Double[] GaussianKernel;
static {
GaussianKernel = new Double[2*FILTER_SIZE + 1];
for( int iii = 0; iii < 2*FILTER_SIZE + 1; iii++ ) {
GaussianKernel[iii] = MathUtils.NormalDistribution(FILTER_SIZE, 40.0, iii);
}
}
// todo -- add upfront the start and stop of the intervals // todo -- add upfront the start and stop of the intervals
// todo -- check that no regions are unexpectedly missing // todo -- check that no regions are unexpectedly missing
@ -85,15 +95,13 @@ public class ActivityProfile {
public ActivityProfile bandPassFilter() { public ActivityProfile bandPassFilter() {
final Double[] activeProbArray = isActiveList.toArray(new Double[isActiveList.size()]); final Double[] activeProbArray = isActiveList.toArray(new Double[isActiveList.size()]);
final Double[] filteredProbArray = new Double[activeProbArray.length]; final Double[] filteredProbArray = new Double[activeProbArray.length];
final int FILTER_SIZE = ( presetRegions ? 0 : 50 ); // TODO: needs to be set-able by the walker author if( !presetRegions ) {
for( int iii = 0; iii < activeProbArray.length; iii++ ) { for( int iii = 0; iii < activeProbArray.length; iii++ ) {
double maxVal = 0; final Double[] kernel = (Double[]) ArrayUtils.subarray(GaussianKernel, Math.max(FILTER_SIZE-iii, 0), Math.min(GaussianKernel.length,FILTER_SIZE + activeProbArray.length - iii));
for( int jjj = Math.max(0, iii-FILTER_SIZE); jjj < Math.min(isActiveList.size(), iii+FILTER_SIZE+1); jjj++ ) { final Double[] activeProbSubArray = (Double[]) ArrayUtils.subarray(activeProbArray, Math.max(0,iii - FILTER_SIZE), Math.min(activeProbArray.length,iii + FILTER_SIZE + 1));
if( activeProbArray[jjj] > maxVal ) { maxVal = activeProbArray[jjj]; } filteredProbArray[iii] = MathUtils.dotProduct(activeProbSubArray, kernel);
} }
filteredProbArray[iii] = maxVal;
} }
return new ActivityProfile(parser, presetRegions, Arrays.asList(filteredProbArray), regionStartLoc); return new ActivityProfile(parser, presetRegions, Arrays.asList(filteredProbArray), regionStartLoc);
} }
@ -102,9 +110,9 @@ public class ActivityProfile {
* @param activeRegionExtension * @param activeRegionExtension
* @return * @return
*/ */
public List<ActiveRegion> createActiveRegions( final int activeRegionExtension ) { public List<ActiveRegion> createActiveRegions( final int activeRegionExtension, final int maxRegionSize ) {
final int MAX_ACTIVE_REGION = ( presetRegions ? 16001 : 425 ); // TODO: needs to be set-able by the walker author final double ACTIVE_PROB_THRESHOLD = 0.002; // TODO: needs to be set-able by the walker author
final double ACTIVE_PROB_THRESHOLD = 0.2; // TODO: needs to be set-able by the walker author final ArrayList<ActiveRegion> returnList = new ArrayList<ActiveRegion>();
if( isActiveList.size() == 0 ) { if( isActiveList.size() == 0 ) {
// no elements in the active list, just return an empty one // no elements in the active list, just return an empty one
@ -112,25 +120,22 @@ public class ActivityProfile {
} else if( isActiveList.size() == 1 ) { } else if( isActiveList.size() == 1 ) {
// there's a single element, it's either active or inactive // there's a single element, it's either active or inactive
boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD; boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD;
final ActiveRegion region = createActiveRegion(isActive, 0, 0, activeRegionExtension ); returnList.addAll(createActiveRegion(isActive, 0, 0, activeRegionExtension, maxRegionSize));
return Collections.singletonList(region);
} else { } else {
// there are 2+ elements, divide these up into regions // there are 2+ elements, divide these up into regions
final ArrayList<ActiveRegion> returnList = new ArrayList<ActiveRegion>();
boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD; boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD;
int curStart = 0; int curStart = 0;
for(int iii = 1; iii < isActiveList.size(); iii++ ) { for(int iii = 1; iii < isActiveList.size(); iii++ ) {
final boolean thisStatus = isActiveList.get(iii) > ACTIVE_PROB_THRESHOLD; final boolean thisStatus = isActiveList.get(iii) > ACTIVE_PROB_THRESHOLD;
if( isActive != thisStatus || (iii-curStart) > MAX_ACTIVE_REGION ) { if( isActive != thisStatus ) {
returnList.add( createActiveRegion(isActive, curStart, iii-1, activeRegionExtension) ); returnList.addAll(createActiveRegion(isActive, curStart, iii - 1, activeRegionExtension, maxRegionSize));
isActive = thisStatus; isActive = thisStatus;
curStart = iii; curStart = iii;
} }
} }
returnList.add( createActiveRegion(isActive, curStart, isActiveList.size()-1, activeRegionExtension) ); // close out the current active region returnList.addAll(createActiveRegion(isActive, curStart, isActiveList.size() - 1, activeRegionExtension, maxRegionSize)); // close out the current active region
return returnList;
} }
return returnList;
} }
/** /**
@ -141,8 +146,25 @@ public class ActivityProfile {
* @param activeRegionExtension * @param activeRegionExtension
* @return a fully initialized ActiveRegion with the above properties * @return a fully initialized ActiveRegion with the above properties
*/ */
private final ActiveRegion createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension) { private final List<ActiveRegion> createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize) {
final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd); return createActiveRegion(isActive, curStart, curEnd, activeRegionExtension, maxRegionSize, new ArrayList<ActiveRegion>());
return new ActiveRegion( loc, isActive, parser, activeRegionExtension ); }
private final List<ActiveRegion> createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize, final List<ActiveRegion> returnList) {
if( !isActive || curEnd - curStart < maxRegionSize ) {
final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd);
returnList.add(new ActiveRegion(loc, isActive, parser, activeRegionExtension));
return returnList;
}
// find the best place to break up the large active region
Double minProb = Double.MAX_VALUE;
int cutPoint = -1;
for( int iii = curStart + 45; iii < curEnd - 45; iii++ ) { // BUGBUG: assumes maxRegionSize >> 45
if( isActiveList.get(iii) < minProb ) { minProb = isActiveList.get(iii); cutPoint = iii; }
}
final List<ActiveRegion> leftList = createActiveRegion(isActive, curStart, cutPoint, activeRegionExtension, maxRegionSize, new ArrayList<ActiveRegion>());
final List<ActiveRegion> rightList = createActiveRegion(isActive, cutPoint, curEnd, activeRegionExtension, maxRegionSize, new ArrayList<ActiveRegion>());
returnList.addAll( leftList );
returnList.addAll( rightList );
return returnList;
} }
} }

View File

@ -32,8 +32,6 @@ public class PileupElement implements Comparable<PileupElement> {
protected final int eventLength; // what is the length of the event (insertion or deletion) *after* this base protected final int eventLength; // what is the length of the event (insertion or deletion) *after* this base
protected final String eventBases; // if it is a deletion, we do not have information about the actual deleted bases in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases protected final String eventBases; // if it is a deletion, we do not have information about the actual deleted bases in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases
/** /**
* Creates a new pileup element. * Creates a new pileup element.
* *

View File

@ -38,7 +38,7 @@ public class CountReadsInActiveRegionsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec( WalkerTestSpec spec = new WalkerTestSpec(
"-T CountReadsInActiveRegions -R " + b37KGReference + " -I " + b37GoodNA12878BAM + " -L 20:10,000,000-10,200,000 -o %s", "-T CountReadsInActiveRegions -R " + b37KGReference + " -I " + b37GoodNA12878BAM + " -L 20:10,000,000-10,200,000 -o %s",
1, 1,
Arrays.asList("fcd581aa6befe85c7297509fa7b34edf")); Arrays.asList("1e9e8d637d2acde23fa99fe9dc07e3e2"));
executeTest("CountReadsInActiveRegions:", spec); executeTest("CountReadsInActiveRegions:", spec);
} }
} }