There are now three triggering options in the HaplotypeCaller. The default (mismatches, insertions, deletions, high quality soft clips), an external alleles file (from the UG for example), or extended triggers which include low quality soft clips, bad mates and unmapped mates. Added better algorithm for band pass filtering an ActivityProfile and breaking them apart when they get too big. Greatly increased the specificity of the caller by battening down the hatches on things like base quality and mapping quality thresholds for both the assembler and the likelihood function.

This commit is contained in:
Ryan Poplin 2012-04-10 14:48:23 -04:00
parent 87e6bea6c1
commit a4634624b7
7 changed files with 57 additions and 33 deletions

View File

@ -47,6 +47,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion();
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
int minStart = Integer.MAX_VALUE;
@ -108,7 +109,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
// add these blocks of work to the work queue
// band-pass filter the list of isActive probabilities and turn into active regions
final ActivityProfile bandPassFiltered = profile.bandPassFilter();
final List<ActiveRegion> activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension );
final List<ActiveRegion> activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize );
// add active regions to queue of regions to process
workQueue.addAll( activeRegions );

View File

@ -16,4 +16,5 @@ import java.lang.annotation.RetentionPolicy;
public @interface ActiveRegionExtension {
public int extension() default 0;
public int maxRegion() default 1500;
}

View File

@ -7,10 +7,7 @@ import org.broadinstitute.sting.commandline.IntervalBinding;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter;
import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter;
import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter;
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
import org.broadinstitute.sting.gatk.filters.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
@ -33,8 +30,8 @@ import java.util.List;
@By(DataSource.READS)
@Requires({DataSource.READS, DataSource.REFERENCE_BASES})
@PartitionBy(PartitionType.READ)
@ActiveRegionExtension(extension=50)
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class})
@ActiveRegionExtension(extension=50,maxRegion=1500)
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, MappingQualityUnavailableFilter.class})
public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
@Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this interval list file", required = false)

View File

@ -15,7 +15,7 @@ import java.util.ArrayList;
* Date: 1/4/12
*/
public class ActiveRegion implements HasGenomeLocation {
public class ActiveRegion implements HasGenomeLocation, Comparable<ActiveRegion> {
private final ArrayList<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
private final GenomeLoc activeRegionLoc;
@ -73,6 +73,11 @@ public class ActiveRegion implements HasGenomeLocation {
Math.min(referenceReader.getSequenceDictionary().getSequence(fullExtentReferenceLoc.getContig()).getSequenceLength(), fullExtentReferenceLoc.getStop() + padding) ).getBases();
}
@Override
public int compareTo( final ActiveRegion other ) {
return this.getLocation().compareTo(other.getLocation());
}
@Override
public GenomeLoc getLocation() { return activeRegionLoc; }
public GenomeLoc getExtendedLoc() { return extendedLoc; }

View File

@ -24,8 +24,10 @@
package org.broadinstitute.sting.utils.activeregion;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList;
@ -45,8 +47,16 @@ public class ActivityProfile {
final boolean presetRegions;
GenomeLoc regionStartLoc = null;
final List<Double> isActiveList;
private GenomeLoc lastLoc = null;
private static final int FILTER_SIZE = 65;
private static final Double[] GaussianKernel;
static {
GaussianKernel = new Double[2*FILTER_SIZE + 1];
for( int iii = 0; iii < 2*FILTER_SIZE + 1; iii++ ) {
GaussianKernel[iii] = MathUtils.NormalDistribution(FILTER_SIZE, 40.0, iii);
}
}
// todo -- add upfront the start and stop of the intervals
// todo -- check that no regions are unexpectedly missing
@ -85,15 +95,13 @@ public class ActivityProfile {
public ActivityProfile bandPassFilter() {
final Double[] activeProbArray = isActiveList.toArray(new Double[isActiveList.size()]);
final Double[] filteredProbArray = new Double[activeProbArray.length];
final int FILTER_SIZE = ( presetRegions ? 0 : 50 ); // TODO: needs to be set-able by the walker author
for( int iii = 0; iii < activeProbArray.length; iii++ ) {
double maxVal = 0;
for( int jjj = Math.max(0, iii-FILTER_SIZE); jjj < Math.min(isActiveList.size(), iii+FILTER_SIZE+1); jjj++ ) {
if( activeProbArray[jjj] > maxVal ) { maxVal = activeProbArray[jjj]; }
if( !presetRegions ) {
for( int iii = 0; iii < activeProbArray.length; iii++ ) {
final Double[] kernel = (Double[]) ArrayUtils.subarray(GaussianKernel, Math.max(FILTER_SIZE-iii, 0), Math.min(GaussianKernel.length,FILTER_SIZE + activeProbArray.length - iii));
final Double[] activeProbSubArray = (Double[]) ArrayUtils.subarray(activeProbArray, Math.max(0,iii - FILTER_SIZE), Math.min(activeProbArray.length,iii + FILTER_SIZE + 1));
filteredProbArray[iii] = MathUtils.dotProduct(activeProbSubArray, kernel);
}
filteredProbArray[iii] = maxVal;
}
return new ActivityProfile(parser, presetRegions, Arrays.asList(filteredProbArray), regionStartLoc);
}
@ -102,9 +110,9 @@ public class ActivityProfile {
* @param activeRegionExtension
* @return
*/
public List<ActiveRegion> createActiveRegions( final int activeRegionExtension ) {
final int MAX_ACTIVE_REGION = ( presetRegions ? 16001 : 425 ); // TODO: needs to be set-able by the walker author
final double ACTIVE_PROB_THRESHOLD = 0.2; // TODO: needs to be set-able by the walker author
public List<ActiveRegion> createActiveRegions( final int activeRegionExtension, final int maxRegionSize ) {
final double ACTIVE_PROB_THRESHOLD = 0.002; // TODO: needs to be set-able by the walker author
final ArrayList<ActiveRegion> returnList = new ArrayList<ActiveRegion>();
if( isActiveList.size() == 0 ) {
// no elements in the active list, just return an empty one
@ -112,25 +120,22 @@ public class ActivityProfile {
} else if( isActiveList.size() == 1 ) {
// there's a single element, it's either active or inactive
boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD;
final ActiveRegion region = createActiveRegion(isActive, 0, 0, activeRegionExtension );
return Collections.singletonList(region);
returnList.addAll(createActiveRegion(isActive, 0, 0, activeRegionExtension, maxRegionSize));
} else {
// there are 2+ elements, divide these up into regions
final ArrayList<ActiveRegion> returnList = new ArrayList<ActiveRegion>();
boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD;
int curStart = 0;
for(int iii = 1; iii < isActiveList.size(); iii++ ) {
final boolean thisStatus = isActiveList.get(iii) > ACTIVE_PROB_THRESHOLD;
if( isActive != thisStatus || (iii-curStart) > MAX_ACTIVE_REGION ) {
returnList.add( createActiveRegion(isActive, curStart, iii-1, activeRegionExtension) );
if( isActive != thisStatus ) {
returnList.addAll(createActiveRegion(isActive, curStart, iii - 1, activeRegionExtension, maxRegionSize));
isActive = thisStatus;
curStart = iii;
}
}
returnList.add( createActiveRegion(isActive, curStart, isActiveList.size()-1, activeRegionExtension) ); // close out the current active region
return returnList;
returnList.addAll(createActiveRegion(isActive, curStart, isActiveList.size() - 1, activeRegionExtension, maxRegionSize)); // close out the current active region
}
return returnList;
}
/**
@ -141,8 +146,25 @@ public class ActivityProfile {
* @param activeRegionExtension
* @return a fully initialized ActiveRegion with the above properties
*/
private final ActiveRegion createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension) {
final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd);
return new ActiveRegion( loc, isActive, parser, activeRegionExtension );
private final List<ActiveRegion> createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize) {
return createActiveRegion(isActive, curStart, curEnd, activeRegionExtension, maxRegionSize, new ArrayList<ActiveRegion>());
}
private final List<ActiveRegion> createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize, final List<ActiveRegion> returnList) {
if( !isActive || curEnd - curStart < maxRegionSize ) {
final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd);
returnList.add(new ActiveRegion(loc, isActive, parser, activeRegionExtension));
return returnList;
}
// find the best place to break up the large active region
Double minProb = Double.MAX_VALUE;
int cutPoint = -1;
for( int iii = curStart + 45; iii < curEnd - 45; iii++ ) { // BUGBUG: assumes maxRegionSize >> 45
if( isActiveList.get(iii) < minProb ) { minProb = isActiveList.get(iii); cutPoint = iii; }
}
final List<ActiveRegion> leftList = createActiveRegion(isActive, curStart, cutPoint, activeRegionExtension, maxRegionSize, new ArrayList<ActiveRegion>());
final List<ActiveRegion> rightList = createActiveRegion(isActive, cutPoint, curEnd, activeRegionExtension, maxRegionSize, new ArrayList<ActiveRegion>());
returnList.addAll( leftList );
returnList.addAll( rightList );
return returnList;
}
}

View File

@ -32,8 +32,6 @@ public class PileupElement implements Comparable<PileupElement> {
protected final int eventLength; // what is the length of the event (insertion or deletion) *after* this base
protected final String eventBases; // if it is a deletion, we do not have information about the actual deleted bases in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases
/**
* Creates a new pileup element.
*

View File

@ -38,7 +38,7 @@ public class CountReadsInActiveRegionsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T CountReadsInActiveRegions -R " + b37KGReference + " -I " + b37GoodNA12878BAM + " -L 20:10,000,000-10,200,000 -o %s",
1,
Arrays.asList("fcd581aa6befe85c7297509fa7b34edf"));
Arrays.asList("1e9e8d637d2acde23fa99fe9dc07e3e2"));
executeTest("CountReadsInActiveRegions:", spec);
}
}