There are now three triggering options in the HaplotypeCaller. The default (mismatches, insertions, deletions, high quality soft clips), an external alleles file (from the UG for example), or extended triggers which include low quality soft clips, bad mates and unmapped mates. Added better algorithm for band pass filtering an ActivityProfile and breaking them apart when they get too big. Greatly increased the specificity of the caller by battening down the hatches on things like base quality and mapping quality thresholds for both the assembler and the likelihood function.
This commit is contained in:
parent
87e6bea6c1
commit
a4634624b7
|
|
@ -47,6 +47,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
||||||
|
|
||||||
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
|
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
|
||||||
final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
|
final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
|
||||||
|
final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion();
|
||||||
|
|
||||||
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
|
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
|
||||||
int minStart = Integer.MAX_VALUE;
|
int minStart = Integer.MAX_VALUE;
|
||||||
|
|
@ -108,7 +109,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
||||||
// add these blocks of work to the work queue
|
// add these blocks of work to the work queue
|
||||||
// band-pass filter the list of isActive probabilities and turn into active regions
|
// band-pass filter the list of isActive probabilities and turn into active regions
|
||||||
final ActivityProfile bandPassFiltered = profile.bandPassFilter();
|
final ActivityProfile bandPassFiltered = profile.bandPassFilter();
|
||||||
final List<ActiveRegion> activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension );
|
final List<ActiveRegion> activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize );
|
||||||
|
|
||||||
// add active regions to queue of regions to process
|
// add active regions to queue of regions to process
|
||||||
workQueue.addAll( activeRegions );
|
workQueue.addAll( activeRegions );
|
||||||
|
|
|
||||||
|
|
@ -16,4 +16,5 @@ import java.lang.annotation.RetentionPolicy;
|
||||||
|
|
||||||
public @interface ActiveRegionExtension {
|
public @interface ActiveRegionExtension {
|
||||||
public int extension() default 0;
|
public int extension() default 0;
|
||||||
|
public int maxRegion() default 1500;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7,10 +7,7 @@ import org.broadinstitute.sting.commandline.IntervalBinding;
|
||||||
import org.broadinstitute.sting.commandline.Output;
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter;
|
import org.broadinstitute.sting.gatk.filters.*;
|
||||||
import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter;
|
|
||||||
import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter;
|
|
||||||
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
|
@ -33,8 +30,8 @@ import java.util.List;
|
||||||
@By(DataSource.READS)
|
@By(DataSource.READS)
|
||||||
@Requires({DataSource.READS, DataSource.REFERENCE_BASES})
|
@Requires({DataSource.READS, DataSource.REFERENCE_BASES})
|
||||||
@PartitionBy(PartitionType.READ)
|
@PartitionBy(PartitionType.READ)
|
||||||
@ActiveRegionExtension(extension=50)
|
@ActiveRegionExtension(extension=50,maxRegion=1500)
|
||||||
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class})
|
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, MappingQualityUnavailableFilter.class})
|
||||||
public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
|
public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
|
||||||
|
|
||||||
@Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this interval list file", required = false)
|
@Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this interval list file", required = false)
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ import java.util.ArrayList;
|
||||||
* Date: 1/4/12
|
* Date: 1/4/12
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class ActiveRegion implements HasGenomeLocation {
|
public class ActiveRegion implements HasGenomeLocation, Comparable<ActiveRegion> {
|
||||||
|
|
||||||
private final ArrayList<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
|
private final ArrayList<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
|
||||||
private final GenomeLoc activeRegionLoc;
|
private final GenomeLoc activeRegionLoc;
|
||||||
|
|
@ -73,6 +73,11 @@ public class ActiveRegion implements HasGenomeLocation {
|
||||||
Math.min(referenceReader.getSequenceDictionary().getSequence(fullExtentReferenceLoc.getContig()).getSequenceLength(), fullExtentReferenceLoc.getStop() + padding) ).getBases();
|
Math.min(referenceReader.getSequenceDictionary().getSequence(fullExtentReferenceLoc.getContig()).getSequenceLength(), fullExtentReferenceLoc.getStop() + padding) ).getBases();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compareTo( final ActiveRegion other ) {
|
||||||
|
return this.getLocation().compareTo(other.getLocation());
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public GenomeLoc getLocation() { return activeRegionLoc; }
|
public GenomeLoc getLocation() { return activeRegionLoc; }
|
||||||
public GenomeLoc getExtendedLoc() { return extendedLoc; }
|
public GenomeLoc getExtendedLoc() { return extendedLoc; }
|
||||||
|
|
|
||||||
|
|
@ -24,8 +24,10 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.utils.activeregion;
|
package org.broadinstitute.sting.utils.activeregion;
|
||||||
|
|
||||||
|
import org.apache.commons.lang.ArrayUtils;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
|
@ -45,8 +47,16 @@ public class ActivityProfile {
|
||||||
final boolean presetRegions;
|
final boolean presetRegions;
|
||||||
GenomeLoc regionStartLoc = null;
|
GenomeLoc regionStartLoc = null;
|
||||||
final List<Double> isActiveList;
|
final List<Double> isActiveList;
|
||||||
|
|
||||||
private GenomeLoc lastLoc = null;
|
private GenomeLoc lastLoc = null;
|
||||||
|
private static final int FILTER_SIZE = 65;
|
||||||
|
private static final Double[] GaussianKernel;
|
||||||
|
|
||||||
|
static {
|
||||||
|
GaussianKernel = new Double[2*FILTER_SIZE + 1];
|
||||||
|
for( int iii = 0; iii < 2*FILTER_SIZE + 1; iii++ ) {
|
||||||
|
GaussianKernel[iii] = MathUtils.NormalDistribution(FILTER_SIZE, 40.0, iii);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// todo -- add upfront the start and stop of the intervals
|
// todo -- add upfront the start and stop of the intervals
|
||||||
// todo -- check that no regions are unexpectedly missing
|
// todo -- check that no regions are unexpectedly missing
|
||||||
|
|
@ -85,15 +95,13 @@ public class ActivityProfile {
|
||||||
public ActivityProfile bandPassFilter() {
|
public ActivityProfile bandPassFilter() {
|
||||||
final Double[] activeProbArray = isActiveList.toArray(new Double[isActiveList.size()]);
|
final Double[] activeProbArray = isActiveList.toArray(new Double[isActiveList.size()]);
|
||||||
final Double[] filteredProbArray = new Double[activeProbArray.length];
|
final Double[] filteredProbArray = new Double[activeProbArray.length];
|
||||||
final int FILTER_SIZE = ( presetRegions ? 0 : 50 ); // TODO: needs to be set-able by the walker author
|
if( !presetRegions ) {
|
||||||
for( int iii = 0; iii < activeProbArray.length; iii++ ) {
|
for( int iii = 0; iii < activeProbArray.length; iii++ ) {
|
||||||
double maxVal = 0;
|
final Double[] kernel = (Double[]) ArrayUtils.subarray(GaussianKernel, Math.max(FILTER_SIZE-iii, 0), Math.min(GaussianKernel.length,FILTER_SIZE + activeProbArray.length - iii));
|
||||||
for( int jjj = Math.max(0, iii-FILTER_SIZE); jjj < Math.min(isActiveList.size(), iii+FILTER_SIZE+1); jjj++ ) {
|
final Double[] activeProbSubArray = (Double[]) ArrayUtils.subarray(activeProbArray, Math.max(0,iii - FILTER_SIZE), Math.min(activeProbArray.length,iii + FILTER_SIZE + 1));
|
||||||
if( activeProbArray[jjj] > maxVal ) { maxVal = activeProbArray[jjj]; }
|
filteredProbArray[iii] = MathUtils.dotProduct(activeProbSubArray, kernel);
|
||||||
}
|
}
|
||||||
filteredProbArray[iii] = maxVal;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return new ActivityProfile(parser, presetRegions, Arrays.asList(filteredProbArray), regionStartLoc);
|
return new ActivityProfile(parser, presetRegions, Arrays.asList(filteredProbArray), regionStartLoc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -102,9 +110,9 @@ public class ActivityProfile {
|
||||||
* @param activeRegionExtension
|
* @param activeRegionExtension
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public List<ActiveRegion> createActiveRegions( final int activeRegionExtension ) {
|
public List<ActiveRegion> createActiveRegions( final int activeRegionExtension, final int maxRegionSize ) {
|
||||||
final int MAX_ACTIVE_REGION = ( presetRegions ? 16001 : 425 ); // TODO: needs to be set-able by the walker author
|
final double ACTIVE_PROB_THRESHOLD = 0.002; // TODO: needs to be set-able by the walker author
|
||||||
final double ACTIVE_PROB_THRESHOLD = 0.2; // TODO: needs to be set-able by the walker author
|
final ArrayList<ActiveRegion> returnList = new ArrayList<ActiveRegion>();
|
||||||
|
|
||||||
if( isActiveList.size() == 0 ) {
|
if( isActiveList.size() == 0 ) {
|
||||||
// no elements in the active list, just return an empty one
|
// no elements in the active list, just return an empty one
|
||||||
|
|
@ -112,25 +120,22 @@ public class ActivityProfile {
|
||||||
} else if( isActiveList.size() == 1 ) {
|
} else if( isActiveList.size() == 1 ) {
|
||||||
// there's a single element, it's either active or inactive
|
// there's a single element, it's either active or inactive
|
||||||
boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD;
|
boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD;
|
||||||
final ActiveRegion region = createActiveRegion(isActive, 0, 0, activeRegionExtension );
|
returnList.addAll(createActiveRegion(isActive, 0, 0, activeRegionExtension, maxRegionSize));
|
||||||
return Collections.singletonList(region);
|
|
||||||
} else {
|
} else {
|
||||||
// there are 2+ elements, divide these up into regions
|
// there are 2+ elements, divide these up into regions
|
||||||
final ArrayList<ActiveRegion> returnList = new ArrayList<ActiveRegion>();
|
|
||||||
boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD;
|
boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD;
|
||||||
int curStart = 0;
|
int curStart = 0;
|
||||||
for(int iii = 1; iii < isActiveList.size(); iii++ ) {
|
for(int iii = 1; iii < isActiveList.size(); iii++ ) {
|
||||||
final boolean thisStatus = isActiveList.get(iii) > ACTIVE_PROB_THRESHOLD;
|
final boolean thisStatus = isActiveList.get(iii) > ACTIVE_PROB_THRESHOLD;
|
||||||
if( isActive != thisStatus || (iii-curStart) > MAX_ACTIVE_REGION ) {
|
if( isActive != thisStatus ) {
|
||||||
returnList.add( createActiveRegion(isActive, curStart, iii-1, activeRegionExtension) );
|
returnList.addAll(createActiveRegion(isActive, curStart, iii - 1, activeRegionExtension, maxRegionSize));
|
||||||
isActive = thisStatus;
|
isActive = thisStatus;
|
||||||
curStart = iii;
|
curStart = iii;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
returnList.add( createActiveRegion(isActive, curStart, isActiveList.size()-1, activeRegionExtension) ); // close out the current active region
|
returnList.addAll(createActiveRegion(isActive, curStart, isActiveList.size() - 1, activeRegionExtension, maxRegionSize)); // close out the current active region
|
||||||
|
|
||||||
return returnList;
|
|
||||||
}
|
}
|
||||||
|
return returnList;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -141,8 +146,25 @@ public class ActivityProfile {
|
||||||
* @param activeRegionExtension
|
* @param activeRegionExtension
|
||||||
* @return a fully initialized ActiveRegion with the above properties
|
* @return a fully initialized ActiveRegion with the above properties
|
||||||
*/
|
*/
|
||||||
private final ActiveRegion createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension) {
|
private final List<ActiveRegion> createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize) {
|
||||||
final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd);
|
return createActiveRegion(isActive, curStart, curEnd, activeRegionExtension, maxRegionSize, new ArrayList<ActiveRegion>());
|
||||||
return new ActiveRegion( loc, isActive, parser, activeRegionExtension );
|
}
|
||||||
|
private final List<ActiveRegion> createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize, final List<ActiveRegion> returnList) {
|
||||||
|
if( !isActive || curEnd - curStart < maxRegionSize ) {
|
||||||
|
final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd);
|
||||||
|
returnList.add(new ActiveRegion(loc, isActive, parser, activeRegionExtension));
|
||||||
|
return returnList;
|
||||||
|
}
|
||||||
|
// find the best place to break up the large active region
|
||||||
|
Double minProb = Double.MAX_VALUE;
|
||||||
|
int cutPoint = -1;
|
||||||
|
for( int iii = curStart + 45; iii < curEnd - 45; iii++ ) { // BUGBUG: assumes maxRegionSize >> 45
|
||||||
|
if( isActiveList.get(iii) < minProb ) { minProb = isActiveList.get(iii); cutPoint = iii; }
|
||||||
|
}
|
||||||
|
final List<ActiveRegion> leftList = createActiveRegion(isActive, curStart, cutPoint, activeRegionExtension, maxRegionSize, new ArrayList<ActiveRegion>());
|
||||||
|
final List<ActiveRegion> rightList = createActiveRegion(isActive, cutPoint, curEnd, activeRegionExtension, maxRegionSize, new ArrayList<ActiveRegion>());
|
||||||
|
returnList.addAll( leftList );
|
||||||
|
returnList.addAll( rightList );
|
||||||
|
return returnList;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -32,8 +32,6 @@ public class PileupElement implements Comparable<PileupElement> {
|
||||||
protected final int eventLength; // what is the length of the event (insertion or deletion) *after* this base
|
protected final int eventLength; // what is the length of the event (insertion or deletion) *after* this base
|
||||||
protected final String eventBases; // if it is a deletion, we do not have information about the actual deleted bases in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases
|
protected final String eventBases; // if it is a deletion, we do not have information about the actual deleted bases in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new pileup element.
|
* Creates a new pileup element.
|
||||||
*
|
*
|
||||||
|
|
|
||||||
|
|
@ -38,7 +38,7 @@ public class CountReadsInActiveRegionsIntegrationTest extends WalkerTest {
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
"-T CountReadsInActiveRegions -R " + b37KGReference + " -I " + b37GoodNA12878BAM + " -L 20:10,000,000-10,200,000 -o %s",
|
"-T CountReadsInActiveRegions -R " + b37KGReference + " -I " + b37GoodNA12878BAM + " -L 20:10,000,000-10,200,000 -o %s",
|
||||||
1,
|
1,
|
||||||
Arrays.asList("fcd581aa6befe85c7297509fa7b34edf"));
|
Arrays.asList("1e9e8d637d2acde23fa99fe9dc07e3e2"));
|
||||||
executeTest("CountReadsInActiveRegions:", spec);
|
executeTest("CountReadsInActiveRegions:", spec);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Loading…
Reference in New Issue