Updating ActiveRegionWalker interface to output a probability of active status instead of a boolean. Integrator runs a band-pass filter over this probability to produce actual active regions. First version of HaplotypeCaller which decides for itself where to trigger and assembles those regions.

This commit is contained in:
Ryan Poplin 2012-01-26 11:37:08 -05:00
parent 7a26fcb86f
commit 390d493049
3 changed files with 52 additions and 44 deletions

View File

@ -10,14 +10,12 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.ArrayList; import java.util.*;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.Queue;
/** /**
* Created by IntelliJ IDEA. * Created by IntelliJ IDEA.
@ -54,7 +52,8 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
int minStart = Integer.MAX_VALUE; int minStart = Integer.MAX_VALUE;
final ArrayList<ActiveRegion> isActiveList = new ArrayList<ActiveRegion>(); final ArrayList<Double> isActiveList = new ArrayList<Double>();
GenomeLoc firstIsActiveStart = null;
//ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
ReferenceOrderedView referenceOrderedDataView = null; ReferenceOrderedView referenceOrderedDataView = null;
@ -91,11 +90,15 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
// Call the walkers isActive function for this locus and add them to the list to be integrated later // Call the walkers isActive function for this locus and add them to the list to be integrated later
if( initialIntervals.overlaps(location) ) { if( initialIntervals.overlaps( location ) ) {
final boolean isActive = ( walker.presetActiveRegions == null ? walker.isActive( tracker, refContext, locus ) : walker.presetActiveRegions.overlaps(location) ); final double isActiveProb = ( walker.presetActiveRegions == null ? walker.isActive( tracker, refContext, locus )
isActiveList.add( new ActiveRegion(location, isActive, engine.getGenomeLocParser(), activeRegionExtension ) ); : ( walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0 ) );
isActiveList.add( isActiveProb );
if( firstIsActiveStart == null ) {
firstIsActiveStart = locus.getLocation();
}
} }
// Grab all the previously unseen reads from this pileup and add them to the massive read list // Grab all the previously unseen reads from this pileup and add them to the massive read list
for( final PileupElement p : locus.getBasePileup() ) { for( final PileupElement p : locus.getBasePileup() ) {
final SAMRecord read = p.getRead(); final SAMRecord read = p.getRead();
@ -104,15 +107,9 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
} }
} }
// If this is the last pileup for this shard then need to first do a special walker.isActive() call // If this is the last pileup for this shard calculate the minimum alignment start so that we know
// and then calculate the minimum alignment start so that we know which active regions in the work queue are now safe to process // which active regions in the work queue are now safe to process
if( !locusView.hasNext() ) { if( !locusView.hasNext() ) {
// Call the walkers isActive function for this locus and add them to the list to be integrated later
if( initialIntervals.overlaps(location) ) {
final boolean isActive = ( walker.presetActiveRegions == null ? walker.isActive( tracker, refContext, locus ) : walker.presetActiveRegions.overlaps(location) );
isActiveList.add( new ActiveRegion(location, isActive, engine.getGenomeLocParser(), activeRegionExtension ) );
}
for( final PileupElement p : locus.getBasePileup() ) { for( final PileupElement p : locus.getBasePileup() ) {
final SAMRecord read = p.getRead(); final SAMRecord read = p.getRead();
if( !myReads.contains(read) ) { if( !myReads.contains(read) ) {
@ -121,12 +118,12 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
if( read.getAlignmentStart() < minStart ) { minStart = read.getAlignmentStart(); } if( read.getAlignmentStart() < minStart ) { minStart = read.getAlignmentStart(); }
} }
} }
printProgress(dataProvider.getShard(),locus.getLocation()); printProgress(dataProvider.getShard(), locus.getLocation());
} }
// Take the individual isActive calls and integrate them into contiguous active regions and // Take the individual isActive calls and integrate them into contiguous active regions and
// add these blocks of work to the work queue // add these blocks of work to the work queue
final ArrayList<ActiveRegion> activeRegions = integrateActiveList( isActiveList ); final ArrayList<ActiveRegion> activeRegions = integrateActiveList( isActiveList, firstIsActiveStart, activeRegionExtension );
logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." ); logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." );
if( walker.activeRegionOutStream == null ) { if( walker.activeRegionOutStream == null ) {
workQueue.addAll( activeRegions ); workQueue.addAll( activeRegions );
@ -137,14 +134,11 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
} }
} }
} }
// Since we've sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them // Since we've sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
if( !workQueue.isEmpty() ) { while( workQueue.peek() != null && (workQueue.peek().getExtendedLoc().getStop() < minStart || !workQueue.peek().getExtendedLoc().getContig().equals(dataProvider.getLocus().getContig())) ) {
while( workQueue.peek().getExtendedLoc().getStop() < minStart || !workQueue.peek().getExtendedLoc().getContig().equals(dataProvider.getLocus().getContig()) ) { final ActiveRegion activeRegion = workQueue.remove();
final ActiveRegion activeRegion = workQueue.remove(); sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker );
sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker );
}
} }
} }
@ -184,7 +178,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
} }
for( final ActiveRegion otherRegionToTest : workQueue ) { for( final ActiveRegion otherRegionToTest : workQueue ) {
if( !bestRegion.equals(otherRegionToTest) && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) { if( !bestRegion.equals(otherRegionToTest) && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) {
activeRegion.add( (GATKSAMRecord) read ); otherRegionToTest.add( (GATKSAMRecord) read );
} }
} }
} }
@ -218,31 +212,43 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource); throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource);
} }
// integrate active regions into contiguous chunks with identical active status // band-pass filter the list of isActive probabilities and turn into active regions
private ArrayList<ActiveRegion> integrateActiveList( final ArrayList<ActiveRegion> activeList ) { private ArrayList<ActiveRegion> integrateActiveList( final ArrayList<Double> activeList, final GenomeLoc firstIsActiveStart, final int activeRegionExtension ) {
final double ACTIVE_PROB_THRESHOLD = 0.2;
final ArrayList<ActiveRegion> returnList = new ArrayList<ActiveRegion>(); final ArrayList<ActiveRegion> returnList = new ArrayList<ActiveRegion>();
if( activeList.size() == 0 ) { if( activeList.size() == 0 ) {
return returnList; return returnList;
} else if( activeList.size() == 1 ) { } else if( activeList.size() == 1 ) {
returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(activeList.get(0).getLocation().getContig(), activeList.get(0).getLocation().getStart(), activeList.get(0).getLocation().getStart()), returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart(), firstIsActiveStart.getStart()),
activeList.get(0).isActive, engine.getGenomeLocParser(), activeList.get(0).getExtension() ) ); activeList.get(0) > ACTIVE_PROB_THRESHOLD, engine.getGenomeLocParser(), activeRegionExtension ) );
return returnList; return returnList;
} else { } else {
ActiveRegion prevLocus = activeList.get(0); final Double[] activeProbArray = activeList.toArray(new Double[activeList.size()]);
ActiveRegion startLocus = prevLocus; final double[] filteredProbArray = new double[activeProbArray.length];
for( final ActiveRegion thisLocus : activeList ) { final int FILTER_SIZE = 10;
if( prevLocus.isActive != thisLocus.isActive || !prevLocus.getLocation().contiguousP( thisLocus.getLocation() ) ) { final int MAX_ACTIVE_REGION = 200;
returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()), for( int iii = 0; iii < activeProbArray.length; iii++ ) {
prevLocus.isActive, engine.getGenomeLocParser(), startLocus.getExtension() ) ); double maxVal = 0;
startLocus = thisLocus; for( int jjj = Math.max( 0, iii-FILTER_SIZE); jjj < Math.min( activeList.size(), iii+FILTER_SIZE); jjj++ ) {
if( activeProbArray[jjj] > maxVal ) { maxVal = activeProbArray[jjj]; }
} }
prevLocus = thisLocus; filteredProbArray[iii] = maxVal;
} }
// output the last region if necessary
if( startLocus != prevLocus ) { boolean curStatus = filteredProbArray[0] > ACTIVE_PROB_THRESHOLD;
returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()), int curStart = 0;
prevLocus.isActive, engine.getGenomeLocParser(), startLocus.getExtension() ) ); for(int iii = 1; iii < filteredProbArray.length; iii++ ) {
final boolean thisStatus = filteredProbArray[iii] > ACTIVE_PROB_THRESHOLD;
if( curStatus != thisStatus || (iii-curStart) > MAX_ACTIVE_REGION ) {
returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (iii-1)),
curStatus, engine.getGenomeLocParser(), activeRegionExtension ) );
curStatus = thisStatus;
curStart = iii;
}
} }
returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (filteredProbArray.length-1)),
curStatus, engine.getGenomeLocParser(), activeRegionExtension ) );
return returnList; return returnList;
} }
} }

View File

@ -73,8 +73,8 @@ public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<Map
return false; return false;
} }
// Determine active status over the AlignmentContext // Determine probability of active status over the AlignmentContext
public abstract boolean isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context); public abstract double isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context);
// Map over the ActiveRegion // Map over the ActiveRegion
public abstract MapType map(final ActiveRegion activeRegion, final ReadMetaDataTracker metaDataTracker); public abstract MapType map(final ActiveRegion activeRegion, final ReadMetaDataTracker metaDataTracker);

View File

@ -169,9 +169,11 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
private VariantAnnotatorEngine annotationEngine; private VariantAnnotatorEngine annotationEngine;
// enable deletions in the pileup // enable deletions in the pileup
@Override
public boolean includeReadsWithDeletionAtLoci() { return true; } public boolean includeReadsWithDeletionAtLoci() { return true; }
// enable extended events for indels // enable extended events for indels
@Override
public boolean generateExtendedEvents() { public boolean generateExtendedEvents() {
return (UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.SNP && UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES); return (UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.SNP && UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES);
} }