diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 3f24e6585..c0fc78e3c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.traversals; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.WalkerManager; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -10,6 +11,7 @@ import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.activeregion.ActivityProfile; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -42,38 +44,31 @@ public class TraverseActiveRegions extends TraversalEngine isActiveList = new ArrayList(); - GenomeLoc firstIsActiveStart = null; + ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); - //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); - ReferenceOrderedView referenceOrderedDataView = null; - if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) - referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider ); - else - referenceOrderedDataView = (RodLocusView)locusView; + ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); // We keep processing while the next reference location is within the interval GenomeLoc prevLoc = null; while( locusView.hasNext() ) { final AlignmentContext locus = locusView.next(); GenomeLoc location = locus.getLocation(); + if(prevLoc != null) { - for(int iii = prevLoc.getStart() + 1; iii < location.getStart(); iii++ ) { + // fill in the active / inactive labels from the stop of the previous location to the start of this location + // TODO refactor to separate function + for(int iii = prevLoc.getStop() + 1; iii < location.getStart(); iii++ ) { final GenomeLoc fakeLoc = engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii); if( initialIntervals == null || initialIntervals.overlaps( fakeLoc ) ) { - final double isActiveProb = ( walker.presetActiveRegions == null ? 0.0 : ( walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0 ) ); - isActiveList.add( isActiveProb ); - if( firstIsActiveStart == null ) { - firstIsActiveStart = fakeLoc; - } + final double isActiveProb = ( walker.hasPresetActiveRegions() && walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0 ); + profile.add(fakeLoc, isActiveProb); } } } @@ -89,12 +84,8 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine activeRegions = integrateActiveList( isActiveList, firstIsActiveStart, activeRegionExtension, walker.presetActiveRegions != null ); - logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." ); - if( walker.activeRegionOutStream == null ) { - workQueue.addAll( activeRegions ); + // band-pass filter the list of isActive probabilities and turn into active regions + final ActivityProfile bandPassFiltered = profile.bandPassFilter(); + final List activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension ); + logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); + + // add to work queue + if( walker.activeRegionOutStream == null ) { + workQueue.addAll( activeRegions ); } else { // Just want to output the active regions to a file, not actually process them for( final ActiveRegion activeRegion : activeRegions ) { if( activeRegion.isActive ) { @@ -134,21 +130,55 @@ public class TraverseActiveRegions extends TraversalEngine walker, T sum) { + + // -------------------------------------------------------------------------------- + // + // simple utility functions + // + // -------------------------------------------------------------------------------- + + private final double walkerActiveProb(final ActiveRegionWalker walker, + final RefMetaDataTracker tracker, final ReferenceContext refContext, + final AlignmentContext locus, final GenomeLoc location) { + if ( walker.hasPresetActiveRegions() ) { + return walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0; + } else { + return walker.isActive( tracker, refContext, locus ); + } + } + + private ReferenceOrderedView getReferenceOrderedView( final ActiveRegionWalker walker, + final LocusShardDataProvider dataProvider, + final LocusView locusView) { + if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) + return new ManagingReferenceOrderedView( dataProvider ); + else + return (RodLocusView)locusView; + } + + // -------------------------------------------------------------------------------- + // + // code to handle processing active regions + // + // -------------------------------------------------------------------------------- + + private T processActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { + // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them while( workQueue.peek() != null ) { - final ActiveRegion activeRegion = workQueue.remove(); - sum = processActiveRegion( activeRegion, myReads, workQueue, sum, (ActiveRegionWalker) walker ); + final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); + if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) { + final ActiveRegion activeRegion = workQueue.remove(); + sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker ); + } else { + break; + } } return sum; @@ -193,6 +223,12 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine integrateActiveList( final ArrayList activeList, final GenomeLoc firstIsActiveStart, final int activeRegionExtension, final boolean presetRegions ) { - - final double ACTIVE_PROB_THRESHOLD = 0.2; // BUGBUG: needs to be set-able by the walker author - final ArrayList returnList = new ArrayList(); - if( activeList.size() == 0 ) { - return returnList; - } else if( activeList.size() == 1 ) { - returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart(), firstIsActiveStart.getStart()), - activeList.get(0) > ACTIVE_PROB_THRESHOLD, engine.getGenomeLocParser(), activeRegionExtension ) ); - return returnList; - } else { - final Double[] activeProbArray = activeList.toArray(new Double[activeList.size()]); - final double[] filteredProbArray = new double[activeProbArray.length]; - final int FILTER_SIZE = ( presetRegions ? 0 : 50 ); // BUGBUG: needs to be set-able by the walker author - final int MAX_ACTIVE_REGION = ( presetRegions ? 16001 : 425 ); // BUGBUG: needs to be set-able by the walker author - for( int iii = 0; iii < activeProbArray.length; iii++ ) { - double maxVal = 0; - for( int jjj = Math.max(0, iii-FILTER_SIZE); jjj < Math.min(activeList.size(), iii+FILTER_SIZE+1); jjj++ ) { - if( activeProbArray[jjj] > maxVal ) { maxVal = activeProbArray[jjj]; } - } - filteredProbArray[iii] = maxVal; - } - - boolean curStatus = filteredProbArray[0] > ACTIVE_PROB_THRESHOLD; - int curStart = 0; - for(int iii = 1; iii < filteredProbArray.length; iii++ ) { - final boolean thisStatus = filteredProbArray[iii] > ACTIVE_PROB_THRESHOLD; - if( curStatus != thisStatus || (iii-curStart) > MAX_ACTIVE_REGION ) { - returnList.add( new ActiveRegion( - engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (iii-1)), - curStatus, engine.getGenomeLocParser(), activeRegionExtension ) ); - curStatus = thisStatus; - curStart = iii; - } - } - if( curStart != filteredProbArray.length-1 ) { - returnList.add( new ActiveRegion( - engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (filteredProbArray.length-1)), - curStatus, engine.getGenomeLocParser(), activeRegionExtension ) ); - } - return returnList; - } + /** + * Special function called in LinearMicroScheduler to empty out the work queue. + * Ugly for now but will be cleaned up when we push this functionality more into the engine + */ + public T endTraversal( final Walker walker, T sum) { + return processActiveRegions((ActiveRegionWalker)walker, sum, Integer.MAX_VALUE, null); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java index 6403f15a2..8ff4b2f6f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -45,6 +45,10 @@ public abstract class ActiveRegionWalker extends Walker isActiveList; + + // todo -- add upfront the start and stop of the intervals + // todo -- check that no regions are unexpectedly missing + // todo -- add unit tests + // TODO -- own preset regions + public ActivityProfile(final GenomeLocParser parser, final boolean presetRegions) { + this(parser, presetRegions, new ArrayList(), null); + } + + protected ActivityProfile(final GenomeLocParser parser, final boolean presetRegions, final List isActiveList, final GenomeLoc regionStartLoc) { + this.parser = parser; + this.presetRegions = presetRegions; + this.isActiveList = isActiveList; + this.regionStartLoc = regionStartLoc; + } + + public void add(final GenomeLoc loc, final double score) { + // todo -- test for validity + isActiveList.add(score); + if( regionStartLoc == null ) { + regionStartLoc = loc; + } + } + + public int size() { + return isActiveList.size(); + } + + /** + * Band pass this ActivityProfile, producing a new profile that's band pass filtered + * @return a new ActivityProfile that's the band-pass filtered version of this profile + */ + public ActivityProfile bandPassFilter() { + final Double[] activeProbArray = isActiveList.toArray(new Double[isActiveList.size()]); + final Double[] filteredProbArray = new Double[activeProbArray.length]; + final int FILTER_SIZE = ( presetRegions ? 0 : 50 ); // TODO: needs to be set-able by the walker author + for( int iii = 0; iii < activeProbArray.length; iii++ ) { + double maxVal = 0; + for( int jjj = Math.max(0, iii-FILTER_SIZE); jjj < Math.min(isActiveList.size(), iii+FILTER_SIZE+1); jjj++ ) { + if( activeProbArray[jjj] > maxVal ) { maxVal = activeProbArray[jjj]; } + } + filteredProbArray[iii] = maxVal; + } + + return new ActivityProfile(parser, presetRegions, Arrays.asList(filteredProbArray), regionStartLoc); + } + + /** + * Partition this profile into active regions + * @param activeRegionExtension + * @return + */ + public List createActiveRegions( final int activeRegionExtension ) { + final int MAX_ACTIVE_REGION = ( presetRegions ? 16001 : 425 ); // TODO: needs to be set-able by the walker author + final double ACTIVE_PROB_THRESHOLD = 0.2; // TODO: needs to be set-able by the walker author + + if( isActiveList.size() == 0 ) { + // no elements in the active list, just return an empty one + return Collections.emptyList(); + } else if( isActiveList.size() == 1 ) { + // there's a single element, it's either active or inactive + boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD; + final ActiveRegion region = createActiveRegion(isActive, 0, 0, activeRegionExtension ); + return Collections.singletonList(region); + } else { + // there are 2+ elements, divide these up into regions + final ArrayList returnList = new ArrayList(); + boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD; + int curStart = 0; + for(int iii = 1; iii < isActiveList.size(); iii++ ) { + final boolean thisStatus = isActiveList.get(iii) > ACTIVE_PROB_THRESHOLD; + if( isActive != thisStatus || (iii-curStart) > MAX_ACTIVE_REGION ) { + returnList.add( createActiveRegion(isActive, curStart, iii-1, activeRegionExtension) ); + isActive = thisStatus; + curStart = iii; + } + } + + if( curStart != isActiveList.size()-1 ) { + returnList.add( createActiveRegion(isActive, curStart, isActiveList.size()-1, activeRegionExtension) ); + } + return returnList; + } + } + + /** + * Helper routine to create an active region based on our current start and end offsets + * @param isActive should the region be active? + * @param curStart offset (0-based) from the start of this region + * @param curEnd offset (0-based) from the start of this region + * @param activeRegionExtension + * @return a fully initialized ActiveRegion with the above properties + */ + private final ActiveRegion createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension) { + final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd); + return new ActiveRegion( loc, isActive, parser, activeRegionExtension ); + } +}