Working version of incremental active region traversals
-- The incremental version now processes active regions as soon as they are ready to be processed, instead of waiting until the end of the shard as in the previous version. This means that ART walkers will now take much less memory than previously. On chr20 of NA12878 the majority of regions are processed with as few as 500 reads in memory. Over the whole chr20 only 5K reads were ever held in ART at one time. -- Fixed bug in the way active regions worked with shard boundaries. The new implementation no longer see shard boundaries in any meaningful way, and that uncovered a problem that active regions were always being closed across shard boundaries. This behavior was actually encoded in the unit tests, so those needed to be updated as well. -- Changed the way that preset regions work in ART. The new contract ensures that you get exactly the regions you requested. the isActive function is still called, but its result has no impact on the regions. With this functionality is should be possible to use the HC as a generic assembly by forcing it to operate over very large regions -- Added a few misc. useful functions to IncrementalActivityProfile
This commit is contained in:
parent
ce160931d5
commit
eb60235dcd
|
|
@ -32,17 +32,13 @@ import org.broadinstitute.sting.gatk.WalkerManager;
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.*;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension;
|
||||
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActivityProfile;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActivityProfileState;
|
||||
import org.broadinstitute.sting.utils.activeregion.BandPassActivityProfile;
|
||||
import org.broadinstitute.sting.utils.activeregion.*;
|
||||
import org.broadinstitute.sting.utils.progressmeter.ProgressMeter;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
|
|
@ -70,6 +66,7 @@ import java.util.*;
|
|||
public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegionWalker<M,T>,LocusShardDataProvider> {
|
||||
protected final static Logger logger = Logger.getLogger(TraversalEngine.class);
|
||||
protected final static boolean DEBUG = false;
|
||||
protected final static boolean LOG_READ_CARRYING = false;
|
||||
|
||||
// set by the tranversal
|
||||
private boolean walkerHasPresetRegions = false;
|
||||
|
|
@ -80,6 +77,8 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
|
|||
|
||||
private LinkedList<GATKSAMRecord> myReads = new LinkedList<GATKSAMRecord>();
|
||||
private GenomeLoc spanOfLastReadSeen = null;
|
||||
private IncrementalActivityProfile activityProfile = null;
|
||||
int maxReadsInMemory = 0;
|
||||
|
||||
@Override
|
||||
public void initialize(GenomeAnalysisEngine engine, Walker walker, ProgressMeter progressMeter) {
|
||||
|
|
@ -94,6 +93,14 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
|
|||
activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
|
||||
maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion();
|
||||
walkerHasPresetRegions = arWalker.hasPresetActiveRegions();
|
||||
|
||||
activityProfile = new IncrementalActivityProfile(engine.getGenomeLocParser());
|
||||
if ( walkerHasPresetRegions ) {
|
||||
// we load all of the preset locations into the
|
||||
for ( final GenomeLoc loc : arWalker.getPresetActiveRegions()) {
|
||||
workQueue.add(new ActiveRegion(loc, true, engine.getGenomeLocParser(), getActiveRegionExtension()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
|
@ -139,62 +146,6 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
|
|||
}
|
||||
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// Working with ActivityProfiles and Active Regions
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Take the individual isActive calls and integrate them into contiguous active regions and
|
||||
* add these blocks of work to the work queue
|
||||
* band-pass filter the list of isActive probabilities and turn into active regions
|
||||
*
|
||||
* @param profile
|
||||
* @param activeRegions
|
||||
* @return
|
||||
*/
|
||||
protected ActivityProfile incorporateActiveRegions(final ActivityProfile profile,
|
||||
final List<ActiveRegion> activeRegions) {
|
||||
if ( profile.isEmpty() )
|
||||
throw new IllegalStateException("trying to incorporate an empty active profile " + profile);
|
||||
|
||||
final ActivityProfile finalizedProfile = profile.finalizeProfile();
|
||||
activeRegions.addAll(finalizedProfile.createActiveRegions(getActiveRegionExtension(), getMaxRegionSize()));
|
||||
return makeNewActivityProfile();
|
||||
}
|
||||
|
||||
protected final ActivityProfileState walkerActiveProb(final ActiveRegionWalker<M, T> walker,
|
||||
final RefMetaDataTracker tracker, final ReferenceContext refContext,
|
||||
final AlignmentContext locus, final GenomeLoc location) {
|
||||
if ( walkerHasPresetRegions ) {
|
||||
return new ActivityProfileState(location, walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0);
|
||||
} else {
|
||||
return walker.isActive( tracker, refContext, locus );
|
||||
}
|
||||
}
|
||||
|
||||
private ActivityProfile makeNewActivityProfile() {
|
||||
if ( walkerHasPresetRegions )
|
||||
return new ActivityProfile(engine.getGenomeLocParser());
|
||||
else
|
||||
return new BandPassActivityProfile(engine.getGenomeLocParser());
|
||||
}
|
||||
|
||||
/**
|
||||
* Write out each active region to the walker activeRegionOutStream
|
||||
*
|
||||
* @param walker
|
||||
*/
|
||||
protected void writeActiveRegionsToStream( final ActiveRegionWalker<M, T> walker ) {
|
||||
// Just want to output the active regions to a file, not actually process them
|
||||
for( final ActiveRegion activeRegion : workQueue ) {
|
||||
if( activeRegion.isActive ) {
|
||||
walker.activeRegionOutStream.println( activeRegion.getLocation() );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// Actual traverse function
|
||||
|
|
@ -219,7 +170,7 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
|
|||
* @param read the read we want to test if it's already been seen in the last shard
|
||||
* @return true if read would have appeared in the last shard, false otherwise
|
||||
*/
|
||||
protected boolean appearedInLastShard(final GenomeLoc locOfLastReadAtTraversalStart, final GATKSAMRecord read) {
|
||||
private boolean appearedInLastShard(final GenomeLoc locOfLastReadAtTraversalStart, final GATKSAMRecord read) {
|
||||
if ( locOfLastReadAtTraversalStart == null )
|
||||
// we're in the first shard, so obviously the answer is no
|
||||
return false;
|
||||
|
|
@ -232,39 +183,20 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
|
|||
|
||||
}
|
||||
|
||||
/**
|
||||
* Is the current shard on a new contig w.r.t. the previous shard?
|
||||
* @param currentShard the current shard we are processing
|
||||
* @return true if the last shard was on a different contig than the current shard
|
||||
*/
|
||||
private boolean onNewContig(final Shard currentShard) {
|
||||
return spanOfLastSeenRead() != null
|
||||
&& spanOfLastSeenRead().getContigIndex() != currentShard.getLocation().getContigIndex();
|
||||
}
|
||||
|
||||
@Override
|
||||
public T traverse( final ActiveRegionWalker<M,T> walker,
|
||||
final LocusShardDataProvider dataProvider,
|
||||
T sum) {
|
||||
logger.debug(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider));
|
||||
if ( LOG_READ_CARRYING || logger.isDebugEnabled() )
|
||||
logger.info(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider));
|
||||
|
||||
final LocusView locusView = new AllLocusView(dataProvider);
|
||||
|
||||
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
|
||||
|
||||
final List<ActiveRegion> activeRegions = new LinkedList<ActiveRegion>();
|
||||
ActivityProfile profile = makeNewActivityProfile();
|
||||
|
||||
ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
|
||||
final ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
|
||||
|
||||
// We keep processing while the next reference location is within the interval
|
||||
final GenomeLoc locOfLastReadAtTraversalStart = spanOfLastSeenRead();
|
||||
|
||||
// if we've moved onto a new contig, process all of the active regions
|
||||
if ( onNewContig(dataProvider.getShard()) )
|
||||
sum = processActiveRegions(walker, sum, true);
|
||||
|
||||
GenomeLoc prevLoc = null;
|
||||
while( locusView.hasNext() ) {
|
||||
final AlignmentContext locus = locusView.next();
|
||||
final GenomeLoc location = locus.getLocation();
|
||||
|
|
@ -273,9 +205,7 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
|
|||
// provided we haven't seen them before
|
||||
final Collection<GATKSAMRecord> reads = locusView.getLIBS().transferReadsFromAllPreviousPileups();
|
||||
for( final GATKSAMRecord read : reads ) {
|
||||
if ( appearedInLastShard(locOfLastReadAtTraversalStart, read) ) {
|
||||
if ( DEBUG ) logger.warn("Skipping duplicated " + read.getReadName());
|
||||
} else {
|
||||
if ( ! appearedInLastShard(locOfLastReadAtTraversalStart, read) ) {
|
||||
if ( DEBUG ) logger.warn("Adding read " + read.getReadName() + " at " + engine.getGenomeLocParser().createGenomeLoc(read) + " from provider " + dataProvider);
|
||||
rememberLastReadLocation(read);
|
||||
myReads.add(read);
|
||||
|
|
@ -286,10 +216,11 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
|
|||
if ( outsideEngineIntervals(location) )
|
||||
continue;
|
||||
|
||||
if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) {
|
||||
// we've move across some interval boundary, restart profile
|
||||
profile = incorporateActiveRegions(profile, activeRegions);
|
||||
}
|
||||
// we've move across some interval boundary, restart profile
|
||||
final boolean flushProfile = ! activityProfile.isEmpty()
|
||||
&& ( activityProfile.getContigIndex() != location.getContigIndex()
|
||||
|| location.getStart() != activityProfile.getStop() + 1);
|
||||
sum = processActiveRegions(walker, sum, flushProfile, false);
|
||||
|
||||
dataProvider.getShard().getReadMetrics().incrementNumIterations();
|
||||
|
||||
|
|
@ -301,38 +232,14 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
|
|||
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
|
||||
|
||||
// Call the walkers isActive function for this locus and add them to the list to be integrated later
|
||||
profile.add(walkerActiveProb(walker, tracker, refContext, locus, location));
|
||||
|
||||
prevLoc = location;
|
||||
addIsActiveResult(walker, tracker, refContext, locus);
|
||||
|
||||
maxReadsInMemory = Math.max(myReads.size(), maxReadsInMemory);
|
||||
printProgress(locus.getLocation());
|
||||
}
|
||||
|
||||
updateCumulativeMetrics(dataProvider.getShard());
|
||||
|
||||
if ( ! profile.isEmpty() )
|
||||
incorporateActiveRegions(profile, activeRegions);
|
||||
|
||||
// add active regions to queue of regions to process
|
||||
// first check if can merge active regions over shard boundaries
|
||||
if( !activeRegions.isEmpty() ) {
|
||||
if( !workQueue.isEmpty() ) {
|
||||
final ActiveRegion last = workQueue.getLast();
|
||||
final ActiveRegion first = activeRegions.get(0);
|
||||
if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= getMaxRegionSize() ) {
|
||||
workQueue.removeLast();
|
||||
activeRegions.remove(first);
|
||||
workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), getActiveRegionExtension()) );
|
||||
}
|
||||
}
|
||||
workQueue.addAll( activeRegions );
|
||||
}
|
||||
|
||||
logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
|
||||
|
||||
// now go and process all of the active regions
|
||||
sum = processActiveRegions(walker, sum, false);
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
|
@ -341,7 +248,7 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
|
|||
* Ugly for now but will be cleaned up when we push this functionality more into the engine
|
||||
*/
|
||||
public T endTraversal(final Walker<M, T> walker, T sum) {
|
||||
return processActiveRegions((ActiveRegionWalker<M, T>)walker, sum, true);
|
||||
return processActiveRegions((ActiveRegionWalker<M, T>)walker, sum, true, true);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
|
@ -383,8 +290,15 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
|
|||
* @return true if the extended location of region is completely within the current dead zone, false otherwise
|
||||
*/
|
||||
protected boolean regionCompletelyWithinDeadZone(final ActiveRegion region) {
|
||||
return region.getExtendedLoc().getStop() < spanOfLastSeenRead().getStart()
|
||||
|| ! region.getExtendedLoc().onSameContig(spanOfLastSeenRead());
|
||||
if ( spanOfLastSeenRead() == null )
|
||||
return false;
|
||||
|
||||
final int contigCmp = region.getExtendedLoc().compareContigs(spanOfLastSeenRead());
|
||||
if ( contigCmp > 0 )
|
||||
throw new IllegalStateException("Active region " + region + " on a contig after last seen read " + spanOfLastSeenRead());
|
||||
else {
|
||||
return contigCmp < 0 || region.getExtendedLoc().getStop() < spanOfLastSeenRead().getStart();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -408,7 +322,9 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
|
|||
*/
|
||||
@Requires({"read != null", "activeRegion != null"})
|
||||
private boolean readCannotOccurInAnyMoreActiveRegions(final GATKSAMRecord read, final ActiveRegion activeRegion) {
|
||||
return read.getAlignmentEnd() + getActiveRegionExtension() < activeRegion.getLocation().getStop();
|
||||
return read.getReferenceIndex() < activeRegion.getLocation().getContigIndex() ||
|
||||
( read.getReferenceIndex() == activeRegion.getLocation().getContigIndex()
|
||||
&& read.getAlignmentEnd() + getActiveRegionExtension() < activeRegion.getLocation().getStop() );
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
|
@ -417,32 +333,71 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
|
|||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
private T processActiveRegions(final ActiveRegionWalker<M, T> walker, T sum, final boolean forceRegionsToBeActive) {
|
||||
if( walker.activeRegionOutStream != null ) {
|
||||
/**
|
||||
* Invoke the walker isActive function, and incorporate its result into the activity profile
|
||||
*
|
||||
* @param walker the walker we're running
|
||||
* @param tracker the ref meta data tracker to pass on to the isActive function of walker
|
||||
* @param refContext the refContext to pass on to the isActive function of walker
|
||||
* @param locus the AlignmentContext to pass on to the isActive function of walker
|
||||
*/
|
||||
private void addIsActiveResult(final ActiveRegionWalker<M, T> walker,
|
||||
final RefMetaDataTracker tracker, final ReferenceContext refContext,
|
||||
final AlignmentContext locus) {
|
||||
// must be called, even if we won't use the result, to satisfy walker contract
|
||||
final ActivityProfileState state = walker.isActive( tracker, refContext, locus );
|
||||
if ( ! walkerHasPresetRegions ) {
|
||||
activityProfile.add(state);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Write out each active region to the walker activeRegionOutStream
|
||||
*
|
||||
* @param walker
|
||||
*/
|
||||
private void writeActiveRegionsToStream( final ActiveRegionWalker<M, T> walker ) {
|
||||
// Just want to output the active regions to a file, not actually process them
|
||||
for( final ActiveRegion activeRegion : workQueue ) {
|
||||
if( activeRegion.isActive ) {
|
||||
walker.activeRegionOutStream.println( activeRegion.getLocation() );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Take the individual isActive calls and integrate them into contiguous active regions and
|
||||
* add these blocks of work to the work queue
|
||||
* band-pass filter the list of isActive probabilities and turn into active regions
|
||||
*/
|
||||
private T processActiveRegions(final ActiveRegionWalker<M, T> walker, T sum, final boolean flushActivityProfile, final boolean forceAllRegionsToBeActive) {
|
||||
if ( ! walkerHasPresetRegions ) {
|
||||
// We don't have preset regions, so we get our regions from the activity profile
|
||||
final Collection<ActiveRegion> activeRegions = activityProfile.popReadyActiveRegions(getActiveRegionExtension(), getMaxRegionSize(), flushActivityProfile);
|
||||
workQueue.addAll(activeRegions);
|
||||
if ( logger.isDebugEnabled() ) logger.debug("Integrated " + activityProfile.size() + " isActive calls into " + activeRegions.size() + " regions." );
|
||||
}
|
||||
|
||||
if ( walker.activeRegionOutStream != null ) {
|
||||
writeActiveRegionsToStream(walker);
|
||||
return sum;
|
||||
} else {
|
||||
return callWalkerMapOnActiveRegions(walker, sum, forceRegionsToBeActive);
|
||||
}
|
||||
}
|
||||
|
||||
private T callWalkerMapOnActiveRegions(final ActiveRegionWalker<M, T> walker, T sum, final boolean forceRegionsToBeActive) {
|
||||
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
|
||||
// TODO can implement parallel traversal here
|
||||
while( workQueue.peek() != null ) {
|
||||
final ActiveRegion activeRegion = workQueue.peek();
|
||||
if ( forceRegionsToBeActive || regionCompletelyWithinDeadZone(activeRegion) ) {
|
||||
if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + spanOfLastSeenRead());
|
||||
sum = processActiveRegion( workQueue.remove(), sum, walker );
|
||||
} else {
|
||||
break;
|
||||
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
|
||||
while( workQueue.peek() != null ) {
|
||||
final ActiveRegion activeRegion = workQueue.peek();
|
||||
if ( forceAllRegionsToBeActive || regionCompletelyWithinDeadZone(activeRegion) ) {
|
||||
if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + spanOfLastSeenRead());
|
||||
sum = processActiveRegion( workQueue.remove(), sum, walker );
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
return sum;
|
||||
}
|
||||
}
|
||||
|
||||
protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker<M, T> walker) {
|
||||
private T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker<M, T> walker) {
|
||||
final Iterator<GATKSAMRecord> liveReads = myReads.iterator();
|
||||
while ( liveReads.hasNext() ) {
|
||||
boolean killed = false;
|
||||
|
|
@ -468,6 +423,11 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
|
|||
}
|
||||
|
||||
logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc());
|
||||
|
||||
if ( LOG_READ_CARRYING )
|
||||
logger.info(String.format("Processing region %20s span=%3d active?=%5b with %4d reads. Overall max reads carried is %s",
|
||||
activeRegion.getLocation(), activeRegion.getLocation().size(), activeRegion.isActive, activeRegion.size(), maxReadsInMemory));
|
||||
|
||||
final M x = walker.map(activeRegion, null);
|
||||
return walker.reduce( x, sum );
|
||||
}
|
||||
|
|
|
|||
|
|
@ -68,11 +68,7 @@ public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<Map
|
|||
@Input(fullName="activeRegionIn", shortName="AR", doc="Use this interval list file as the active regions to process", required = false)
|
||||
protected List<IntervalBinding<Feature>> activeRegionBindings = null;
|
||||
|
||||
public GenomeLocSortedSet presetActiveRegions = null;
|
||||
|
||||
public boolean hasPresetActiveRegions() {
|
||||
return presetActiveRegions != null;
|
||||
}
|
||||
private GenomeLocSortedSet presetActiveRegions = null;
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
|
|
@ -91,6 +87,22 @@ public abstract class ActiveRegionWalker<MapType, ReduceType> extends Walker<Map
|
|||
presetActiveRegions = IntervalUtils.sortAndMergeIntervals(this.getToolkit().getGenomeLocParser(), allIntervals, IntervalMergingRule.ALL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Does this walker want us to use a set of preset action regions instead of dynamically using the result of isActive?
|
||||
* @return true if yes, false if no
|
||||
*/
|
||||
public boolean hasPresetActiveRegions() {
|
||||
return presetActiveRegions != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the set of preset active regions, or null if none were provided
|
||||
* @return a set of genome locs specifying fixed active regions requested by the walker, or null if none exist
|
||||
*/
|
||||
public GenomeLocSortedSet getPresetActiveRegions() {
|
||||
return presetActiveRegions;
|
||||
}
|
||||
|
||||
// Do we actually want to operate on the context?
|
||||
public boolean filter(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
|
||||
return true; // We are keeping all the reads
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ public class ActiveRegion implements HasGenomeLocation {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "ActiveRegion " + activeRegionLoc.toString();
|
||||
return "ActiveRegion " + activeRegionLoc.toString() + " active?=" + isActive + " nReads=" + reads.size() + " ";
|
||||
}
|
||||
|
||||
// add each read to the bin and extend the reference genome activeRegionLoc if needed
|
||||
|
|
@ -126,19 +126,4 @@ public class ActiveRegion implements HasGenomeLocation {
|
|||
if ( extendedLoc.compareTo(other.extendedLoc) != 0 ) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* A comparator class which is used to sort ActiveRegions by their start location
|
||||
*/
|
||||
/*
|
||||
public static class ActiveRegionStartLocationComparator implements Comparator<ActiveRegion> {
|
||||
|
||||
public ActiveRegionStartLocationComparator() {}
|
||||
|
||||
@Override
|
||||
public int compare(final ActiveRegion left, final ActiveRegion right) {
|
||||
return left.getLocation().compareTo(right.getLocation());
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
|
@ -123,6 +123,24 @@ public class IncrementalActivityProfile {
|
|||
return stateList.isEmpty();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the span of this activity profile, which is from the start of the first state to the stop of the last
|
||||
* @return a potentially null GenomeLoc. Will be null if this profile is empty
|
||||
*/
|
||||
public GenomeLoc getSpan() {
|
||||
return isEmpty() ? null : regionStartLoc.endpointSpan(regionStopLoc);
|
||||
}
|
||||
|
||||
@Requires("! isEmpty()")
|
||||
public int getContigIndex() {
|
||||
return regionStartLoc.getContigIndex();
|
||||
}
|
||||
|
||||
@Requires("! isEmpty()")
|
||||
public int getStop() {
|
||||
return regionStopLoc.getStop();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the list of active profile results in this object
|
||||
* @return a non-null, ordered list of active profile results
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ class DummyActiveRegionWalker extends ActiveRegionWalker<Integer, Integer> {
|
|||
|
||||
protected List<GenomeLoc> isActiveCalls = new ArrayList<GenomeLoc>();
|
||||
protected Map<GenomeLoc, ActiveRegion> mappedActiveRegions = new LinkedHashMap<GenomeLoc, ActiveRegion>();
|
||||
private boolean declareHavingPresetRegions = false;
|
||||
|
||||
public DummyActiveRegionWalker() {
|
||||
this(1.0);
|
||||
|
|
@ -60,20 +61,31 @@ class DummyActiveRegionWalker extends ActiveRegionWalker<Integer, Integer> {
|
|||
this.prob = constProb;
|
||||
}
|
||||
|
||||
public DummyActiveRegionWalker(EnumSet<ActiveRegionReadState> wantStates) {
|
||||
this(1.0);
|
||||
public DummyActiveRegionWalker(GenomeLocSortedSet activeRegions, EnumSet<ActiveRegionReadState> wantStates, final boolean declareHavingPresetRegions) {
|
||||
this(activeRegions, declareHavingPresetRegions);
|
||||
this.states = wantStates;
|
||||
}
|
||||
|
||||
public DummyActiveRegionWalker(GenomeLocSortedSet activeRegions) {
|
||||
public DummyActiveRegionWalker(GenomeLocSortedSet activeRegions, final boolean declareHavingPresetRegions) {
|
||||
this(1.0);
|
||||
this.activeRegions = activeRegions;
|
||||
this.declareHavingPresetRegions = declareHavingPresetRegions;
|
||||
}
|
||||
|
||||
public void setStates(EnumSet<ActiveRegionReadState> states) {
|
||||
this.states = states;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPresetActiveRegions() {
|
||||
return declareHavingPresetRegions;
|
||||
}
|
||||
|
||||
@Override
|
||||
public GenomeLocSortedSet getPresetActiveRegions() {
|
||||
return declareHavingPresetRegions ? activeRegions : null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public EnumSet<ActiveRegionReadState> desiredReadStates() {
|
||||
return states;
|
||||
|
|
|
|||
|
|
@ -179,7 +179,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
|
|||
|
||||
@Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider")
|
||||
public void testActiveRegionCoverage(TraverseActiveRegions t) {
|
||||
DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
|
||||
DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals), true);
|
||||
|
||||
Collection<ActiveRegion> activeRegions = getActiveRegions(t, walker, intervals).values();
|
||||
verifyActiveRegionCoverage(intervals, activeRegions);
|
||||
|
|
@ -242,9 +242,11 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "TraversalEngineProvider")
|
||||
@Test(enabled = true && !DEBUG, dataProvider = "TraversalEngineProvider")
|
||||
public void testPrimaryReadMapping(TraverseActiveRegions t) {
|
||||
DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
|
||||
DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals),
|
||||
EnumSet.of(ActiveRegionReadState.PRIMARY),
|
||||
true);
|
||||
|
||||
// Contract: Each read has the Primary state in a single region (or none)
|
||||
// This is the region of maximum overlap for the read (earlier if tied)
|
||||
|
|
@ -275,20 +277,18 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
|
|||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999));
|
||||
verifyReadMapping(region);
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384));
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 10000, 20000));
|
||||
verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927));
|
||||
verifyReadMapping(region);
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100));
|
||||
verifyReadMapping(region, "simple20");
|
||||
}
|
||||
|
||||
@Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider")
|
||||
public void testNonPrimaryReadMapping(TraverseActiveRegions t) {
|
||||
DummyActiveRegionWalker walker = new DummyActiveRegionWalker(
|
||||
EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY));
|
||||
DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals),
|
||||
EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY),
|
||||
true);
|
||||
|
||||
// Contract: Each read has the Primary state in a single region (or none)
|
||||
// This is the region of maximum overlap for the read (earlier if tied)
|
||||
|
|
@ -321,10 +321,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
|
|||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999));
|
||||
verifyReadMapping(region, "boundary_equal", "boundary_unequal", "boundary_1_pre", "boundary_1_post");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384));
|
||||
verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927));
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 10000, 20000));
|
||||
verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100));
|
||||
|
|
@ -333,8 +330,9 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
|
|||
|
||||
@Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider")
|
||||
public void testExtendedReadMapping(TraverseActiveRegions t) {
|
||||
DummyActiveRegionWalker walker = new DummyActiveRegionWalker(
|
||||
EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED));
|
||||
DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals),
|
||||
EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED),
|
||||
true);
|
||||
|
||||
// Contract: Each read has the Primary state in a single region (or none)
|
||||
// This is the region of maximum overlap for the read (earlier if tied)
|
||||
|
|
@ -368,10 +366,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
|
|||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999));
|
||||
verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384));
|
||||
verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927));
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 10000, 20000));
|
||||
verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100));
|
||||
|
|
@ -384,6 +379,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
|
|||
}
|
||||
|
||||
private void verifyReadMapping(ActiveRegion region, String... reads) {
|
||||
Assert.assertNotNull(region, "Region was unexpectedly null");
|
||||
final Set<String> regionReads = new HashSet<String>();
|
||||
for (SAMRecord read : region.getReads()) {
|
||||
Assert.assertFalse(regionReads.contains(read.getReadName()), "Duplicate reads detected in region " + region + " read " + read.getReadName());
|
||||
|
|
@ -530,12 +526,11 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
|
|||
for ( final int start : starts ) {
|
||||
for ( final int nReadsPerLocus : Arrays.asList(1, 2) ) {
|
||||
for ( final int nLoci : Arrays.asList(1, 1000) ) {
|
||||
final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(reference, nReadsPerLocus, nLoci);
|
||||
bamBuilder.setReadLength(readLength);
|
||||
bamBuilder.setSkipNLoci(skips);
|
||||
bamBuilder.setAlignmentStart(start);
|
||||
for ( EnumSet<ActiveRegionReadState> readStates : allReadStates ) {
|
||||
final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(reference, nReadsPerLocus, nLoci);
|
||||
bamBuilder.setReadLength(readLength);
|
||||
bamBuilder.setSkipNLoci(skips);
|
||||
bamBuilder.setAlignmentStart(start);
|
||||
|
||||
for ( final GenomeLocSortedSet activeRegions : enumerateActiveRegions(bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd())) {
|
||||
nTests++;
|
||||
if ( nTests < maxTests ) // && nTests == 1238 )
|
||||
|
|
@ -595,7 +590,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
|
|||
genomeLocParser.createGenomeLoc("1", bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd())
|
||||
);
|
||||
|
||||
final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions);
|
||||
final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions, false);
|
||||
walker.setStates(readStates);
|
||||
|
||||
final TraverseActiveRegions traversal = new TraverseActiveRegions<Integer, Integer>();
|
||||
|
|
@ -619,8 +614,9 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
|
|||
alreadySeenReads.add(read.getReadName());
|
||||
}
|
||||
|
||||
Assert.assertEquals(readNamesInRegion.contains(read.getReadName()), shouldBeInRegion, "Region " + region +
|
||||
" failed contains read check: read " + read + " with span " + readLoc + " should be in region is " + shouldBeInRegion + " but I got the opposite");
|
||||
String msg = readNamesInRegion.contains(read.getReadName()) == shouldBeInRegion ? "" : "Region " + region +
|
||||
" failed contains read check: read " + read + " with span " + readLoc + " should be in region is " + shouldBeInRegion + " but I got the opposite";
|
||||
Assert.assertEquals(readNamesInRegion.contains(read.getReadName()), shouldBeInRegion, msg);
|
||||
|
||||
nReadsExpectedInRegion += shouldBeInRegion ? 1 : 0;
|
||||
}
|
||||
|
|
@ -642,7 +638,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
|
|||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Test
|
||||
@Test(enabled = true && ! DEBUG)
|
||||
public void ensureAllInsertionReadsAreInActiveRegions() {
|
||||
|
||||
final int readLength = 10;
|
||||
|
|
@ -667,7 +663,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
|
|||
genomeLocParser.createGenomeLoc("1", bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd())
|
||||
);
|
||||
|
||||
final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions);
|
||||
final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions, false);
|
||||
|
||||
final TraverseActiveRegions traversal = new TraverseActiveRegions<Integer, Integer>();
|
||||
final Map<GenomeLoc, ActiveRegion> activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile().toString());
|
||||
|
|
|
|||
Loading…
Reference in New Issue