diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 9aa59459f..c127899f6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -213,7 +213,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { // Now that we have a progress meter, go through and initialize the traversal engines for ( final TraversalEngine traversalEngine : allCreatedTraversalEngines ) - traversalEngine.initialize(engine, progressMeter); + traversalEngine.initialize(engine, walker, progressMeter); // JMX does not allow multiple instances with the same ObjectName to be registered with the same platform MXBean. // To get around this limitation and since we have no job identifier at this point, register a simple counter that diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 3dc3e1501..0811e5e70 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -74,7 +74,7 @@ public abstract class TraversalEngine,Provide * @param engine GenomeAnalysisEngine for this traversal * @param progressMeter An optional (null == optional) meter to track our progress */ - public void initialize(final GenomeAnalysisEngine engine, final ProgressMeter progressMeter) { + public void initialize(final GenomeAnalysisEngine engine, final Walker walker, final ProgressMeter progressMeter) { if ( engine == null ) throw new ReviewedStingException("BUG: GenomeAnalysisEngine cannot be null!"); @@ -87,8 +87,8 @@ public abstract class TraversalEngine,Provide * * @param engine */ - protected void initialize(final GenomeAnalysisEngine engine) { - initialize(engine, null); + protected void initialize(final GenomeAnalysisEngine engine, final Walker walker) { + initialize(engine, walker, null); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 3adc5fa12..713f1fd9e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.traversals; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.WalkerManager; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -39,6 +40,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActivityProfile; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; +import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.LinkedList; @@ -52,9 +54,11 @@ import java.util.List; * To change this template use File | Settings | File Templates. */ public abstract class TraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { + protected final static boolean DEBUG = false; + // set by the tranversal - protected int activeRegionExtension = -1; - protected int maxRegionSize = -1; + private int activeRegionExtension = -1; + private int maxRegionSize = -1; /** * our log, which we want to capture anything from this class @@ -64,11 +68,32 @@ public abstract class TraverseActiveRegions extends TraversalEngine walker); + protected int getActiveRegionExtension() { + return activeRegionExtension; + } + + protected int getMaxRegionSize() { + return maxRegionSize; + } + @Override public String getTraversalUnits() { return "active regions"; } + @Override + public void initialize(GenomeAnalysisEngine engine, Walker walker, ProgressMeter progressMeter) { + super.initialize(engine, walker, progressMeter); + activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); + maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); + + final ActiveRegionWalker arWalker = (ActiveRegionWalker)walker; + if ( arWalker.wantsExtendedReads() && ! arWalker.wantsNonPrimaryReads() ) { + throw new IllegalArgumentException("Active region walker " + arWalker + " requested extended events but not " + + "non-primary reads, an inconsistent state. Please modify the walker"); + } + } + /** * Is the loc outside of the intervals being requested for processing by the GATK? * @param loc @@ -85,19 +110,15 @@ public abstract class TraverseActiveRegions extends TraversalEngine activeRegions, - final int activeRegionExtension, - final int maxRegionSize) { + final List activeRegions) { if ( profile.isEmpty() ) throw new IllegalStateException("trying to incorporate an empty active profile " + profile); final ActivityProfile bandPassFiltered = profile.bandPassFilter(); - activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize )); + activeRegions.addAll(bandPassFiltered.createActiveRegions( getActiveRegionExtension(), getMaxRegionSize() )); return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() ); } @@ -161,7 +182,7 @@ public abstract class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine)walker, sum, true); } + // todo -- remove me protected ActiveRegion getBestRegion(final ActiveRegion activeRegion, final GenomeLoc readLoc) { + long minStart = activeRegion.getLocation().getStart(); ActiveRegion bestRegion = activeRegion; - long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc ); + for( final ActiveRegion otherRegionToTest : workQueue ) { - if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) { - maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc ); + if( otherRegionToTest.getLocation().getStart() < minStart ) { + minStart = otherRegionToTest.getLocation().getStart(); bestRegion = otherRegionToTest; } } + return bestRegion; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java index ee93e24b1..a22f257e5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java @@ -29,6 +29,7 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.providers.*; +import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; @@ -47,18 +48,26 @@ import java.util.*; public class TraverseActiveRegionsOptimized extends TraverseActiveRegions { private LinkedList myReads = new LinkedList(); + private Shard lastShard = null; @Override public T traverse( final ActiveRegionWalker walker, final LocusShardDataProvider dataProvider, T sum) { - logger.debug(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider)); + if ( DEBUG ) logger.warn(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider)); + + final HashSet maybeDuplicatedReads = new HashSet(); + // TODO -- there's got to be a better way to know this + if ( lastShard != dataProvider.getShard() ) { + maybeDuplicatedReads.addAll(myReads); + logger.info("Crossing shard boundary requires us to check for duplicates against " + maybeDuplicatedReads.size() + " reads"); + if ( DEBUG ) logger.warn("Clearing myReads"); + } + lastShard = dataProvider.getShard(); final LocusView locusView = new AllLocusView(dataProvider); final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); - activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); - maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); final List activeRegions = new LinkedList(); ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); @@ -77,7 +86,15 @@ public class TraverseActiveRegionsOptimized extends TraverseActiveRegions reads = locusView.getLIBS().transferReadsFromAllPreviousPileups(); for( final SAMRecord read : reads ) { notifyOfCurrentPosition((GATKSAMRecord)read); - myReads.add((GATKSAMRecord)read); + // most of the time maybeDuplicatedReads is empty + // TODO -- I believe that because of the ordering of reads that as soon as we don't find a read in the + // TODO -- potential list of duplicates we can clear the hashset + if ( ! maybeDuplicatedReads.isEmpty() && maybeDuplicatedReads.contains(read) ) { + if ( DEBUG ) logger.warn("Skipping duplicated " + read.getReadName()); + } else { + if ( DEBUG ) logger.warn("Adding read " + read.getReadName() + " at " + engine.getGenomeLocParser().createGenomeLoc(read) + " from provider " + dataProvider); + myReads.add((GATKSAMRecord)read); + } } // skip this location -- it's not part of our engine intervals @@ -86,7 +103,7 @@ public class TraverseActiveRegionsOptimized extends TraverseActiveRegions extends TraverseActiveRegions extends TraverseActiveRegions extends TraverseActiveRegions walker) { final Iterator liveReads = myReads.iterator(); while ( liveReads.hasNext() ) { + boolean killed = false; final GATKSAMRecord read = liveReads.next(); final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); if( activeRegion.getLocation().overlapsP( readLoc ) ) { - // TODO -- this test assumes that we've successfully defined all regions that might be - // TODO -- the primary home for read. Doesn't seem safe to me - // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region) - final ActiveRegion bestRegion = getBestRegion(activeRegion, readLoc); - addToRegion(bestRegion, read); + activeRegion.add(read); - // The read is also added to all other regions in which it overlaps but marked as non-primary - - if( walker.wantsNonPrimaryReads() ) { - if( !bestRegion.equals(activeRegion) ) { - addToRegion(activeRegion, read); - } - for( final ActiveRegion otherRegionToTest : workQueue ) { - if( !bestRegion.equals(otherRegionToTest) ) { - // check for non-primary vs. extended - if ( otherRegionToTest.getLocation().overlapsP( readLoc ) ) { - addToRegion(otherRegionToTest, read); - } else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) { - addToRegion(otherRegionToTest, read); - } - } - } + if ( ! walker.wantsNonPrimaryReads() ) { + if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion()); + liveReads.remove(); + killed = true; } - // check for non-primary vs. extended } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) { activeRegion.add( read ); } - if ( regionCompletelyWithinDeadZone(readLoc, true) ) { - logger.info("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion()); + if ( ! killed && readIsDead(read, readLoc, activeRegion) ) { + if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion()); liveReads.remove(); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java index 2fc63dae1..6c542f578 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java @@ -40,8 +40,6 @@ public class TraverseActiveRegionsOriginal extends TraverseActiveRegions activeRegions = new LinkedList(); @@ -77,7 +75,7 @@ public class TraverseActiveRegionsOriginal extends TraverseActiveRegions extends TraverseActiveRegions extends TraverseActiveRegions getIsActiveIntervals(final TraverseActiveRegions t, DummyActiveRegionWalker walker, List intervals) { List activeIntervals = new ArrayList(); - for (LocusShardDataProvider dataProvider : createDataProviders(t, intervals, testBAM)) { + for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, testBAM)) { t.traverse(walker, dataProvider, 0); activeIntervals.addAll(walker.isActiveCalls); } @@ -308,40 +310,40 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { // simple: Primary in 1:1-999 // overlap_equal: Primary in 1:1-999 // overlap_unequal: Primary in 1:1-999 - // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 + // boundary_equal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 - // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 + // boundary_1_post: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // extended_and_np: Primary in 1:1-999, Non-Primary in 1:1000-1999, Extended in 1:2000-2999 // outside_intervals: none // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 - // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 - // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 + // shard_boundary_1_post: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // shard_boundary_equal: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 // simple20: Primary in 20:10000-10100 Map activeRegions = getActiveRegions(t, walker, intervals); ActiveRegion region; region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); - verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal"); + verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); - verifyReadMapping(region, "boundary_unequal", "extended_and_np", "boundary_1_pre"); + verifyReadMapping(region, "boundary_unequal", "boundary_1_pre", "boundary_equal", "boundary_1_post"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); - verifyReadMapping(region, "boundary_equal", "boundary_1_post"); + verifyReadMapping(region); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384)); - verifyReadMapping(region, "shard_boundary_1_pre"); + verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927)); - verifyReadMapping(region, "shard_boundary_1_post", "shard_boundary_equal"); + verifyReadMapping(region); region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); verifyReadMapping(region, "simple20"); } - @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") + @Test(enabled = true, dataProvider = "TraversalEngineProvider") public void testNonPrimaryReadMapping(TraverseActiveRegions t) { DummyActiveRegionWalker walker = new DummyActiveRegionWalker( EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY)); @@ -354,15 +356,15 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { // simple: Primary in 1:1-999 // overlap_equal: Primary in 1:1-999 // overlap_unequal: Primary in 1:1-999 - // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 + // boundary_equal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 - // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 + // boundary_1_post: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // extended_and_np: Primary in 1:1-999, Non-Primary in 1:1000-1999, Extended in 1:2000-2999 // outside_intervals: none // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 - // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 - // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 + // shard_boundary_1_post: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // shard_boundary_equal: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 // simple20: Primary in 20:10000-10100 Map activeRegions = getActiveRegions(t, walker, intervals); @@ -387,7 +389,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { verifyReadMapping(region, "simple20"); } - @Test(enabled = true, dataProvider = "TraversalEngineProvider") + @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") public void testExtendedReadMapping(TraverseActiveRegions t) { DummyActiveRegionWalker walker = new DummyActiveRegionWalker( EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED)); @@ -457,7 +459,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { } private Map getActiveRegions(TraverseActiveRegions t, DummyActiveRegionWalker walker, List intervals) { - for (LocusShardDataProvider dataProvider : createDataProviders(t, intervals, testBAM)) + for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, testBAM)) t.traverse(walker, dataProvider, 0); t.endTraversal(walker, 0); @@ -521,10 +523,10 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { return record; } - private List createDataProviders(TraverseActiveRegions t, List intervals, String bamFile) { + private List createDataProviders(TraverseActiveRegions t, final Walker walker, List intervals, String bamFile) { GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); engine.setGenomeLocParser(genomeLocParser); - t.initialize(engine); + t.initialize(engine, walker); Collection samFiles = new ArrayList(); SAMReaderID readerID = new SAMReaderID(new File(bamFile), new Tags()); diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesUnitTest.java index ee6c6d1d4..fd9e46a06 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseDuplicatesUnitTest.java @@ -68,7 +68,7 @@ public class TraverseDuplicatesUnitTest extends BaseTest { engine.setReferenceDataSource(refFile); engine.setGenomeLocParser(genomeLocParser); - obj.initialize(engine); + obj.initialize(engine, null); } @Test diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java index 3866990b2..4328e3047 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java @@ -132,7 +132,7 @@ public class TraverseReadsUnitTest extends BaseTest { countReadWalker = new CountReads(); traversalEngine = new TraverseReadsNano(1); - traversalEngine.initialize(engine); + traversalEngine.initialize(engine, countReadWalker); } /** Test out that we can shard the file and iterate over every read */