diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 713f1fd9e..45dbb6dc8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -68,6 +68,12 @@ public abstract class TraverseActiveRegions extends TraversalEngine walker); + /** + * Special function called in LinearMicroScheduler to empty out the work queue. + * Ugly for now but will be cleaned up when we push this functionality more into the engine + */ + public abstract T endTraversal(final Walker walker, T sum); + protected int getActiveRegionExtension() { return activeRegionExtension; } @@ -141,21 +147,12 @@ public abstract class TraverseActiveRegions extends TraversalEngine walker, T sum, final boolean forceRegionsToBeActive) { - if( walker.activeRegionOutStream != null ) { - writeActiveRegionsToStream(walker); - return sum; - } else { - return callWalkerMapOnActiveRegions(walker, sum, forceRegionsToBeActive); - } - } - /** * Write out each active region to the walker activeRegionOutStream * * @param walker */ - private void writeActiveRegionsToStream( final ActiveRegionWalker walker ) { + protected void writeActiveRegionsToStream( final ActiveRegionWalker walker ) { // Just want to output the active regions to a file, not actually process them for( final ActiveRegion activeRegion : workQueue ) { if( activeRegion.isActive ) { @@ -163,66 +160,4 @@ public abstract class TraverseActiveRegions extends TraversalEngine walker, T sum, final boolean forceRegionsToBeActive) { - // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them - // TODO can implement parallel traversal here - while( workQueue.peek() != null ) { - final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); - if ( forceRegionsToBeActive || regionCompletelyWithinDeadZone(extendedLoc, false) ) { - final ActiveRegion activeRegion = workQueue.remove(); - if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + getStartOfLiveRegion()); - sum = processActiveRegion( activeRegion, sum, walker ); - } else { - break; - } - } - - return sum; - } - - /** - * Special function called in LinearMicroScheduler to empty out the work queue. - * Ugly for now but will be cleaned up when we push this functionality more into the engine - */ - public T endTraversal(final Walker walker, T sum) { - return processActiveRegions((ActiveRegionWalker)walker, sum, true); - } - - // todo -- remove me - protected ActiveRegion getBestRegion(final ActiveRegion activeRegion, final GenomeLoc readLoc) { - long minStart = activeRegion.getLocation().getStart(); - ActiveRegion bestRegion = activeRegion; - - for( final ActiveRegion otherRegionToTest : workQueue ) { - if( otherRegionToTest.getLocation().getStart() < minStart ) { - minStart = otherRegionToTest.getLocation().getStart(); - bestRegion = otherRegionToTest; - } - } - - return bestRegion; - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java index a22f257e5..461f74c1f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; +import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActivityProfile; @@ -151,6 +152,54 @@ public class TraverseActiveRegionsOptimized extends TraverseActiveRegions walker, T sum, final boolean forceRegionsToBeActive) { + if( walker.activeRegionOutStream != null ) { + writeActiveRegionsToStream(walker); + return sum; + } else { + return callWalkerMapOnActiveRegions(walker, sum, forceRegionsToBeActive); + } + } + + private T callWalkerMapOnActiveRegions(final ActiveRegionWalker walker, T sum, final boolean forceRegionsToBeActive) { + // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them + // TODO can implement parallel traversal here + while( workQueue.peek() != null ) { + final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); + if ( forceRegionsToBeActive || regionCompletelyWithinDeadZone(extendedLoc, false) ) { + final ActiveRegion activeRegion = workQueue.remove(); + if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + getStartOfLiveRegion()); + sum = processActiveRegion( activeRegion, sum, walker ); + } else { + break; + } + } + + return sum; + } + @Override public String toString() { return "TraverseActiveRegionsOptimized"; @@ -190,4 +239,15 @@ public class TraverseActiveRegionsOptimized extends TraverseActiveRegions walker, T sum) { + return processActiveRegions((ActiveRegionWalker)walker, sum, true); + } + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java index 6c542f578..72cf23bf4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java @@ -1,14 +1,19 @@ package org.broadinstitute.sting.gatk.traversals; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.WalkerManager; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.providers.*; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActivityProfile; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -23,14 +28,6 @@ import java.util.*; public class TraverseActiveRegionsOriginal extends TraverseActiveRegions { private final LinkedHashSet myReads = new LinkedHashSet(); - protected Collection getReadsInCurrentRegion() { - return myReads; - } - - protected void removeReadsFromCurrentRegion(final List placedReads) { - myReads.removeAll( placedReads ); // remove all the reads which have been placed into their active region - } - @Override public T traverse( final ActiveRegionWalker walker, final LocusShardDataProvider dataProvider, @@ -40,6 +37,8 @@ public class TraverseActiveRegionsOriginal extends TraverseActiveRegions activeRegions = new LinkedList(); @@ -75,7 +74,7 @@ public class TraverseActiveRegionsOriginal extends TraverseActiveRegions extends TraverseActiveRegions extends TraverseActiveRegions extends TraverseActiveRegions activeRegions, + final int activeRegionExtension, + final int maxRegionSize) { + if ( profile.isEmpty() ) + throw new IllegalStateException("trying to incorporate an empty active profile " + profile); + + final ActivityProfile bandPassFiltered = profile.bandPassFilter(); + activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize )); + return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() ); + } + + // -------------------------------------------------------------------------------- + // + // code to handle processing active regions + // + // -------------------------------------------------------------------------------- + + private T processActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { + if( walker.activeRegionOutStream != null ) { + writeActiveRegionsToStream(walker); + return sum; + } else { + return callWalkerMapOnActiveRegions(walker, sum, minStart, currentContig); + } + } + + private T callWalkerMapOnActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { + // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them + // TODO can implement parallel traversal here + while( workQueue.peek() != null ) { + final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); + if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) { + final ActiveRegion activeRegion = workQueue.remove(); + sum = processActiveRegion( activeRegion, sum, walker ); + } else { + break; + } + } return sum; } @Override - public String toString() { - return "TraverseActiveRegionsOriginal"; - } - - @Override - protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker) { + protected T processActiveRegion( final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker ) { final ArrayList placedReads = new ArrayList(); - for( final GATKSAMRecord read : getReadsInCurrentRegion() ) { + for( final GATKSAMRecord read : myReads ) { final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); - if( activeRegion.getLocation().overlapsP( readLoc ) ) { // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region) - final ActiveRegion bestRegion = getBestRegion(activeRegion, readLoc); + long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc ); + ActiveRegion bestRegion = activeRegion; + for( final ActiveRegion otherRegionToTest : workQueue ) { + if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) { + maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc ); + bestRegion = otherRegionToTest; + } + } bestRegion.add( read ); // The read is also added to all other regions in which it overlaps but marked as non-primary - if( walker.wantsNonPrimaryReads() ) { if( !bestRegion.equals(activeRegion) ) { activeRegion.add( read ); @@ -160,16 +211,27 @@ public class TraverseActiveRegionsOriginal extends TraverseActiveRegions> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); - final M x = walker.map(activeRegion, null); + final M x = walker.map( activeRegion, null ); return walker.reduce( x, sum ); } + + /** + * Special function called in LinearMicroScheduler to empty out the work queue. + * Ugly for now but will be cleaned up when we push this functionality more into the engine + */ + public T endTraversal( final Walker walker, T sum) { + return processActiveRegions((ActiveRegionWalker)walker, sum, Integer.MAX_VALUE, null); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimizedUnitTest.java similarity index 98% rename from public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimizedUnitTest.java index 466cc65e7..038cd2853 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimizedUnitTest.java @@ -76,9 +76,7 @@ import java.util.*; * Test the Active Region Traversal Contract * http://iwww.broadinstitute.org/gsa/wiki/index.php/Active_Region_Traversal_Contract */ -public class TraverseActiveRegionsUnitTest extends BaseTest { - private final static boolean INCLUDE_OLD = false; - private final static boolean INCLUDE_NEW = true; +public class TraverseActiveRegionsOptimizedUnitTest extends BaseTest { private final static boolean ENFORCE_CONTRACTS = false; private final static boolean DEBUG = false; @@ -133,8 +131,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { @DataProvider(name = "TraversalEngineProvider") public Object[][] makeTraversals() { final List traversals = new LinkedList(); - if ( INCLUDE_OLD ) traversals.add(new Object[]{new TraverseActiveRegionsOriginal()}); - if ( INCLUDE_NEW ) traversals.add(new Object[]{new TraverseActiveRegionsOptimized()}); + traversals.add(new Object[]{new TraverseActiveRegionsOptimized()}); return traversals.toArray(new Object[][]{}); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginalUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginalUnitTest.java new file mode 100644 index 000000000..35a0931df --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginalUnitTest.java @@ -0,0 +1,523 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.traversals; + +import com.google.java.contract.PreconditionError; +import net.sf.samtools.*; +import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.datasources.reads.*; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.utils.interval.IntervalUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider; +import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.executive.WindowMaker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + + +/** + * Created with IntelliJ IDEA. + * User: depristo + * Date: 1/10/13 + * Time: 8:03 PM + * To change this template use File | Settings | File Templates. + */ +public class TraverseActiveRegionsOriginalUnitTest extends BaseTest { + + private class DummyActiveRegionWalker extends ActiveRegionWalker { + private final double prob; + private EnumSet states = super.desiredReadStates(); + + protected List isActiveCalls = new ArrayList(); + protected Map mappedActiveRegions = new HashMap(); + + public DummyActiveRegionWalker() { + this.prob = 1.0; + } + + public DummyActiveRegionWalker(double constProb) { + this.prob = constProb; + } + + public DummyActiveRegionWalker(EnumSet wantStates) { + this.prob = 1.0; + this.states = wantStates; + } + + @Override + public EnumSet desiredReadStates() { + return states; + } + + @Override + public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + isActiveCalls.add(ref.getLocus()); + return new ActivityProfileResult(ref.getLocus(), prob); + } + + @Override + public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { + mappedActiveRegions.put(activeRegion.getLocation(), activeRegion); + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; + } + } + + private final TraverseActiveRegions t = new TraverseActiveRegionsOriginal(); + + private IndexedFastaSequenceFile reference; + private SAMSequenceDictionary dictionary; + private GenomeLocParser genomeLocParser; + + private List intervals; + + private static final String testBAM = "TraverseActiveRegionsUnitTest.bam"; + private static final String testBAI = "TraverseActiveRegionsUnitTest.bai"; + + @BeforeClass + private void init() throws FileNotFoundException { + reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference)); + dictionary = reference.getSequenceDictionary(); + genomeLocParser = new GenomeLocParser(dictionary); + + // TODO: reads with indels + // TODO: reads which span many regions + // TODO: reads which are partially between intervals (in/outside extension) + // TODO: duplicate reads + // TODO: read at the end of a contig + // TODO: reads which are completely outside intervals but within extension + // TODO: test the extension itself + // TODO: unmapped reads + + intervals = new ArrayList(); + intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20)); + intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999)); + intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999)); + intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999)); + intervals.add(genomeLocParser.createGenomeLoc("1", 10000, 20000)); + intervals.add(genomeLocParser.createGenomeLoc("2", 1, 100)); + intervals.add(genomeLocParser.createGenomeLoc("20", 10000, 10100)); + intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList(); + + List reads = new ArrayList(); + reads.add(buildSAMRecord("simple", "1", 100, 200)); + reads.add(buildSAMRecord("overlap_equal", "1", 10, 20)); + reads.add(buildSAMRecord("overlap_unequal", "1", 10, 21)); + reads.add(buildSAMRecord("boundary_equal", "1", 1990, 2009)); + reads.add(buildSAMRecord("boundary_unequal", "1", 1990, 2008)); + reads.add(buildSAMRecord("boundary_1_pre", "1", 1950, 2000)); + reads.add(buildSAMRecord("boundary_1_post", "1", 1999, 2050)); + reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990)); + reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000)); + reads.add(buildSAMRecord("shard_boundary_1_pre", "1", 16300, 16385)); + reads.add(buildSAMRecord("shard_boundary_1_post", "1", 16384, 16400)); + reads.add(buildSAMRecord("shard_boundary_equal", "1", 16355, 16414)); + reads.add(buildSAMRecord("simple20", "20", 10025, 10075)); + + createBAM(reads); + } + + private void createBAM(List reads) { + File outFile = new File(testBAM); + outFile.deleteOnExit(); + File indexFile = new File(testBAI); + indexFile.deleteOnExit(); + + SAMFileWriter out = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(reads.get(0).getHeader(), true, outFile); + for (GATKSAMRecord read : ReadUtils.sortReadsByCoordinate(reads)) { + out.addAlignment(read); + } + out.close(); + } + + @Test + public void testAllBasesSeen() { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); + + List activeIntervals = getIsActiveIntervals(walker, intervals); + // Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call + verifyEqualIntervals(intervals, activeIntervals); + } + + private List getIsActiveIntervals(DummyActiveRegionWalker walker, List intervals) { + List activeIntervals = new ArrayList(); + for (LocusShardDataProvider dataProvider : createDataProviders(walker, intervals, testBAM)) { + t.traverse(walker, dataProvider, 0); + activeIntervals.addAll(walker.isActiveCalls); + } + + return activeIntervals; + } + + @Test (expectedExceptions = PreconditionError.class) + public void testIsActiveRangeLow () { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(-0.1); + getActiveRegions(walker, intervals).values(); + } + + @Test (expectedExceptions = PreconditionError.class) + public void testIsActiveRangeHigh () { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(1.1); + getActiveRegions(walker, intervals).values(); + } + + @Test + public void testActiveRegionCoverage() { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); + + Collection activeRegions = getActiveRegions(walker, intervals).values(); + verifyActiveRegionCoverage(intervals, activeRegions); + } + + private void verifyActiveRegionCoverage(List intervals, Collection activeRegions) { + List intervalStarts = new ArrayList(); + List intervalStops = new ArrayList(); + + for (GenomeLoc interval : intervals) { + intervalStarts.add(interval.getStartLocation()); + intervalStops.add(interval.getStopLocation()); + } + + Map baseRegionMap = new HashMap(); + + for (ActiveRegion activeRegion : activeRegions) { + for (GenomeLoc activeLoc : toSingleBaseLocs(activeRegion.getLocation())) { + // Contract: Regions do not overlap + Assert.assertFalse(baseRegionMap.containsKey(activeLoc), "Genome location " + activeLoc + " is assigned to more than one region"); + baseRegionMap.put(activeLoc, activeRegion); + } + + GenomeLoc start = activeRegion.getLocation().getStartLocation(); + if (intervalStarts.contains(start)) + intervalStarts.remove(start); + + GenomeLoc stop = activeRegion.getLocation().getStopLocation(); + if (intervalStops.contains(stop)) + intervalStops.remove(stop); + } + + for (GenomeLoc baseLoc : toSingleBaseLocs(intervals)) { + // Contract: Each location in the interval(s) is in exactly one region + // Contract: The total set of regions exactly matches the analysis interval(s) + Assert.assertTrue(baseRegionMap.containsKey(baseLoc), "Genome location " + baseLoc + " is not assigned to any region"); + baseRegionMap.remove(baseLoc); + } + + // Contract: The total set of regions exactly matches the analysis interval(s) + Assert.assertEquals(baseRegionMap.size(), 0, "Active regions contain base(s) outside of the given intervals"); + + // Contract: All explicit interval boundaries must also be region boundaries + Assert.assertEquals(intervalStarts.size(), 0, "Interval start location does not match an active region start location"); + Assert.assertEquals(intervalStops.size(), 0, "Interval stop location does not match an active region stop location"); + } + + @Test + public void testActiveRegionExtensionOnContig() { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); + + Collection activeRegions = getActiveRegions(walker, intervals).values(); + for (ActiveRegion activeRegion : activeRegions) { + GenomeLoc loc = activeRegion.getExtendedLoc(); + + // Contract: active region extensions must stay on the contig + Assert.assertTrue(loc.getStart() > 0, "Active region extension begins at location " + loc.getStart() + ", past the left end of the contig"); + int refLen = dictionary.getSequence(loc.getContigIndex()).getSequenceLength(); + Assert.assertTrue(loc.getStop() <= refLen, "Active region extension ends at location " + loc.getStop() + ", past the right end of the contig"); + } + } + + @Test + public void testPrimaryReadMapping() { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); + + // Contract: Each read has the Primary state in a single region (or none) + // This is the region of maximum overlap for the read (earlier if tied) + + // simple: Primary in 1:1-999 + // overlap_equal: Primary in 1:1-999 + // overlap_unequal: Primary in 1:1-999 + // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 + // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 + // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 + // outside_intervals: none + // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 + // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 + // simple20: Primary in 20:10000-10100 + + Map activeRegions = getActiveRegions(walker, intervals); + ActiveRegion region; + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); + verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); + verifyReadMapping(region, "boundary_unequal", "extended_and_np", "boundary_1_pre"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); + verifyReadMapping(region, "boundary_equal", "boundary_1_post"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384)); + verifyReadMapping(region, "shard_boundary_1_pre"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927)); + verifyReadMapping(region, "shard_boundary_1_post", "shard_boundary_equal"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); + verifyReadMapping(region, "simple20"); + } + + @Test + public void testNonPrimaryReadMapping() { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker( + EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY)); + + // Contract: Each read has the Primary state in a single region (or none) + // This is the region of maximum overlap for the read (earlier if tied) + + // Contract: Each read has the Non-Primary state in all other regions it overlaps + + // simple: Primary in 1:1-999 + // overlap_equal: Primary in 1:1-999 + // overlap_unequal: Primary in 1:1-999 + // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 + // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 + // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 + // outside_intervals: none + // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 + // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 + // simple20: Primary in 20:10000-10100 + + Map activeRegions = getActiveRegions(walker, intervals); + ActiveRegion region; + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); + verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); + verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); + verifyReadMapping(region, "boundary_equal", "boundary_unequal", "boundary_1_pre", "boundary_1_post"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384)); + verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927)); + verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); + verifyReadMapping(region, "simple20"); + } + + @Test + public void testExtendedReadMapping() { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker( + EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED)); + + // Contract: Each read has the Primary state in a single region (or none) + // This is the region of maximum overlap for the read (earlier if tied) + + // Contract: Each read has the Non-Primary state in all other regions it overlaps + // Contract: Each read has the Extended state in regions where it only overlaps if the region is extended + + // simple: Primary in 1:1-999 + // overlap_equal: Primary in 1:1-999 + // overlap_unequal: Primary in 1:1-999 + // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 + // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 + // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 + // outside_intervals: none + // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 + // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 + // simple20: Primary in 20:10000-10100 + + Map activeRegions = getActiveRegions(walker, intervals); + ActiveRegion region; + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); + verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); + verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); + verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384)); + verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927)); + verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); + verifyReadMapping(region, "simple20"); + } + + @Test + public void testUnmappedReads() { + // TODO + } + + private void verifyReadMapping(ActiveRegion region, String... reads) { + Collection wantReads = new ArrayList(Arrays.asList(reads)); + for (SAMRecord read : region.getReads()) { + String regionReadName = read.getReadName(); + Assert.assertTrue(wantReads.contains(regionReadName), "Read " + regionReadName + " assigned to active region " + region); + wantReads.remove(regionReadName); + } + + Assert.assertTrue(wantReads.isEmpty(), "Reads missing in active region " + region); + } + + private Map getActiveRegions(DummyActiveRegionWalker walker, List intervals) { + for (LocusShardDataProvider dataProvider : createDataProviders(walker, intervals, testBAM)) + t.traverse(walker, dataProvider, 0); + + t.endTraversal(walker, 0); + + return walker.mappedActiveRegions; + } + + private Collection toSingleBaseLocs(GenomeLoc interval) { + List bases = new ArrayList(); + if (interval.size() == 1) + bases.add(interval); + else { + for (int location = interval.getStart(); location <= interval.getStop(); location++) + bases.add(genomeLocParser.createGenomeLoc(interval.getContig(), location, location)); + } + + return bases; + } + + private Collection toSingleBaseLocs(List intervals) { + Set bases = new TreeSet(); // for sorting and uniqueness + for (GenomeLoc interval : intervals) + bases.addAll(toSingleBaseLocs(interval)); + + return bases; + } + + private void verifyEqualIntervals(List aIntervals, List bIntervals) { + Collection aBases = toSingleBaseLocs(aIntervals); + Collection bBases = toSingleBaseLocs(bIntervals); + + Assert.assertTrue(aBases.size() == bBases.size(), "Interval lists have a differing number of bases: " + aBases.size() + " vs. " + bBases.size()); + + Iterator aIter = aBases.iterator(); + Iterator bIter = bBases.iterator(); + while (aIter.hasNext() && bIter.hasNext()) { + GenomeLoc aLoc = aIter.next(); + GenomeLoc bLoc = bIter.next(); + Assert.assertTrue(aLoc.equals(bLoc), "Interval locations do not match: " + aLoc + " vs. " + bLoc); + } + } + + // copied from LocusViewTemplate + protected GATKSAMRecord buildSAMRecord(String readName, String contig, int alignmentStart, int alignmentEnd) { + SAMFileHeader header = ArtificialSAMUtils.createDefaultReadGroup(new SAMFileHeader(), "test", "test"); + header.setSequenceDictionary(dictionary); + header.setSortOrder(SAMFileHeader.SortOrder.coordinate); + GATKSAMRecord record = new GATKSAMRecord(header); + + record.setReadName(readName); + record.setReferenceIndex(dictionary.getSequenceIndex(contig)); + record.setAlignmentStart(alignmentStart); + + Cigar cigar = new Cigar(); + int len = alignmentEnd - alignmentStart + 1; + cigar.add(new CigarElement(len, CigarOperator.M)); + record.setCigar(cigar); + record.setReadString(new String(new char[len]).replace("\0", "A")); + record.setBaseQualities(new byte[len]); + + return record; + } + + private List createDataProviders(final Walker walker, List intervals, String bamFile) { + GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + engine.setGenomeLocParser(genomeLocParser); + t.initialize(engine, walker); + + Collection samFiles = new ArrayList(); + SAMReaderID readerID = new SAMReaderID(new File(bamFile), new Tags()); + samFiles.add(readerID); + + SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser); + + List providers = new ArrayList(); + for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) { + for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs())) { + providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList())); + } + } + + return providers; + } +}