From 2a42b47e4a19c17ae3dad64a980e856229875295 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 16 Jan 2013 15:29:26 -0500 Subject: [PATCH] Massive expansion of ActiveRegionTraversal unit tests, resulting in several bugfixes to ART -- UnitTests now include combinational tiling of reads within and spanning shard boundaries -- ART now properly handles shard transitions, and does so efficiently without requiring hash sets or other collections of reads -- Updating HC and CountReadsInActiveRegions integration tests --- .../HaplotypeCallerIntegrationTest.java | 12 +- .../traversals/TraverseActiveRegions.java | 226 +++++++++++++----- .../sting/utils/sam/ArtificialBAMBuilder.java | 39 ++- .../traversals/DummyActiveRegionWalker.java | 104 ++++++++ .../TraverseActiveRegionsUnitTest.java | 207 +++++++++++----- .../LocusIteratorByStateUnitTest.java | 28 ++- .../sam/ArtificialBAMBuilderUnitTest.java | 6 +- 7 files changed, 482 insertions(+), 140 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 6183fc411..e86834a4a 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -68,12 +68,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "1e2671557b01ad0497557097282965fc"); + HCTest(CEUTRIO_BAM, "", "b8f7b741445ce6b6ea491c794ce75c17"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "2bd237a7e1e63eebe755dbe7963e430a"); + HCTest(NA12878_BAM, "", "a2c63f6e6e51a01019bdbd23125bdb15"); } @Test(enabled = false) @@ -84,7 +84,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "a938cdd7262968597fc8eb6c1c0a69f1"); + "c679ae7f04bdfda896b5c046d35e043c"); } private void HCTestComplexGGA(String bam, String args, String md5) { @@ -102,7 +102,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "d590c8d6d5e58d685401b65a23846893"); + "1a034b7eb572e1b6f659d6e5d57b3e76"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -135,7 +135,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "50a26224b9e863ee47a0619eb54a0323"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "29f1125df5ab27cc937a144ae08ac735"); } // That problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -146,7 +146,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("4439496472eb1e2f5c91b30ba525be37")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8b1b8d1bd7feac1503fc4ffa6236cff7")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 03aaf95f2..a7e4d7649 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.traversals; +import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.WalkerManager; @@ -47,24 +48,36 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.*; /** - * Created with IntelliJ IDEA. + * Implement active region traversal + * * User: depristo * Date: 1/9/13 * Time: 4:45 PM - * To change this template use File | Settings | File Templates. + * + * Live region: + * + * The ART tracks a thing called the live region. The live region is a position on a specific contig + * of the alignment start of the last read we processed during this traversal. Because the + * read stream is sorted, future reads must occurs in the the live region. Therefore the the dead region + * (everything to the left of the live boundary) cannot have any more read data. The live / dead + * regions are used to decide when we can safely call map on active regions, as only active regions + * contained completely within the dead region (including extensions) have a complete set of read data + * in the collected read list. All of the data related to the live region is captured by the local + * variable spanOfLastReadSeen + * */ public class TraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { + protected final static Logger logger = Logger.getLogger(TraversalEngine.class); protected final static boolean DEBUG = false; // set by the tranversal private int activeRegionExtension = -1; private int maxRegionSize = -1; - /** - * our log, which we want to capture anything from this class - */ - protected final static Logger logger = Logger.getLogger(TraversalEngine.class); - protected final LinkedList workQueue = new LinkedList(); + private final LinkedList workQueue = new LinkedList(); + + private LinkedList myReads = new LinkedList(); + private GenomeLoc spanOfLastReadSeen = null; protected int getActiveRegionExtension() { return activeRegionExtension; @@ -79,6 +92,11 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine myReads = new LinkedList(); - private Shard lastShard = null; + /** + * Did read appear in the last shard? + * + * When we transition across shard boundaries we see duplicate reads because + * each shard contains the reads that *overlap* the shard. So if we just finished + * shard 1-1000 and are now in 1001-2000 we'll see duplicate reads from 1001 + * that overlapped 1-1000. This function tests read to determine if we would have + * seen it before by asking if read.getAlignmentStart() is less than the + * stop position of the last seen read at the start of the traversal. The reason + * we need to use the location of the last read at the start of the traversal + * is that we update the lastRead during the traversal, and we only want to filter + * out reads whose start is before the last read of the previous shard, not the + * current shard. + * + * @param locOfLastReadAtTraversalStart the location of the last read seen at the start of the traversal + * @param read the read we want to test if it's already been seen in the last shard + * @return true if read would have appeared in the last shard, false otherwise + */ + protected boolean appearedInLastShard(final GenomeLoc locOfLastReadAtTraversalStart, final GATKSAMRecord read) { + if ( locOfLastReadAtTraversalStart == null ) + // we're in the first shard, so obviously the answer is no + return false; + else { + // otherwise check to see if the alignment occurred in the previous shard + return read.getAlignmentStart() <= locOfLastReadAtTraversalStart.getStart() + // we're on the same contig + && read.getReferenceIndex() == locOfLastReadAtTraversalStart.getContigIndex(); + } + + } + + // ------------------------------------------------------------------------------------- + // + // Actual traverse function + // + // ------------------------------------------------------------------------------------- + + /** + * Is the current shard on a new contig w.r.t. the previous shard? + * @param currentShard the current shard we are processing + * @return true if the last shard was on a different contig than the current shard + */ + private boolean onNewContig(final Shard currentShard) { + return spanOfLastSeenRead() != null + && spanOfLastSeenRead().getContigIndex() != currentShard.getLocation().getContigIndex(); + } @Override public T traverse( final ActiveRegionWalker walker, final LocusShardDataProvider dataProvider, T sum) { - if ( DEBUG ) logger.warn(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider)); - - final HashSet maybeDuplicatedReads = new HashSet(); - // TODO -- there's got to be a better way to know this - if ( lastShard != dataProvider.getShard() ) { - maybeDuplicatedReads.addAll(myReads); - logger.info("Crossing shard boundary requires us to check for duplicates against " + maybeDuplicatedReads.size() + " reads"); - if ( DEBUG ) logger.warn("Clearing myReads"); - } - lastShard = dataProvider.getShard(); + logger.debug(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider)); final LocusView locusView = new AllLocusView(dataProvider); @@ -181,6 +234,12 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine reads = locusView.getLIBS().transferReadsFromAllPreviousPileups(); for( final GATKSAMRecord read : reads ) { - notifyOfCurrentPosition(read); - // most of the time maybeDuplicatedReads is empty - // TODO -- I believe that because of the ordering of reads that as soon as we don't find a read in the - // TODO -- potential list of duplicates we can clear the hashset - if ( ! maybeDuplicatedReads.isEmpty() && maybeDuplicatedReads.contains(read) ) { + if ( appearedInLastShard(locOfLastReadAtTraversalStart, read) ) { if ( DEBUG ) logger.warn("Skipping duplicated " + read.getReadName()); } else { if ( DEBUG ) logger.warn("Adding read " + read.getReadName() + " at " + engine.getGenomeLocParser().createGenomeLoc(read) + " from provider " + dataProvider); - myReads.add((GATKSAMRecord)read); + rememberLastReadLocation(read); + myReads.add(read); } } @@ -257,28 +313,87 @@ public class TraverseActiveRegions extends TraversalEngine walker, T sum) { + return processActiveRegions((ActiveRegionWalker)walker, sum, true); } - protected void notifyOfCurrentPosition(final GenomeLoc currentLocation) { - if ( startOfLiveRegion == null ) - startOfLiveRegion = currentLocation; - else - startOfLiveRegion = startOfLiveRegion.max(currentLocation.getStartLocation()); + // ------------------------------------------------------------------------------------- + // + // Functions to manage and interact with the live / dead zone + // + // ------------------------------------------------------------------------------------- + + /** + * Update the live region to reflect that the last read we've seen in the traversal is read + * + * Requires that sequential calls always be provided reads in coordinate sorted order + * + * @param read the last read we've seen during the traversal + */ + protected void rememberLastReadLocation(final GATKSAMRecord read) { + final GenomeLoc currentLocation = engine.getGenomeLocParser().createGenomeLoc(read); + if ( spanOfLastReadSeen == null ) + spanOfLastReadSeen = currentLocation; + else { + if ( currentLocation.isBefore(spanOfLastReadSeen) ) + throw new IllegalStateException("Updating last read seen in the traversal with read " + read + " with span " + currentLocation + " but this occurs before the previously seen read " + spanOfLastReadSeen); + spanOfLastReadSeen = currentLocation; + } } - protected GenomeLoc getStartOfLiveRegion() { - return startOfLiveRegion; + /** + * Get a GenomeLoc indicating the start (heading to the right) of the live ART region. + * @return the left-most position of the live region on the genome + */ + protected GenomeLoc spanOfLastSeenRead() { + return spanOfLastReadSeen; } - protected boolean regionCompletelyWithinDeadZone(final GenomeLoc region, final boolean includeExtension) { - return (region.getStop() < (getStartOfLiveRegion().getStart() - (includeExtension ? getActiveRegionExtension() : 0))) - || ! region.onSameContig(getStartOfLiveRegion()); + /** + * Is the active region completely within the traversal's dead zone? + * + * @param region the region we want to test + * @return true if the extended location of region is completely within the current dead zone, false otherwise + */ + protected boolean regionCompletelyWithinDeadZone(final ActiveRegion region) { + return region.getExtendedLoc().getStop() < spanOfLastSeenRead().getStart() + || ! region.getExtendedLoc().onSameContig(spanOfLastSeenRead()); } + /** + * Is the read dead? That is, can it no longer be in any future active region, and therefore can be discarded? + * + * read: start |--------> stop ------ stop + extension + * region: start |-----------------| end + * + * Since the regions are coming in order, read could potentially be contained in a future interval if + * stop + activeRegionExtension >= end. If, on the other hand, stop + extension is < the end + * of this region, then we can discard it, since any future region could only include reads + * up to end + 1 - extension. + * + * Note that this function doesn't care about the dead zone. We're assuming that by + * actually calling this function with an active region that region is already in the dead zone, + * so checking that the read is in the dead zone doesn't make sense. + * + * @param read the read we're testing + * @param activeRegion the current active region + * @return true if the read is dead, false other + */ + @Requires({"read != null", "activeRegion != null"}) + private boolean readCannotOccurInAnyMoreActiveRegions(final GATKSAMRecord read, final ActiveRegion activeRegion) { + return read.getAlignmentEnd() + getActiveRegionExtension() < activeRegion.getLocation().getStop(); + } + + // ------------------------------------------------------------------------------------- + // + // Functions to process active regions that are ready for map / reduce calls + // + // ------------------------------------------------------------------------------------- + private T processActiveRegions(final ActiveRegionWalker walker, T sum, final boolean forceRegionsToBeActive) { if( walker.activeRegionOutStream != null ) { writeActiveRegionsToStream(walker); @@ -292,11 +407,10 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine walker) { final Iterator liveReads = myReads.iterator(); while ( liveReads.hasNext() ) { @@ -325,7 +430,7 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine walker, T sum) { - return processActiveRegions((ActiveRegionWalker)walker, sum, true); - } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java index 651d759e0..f5018db8c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilder.java @@ -25,7 +25,9 @@ package org.broadinstitute.sting.utils.sam; +import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.samtools.*; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.NGSPlatform; import java.io.File; @@ -51,6 +53,9 @@ import java.util.List; public class ArtificialBAMBuilder { public final static int BAM_SHARD_SIZE = 16384; + private final IndexedFastaSequenceFile reference; + private final GenomeLocParser parser; + final int nReadsPerLocus; final int nLoci; @@ -66,14 +71,39 @@ public class ArtificialBAMBuilder { SAMFileHeader header; - public ArtificialBAMBuilder(int nReadsPerLocus, int nLoci) { + public ArtificialBAMBuilder(final IndexedFastaSequenceFile reference, int nReadsPerLocus, int nLoci) { this.nReadsPerLocus = nReadsPerLocus; this.nLoci = nLoci; + + this.reference = reference; + this.parser = new GenomeLocParser(reference); createAndSetHeader(1); } + public ArtificialBAMBuilder(int nReadsPerLocus, int nLoci) { + this(ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000).getSequenceDictionary(), nReadsPerLocus, nLoci); + } + + public ArtificialBAMBuilder(final SAMSequenceDictionary dict, int nReadsPerLocus, int nLoci) { + this.nReadsPerLocus = nReadsPerLocus; + this.nLoci = nLoci; + this.reference = null; + this.parser = new GenomeLocParser(dict); + createAndSetHeader(1); + } + + public IndexedFastaSequenceFile getReference() { + return reference; + } + + public GenomeLocParser getGenomeLocParser() { + return parser; + } + public ArtificialBAMBuilder createAndSetHeader(final int nSamples) { - this.header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + this.header = new SAMFileHeader(); + header.setSortOrder(SAMFileHeader.SortOrder.coordinate); + header.setSequenceDictionary(parser.getContigs()); samples.clear(); for ( int i = 0; i < nSamples; i++ ) { @@ -156,6 +186,11 @@ public class ArtificialBAMBuilder { public SAMFileHeader getHeader() { return header; } public ArtificialBAMBuilder setHeader(SAMFileHeader header) { this.header = header; return this; } + public int getAlignmentEnd() { + return alignmentStart + nLoci * (skipNLoci + 1) + readLength; + } + + public int getNSamples() { return samples.size(); } public int expectedNumberOfReads() { diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java new file mode 100644 index 000000000..bc1e1d7b0 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/DummyActiveRegionWalker.java @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.traversals; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; + +import java.util.*; + +/** + * ActiveRegionWalker for unit testing + * + * User: depristo + * Date: 1/15/13 + * Time: 1:28 PM + */ +class DummyActiveRegionWalker extends ActiveRegionWalker { + private final double prob; + private EnumSet states = super.desiredReadStates(); + private GenomeLocSortedSet activeRegions = null; + + protected List isActiveCalls = new ArrayList(); + protected Map mappedActiveRegions = new LinkedHashMap(); + + public DummyActiveRegionWalker() { + this(1.0); + } + + public DummyActiveRegionWalker(double constProb) { + this.prob = constProb; + } + + public DummyActiveRegionWalker(EnumSet wantStates) { + this(1.0); + this.states = wantStates; + } + + public DummyActiveRegionWalker(GenomeLocSortedSet activeRegions) { + this(1.0); + this.activeRegions = activeRegions; + } + + public void setStates(EnumSet states) { + this.states = states; + } + + @Override + public EnumSet desiredReadStates() { + return states; + } + + @Override + public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + isActiveCalls.add(ref.getLocus()); + final double p = activeRegions == null || activeRegions.overlaps(ref.getLocus()) ? prob : 0.0; + return new ActivityProfileResult(ref.getLocus(), p); + } + + @Override + public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { + mappedActiveRegions.put(activeRegion.getLocation(), activeRegion); + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index c4dadbcce..15d4eec2d 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -30,33 +30,26 @@ import net.sf.samtools.*; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.datasources.reads.*; -import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.*; import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.executive.WindowMaker; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.ReadUtils; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; @@ -80,54 +73,6 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { private final static boolean ENFORCE_CONTRACTS = false; private final static boolean DEBUG = false; - private class DummyActiveRegionWalker extends ActiveRegionWalker { - private final double prob; - private EnumSet states = super.desiredReadStates(); - - protected List isActiveCalls = new ArrayList(); - protected Map mappedActiveRegions = new HashMap(); - - public DummyActiveRegionWalker() { - this.prob = 1.0; - } - - public DummyActiveRegionWalker(double constProb) { - this.prob = constProb; - } - - public DummyActiveRegionWalker(EnumSet wantStates) { - this.prob = 1.0; - this.states = wantStates; - } - - @Override - public EnumSet desiredReadStates() { - return states; - } - - @Override - public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - isActiveCalls.add(ref.getLocus()); - return new ActivityProfileResult(ref.getLocus(), prob); - } - - @Override - public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { - mappedActiveRegions.put(activeRegion.getLocation(), activeRegion); - return 0; - } - - @Override - public Integer reduceInit() { - return 0; - } - - @Override - public Integer reduce(Integer value, Integer sum) { - return 0; - } - } - @DataProvider(name = "TraversalEngineProvider") public Object[][] makeTraversals() { final List traversals = new LinkedList(); @@ -297,7 +242,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { } } - @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") + @Test(enabled = true, dataProvider = "TraversalEngineProvider") public void testPrimaryReadMapping(TraverseActiveRegions t) { DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); @@ -340,7 +285,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { verifyReadMapping(region, "simple20"); } - @Test(enabled = true, dataProvider = "TraversalEngineProvider") + @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") public void testNonPrimaryReadMapping(TraverseActiveRegions t) { DummyActiveRegionWalker walker = new DummyActiveRegionWalker( EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY)); @@ -456,7 +401,11 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { } private Map getActiveRegions(TraverseActiveRegions t, DummyActiveRegionWalker walker, List intervals) { - for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, testBAM)) + return getActiveRegions(t, walker, intervals, testBAM); + } + + private Map getActiveRegions(TraverseActiveRegions t, DummyActiveRegionWalker walker, List intervals, final String bam) { + for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, bam)) t.traverse(walker, dataProvider, 0); t.endTraversal(walker, 0); @@ -516,14 +465,15 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { record.setCigar(cigar); record.setReadString(new String(new char[len]).replace("\0", "A")); record.setBaseQualities(new byte[len]); + record.setReadGroup(new GATKSAMReadGroupRecord(header.getReadGroup("test"))); return record; } - private List createDataProviders(TraverseActiveRegions t, final Walker walker, List intervals, String bamFile) { + private List createDataProviders(TraverseActiveRegions traverseActiveRegions, final Walker walker, List intervals, String bamFile) { GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); engine.setGenomeLocParser(genomeLocParser); - t.initialize(engine, walker); + traverseActiveRegions.initialize(engine, walker); Collection samFiles = new ArrayList(); SAMReaderID readerID = new SAMReaderID(new File(bamFile), new Tags()); @@ -539,13 +489,144 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { new ArrayList(), false, (byte)30, false, true); + final Set samples = SampleUtils.getSAMFileSamples(dataSource.getHeader()); + List providers = new ArrayList(); for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) { - for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs())) { + for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs(), samples)) { providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList())); } } return providers; } + + @DataProvider(name = "CombinatorialARTTilingProvider") + public Object[][] makeCombinatorialARTTilingProvider() { + final List tests = new LinkedList(); + + final List starts = Arrays.asList( + 1, // very start of the chromosome + ArtificialBAMBuilder.BAM_SHARD_SIZE - 100, // right before the shard boundary + ArtificialBAMBuilder.BAM_SHARD_SIZE + 100 // right after the shard boundary + ); + + final List> allReadStates = Arrays.asList( + EnumSet.of(ActiveRegionReadState.PRIMARY), + EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY), + EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED) + ); + + final int maxTests = Integer.MAX_VALUE; + int nTests = 0; + for ( final int readLength : Arrays.asList(10, 100) ) { + for ( final int skips : Arrays.asList(0, 1, 10) ) { + for ( final int start : starts ) { + for ( final int nReadsPerLocus : Arrays.asList(1, 2) ) { + for ( final int nLoci : Arrays.asList(1, 1000) ) { + for ( EnumSet readStates : allReadStates ) { + final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(reference, nReadsPerLocus, nLoci); + bamBuilder.setReadLength(readLength); + bamBuilder.setSkipNLoci(skips); + bamBuilder.setAlignmentStart(start); + + for ( final GenomeLocSortedSet activeRegions : enumerateActiveRegions(bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd())) { + nTests++; + if ( nTests < maxTests ) // && nTests == 1238 ) + tests.add(new Object[]{nTests, activeRegions, readStates, bamBuilder}); + } + } + } + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + private Collection enumerateActiveRegions(final int start, final int stop) { + // should basically cut up entire region into equal sized chunks, of + // size 10, 20, 50, 100, etc, alternating skipping pieces so they are inactive + // Need to make sure we include some edge cases: + final List activeRegions = new LinkedList(); + + for ( final int stepSize : Arrays.asList(11, 29, 53, 97) ) { + for ( final boolean startWithActive : Arrays.asList(true, false) ) { + activeRegions.add(makeActiveRegionMask(start, stop, stepSize, startWithActive)); + } + } + + // active region is the whole interval + activeRegions.add(new GenomeLocSortedSet(genomeLocParser, genomeLocParser.createGenomeLoc("1", start, stop))); + + // active region extends up to the end of the data, but doesn't include start + activeRegions.add(new GenomeLocSortedSet(genomeLocParser, genomeLocParser.createGenomeLoc("1", start+10, stop))); + + return activeRegions; + } + + private GenomeLocSortedSet makeActiveRegionMask(final int start, final int stop, final int stepSize, final boolean startWithActive) { + final GenomeLocSortedSet active = new GenomeLocSortedSet(genomeLocParser); + + boolean includeRegion = startWithActive; + for ( int left = start; left < stop; left += stepSize) { + final int right = left + stepSize; + final GenomeLoc region = genomeLocParser.createGenomeLoc("1", left, right); + if ( includeRegion ) + active.add(region); + includeRegion = ! includeRegion; + } + + return active; + } + + + @Test(enabled = true, dataProvider = "CombinatorialARTTilingProvider") + public void testARTReadsInActiveRegions(final int id, final GenomeLocSortedSet activeRegions, final EnumSet readStates, final ArtificialBAMBuilder bamBuilder) { + logger.warn("Running testARTReadsInActiveRegions id=" + id + " locs " + activeRegions + " against bam " + bamBuilder); + final List intervals = Arrays.asList( + genomeLocParser.createGenomeLoc("1", bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd()) + ); + + final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions); + walker.setStates(readStates); + + final TraverseActiveRegions traversal = new TraverseActiveRegions(); + final Map activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile().toString()); + + final Set alreadySeenReads = new HashSet(); // for use with the primary / non-primary + for ( final ActiveRegion region : activeRegionsMap.values() ) { + int nReadsExpectedInRegion = 0; + for ( final GATKSAMRecord read : bamBuilder.makeReads() ) { + final GenomeLoc readLoc = genomeLocParser.createGenomeLoc(read); + final Set readNamesInRegion = readNamesInRegion(region); + + boolean shouldBeInRegion = readStates.contains(ActiveRegionReadState.EXTENDED) + ? region.getExtendedLoc().overlapsP(readLoc) + : region.getLocation().overlapsP(readLoc); + + if ( ! readStates.contains(ActiveRegionReadState.NONPRIMARY) ) { + if ( alreadySeenReads.contains(read.getReadName()) ) + shouldBeInRegion = false; + else if ( shouldBeInRegion ) + alreadySeenReads.add(read.getReadName()); + } + + Assert.assertEquals(readNamesInRegion.contains(read.getReadName()), shouldBeInRegion, "Region " + region + + " failed contains read check: read " + read + " with span " + readLoc + " should be in region is " + shouldBeInRegion + " but I got the opposite"); + + nReadsExpectedInRegion += shouldBeInRegion ? 1 : 0; + } + + Assert.assertEquals(region.size(), nReadsExpectedInRegion, "There are more reads in active region " + region + "than expected"); + } + } + + private Set readNamesInRegion(final ActiveRegion region) { + final Set readNames = new LinkedHashSet(region.getReads().size()); + for ( final SAMRecord read : region.getReads() ) + readNames.add(read.getReadName()); + return readNames; + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java index 2f984165e..e5e28e1f6 100644 --- a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java @@ -54,6 +54,32 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { private static final boolean DEBUG = false; protected LocusIteratorByState li; + @Test(enabled = true) + public void testUnmappedAndAllIReadsPassThrough() { + final int readLength = 10; + GATKSAMRecord mapped1 = ArtificialSAMUtils.createArtificialRead(header,"mapped1",0,1,readLength); + GATKSAMRecord mapped2 = ArtificialSAMUtils.createArtificialRead(header,"mapped2",0,1,readLength); + GATKSAMRecord unmapped = ArtificialSAMUtils.createArtificialRead(header,"unmapped",0,1,readLength); + GATKSAMRecord allI = ArtificialSAMUtils.createArtificialRead(header,"allI",0,1,readLength); + + unmapped.setReadUnmappedFlag(true); + unmapped.setCigarString("*"); + allI.setCigarString(readLength + "I"); + + List reads = Arrays.asList(mapped1, unmapped, allI, mapped2); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads,createTestReadProperties(DownsamplingMethod.NONE, true)); + + Assert.assertTrue(li.hasNext()); + AlignmentContext context = li.next(); + ReadBackedPileup pileup = context.getBasePileup(); + Assert.assertEquals(pileup.depthOfCoverage(), 2, "Should see only 2 reads in pileup, even with unmapped and all I reads"); + + final List rawReads = li.transferReadsFromAllPreviousPileups(); + Assert.assertEquals(rawReads, reads, "Input and transferred read lists should be the same, and include the unmapped and all I reads"); + } + @Test(enabled = true && ! DEBUG) public void testXandEQOperators() { final byte[] bases1 = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; @@ -451,7 +477,7 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { ? new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsampleTo, null, false) : new DownsamplingMethod(DownsampleType.NONE, null, null, false); - final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(nReadsPerLocus, nLoci); + final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(header.getSequenceDictionary(), nReadsPerLocus, nLoci); bamBuilder.createAndSetHeader(nSamples).setReadLength(readLength).setAlignmentStart(1); final List reads = bamBuilder.makeReads(); diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java index cf3c97b34..2a638eb69 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java @@ -47,8 +47,8 @@ import java.util.List; * To change this template use File | Settings | File Templates. */ public class ArtificialBAMBuilderUnitTest extends BaseTest { - @DataProvider(name = "CombinatorialARTTilingProvider") - public Object[][] makeCombinatorialARTTilingProvider() { + @DataProvider(name = "ArtificialBAMBuilderUnitTestProvider") + public Object[][] makeArtificialBAMBuilderUnitTestProvider() { final List tests = new LinkedList(); final List starts = Arrays.asList( @@ -79,7 +79,7 @@ public class ArtificialBAMBuilderUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "CombinatorialARTTilingProvider") + @Test(dataProvider = "ArtificialBAMBuilderUnitTestProvider") public void testBamProvider(final ArtificialBAMBuilder bamBuilder, int readLength, int skips, int start, int nSamples, int nReadsPerLocus, int nLoci) { Assert.assertEquals(bamBuilder.getReadLength(), readLength); Assert.assertEquals(bamBuilder.getSkipNLoci(), skips);