diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 8f5e275e6..780934c03 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -67,18 +67,18 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "b8f7b741445ce6b6ea491c794ce75c17"); + HCTest(CEUTRIO_BAM, "", "1e2671557b01ad0497557097282965fc"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "a2c63f6e6e51a01019bdbd23125bdb15"); + HCTest(NA12878_BAM, "", "2bd237a7e1e63eebe755dbe7963e430a"); } @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "c679ae7f04bdfda896b5c046d35e043c"); + "a938cdd7262968597fc8eb6c1c0a69f1"); } private void HCTestComplexGGA(String bam, String args, String md5) { @@ -96,7 +96,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "1a034b7eb572e1b6f659d6e5d57b3e76"); + "d590c8d6d5e58d685401b65a23846893"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -129,7 +129,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "29f1125df5ab27cc937a144ae08ac735"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "50a26224b9e863ee47a0619eb54a0323"); } // That problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -140,7 +140,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8b1b8d1bd7feac1503fc4ffa6236cff7")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("4439496472eb1e2f5c91b30ba525be37")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index a5926aeae..f9d6955c0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -842,7 +842,7 @@ public class GenomeAnalysisEngine { if (argCollection.keepProgramRecords) removeProgramRecords = false; - final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker && argCollection.newART; + final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker; return new SAMDataSource( samReaderIDs, diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index b6f0d5f90..ab09064dd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -448,10 +448,5 @@ public class GATKArgumentCollection { @Hidden public boolean generateShadowBCF = false; // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed - - @Hidden - @Argument(fullName="newART", shortName = "newART", doc = "use the new ART traversal", required=false) - public boolean newART = false; - } diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index c127899f6..371cce778 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -245,12 +245,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { } else if (walker instanceof ReadPairWalker) { return new TraverseReadPairs(); } else if (walker instanceof ActiveRegionWalker) { - if ( engine.getArguments().newART ) { - // todo -- create optimized traversal - return new TraverseActiveRegionsOptimized(); - } else { - return new TraverseActiveRegionsOriginal(); - } + return new TraverseActiveRegions(); } else { throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type."); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 45dbb6dc8..03aaf95f2 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -31,6 +31,7 @@ import org.broadinstitute.sting.gatk.WalkerManager; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.providers.*; +import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; @@ -43,8 +44,7 @@ import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import java.util.LinkedList; -import java.util.List; +import java.util.*; /** * Created with IntelliJ IDEA. @@ -53,7 +53,7 @@ import java.util.List; * Time: 4:45 PM * To change this template use File | Settings | File Templates. */ -public abstract class TraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { +public class TraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { protected final static boolean DEBUG = false; // set by the tranversal @@ -66,14 +66,6 @@ public abstract class TraverseActiveRegions extends TraversalEngine workQueue = new LinkedList(); - abstract protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker); - - /** - * Special function called in LinearMicroScheduler to empty out the work queue. - * Ugly for now but will be cleaned up when we push this functionality more into the engine - */ - public abstract T endTraversal(final Walker walker, T sum); - protected int getActiveRegionExtension() { return activeRegionExtension; } @@ -160,4 +152,204 @@ public abstract class TraverseActiveRegions extends TraversalEngine myReads = new LinkedList(); + private Shard lastShard = null; + + @Override + public T traverse( final ActiveRegionWalker walker, + final LocusShardDataProvider dataProvider, + T sum) { + if ( DEBUG ) logger.warn(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider)); + + final HashSet maybeDuplicatedReads = new HashSet(); + // TODO -- there's got to be a better way to know this + if ( lastShard != dataProvider.getShard() ) { + maybeDuplicatedReads.addAll(myReads); + logger.info("Crossing shard boundary requires us to check for duplicates against " + maybeDuplicatedReads.size() + " reads"); + if ( DEBUG ) logger.warn("Clearing myReads"); + } + lastShard = dataProvider.getShard(); + + final LocusView locusView = new AllLocusView(dataProvider); + + final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); + + final List activeRegions = new LinkedList(); + ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); + + ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); + + // We keep processing while the next reference location is within the interval + GenomeLoc prevLoc = null; + while( locusView.hasNext() ) { + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + // Grab all the previously unseen reads from this pileup and add them to the massive read list + // Note that this must occur before we leave because we are outside the intervals because + // reads may occur outside our intervals but overlap them in the future + final Collection reads = locusView.getLIBS().transferReadsFromAllPreviousPileups(); + for( final GATKSAMRecord read : reads ) { + notifyOfCurrentPosition(read); + // most of the time maybeDuplicatedReads is empty + // TODO -- I believe that because of the ordering of reads that as soon as we don't find a read in the + // TODO -- potential list of duplicates we can clear the hashset + if ( ! maybeDuplicatedReads.isEmpty() && maybeDuplicatedReads.contains(read) ) { + if ( DEBUG ) logger.warn("Skipping duplicated " + read.getReadName()); + } else { + if ( DEBUG ) logger.warn("Adding read " + read.getReadName() + " at " + engine.getGenomeLocParser().createGenomeLoc(read) + " from provider " + dataProvider); + myReads.add((GATKSAMRecord)read); + } + } + + // skip this location -- it's not part of our engine intervals + if ( outsideEngineIntervals(location) ) + continue; + + if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { + // we've move across some interval boundary, restart profile + profile = incorporateActiveRegions(profile, activeRegions); + } + + dataProvider.getShard().getReadMetrics().incrementNumIterations(); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); + + // Call the walkers isActive function for this locus and add them to the list to be integrated later + profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); + + prevLoc = location; + + printProgress(locus.getLocation()); + } + + updateCumulativeMetrics(dataProvider.getShard()); + + if ( ! profile.isEmpty() ) + incorporateActiveRegions(profile, activeRegions); + + // add active regions to queue of regions to process + // first check if can merge active regions over shard boundaries + if( !activeRegions.isEmpty() ) { + if( !workQueue.isEmpty() ) { + final ActiveRegion last = workQueue.getLast(); + final ActiveRegion first = activeRegions.get(0); + if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= getMaxRegionSize() ) { + workQueue.removeLast(); + activeRegions.remove(first); + workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), getActiveRegionExtension()) ); + } + } + workQueue.addAll( activeRegions ); + } + + logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); + + // now go and process all of the active regions + sum = processActiveRegions(walker, sum, false); + + return sum; + } + + private GenomeLoc startOfLiveRegion = null; + + protected void notifyOfCurrentPosition(final GATKSAMRecord read) { + notifyOfCurrentPosition(engine.getGenomeLocParser().createGenomeLoc(read)); + } + + protected void notifyOfCurrentPosition(final GenomeLoc currentLocation) { + if ( startOfLiveRegion == null ) + startOfLiveRegion = currentLocation; + else + startOfLiveRegion = startOfLiveRegion.max(currentLocation.getStartLocation()); + } + + protected GenomeLoc getStartOfLiveRegion() { + return startOfLiveRegion; + } + + protected boolean regionCompletelyWithinDeadZone(final GenomeLoc region, final boolean includeExtension) { + return (region.getStop() < (getStartOfLiveRegion().getStart() - (includeExtension ? getActiveRegionExtension() : 0))) + || ! region.onSameContig(getStartOfLiveRegion()); + } + + private T processActiveRegions(final ActiveRegionWalker walker, T sum, final boolean forceRegionsToBeActive) { + if( walker.activeRegionOutStream != null ) { + writeActiveRegionsToStream(walker); + return sum; + } else { + return callWalkerMapOnActiveRegions(walker, sum, forceRegionsToBeActive); + } + } + + private T callWalkerMapOnActiveRegions(final ActiveRegionWalker walker, T sum, final boolean forceRegionsToBeActive) { + // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them + // TODO can implement parallel traversal here + while( workQueue.peek() != null ) { + final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); + if ( forceRegionsToBeActive || regionCompletelyWithinDeadZone(extendedLoc, false) ) { + final ActiveRegion activeRegion = workQueue.remove(); + if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + getStartOfLiveRegion()); + sum = processActiveRegion( activeRegion, sum, walker ); + } else { + break; + } + } + + return sum; + } + + @Override + public String toString() { + return "TraverseActiveRegions"; + } + + private boolean readIsDead(final GATKSAMRecord read, final GenomeLoc readLoc, final ActiveRegion activeRegion) { + return readLoc.getStop() < activeRegion.getLocation().getStart() && regionCompletelyWithinDeadZone(readLoc, true); + } + + protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker) { + final Iterator liveReads = myReads.iterator(); + while ( liveReads.hasNext() ) { + boolean killed = false; + final GATKSAMRecord read = liveReads.next(); + final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); + + if( activeRegion.getLocation().overlapsP( readLoc ) ) { + activeRegion.add(read); + + if ( ! walker.wantsNonPrimaryReads() ) { + if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion()); + liveReads.remove(); + killed = true; + } + } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) { + activeRegion.add( read ); + } + + if ( ! killed && readIsDead(read, readLoc, activeRegion) ) { + if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion()); + liveReads.remove(); + } + } + + logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); + final M x = walker.map(activeRegion, null); + return walker.reduce( x, sum ); + } + + + /** + * Special function called in LinearMicroScheduler to empty out the work queue. + * Ugly for now but will be cleaned up when we push this functionality more into the engine + */ + public T endTraversal(final Walker walker, T sum) { + return processActiveRegions((ActiveRegionWalker)walker, sum, true); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java deleted file mode 100644 index 809c7ea6a..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimized.java +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright (c) 2012 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.traversals; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.*; -import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.activeregion.ActivityProfile; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: 12/9/11 - */ - -public class TraverseActiveRegionsOptimized extends TraverseActiveRegions { - private LinkedList myReads = new LinkedList(); - private Shard lastShard = null; - - @Override - public T traverse( final ActiveRegionWalker walker, - final LocusShardDataProvider dataProvider, - T sum) { - if ( DEBUG ) logger.warn(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider)); - - final HashSet maybeDuplicatedReads = new HashSet(); - // TODO -- there's got to be a better way to know this - if ( lastShard != dataProvider.getShard() ) { - maybeDuplicatedReads.addAll(myReads); - logger.info("Crossing shard boundary requires us to check for duplicates against " + maybeDuplicatedReads.size() + " reads"); - if ( DEBUG ) logger.warn("Clearing myReads"); - } - lastShard = dataProvider.getShard(); - - final LocusView locusView = new AllLocusView(dataProvider); - - final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); - - final List activeRegions = new LinkedList(); - ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); - - ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); - - // We keep processing while the next reference location is within the interval - GenomeLoc prevLoc = null; - while( locusView.hasNext() ) { - final AlignmentContext locus = locusView.next(); - final GenomeLoc location = locus.getLocation(); - - // Grab all the previously unseen reads from this pileup and add them to the massive read list - // Note that this must occur before we leave because we are outside the intervals because - // reads may occur outside our intervals but overlap them in the future - final Collection reads = locusView.getLIBS().transferReadsFromAllPreviousPileups(); - for( final GATKSAMRecord read : reads ) { - notifyOfCurrentPosition(read); - // most of the time maybeDuplicatedReads is empty - // TODO -- I believe that because of the ordering of reads that as soon as we don't find a read in the - // TODO -- potential list of duplicates we can clear the hashset - if ( ! maybeDuplicatedReads.isEmpty() && maybeDuplicatedReads.contains(read) ) { - if ( DEBUG ) logger.warn("Skipping duplicated " + read.getReadName()); - } else { - if ( DEBUG ) logger.warn("Adding read " + read.getReadName() + " at " + engine.getGenomeLocParser().createGenomeLoc(read) + " from provider " + dataProvider); - myReads.add((GATKSAMRecord)read); - } - } - - // skip this location -- it's not part of our engine intervals - if ( outsideEngineIntervals(location) ) - continue; - - if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { - // we've move across some interval boundary, restart profile - profile = incorporateActiveRegions(profile, activeRegions); - } - - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); - - // Call the walkers isActive function for this locus and add them to the list to be integrated later - profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); - - prevLoc = location; - - printProgress(locus.getLocation()); - } - - updateCumulativeMetrics(dataProvider.getShard()); - - if ( ! profile.isEmpty() ) - incorporateActiveRegions(profile, activeRegions); - - // add active regions to queue of regions to process - // first check if can merge active regions over shard boundaries - if( !activeRegions.isEmpty() ) { - if( !workQueue.isEmpty() ) { - final ActiveRegion last = workQueue.getLast(); - final ActiveRegion first = activeRegions.get(0); - if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= getMaxRegionSize() ) { - workQueue.removeLast(); - activeRegions.remove(first); - workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), getActiveRegionExtension()) ); - } - } - workQueue.addAll( activeRegions ); - } - - logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); - - // now go and process all of the active regions - sum = processActiveRegions(walker, sum, false); - - return sum; - } - - private GenomeLoc startOfLiveRegion = null; - - protected void notifyOfCurrentPosition(final GATKSAMRecord read) { - notifyOfCurrentPosition(engine.getGenomeLocParser().createGenomeLoc(read)); - } - - protected void notifyOfCurrentPosition(final GenomeLoc currentLocation) { - if ( startOfLiveRegion == null ) - startOfLiveRegion = currentLocation; - else - startOfLiveRegion = startOfLiveRegion.max(currentLocation.getStartLocation()); - } - - protected GenomeLoc getStartOfLiveRegion() { - return startOfLiveRegion; - } - - protected boolean regionCompletelyWithinDeadZone(final GenomeLoc region, final boolean includeExtension) { - return (region.getStop() < (getStartOfLiveRegion().getStart() - (includeExtension ? getActiveRegionExtension() : 0))) - || ! region.onSameContig(getStartOfLiveRegion()); - } - - private T processActiveRegions(final ActiveRegionWalker walker, T sum, final boolean forceRegionsToBeActive) { - if( walker.activeRegionOutStream != null ) { - writeActiveRegionsToStream(walker); - return sum; - } else { - return callWalkerMapOnActiveRegions(walker, sum, forceRegionsToBeActive); - } - } - - private T callWalkerMapOnActiveRegions(final ActiveRegionWalker walker, T sum, final boolean forceRegionsToBeActive) { - // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them - // TODO can implement parallel traversal here - while( workQueue.peek() != null ) { - final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); - if ( forceRegionsToBeActive || regionCompletelyWithinDeadZone(extendedLoc, false) ) { - final ActiveRegion activeRegion = workQueue.remove(); - if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + getStartOfLiveRegion()); - sum = processActiveRegion( activeRegion, sum, walker ); - } else { - break; - } - } - - return sum; - } - - @Override - public String toString() { - return "TraverseActiveRegionsOptimized"; - } - - private boolean readIsDead(final GATKSAMRecord read, final GenomeLoc readLoc, final ActiveRegion activeRegion) { - return readLoc.getStop() < activeRegion.getLocation().getStart() && regionCompletelyWithinDeadZone(readLoc, true); - } - - @Override - protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker) { - final Iterator liveReads = myReads.iterator(); - while ( liveReads.hasNext() ) { - boolean killed = false; - final GATKSAMRecord read = liveReads.next(); - final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); - - if( activeRegion.getLocation().overlapsP( readLoc ) ) { - activeRegion.add(read); - - if ( ! walker.wantsNonPrimaryReads() ) { - if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion()); - liveReads.remove(); - killed = true; - } - } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) { - activeRegion.add( read ); - } - - if ( ! killed && readIsDead(read, readLoc, activeRegion) ) { - if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion()); - liveReads.remove(); - } - } - - logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); - final M x = walker.map(activeRegion, null); - return walker.reduce( x, sum ); - } - - - /** - * Special function called in LinearMicroScheduler to empty out the work queue. - * Ugly for now but will be cleaned up when we push this functionality more into the engine - */ - @Override - public T endTraversal(final Walker walker, T sum) { - return processActiveRegions((ActiveRegionWalker)walker, sum, true); - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java deleted file mode 100644 index 0786bc800..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginal.java +++ /dev/null @@ -1,262 +0,0 @@ -/* - * Copyright (c) 2012 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.traversals; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.WalkerManager; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.*; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.activeregion.ActivityProfile; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: 12/9/11 - */ - -public class TraverseActiveRegionsOriginal extends TraverseActiveRegions { - private final LinkedHashSet myReads = new LinkedHashSet(); - - @Override - public T traverse( final ActiveRegionWalker walker, - final LocusShardDataProvider dataProvider, - T sum) { - logger.debug(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider)); - - final LocusView locusView = new AllLocusView(dataProvider); - - final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); - final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); - final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); - - int minStart = Integer.MAX_VALUE; - final List activeRegions = new LinkedList(); - ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); - - ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); - - // We keep processing while the next reference location is within the interval - GenomeLoc prevLoc = null; - while( locusView.hasNext() ) { - final AlignmentContext locus = locusView.next(); - final GenomeLoc location = locus.getLocation(); - - // Grab all the previously unseen reads from this pileup and add them to the massive read list - // Note that this must occur before we leave because we are outside the intervals because - // reads may occur outside our intervals but overlap them in the future - // TODO -- this whole HashSet logic should be changed to a linked list of reads with - // TODO -- subsequent pass over them to find the ones overlapping the active regions - for( final PileupElement p : locus.getBasePileup() ) { - final GATKSAMRecord read = p.getRead(); - if( !myReads.contains(read) ) { - myReads.add(read); - } - - // If this is the last pileup for this shard calculate the minimum alignment start so that we know - // which active regions in the work queue are now safe to process - minStart = Math.min(minStart, read.getAlignmentStart()); - } - - // skip this location -- it's not part of our engine intervals - if ( outsideEngineIntervals(location) ) - continue; - - if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { - // we've move across some interval boundary, restart profile - profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); - } - - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); - - // Call the walkers isActive function for this locus and add them to the list to be integrated later - profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); - - prevLoc = location; - - printProgress(locus.getLocation()); - } - - updateCumulativeMetrics(dataProvider.getShard()); - - if ( ! profile.isEmpty() ) - incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); - - // add active regions to queue of regions to process - // first check if can merge active regions over shard boundaries - if( !activeRegions.isEmpty() ) { - if( !workQueue.isEmpty() ) { - final ActiveRegion last = workQueue.getLast(); - final ActiveRegion first = activeRegions.get(0); - if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) { - workQueue.removeLast(); - activeRegions.remove(first); - workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); - } - } - workQueue.addAll( activeRegions ); - } - - logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); - - // now go and process all of the active regions - sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); - - return sum; - } - - /** - * Take the individual isActive calls and integrate them into contiguous active regions and - * add these blocks of work to the work queue - * band-pass filter the list of isActive probabilities and turn into active regions - * - * @param profile - * @param activeRegions - * @param activeRegionExtension - * @param maxRegionSize - * @return - */ - private ActivityProfile incorporateActiveRegions(final ActivityProfile profile, - final List activeRegions, - final int activeRegionExtension, - final int maxRegionSize) { - if ( profile.isEmpty() ) - throw new IllegalStateException("trying to incorporate an empty active profile " + profile); - - final ActivityProfile bandPassFiltered = profile.bandPassFilter(); - activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize )); - return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() ); - } - - // -------------------------------------------------------------------------------- - // - // code to handle processing active regions - // - // -------------------------------------------------------------------------------- - - private T processActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { - if( walker.activeRegionOutStream != null ) { - writeActiveRegionsToStream(walker); - return sum; - } else { - return callWalkerMapOnActiveRegions(walker, sum, minStart, currentContig); - } - } - - private T callWalkerMapOnActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { - // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them - // TODO can implement parallel traversal here - while( workQueue.peek() != null ) { - final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); - if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) { - final ActiveRegion activeRegion = workQueue.remove(); - sum = processActiveRegion( activeRegion, sum, walker ); - } else { - break; - } - } - - return sum; - } - - @Override - protected T processActiveRegion( final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker ) { - final ArrayList placedReads = new ArrayList(); - for( final GATKSAMRecord read : myReads ) { - final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); - if( activeRegion.getLocation().overlapsP( readLoc ) ) { - // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region) - long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc ); - ActiveRegion bestRegion = activeRegion; - for( final ActiveRegion otherRegionToTest : workQueue ) { - if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) { - maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc ); - bestRegion = otherRegionToTest; - } - } - bestRegion.add( read ); - - // The read is also added to all other regions in which it overlaps but marked as non-primary - if( walker.wantsNonPrimaryReads() ) { - if( !bestRegion.equals(activeRegion) ) { - activeRegion.add( read ); - } - for( final ActiveRegion otherRegionToTest : workQueue ) { - if( !bestRegion.equals(otherRegionToTest) ) { - // check for non-primary vs. extended - if ( otherRegionToTest.getLocation().overlapsP( readLoc ) ) { - otherRegionToTest.add( read ); - } else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) { - otherRegionToTest.add( read ); - } - } - } - } - placedReads.add( read ); - // check for non-primary vs. extended - } else if( activeRegion.getLocation().overlapsP( readLoc ) ) { - if ( walker.wantsNonPrimaryReads() ) { - activeRegion.add( read ); - } - } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) { - activeRegion.add( read ); - } - } - myReads.removeAll( placedReads ); // remove all the reads which have been placed into their active region - // WARNING: This hashset relies on reads being exactly equal when they are placed in the list as when they are removed. So the ActiveRegionWalker can't modify the reads in any way. - - logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); - final M x = walker.map( activeRegion, null ); - return walker.reduce( x, sum ); - } - - /** - * Special function called in LinearMicroScheduler to empty out the work queue. - * Ugly for now but will be cleaned up when we push this functionality more into the engine - */ - public T endTraversal( final Walker walker, T sum) { - return processActiveRegions((ActiveRegionWalker) walker, sum, Integer.MAX_VALUE, null); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginalUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginalUnitTest.java deleted file mode 100644 index 35a0931df..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOriginalUnitTest.java +++ /dev/null @@ -1,523 +0,0 @@ -/* - * Copyright (c) 2012 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.traversals; - -import com.google.java.contract.PreconditionError; -import net.sf.samtools.*; -import org.broadinstitute.sting.commandline.Tags; -import org.broadinstitute.sting.gatk.datasources.reads.*; -import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; -import org.broadinstitute.sting.utils.interval.IntervalMergingRule; -import org.broadinstitute.sting.utils.interval.IntervalUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.executive.WindowMaker; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.*; - - -/** - * Created with IntelliJ IDEA. - * User: depristo - * Date: 1/10/13 - * Time: 8:03 PM - * To change this template use File | Settings | File Templates. - */ -public class TraverseActiveRegionsOriginalUnitTest extends BaseTest { - - private class DummyActiveRegionWalker extends ActiveRegionWalker { - private final double prob; - private EnumSet states = super.desiredReadStates(); - - protected List isActiveCalls = new ArrayList(); - protected Map mappedActiveRegions = new HashMap(); - - public DummyActiveRegionWalker() { - this.prob = 1.0; - } - - public DummyActiveRegionWalker(double constProb) { - this.prob = constProb; - } - - public DummyActiveRegionWalker(EnumSet wantStates) { - this.prob = 1.0; - this.states = wantStates; - } - - @Override - public EnumSet desiredReadStates() { - return states; - } - - @Override - public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - isActiveCalls.add(ref.getLocus()); - return new ActivityProfileResult(ref.getLocus(), prob); - } - - @Override - public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { - mappedActiveRegions.put(activeRegion.getLocation(), activeRegion); - return 0; - } - - @Override - public Integer reduceInit() { - return 0; - } - - @Override - public Integer reduce(Integer value, Integer sum) { - return 0; - } - } - - private final TraverseActiveRegions t = new TraverseActiveRegionsOriginal(); - - private IndexedFastaSequenceFile reference; - private SAMSequenceDictionary dictionary; - private GenomeLocParser genomeLocParser; - - private List intervals; - - private static final String testBAM = "TraverseActiveRegionsUnitTest.bam"; - private static final String testBAI = "TraverseActiveRegionsUnitTest.bai"; - - @BeforeClass - private void init() throws FileNotFoundException { - reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference)); - dictionary = reference.getSequenceDictionary(); - genomeLocParser = new GenomeLocParser(dictionary); - - // TODO: reads with indels - // TODO: reads which span many regions - // TODO: reads which are partially between intervals (in/outside extension) - // TODO: duplicate reads - // TODO: read at the end of a contig - // TODO: reads which are completely outside intervals but within extension - // TODO: test the extension itself - // TODO: unmapped reads - - intervals = new ArrayList(); - intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20)); - intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999)); - intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999)); - intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999)); - intervals.add(genomeLocParser.createGenomeLoc("1", 10000, 20000)); - intervals.add(genomeLocParser.createGenomeLoc("2", 1, 100)); - intervals.add(genomeLocParser.createGenomeLoc("20", 10000, 10100)); - intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList(); - - List reads = new ArrayList(); - reads.add(buildSAMRecord("simple", "1", 100, 200)); - reads.add(buildSAMRecord("overlap_equal", "1", 10, 20)); - reads.add(buildSAMRecord("overlap_unequal", "1", 10, 21)); - reads.add(buildSAMRecord("boundary_equal", "1", 1990, 2009)); - reads.add(buildSAMRecord("boundary_unequal", "1", 1990, 2008)); - reads.add(buildSAMRecord("boundary_1_pre", "1", 1950, 2000)); - reads.add(buildSAMRecord("boundary_1_post", "1", 1999, 2050)); - reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990)); - reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000)); - reads.add(buildSAMRecord("shard_boundary_1_pre", "1", 16300, 16385)); - reads.add(buildSAMRecord("shard_boundary_1_post", "1", 16384, 16400)); - reads.add(buildSAMRecord("shard_boundary_equal", "1", 16355, 16414)); - reads.add(buildSAMRecord("simple20", "20", 10025, 10075)); - - createBAM(reads); - } - - private void createBAM(List reads) { - File outFile = new File(testBAM); - outFile.deleteOnExit(); - File indexFile = new File(testBAI); - indexFile.deleteOnExit(); - - SAMFileWriter out = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(reads.get(0).getHeader(), true, outFile); - for (GATKSAMRecord read : ReadUtils.sortReadsByCoordinate(reads)) { - out.addAlignment(read); - } - out.close(); - } - - @Test - public void testAllBasesSeen() { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); - - List activeIntervals = getIsActiveIntervals(walker, intervals); - // Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call - verifyEqualIntervals(intervals, activeIntervals); - } - - private List getIsActiveIntervals(DummyActiveRegionWalker walker, List intervals) { - List activeIntervals = new ArrayList(); - for (LocusShardDataProvider dataProvider : createDataProviders(walker, intervals, testBAM)) { - t.traverse(walker, dataProvider, 0); - activeIntervals.addAll(walker.isActiveCalls); - } - - return activeIntervals; - } - - @Test (expectedExceptions = PreconditionError.class) - public void testIsActiveRangeLow () { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(-0.1); - getActiveRegions(walker, intervals).values(); - } - - @Test (expectedExceptions = PreconditionError.class) - public void testIsActiveRangeHigh () { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(1.1); - getActiveRegions(walker, intervals).values(); - } - - @Test - public void testActiveRegionCoverage() { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); - - Collection activeRegions = getActiveRegions(walker, intervals).values(); - verifyActiveRegionCoverage(intervals, activeRegions); - } - - private void verifyActiveRegionCoverage(List intervals, Collection activeRegions) { - List intervalStarts = new ArrayList(); - List intervalStops = new ArrayList(); - - for (GenomeLoc interval : intervals) { - intervalStarts.add(interval.getStartLocation()); - intervalStops.add(interval.getStopLocation()); - } - - Map baseRegionMap = new HashMap(); - - for (ActiveRegion activeRegion : activeRegions) { - for (GenomeLoc activeLoc : toSingleBaseLocs(activeRegion.getLocation())) { - // Contract: Regions do not overlap - Assert.assertFalse(baseRegionMap.containsKey(activeLoc), "Genome location " + activeLoc + " is assigned to more than one region"); - baseRegionMap.put(activeLoc, activeRegion); - } - - GenomeLoc start = activeRegion.getLocation().getStartLocation(); - if (intervalStarts.contains(start)) - intervalStarts.remove(start); - - GenomeLoc stop = activeRegion.getLocation().getStopLocation(); - if (intervalStops.contains(stop)) - intervalStops.remove(stop); - } - - for (GenomeLoc baseLoc : toSingleBaseLocs(intervals)) { - // Contract: Each location in the interval(s) is in exactly one region - // Contract: The total set of regions exactly matches the analysis interval(s) - Assert.assertTrue(baseRegionMap.containsKey(baseLoc), "Genome location " + baseLoc + " is not assigned to any region"); - baseRegionMap.remove(baseLoc); - } - - // Contract: The total set of regions exactly matches the analysis interval(s) - Assert.assertEquals(baseRegionMap.size(), 0, "Active regions contain base(s) outside of the given intervals"); - - // Contract: All explicit interval boundaries must also be region boundaries - Assert.assertEquals(intervalStarts.size(), 0, "Interval start location does not match an active region start location"); - Assert.assertEquals(intervalStops.size(), 0, "Interval stop location does not match an active region stop location"); - } - - @Test - public void testActiveRegionExtensionOnContig() { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); - - Collection activeRegions = getActiveRegions(walker, intervals).values(); - for (ActiveRegion activeRegion : activeRegions) { - GenomeLoc loc = activeRegion.getExtendedLoc(); - - // Contract: active region extensions must stay on the contig - Assert.assertTrue(loc.getStart() > 0, "Active region extension begins at location " + loc.getStart() + ", past the left end of the contig"); - int refLen = dictionary.getSequence(loc.getContigIndex()).getSequenceLength(); - Assert.assertTrue(loc.getStop() <= refLen, "Active region extension ends at location " + loc.getStop() + ", past the right end of the contig"); - } - } - - @Test - public void testPrimaryReadMapping() { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); - - // Contract: Each read has the Primary state in a single region (or none) - // This is the region of maximum overlap for the read (earlier if tied) - - // simple: Primary in 1:1-999 - // overlap_equal: Primary in 1:1-999 - // overlap_unequal: Primary in 1:1-999 - // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 - // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 - // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 - // outside_intervals: none - // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 - // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 - // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 - // simple20: Primary in 20:10000-10100 - - Map activeRegions = getActiveRegions(walker, intervals); - ActiveRegion region; - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); - verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); - verifyReadMapping(region, "boundary_unequal", "extended_and_np", "boundary_1_pre"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); - verifyReadMapping(region, "boundary_equal", "boundary_1_post"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384)); - verifyReadMapping(region, "shard_boundary_1_pre"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927)); - verifyReadMapping(region, "shard_boundary_1_post", "shard_boundary_equal"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); - verifyReadMapping(region, "simple20"); - } - - @Test - public void testNonPrimaryReadMapping() { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker( - EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY)); - - // Contract: Each read has the Primary state in a single region (or none) - // This is the region of maximum overlap for the read (earlier if tied) - - // Contract: Each read has the Non-Primary state in all other regions it overlaps - - // simple: Primary in 1:1-999 - // overlap_equal: Primary in 1:1-999 - // overlap_unequal: Primary in 1:1-999 - // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 - // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 - // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 - // outside_intervals: none - // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 - // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 - // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 - // simple20: Primary in 20:10000-10100 - - Map activeRegions = getActiveRegions(walker, intervals); - ActiveRegion region; - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); - verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); - verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); - verifyReadMapping(region, "boundary_equal", "boundary_unequal", "boundary_1_pre", "boundary_1_post"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384)); - verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927)); - verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); - verifyReadMapping(region, "simple20"); - } - - @Test - public void testExtendedReadMapping() { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker( - EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED)); - - // Contract: Each read has the Primary state in a single region (or none) - // This is the region of maximum overlap for the read (earlier if tied) - - // Contract: Each read has the Non-Primary state in all other regions it overlaps - // Contract: Each read has the Extended state in regions where it only overlaps if the region is extended - - // simple: Primary in 1:1-999 - // overlap_equal: Primary in 1:1-999 - // overlap_unequal: Primary in 1:1-999 - // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 - // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 - // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 - // outside_intervals: none - // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 - // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 - // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 - // simple20: Primary in 20:10000-10100 - - Map activeRegions = getActiveRegions(walker, intervals); - ActiveRegion region; - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); - verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); - verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); - verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384)); - verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927)); - verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); - verifyReadMapping(region, "simple20"); - } - - @Test - public void testUnmappedReads() { - // TODO - } - - private void verifyReadMapping(ActiveRegion region, String... reads) { - Collection wantReads = new ArrayList(Arrays.asList(reads)); - for (SAMRecord read : region.getReads()) { - String regionReadName = read.getReadName(); - Assert.assertTrue(wantReads.contains(regionReadName), "Read " + regionReadName + " assigned to active region " + region); - wantReads.remove(regionReadName); - } - - Assert.assertTrue(wantReads.isEmpty(), "Reads missing in active region " + region); - } - - private Map getActiveRegions(DummyActiveRegionWalker walker, List intervals) { - for (LocusShardDataProvider dataProvider : createDataProviders(walker, intervals, testBAM)) - t.traverse(walker, dataProvider, 0); - - t.endTraversal(walker, 0); - - return walker.mappedActiveRegions; - } - - private Collection toSingleBaseLocs(GenomeLoc interval) { - List bases = new ArrayList(); - if (interval.size() == 1) - bases.add(interval); - else { - for (int location = interval.getStart(); location <= interval.getStop(); location++) - bases.add(genomeLocParser.createGenomeLoc(interval.getContig(), location, location)); - } - - return bases; - } - - private Collection toSingleBaseLocs(List intervals) { - Set bases = new TreeSet(); // for sorting and uniqueness - for (GenomeLoc interval : intervals) - bases.addAll(toSingleBaseLocs(interval)); - - return bases; - } - - private void verifyEqualIntervals(List aIntervals, List bIntervals) { - Collection aBases = toSingleBaseLocs(aIntervals); - Collection bBases = toSingleBaseLocs(bIntervals); - - Assert.assertTrue(aBases.size() == bBases.size(), "Interval lists have a differing number of bases: " + aBases.size() + " vs. " + bBases.size()); - - Iterator aIter = aBases.iterator(); - Iterator bIter = bBases.iterator(); - while (aIter.hasNext() && bIter.hasNext()) { - GenomeLoc aLoc = aIter.next(); - GenomeLoc bLoc = bIter.next(); - Assert.assertTrue(aLoc.equals(bLoc), "Interval locations do not match: " + aLoc + " vs. " + bLoc); - } - } - - // copied from LocusViewTemplate - protected GATKSAMRecord buildSAMRecord(String readName, String contig, int alignmentStart, int alignmentEnd) { - SAMFileHeader header = ArtificialSAMUtils.createDefaultReadGroup(new SAMFileHeader(), "test", "test"); - header.setSequenceDictionary(dictionary); - header.setSortOrder(SAMFileHeader.SortOrder.coordinate); - GATKSAMRecord record = new GATKSAMRecord(header); - - record.setReadName(readName); - record.setReferenceIndex(dictionary.getSequenceIndex(contig)); - record.setAlignmentStart(alignmentStart); - - Cigar cigar = new Cigar(); - int len = alignmentEnd - alignmentStart + 1; - cigar.add(new CigarElement(len, CigarOperator.M)); - record.setCigar(cigar); - record.setReadString(new String(new char[len]).replace("\0", "A")); - record.setBaseQualities(new byte[len]); - - return record; - } - - private List createDataProviders(final Walker walker, List intervals, String bamFile) { - GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - engine.setGenomeLocParser(genomeLocParser); - t.initialize(engine, walker); - - Collection samFiles = new ArrayList(); - SAMReaderID readerID = new SAMReaderID(new File(bamFile), new Tags()); - samFiles.add(readerID); - - SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser); - - List providers = new ArrayList(); - for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) { - for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs())) { - providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList())); - } - } - - return providers; - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimizedUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java similarity index 99% rename from public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimizedUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index 038cd2853..c4dadbcce 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsOptimizedUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -76,7 +76,7 @@ import java.util.*; * Test the Active Region Traversal Contract * http://iwww.broadinstitute.org/gsa/wiki/index.php/Active_Region_Traversal_Contract */ -public class TraverseActiveRegionsOptimizedUnitTest extends BaseTest { +public class TraverseActiveRegionsUnitTest extends BaseTest { private final static boolean ENFORCE_CONTRACTS = false; private final static boolean DEBUG = false; @@ -131,7 +131,7 @@ public class TraverseActiveRegionsOptimizedUnitTest extends BaseTest { @DataProvider(name = "TraversalEngineProvider") public Object[][] makeTraversals() { final List traversals = new LinkedList(); - traversals.add(new Object[]{new TraverseActiveRegionsOptimized()}); + traversals.add(new Object[]{new TraverseActiveRegions()}); return traversals.toArray(new Object[][]{}); } @@ -537,7 +537,7 @@ public class TraverseActiveRegionsOptimizedUnitTest extends BaseTest { new ValidationExclusion(), new ArrayList(), new ArrayList(), - false, (byte)30, false, t instanceof TraverseActiveRegionsOptimized); + false, (byte)30, false, true); List providers = new ArrayList(); for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) {