Retire original TraverseActiveRegion, leaving only the new optimized version

-- Required some updates to MD5s, which was unexpected, and will be sorted out later with more detailed unit tests
This commit is contained in:
Mark DePristo 2013-01-15 10:24:45 -05:00
parent 39bc9e999d
commit 3c37ea014b
9 changed files with 214 additions and 1070 deletions

View File

@ -67,18 +67,18 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "b8f7b741445ce6b6ea491c794ce75c17");
HCTest(CEUTRIO_BAM, "", "1e2671557b01ad0497557097282965fc");
}
@Test
public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "a2c63f6e6e51a01019bdbd23125bdb15");
HCTest(NA12878_BAM, "", "2bd237a7e1e63eebe755dbe7963e430a");
}
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
"c679ae7f04bdfda896b5c046d35e043c");
"a938cdd7262968597fc8eb6c1c0a69f1");
}
private void HCTestComplexGGA(String bam, String args, String md5) {
@ -96,7 +96,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
"1a034b7eb572e1b6f659d6e5d57b3e76");
"d590c8d6d5e58d685401b65a23846893");
}
private void HCTestComplexVariants(String bam, String args, String md5) {
@ -129,7 +129,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "29f1125df5ab27cc937a144ae08ac735");
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "50a26224b9e863ee47a0619eb54a0323");
}
// That problem bam came from a user on the forum and it spotted a problem where the ReadClipper
@ -140,7 +140,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8b1b8d1bd7feac1503fc4ffa6236cff7"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("4439496472eb1e2f5c91b30ba525be37"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}

View File

@ -842,7 +842,7 @@ public class GenomeAnalysisEngine {
if (argCollection.keepProgramRecords)
removeProgramRecords = false;
final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker && argCollection.newART;
final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker;
return new SAMDataSource(
samReaderIDs,

View File

@ -448,10 +448,5 @@ public class GATKArgumentCollection {
@Hidden
public boolean generateShadowBCF = false;
// TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
@Hidden
@Argument(fullName="newART", shortName = "newART", doc = "use the new ART traversal", required=false)
public boolean newART = false;
}

View File

@ -245,12 +245,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
} else if (walker instanceof ReadPairWalker) {
return new TraverseReadPairs();
} else if (walker instanceof ActiveRegionWalker) {
if ( engine.getArguments().newART ) {
// todo -- create optimized traversal
return new TraverseActiveRegionsOptimized();
} else {
return new TraverseActiveRegionsOriginal();
}
return new TraverseActiveRegions();
} else {
throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type.");
}

View File

@ -31,6 +31,7 @@ import org.broadinstitute.sting.gatk.WalkerManager;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.*;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
@ -43,8 +44,7 @@ import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
import org.broadinstitute.sting.utils.progressmeter.ProgressMeter;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.LinkedList;
import java.util.List;
import java.util.*;
/**
* Created with IntelliJ IDEA.
@ -53,7 +53,7 @@ import java.util.List;
* Time: 4:45 PM
* To change this template use File | Settings | File Templates.
*/
public abstract class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegionWalker<M,T>,LocusShardDataProvider> {
public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegionWalker<M,T>,LocusShardDataProvider> {
protected final static boolean DEBUG = false;
// set by the tranversal
@ -66,14 +66,6 @@ public abstract class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Ac
protected final static Logger logger = Logger.getLogger(TraversalEngine.class);
protected final LinkedList<ActiveRegion> workQueue = new LinkedList<ActiveRegion>();
abstract protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker<M, T> walker);
/**
* Special function called in LinearMicroScheduler to empty out the work queue.
* Ugly for now but will be cleaned up when we push this functionality more into the engine
*/
public abstract T endTraversal(final Walker<M, T> walker, T sum);
protected int getActiveRegionExtension() {
return activeRegionExtension;
}
@ -160,4 +152,204 @@ public abstract class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Ac
}
}
}
private LinkedList<GATKSAMRecord> myReads = new LinkedList<GATKSAMRecord>();
private Shard lastShard = null;
@Override
public T traverse( final ActiveRegionWalker<M,T> walker,
final LocusShardDataProvider dataProvider,
T sum) {
if ( DEBUG ) logger.warn(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider));
final HashSet<GATKSAMRecord> maybeDuplicatedReads = new HashSet<GATKSAMRecord>();
// TODO -- there's got to be a better way to know this
if ( lastShard != dataProvider.getShard() ) {
maybeDuplicatedReads.addAll(myReads);
logger.info("Crossing shard boundary requires us to check for duplicates against " + maybeDuplicatedReads.size() + " reads");
if ( DEBUG ) logger.warn("Clearing myReads");
}
lastShard = dataProvider.getShard();
final LocusView locusView = new AllLocusView(dataProvider);
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
final List<ActiveRegion> activeRegions = new LinkedList<ActiveRegion>();
ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
// We keep processing while the next reference location is within the interval
GenomeLoc prevLoc = null;
while( locusView.hasNext() ) {
final AlignmentContext locus = locusView.next();
final GenomeLoc location = locus.getLocation();
// Grab all the previously unseen reads from this pileup and add them to the massive read list
// Note that this must occur before we leave because we are outside the intervals because
// reads may occur outside our intervals but overlap them in the future
final Collection<GATKSAMRecord> reads = locusView.getLIBS().transferReadsFromAllPreviousPileups();
for( final GATKSAMRecord read : reads ) {
notifyOfCurrentPosition(read);
// most of the time maybeDuplicatedReads is empty
// TODO -- I believe that because of the ordering of reads that as soon as we don't find a read in the
// TODO -- potential list of duplicates we can clear the hashset
if ( ! maybeDuplicatedReads.isEmpty() && maybeDuplicatedReads.contains(read) ) {
if ( DEBUG ) logger.warn("Skipping duplicated " + read.getReadName());
} else {
if ( DEBUG ) logger.warn("Adding read " + read.getReadName() + " at " + engine.getGenomeLocParser().createGenomeLoc(read) + " from provider " + dataProvider);
myReads.add((GATKSAMRecord)read);
}
}
// skip this location -- it's not part of our engine intervals
if ( outsideEngineIntervals(location) )
continue;
if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) {
// we've move across some interval boundary, restart profile
profile = incorporateActiveRegions(profile, activeRegions);
}
dataProvider.getShard().getReadMetrics().incrementNumIterations();
// create reference context. Note that if we have a pileup of "extended events", the context will
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
final ReferenceContext refContext = referenceView.getReferenceContext(location);
// Iterate forward to get all reference ordered data covering this location
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
// Call the walkers isActive function for this locus and add them to the list to be integrated later
profile.add(walkerActiveProb(walker, tracker, refContext, locus, location));
prevLoc = location;
printProgress(locus.getLocation());
}
updateCumulativeMetrics(dataProvider.getShard());
if ( ! profile.isEmpty() )
incorporateActiveRegions(profile, activeRegions);
// add active regions to queue of regions to process
// first check if can merge active regions over shard boundaries
if( !activeRegions.isEmpty() ) {
if( !workQueue.isEmpty() ) {
final ActiveRegion last = workQueue.getLast();
final ActiveRegion first = activeRegions.get(0);
if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= getMaxRegionSize() ) {
workQueue.removeLast();
activeRegions.remove(first);
workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), getActiveRegionExtension()) );
}
}
workQueue.addAll( activeRegions );
}
logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
// now go and process all of the active regions
sum = processActiveRegions(walker, sum, false);
return sum;
}
private GenomeLoc startOfLiveRegion = null;
protected void notifyOfCurrentPosition(final GATKSAMRecord read) {
notifyOfCurrentPosition(engine.getGenomeLocParser().createGenomeLoc(read));
}
protected void notifyOfCurrentPosition(final GenomeLoc currentLocation) {
if ( startOfLiveRegion == null )
startOfLiveRegion = currentLocation;
else
startOfLiveRegion = startOfLiveRegion.max(currentLocation.getStartLocation());
}
protected GenomeLoc getStartOfLiveRegion() {
return startOfLiveRegion;
}
protected boolean regionCompletelyWithinDeadZone(final GenomeLoc region, final boolean includeExtension) {
return (region.getStop() < (getStartOfLiveRegion().getStart() - (includeExtension ? getActiveRegionExtension() : 0)))
|| ! region.onSameContig(getStartOfLiveRegion());
}
private T processActiveRegions(final ActiveRegionWalker<M, T> walker, T sum, final boolean forceRegionsToBeActive) {
if( walker.activeRegionOutStream != null ) {
writeActiveRegionsToStream(walker);
return sum;
} else {
return callWalkerMapOnActiveRegions(walker, sum, forceRegionsToBeActive);
}
}
private T callWalkerMapOnActiveRegions(final ActiveRegionWalker<M, T> walker, T sum, final boolean forceRegionsToBeActive) {
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
// TODO can implement parallel traversal here
while( workQueue.peek() != null ) {
final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc();
if ( forceRegionsToBeActive || regionCompletelyWithinDeadZone(extendedLoc, false) ) {
final ActiveRegion activeRegion = workQueue.remove();
if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + getStartOfLiveRegion());
sum = processActiveRegion( activeRegion, sum, walker );
} else {
break;
}
}
return sum;
}
@Override
public String toString() {
return "TraverseActiveRegions";
}
private boolean readIsDead(final GATKSAMRecord read, final GenomeLoc readLoc, final ActiveRegion activeRegion) {
return readLoc.getStop() < activeRegion.getLocation().getStart() && regionCompletelyWithinDeadZone(readLoc, true);
}
protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker<M, T> walker) {
final Iterator<GATKSAMRecord> liveReads = myReads.iterator();
while ( liveReads.hasNext() ) {
boolean killed = false;
final GATKSAMRecord read = liveReads.next();
final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read );
if( activeRegion.getLocation().overlapsP( readLoc ) ) {
activeRegion.add(read);
if ( ! walker.wantsNonPrimaryReads() ) {
if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion());
liveReads.remove();
killed = true;
}
} else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) {
activeRegion.add( read );
}
if ( ! killed && readIsDead(read, readLoc, activeRegion) ) {
if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion());
liveReads.remove();
}
}
logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc());
final M x = walker.map(activeRegion, null);
return walker.reduce( x, sum );
}
/**
* Special function called in LinearMicroScheduler to empty out the work queue.
* Ugly for now but will be cleaned up when we push this functionality more into the engine
*/
public T endTraversal(final Walker<M, T> walker, T sum) {
return processActiveRegions((ActiveRegionWalker<M, T>)walker, sum, true);
}
}

View File

@ -1,253 +0,0 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.traversals;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.*;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.activeregion.ActivityProfile;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: 12/9/11
*/
public class TraverseActiveRegionsOptimized<M,T> extends TraverseActiveRegions<M,T> {
private LinkedList<GATKSAMRecord> myReads = new LinkedList<GATKSAMRecord>();
private Shard lastShard = null;
@Override
public T traverse( final ActiveRegionWalker<M,T> walker,
final LocusShardDataProvider dataProvider,
T sum) {
if ( DEBUG ) logger.warn(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider));
final HashSet<GATKSAMRecord> maybeDuplicatedReads = new HashSet<GATKSAMRecord>();
// TODO -- there's got to be a better way to know this
if ( lastShard != dataProvider.getShard() ) {
maybeDuplicatedReads.addAll(myReads);
logger.info("Crossing shard boundary requires us to check for duplicates against " + maybeDuplicatedReads.size() + " reads");
if ( DEBUG ) logger.warn("Clearing myReads");
}
lastShard = dataProvider.getShard();
final LocusView locusView = new AllLocusView(dataProvider);
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
final List<ActiveRegion> activeRegions = new LinkedList<ActiveRegion>();
ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
// We keep processing while the next reference location is within the interval
GenomeLoc prevLoc = null;
while( locusView.hasNext() ) {
final AlignmentContext locus = locusView.next();
final GenomeLoc location = locus.getLocation();
// Grab all the previously unseen reads from this pileup and add them to the massive read list
// Note that this must occur before we leave because we are outside the intervals because
// reads may occur outside our intervals but overlap them in the future
final Collection<GATKSAMRecord> reads = locusView.getLIBS().transferReadsFromAllPreviousPileups();
for( final GATKSAMRecord read : reads ) {
notifyOfCurrentPosition(read);
// most of the time maybeDuplicatedReads is empty
// TODO -- I believe that because of the ordering of reads that as soon as we don't find a read in the
// TODO -- potential list of duplicates we can clear the hashset
if ( ! maybeDuplicatedReads.isEmpty() && maybeDuplicatedReads.contains(read) ) {
if ( DEBUG ) logger.warn("Skipping duplicated " + read.getReadName());
} else {
if ( DEBUG ) logger.warn("Adding read " + read.getReadName() + " at " + engine.getGenomeLocParser().createGenomeLoc(read) + " from provider " + dataProvider);
myReads.add((GATKSAMRecord)read);
}
}
// skip this location -- it's not part of our engine intervals
if ( outsideEngineIntervals(location) )
continue;
if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) {
// we've move across some interval boundary, restart profile
profile = incorporateActiveRegions(profile, activeRegions);
}
dataProvider.getShard().getReadMetrics().incrementNumIterations();
// create reference context. Note that if we have a pileup of "extended events", the context will
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
final ReferenceContext refContext = referenceView.getReferenceContext(location);
// Iterate forward to get all reference ordered data covering this location
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
// Call the walkers isActive function for this locus and add them to the list to be integrated later
profile.add(walkerActiveProb(walker, tracker, refContext, locus, location));
prevLoc = location;
printProgress(locus.getLocation());
}
updateCumulativeMetrics(dataProvider.getShard());
if ( ! profile.isEmpty() )
incorporateActiveRegions(profile, activeRegions);
// add active regions to queue of regions to process
// first check if can merge active regions over shard boundaries
if( !activeRegions.isEmpty() ) {
if( !workQueue.isEmpty() ) {
final ActiveRegion last = workQueue.getLast();
final ActiveRegion first = activeRegions.get(0);
if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= getMaxRegionSize() ) {
workQueue.removeLast();
activeRegions.remove(first);
workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), getActiveRegionExtension()) );
}
}
workQueue.addAll( activeRegions );
}
logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
// now go and process all of the active regions
sum = processActiveRegions(walker, sum, false);
return sum;
}
private GenomeLoc startOfLiveRegion = null;
protected void notifyOfCurrentPosition(final GATKSAMRecord read) {
notifyOfCurrentPosition(engine.getGenomeLocParser().createGenomeLoc(read));
}
protected void notifyOfCurrentPosition(final GenomeLoc currentLocation) {
if ( startOfLiveRegion == null )
startOfLiveRegion = currentLocation;
else
startOfLiveRegion = startOfLiveRegion.max(currentLocation.getStartLocation());
}
protected GenomeLoc getStartOfLiveRegion() {
return startOfLiveRegion;
}
protected boolean regionCompletelyWithinDeadZone(final GenomeLoc region, final boolean includeExtension) {
return (region.getStop() < (getStartOfLiveRegion().getStart() - (includeExtension ? getActiveRegionExtension() : 0)))
|| ! region.onSameContig(getStartOfLiveRegion());
}
private T processActiveRegions(final ActiveRegionWalker<M, T> walker, T sum, final boolean forceRegionsToBeActive) {
if( walker.activeRegionOutStream != null ) {
writeActiveRegionsToStream(walker);
return sum;
} else {
return callWalkerMapOnActiveRegions(walker, sum, forceRegionsToBeActive);
}
}
private T callWalkerMapOnActiveRegions(final ActiveRegionWalker<M, T> walker, T sum, final boolean forceRegionsToBeActive) {
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
// TODO can implement parallel traversal here
while( workQueue.peek() != null ) {
final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc();
if ( forceRegionsToBeActive || regionCompletelyWithinDeadZone(extendedLoc, false) ) {
final ActiveRegion activeRegion = workQueue.remove();
if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + getStartOfLiveRegion());
sum = processActiveRegion( activeRegion, sum, walker );
} else {
break;
}
}
return sum;
}
@Override
public String toString() {
return "TraverseActiveRegionsOptimized";
}
private boolean readIsDead(final GATKSAMRecord read, final GenomeLoc readLoc, final ActiveRegion activeRegion) {
return readLoc.getStop() < activeRegion.getLocation().getStart() && regionCompletelyWithinDeadZone(readLoc, true);
}
@Override
protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker<M, T> walker) {
final Iterator<GATKSAMRecord> liveReads = myReads.iterator();
while ( liveReads.hasNext() ) {
boolean killed = false;
final GATKSAMRecord read = liveReads.next();
final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read );
if( activeRegion.getLocation().overlapsP( readLoc ) ) {
activeRegion.add(read);
if ( ! walker.wantsNonPrimaryReads() ) {
if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion());
liveReads.remove();
killed = true;
}
} else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) {
activeRegion.add( read );
}
if ( ! killed && readIsDead(read, readLoc, activeRegion) ) {
if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion());
liveReads.remove();
}
}
logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc());
final M x = walker.map(activeRegion, null);
return walker.reduce( x, sum );
}
/**
* Special function called in LinearMicroScheduler to empty out the work queue.
* Ugly for now but will be cleaned up when we push this functionality more into the engine
*/
@Override
public T endTraversal(final Walker<M, T> walker, T sum) {
return processActiveRegions((ActiveRegionWalker<M, T>)walker, sum, true);
}
}

View File

@ -1,262 +0,0 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.traversals;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.WalkerManager;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.activeregion.ActivityProfile;
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: 12/9/11
*/
public class TraverseActiveRegionsOriginal<M,T> extends TraverseActiveRegions<M,T> {
private final LinkedHashSet<GATKSAMRecord> myReads = new LinkedHashSet<GATKSAMRecord>();
@Override
public T traverse( final ActiveRegionWalker<M,T> walker,
final LocusShardDataProvider dataProvider,
T sum) {
logger.debug(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider));
final LocusView locusView = new AllLocusView(dataProvider);
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion();
int minStart = Integer.MAX_VALUE;
final List<ActiveRegion> activeRegions = new LinkedList<ActiveRegion>();
ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
// We keep processing while the next reference location is within the interval
GenomeLoc prevLoc = null;
while( locusView.hasNext() ) {
final AlignmentContext locus = locusView.next();
final GenomeLoc location = locus.getLocation();
// Grab all the previously unseen reads from this pileup and add them to the massive read list
// Note that this must occur before we leave because we are outside the intervals because
// reads may occur outside our intervals but overlap them in the future
// TODO -- this whole HashSet logic should be changed to a linked list of reads with
// TODO -- subsequent pass over them to find the ones overlapping the active regions
for( final PileupElement p : locus.getBasePileup() ) {
final GATKSAMRecord read = p.getRead();
if( !myReads.contains(read) ) {
myReads.add(read);
}
// If this is the last pileup for this shard calculate the minimum alignment start so that we know
// which active regions in the work queue are now safe to process
minStart = Math.min(minStart, read.getAlignmentStart());
}
// skip this location -- it's not part of our engine intervals
if ( outsideEngineIntervals(location) )
continue;
if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) {
// we've move across some interval boundary, restart profile
profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize);
}
dataProvider.getShard().getReadMetrics().incrementNumIterations();
// create reference context. Note that if we have a pileup of "extended events", the context will
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
final ReferenceContext refContext = referenceView.getReferenceContext(location);
// Iterate forward to get all reference ordered data covering this location
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
// Call the walkers isActive function for this locus and add them to the list to be integrated later
profile.add(walkerActiveProb(walker, tracker, refContext, locus, location));
prevLoc = location;
printProgress(locus.getLocation());
}
updateCumulativeMetrics(dataProvider.getShard());
if ( ! profile.isEmpty() )
incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize);
// add active regions to queue of regions to process
// first check if can merge active regions over shard boundaries
if( !activeRegions.isEmpty() ) {
if( !workQueue.isEmpty() ) {
final ActiveRegion last = workQueue.getLast();
final ActiveRegion first = activeRegions.get(0);
if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) {
workQueue.removeLast();
activeRegions.remove(first);
workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) );
}
}
workQueue.addAll( activeRegions );
}
logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
// now go and process all of the active regions
sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig());
return sum;
}
/**
* Take the individual isActive calls and integrate them into contiguous active regions and
* add these blocks of work to the work queue
* band-pass filter the list of isActive probabilities and turn into active regions
*
* @param profile
* @param activeRegions
* @param activeRegionExtension
* @param maxRegionSize
* @return
*/
private ActivityProfile incorporateActiveRegions(final ActivityProfile profile,
final List<ActiveRegion> activeRegions,
final int activeRegionExtension,
final int maxRegionSize) {
if ( profile.isEmpty() )
throw new IllegalStateException("trying to incorporate an empty active profile " + profile);
final ActivityProfile bandPassFiltered = profile.bandPassFilter();
activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ));
return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() );
}
// --------------------------------------------------------------------------------
//
// code to handle processing active regions
//
// --------------------------------------------------------------------------------
private T processActiveRegions( final ActiveRegionWalker<M,T> walker, T sum, final int minStart, final String currentContig ) {
if( walker.activeRegionOutStream != null ) {
writeActiveRegionsToStream(walker);
return sum;
} else {
return callWalkerMapOnActiveRegions(walker, sum, minStart, currentContig);
}
}
private T callWalkerMapOnActiveRegions( final ActiveRegionWalker<M,T> walker, T sum, final int minStart, final String currentContig ) {
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
// TODO can implement parallel traversal here
while( workQueue.peek() != null ) {
final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc();
if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) {
final ActiveRegion activeRegion = workQueue.remove();
sum = processActiveRegion( activeRegion, sum, walker );
} else {
break;
}
}
return sum;
}
@Override
protected T processActiveRegion( final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker<M,T> walker ) {
final ArrayList<GATKSAMRecord> placedReads = new ArrayList<GATKSAMRecord>();
for( final GATKSAMRecord read : myReads ) {
final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read );
if( activeRegion.getLocation().overlapsP( readLoc ) ) {
// The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region)
long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc );
ActiveRegion bestRegion = activeRegion;
for( final ActiveRegion otherRegionToTest : workQueue ) {
if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) {
maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc );
bestRegion = otherRegionToTest;
}
}
bestRegion.add( read );
// The read is also added to all other regions in which it overlaps but marked as non-primary
if( walker.wantsNonPrimaryReads() ) {
if( !bestRegion.equals(activeRegion) ) {
activeRegion.add( read );
}
for( final ActiveRegion otherRegionToTest : workQueue ) {
if( !bestRegion.equals(otherRegionToTest) ) {
// check for non-primary vs. extended
if ( otherRegionToTest.getLocation().overlapsP( readLoc ) ) {
otherRegionToTest.add( read );
} else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) {
otherRegionToTest.add( read );
}
}
}
}
placedReads.add( read );
// check for non-primary vs. extended
} else if( activeRegion.getLocation().overlapsP( readLoc ) ) {
if ( walker.wantsNonPrimaryReads() ) {
activeRegion.add( read );
}
} else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) {
activeRegion.add( read );
}
}
myReads.removeAll( placedReads ); // remove all the reads which have been placed into their active region
// WARNING: This hashset relies on reads being exactly equal when they are placed in the list as when they are removed. So the ActiveRegionWalker can't modify the reads in any way.
logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc());
final M x = walker.map( activeRegion, null );
return walker.reduce( x, sum );
}
/**
* Special function called in LinearMicroScheduler to empty out the work queue.
* Ugly for now but will be cleaned up when we push this functionality more into the engine
*/
public T endTraversal( final Walker<M,T> walker, T sum) {
return processActiveRegions((ActiveRegionWalker<M, T>) walker, sum, Integer.MAX_VALUE, null);
}
}

View File

@ -1,523 +0,0 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.traversals;
import com.google.java.contract.PreconditionError;
import net.sf.samtools.*;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.datasources.reads.*;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.executive.WindowMaker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
/**
* Created with IntelliJ IDEA.
* User: depristo
* Date: 1/10/13
* Time: 8:03 PM
* To change this template use File | Settings | File Templates.
*/
public class TraverseActiveRegionsOriginalUnitTest extends BaseTest {
private class DummyActiveRegionWalker extends ActiveRegionWalker<Integer, Integer> {
private final double prob;
private EnumSet<ActiveRegionReadState> states = super.desiredReadStates();
protected List<GenomeLoc> isActiveCalls = new ArrayList<GenomeLoc>();
protected Map<GenomeLoc, ActiveRegion> mappedActiveRegions = new HashMap<GenomeLoc, ActiveRegion>();
public DummyActiveRegionWalker() {
this.prob = 1.0;
}
public DummyActiveRegionWalker(double constProb) {
this.prob = constProb;
}
public DummyActiveRegionWalker(EnumSet<ActiveRegionReadState> wantStates) {
this.prob = 1.0;
this.states = wantStates;
}
@Override
public EnumSet<ActiveRegionReadState> desiredReadStates() {
return states;
}
@Override
public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
isActiveCalls.add(ref.getLocus());
return new ActivityProfileResult(ref.getLocus(), prob);
}
@Override
public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) {
mappedActiveRegions.put(activeRegion.getLocation(), activeRegion);
return 0;
}
@Override
public Integer reduceInit() {
return 0;
}
@Override
public Integer reduce(Integer value, Integer sum) {
return 0;
}
}
private final TraverseActiveRegions<Integer, Integer> t = new TraverseActiveRegionsOriginal<Integer, Integer>();
private IndexedFastaSequenceFile reference;
private SAMSequenceDictionary dictionary;
private GenomeLocParser genomeLocParser;
private List<GenomeLoc> intervals;
private static final String testBAM = "TraverseActiveRegionsUnitTest.bam";
private static final String testBAI = "TraverseActiveRegionsUnitTest.bai";
@BeforeClass
private void init() throws FileNotFoundException {
reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference));
dictionary = reference.getSequenceDictionary();
genomeLocParser = new GenomeLocParser(dictionary);
// TODO: reads with indels
// TODO: reads which span many regions
// TODO: reads which are partially between intervals (in/outside extension)
// TODO: duplicate reads
// TODO: read at the end of a contig
// TODO: reads which are completely outside intervals but within extension
// TODO: test the extension itself
// TODO: unmapped reads
intervals = new ArrayList<GenomeLoc>();
intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20));
intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999));
intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999));
intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999));
intervals.add(genomeLocParser.createGenomeLoc("1", 10000, 20000));
intervals.add(genomeLocParser.createGenomeLoc("2", 1, 100));
intervals.add(genomeLocParser.createGenomeLoc("20", 10000, 10100));
intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList();
List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
reads.add(buildSAMRecord("simple", "1", 100, 200));
reads.add(buildSAMRecord("overlap_equal", "1", 10, 20));
reads.add(buildSAMRecord("overlap_unequal", "1", 10, 21));
reads.add(buildSAMRecord("boundary_equal", "1", 1990, 2009));
reads.add(buildSAMRecord("boundary_unequal", "1", 1990, 2008));
reads.add(buildSAMRecord("boundary_1_pre", "1", 1950, 2000));
reads.add(buildSAMRecord("boundary_1_post", "1", 1999, 2050));
reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990));
reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000));
reads.add(buildSAMRecord("shard_boundary_1_pre", "1", 16300, 16385));
reads.add(buildSAMRecord("shard_boundary_1_post", "1", 16384, 16400));
reads.add(buildSAMRecord("shard_boundary_equal", "1", 16355, 16414));
reads.add(buildSAMRecord("simple20", "20", 10025, 10075));
createBAM(reads);
}
private void createBAM(List<GATKSAMRecord> reads) {
File outFile = new File(testBAM);
outFile.deleteOnExit();
File indexFile = new File(testBAI);
indexFile.deleteOnExit();
SAMFileWriter out = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(reads.get(0).getHeader(), true, outFile);
for (GATKSAMRecord read : ReadUtils.sortReadsByCoordinate(reads)) {
out.addAlignment(read);
}
out.close();
}
@Test
public void testAllBasesSeen() {
DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
List<GenomeLoc> activeIntervals = getIsActiveIntervals(walker, intervals);
// Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call
verifyEqualIntervals(intervals, activeIntervals);
}
private List<GenomeLoc> getIsActiveIntervals(DummyActiveRegionWalker walker, List<GenomeLoc> intervals) {
List<GenomeLoc> activeIntervals = new ArrayList<GenomeLoc>();
for (LocusShardDataProvider dataProvider : createDataProviders(walker, intervals, testBAM)) {
t.traverse(walker, dataProvider, 0);
activeIntervals.addAll(walker.isActiveCalls);
}
return activeIntervals;
}
@Test (expectedExceptions = PreconditionError.class)
public void testIsActiveRangeLow () {
DummyActiveRegionWalker walker = new DummyActiveRegionWalker(-0.1);
getActiveRegions(walker, intervals).values();
}
@Test (expectedExceptions = PreconditionError.class)
public void testIsActiveRangeHigh () {
DummyActiveRegionWalker walker = new DummyActiveRegionWalker(1.1);
getActiveRegions(walker, intervals).values();
}
@Test
public void testActiveRegionCoverage() {
DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
Collection<ActiveRegion> activeRegions = getActiveRegions(walker, intervals).values();
verifyActiveRegionCoverage(intervals, activeRegions);
}
private void verifyActiveRegionCoverage(List<GenomeLoc> intervals, Collection<ActiveRegion> activeRegions) {
List<GenomeLoc> intervalStarts = new ArrayList<GenomeLoc>();
List<GenomeLoc> intervalStops = new ArrayList<GenomeLoc>();
for (GenomeLoc interval : intervals) {
intervalStarts.add(interval.getStartLocation());
intervalStops.add(interval.getStopLocation());
}
Map<GenomeLoc, ActiveRegion> baseRegionMap = new HashMap<GenomeLoc, ActiveRegion>();
for (ActiveRegion activeRegion : activeRegions) {
for (GenomeLoc activeLoc : toSingleBaseLocs(activeRegion.getLocation())) {
// Contract: Regions do not overlap
Assert.assertFalse(baseRegionMap.containsKey(activeLoc), "Genome location " + activeLoc + " is assigned to more than one region");
baseRegionMap.put(activeLoc, activeRegion);
}
GenomeLoc start = activeRegion.getLocation().getStartLocation();
if (intervalStarts.contains(start))
intervalStarts.remove(start);
GenomeLoc stop = activeRegion.getLocation().getStopLocation();
if (intervalStops.contains(stop))
intervalStops.remove(stop);
}
for (GenomeLoc baseLoc : toSingleBaseLocs(intervals)) {
// Contract: Each location in the interval(s) is in exactly one region
// Contract: The total set of regions exactly matches the analysis interval(s)
Assert.assertTrue(baseRegionMap.containsKey(baseLoc), "Genome location " + baseLoc + " is not assigned to any region");
baseRegionMap.remove(baseLoc);
}
// Contract: The total set of regions exactly matches the analysis interval(s)
Assert.assertEquals(baseRegionMap.size(), 0, "Active regions contain base(s) outside of the given intervals");
// Contract: All explicit interval boundaries must also be region boundaries
Assert.assertEquals(intervalStarts.size(), 0, "Interval start location does not match an active region start location");
Assert.assertEquals(intervalStops.size(), 0, "Interval stop location does not match an active region stop location");
}
@Test
public void testActiveRegionExtensionOnContig() {
DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
Collection<ActiveRegion> activeRegions = getActiveRegions(walker, intervals).values();
for (ActiveRegion activeRegion : activeRegions) {
GenomeLoc loc = activeRegion.getExtendedLoc();
// Contract: active region extensions must stay on the contig
Assert.assertTrue(loc.getStart() > 0, "Active region extension begins at location " + loc.getStart() + ", past the left end of the contig");
int refLen = dictionary.getSequence(loc.getContigIndex()).getSequenceLength();
Assert.assertTrue(loc.getStop() <= refLen, "Active region extension ends at location " + loc.getStop() + ", past the right end of the contig");
}
}
@Test
public void testPrimaryReadMapping() {
DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
// Contract: Each read has the Primary state in a single region (or none)
// This is the region of maximum overlap for the read (earlier if tied)
// simple: Primary in 1:1-999
// overlap_equal: Primary in 1:1-999
// overlap_unequal: Primary in 1:1-999
// boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
// boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
// boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
// boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
// extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999
// outside_intervals: none
// shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927
// shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
// shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
// simple20: Primary in 20:10000-10100
Map<GenomeLoc, ActiveRegion> activeRegions = getActiveRegions(walker, intervals);
ActiveRegion region;
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999));
verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999));
verifyReadMapping(region, "boundary_unequal", "extended_and_np", "boundary_1_pre");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999));
verifyReadMapping(region, "boundary_equal", "boundary_1_post");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384));
verifyReadMapping(region, "shard_boundary_1_pre");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927));
verifyReadMapping(region, "shard_boundary_1_post", "shard_boundary_equal");
region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100));
verifyReadMapping(region, "simple20");
}
@Test
public void testNonPrimaryReadMapping() {
DummyActiveRegionWalker walker = new DummyActiveRegionWalker(
EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY));
// Contract: Each read has the Primary state in a single region (or none)
// This is the region of maximum overlap for the read (earlier if tied)
// Contract: Each read has the Non-Primary state in all other regions it overlaps
// simple: Primary in 1:1-999
// overlap_equal: Primary in 1:1-999
// overlap_unequal: Primary in 1:1-999
// boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
// boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
// boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
// boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
// extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999
// outside_intervals: none
// shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927
// shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
// shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
// simple20: Primary in 20:10000-10100
Map<GenomeLoc, ActiveRegion> activeRegions = getActiveRegions(walker, intervals);
ActiveRegion region;
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999));
verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999));
verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999));
verifyReadMapping(region, "boundary_equal", "boundary_unequal", "boundary_1_pre", "boundary_1_post");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384));
verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927));
verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");
region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100));
verifyReadMapping(region, "simple20");
}
@Test
public void testExtendedReadMapping() {
DummyActiveRegionWalker walker = new DummyActiveRegionWalker(
EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED));
// Contract: Each read has the Primary state in a single region (or none)
// This is the region of maximum overlap for the read (earlier if tied)
// Contract: Each read has the Non-Primary state in all other regions it overlaps
// Contract: Each read has the Extended state in regions where it only overlaps if the region is extended
// simple: Primary in 1:1-999
// overlap_equal: Primary in 1:1-999
// overlap_unequal: Primary in 1:1-999
// boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
// boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
// boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
// boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
// extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999
// outside_intervals: none
// shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927
// shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
// shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
// simple20: Primary in 20:10000-10100
Map<GenomeLoc, ActiveRegion> activeRegions = getActiveRegions(walker, intervals);
ActiveRegion region;
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999));
verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999));
verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999));
verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384));
verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927));
verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");
region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100));
verifyReadMapping(region, "simple20");
}
@Test
public void testUnmappedReads() {
// TODO
}
private void verifyReadMapping(ActiveRegion region, String... reads) {
Collection<String> wantReads = new ArrayList<String>(Arrays.asList(reads));
for (SAMRecord read : region.getReads()) {
String regionReadName = read.getReadName();
Assert.assertTrue(wantReads.contains(regionReadName), "Read " + regionReadName + " assigned to active region " + region);
wantReads.remove(regionReadName);
}
Assert.assertTrue(wantReads.isEmpty(), "Reads missing in active region " + region);
}
private Map<GenomeLoc, ActiveRegion> getActiveRegions(DummyActiveRegionWalker walker, List<GenomeLoc> intervals) {
for (LocusShardDataProvider dataProvider : createDataProviders(walker, intervals, testBAM))
t.traverse(walker, dataProvider, 0);
t.endTraversal(walker, 0);
return walker.mappedActiveRegions;
}
private Collection<GenomeLoc> toSingleBaseLocs(GenomeLoc interval) {
List<GenomeLoc> bases = new ArrayList<GenomeLoc>();
if (interval.size() == 1)
bases.add(interval);
else {
for (int location = interval.getStart(); location <= interval.getStop(); location++)
bases.add(genomeLocParser.createGenomeLoc(interval.getContig(), location, location));
}
return bases;
}
private Collection<GenomeLoc> toSingleBaseLocs(List<GenomeLoc> intervals) {
Set<GenomeLoc> bases = new TreeSet<GenomeLoc>(); // for sorting and uniqueness
for (GenomeLoc interval : intervals)
bases.addAll(toSingleBaseLocs(interval));
return bases;
}
private void verifyEqualIntervals(List<GenomeLoc> aIntervals, List<GenomeLoc> bIntervals) {
Collection<GenomeLoc> aBases = toSingleBaseLocs(aIntervals);
Collection<GenomeLoc> bBases = toSingleBaseLocs(bIntervals);
Assert.assertTrue(aBases.size() == bBases.size(), "Interval lists have a differing number of bases: " + aBases.size() + " vs. " + bBases.size());
Iterator<GenomeLoc> aIter = aBases.iterator();
Iterator<GenomeLoc> bIter = bBases.iterator();
while (aIter.hasNext() && bIter.hasNext()) {
GenomeLoc aLoc = aIter.next();
GenomeLoc bLoc = bIter.next();
Assert.assertTrue(aLoc.equals(bLoc), "Interval locations do not match: " + aLoc + " vs. " + bLoc);
}
}
// copied from LocusViewTemplate
protected GATKSAMRecord buildSAMRecord(String readName, String contig, int alignmentStart, int alignmentEnd) {
SAMFileHeader header = ArtificialSAMUtils.createDefaultReadGroup(new SAMFileHeader(), "test", "test");
header.setSequenceDictionary(dictionary);
header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
GATKSAMRecord record = new GATKSAMRecord(header);
record.setReadName(readName);
record.setReferenceIndex(dictionary.getSequenceIndex(contig));
record.setAlignmentStart(alignmentStart);
Cigar cigar = new Cigar();
int len = alignmentEnd - alignmentStart + 1;
cigar.add(new CigarElement(len, CigarOperator.M));
record.setCigar(cigar);
record.setReadString(new String(new char[len]).replace("\0", "A"));
record.setBaseQualities(new byte[len]);
return record;
}
private List<LocusShardDataProvider> createDataProviders(final Walker walker, List<GenomeLoc> intervals, String bamFile) {
GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
engine.setGenomeLocParser(genomeLocParser);
t.initialize(engine, walker);
Collection<SAMReaderID> samFiles = new ArrayList<SAMReaderID>();
SAMReaderID readerID = new SAMReaderID(new File(bamFile), new Tags());
samFiles.add(readerID);
SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser);
List<LocusShardDataProvider> providers = new ArrayList<LocusShardDataProvider>();
for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) {
for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs())) {
providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList<ReferenceOrderedDataSource>()));
}
}
return providers;
}
}

View File

@ -76,7 +76,7 @@ import java.util.*;
* Test the Active Region Traversal Contract
* http://iwww.broadinstitute.org/gsa/wiki/index.php/Active_Region_Traversal_Contract
*/
public class TraverseActiveRegionsOptimizedUnitTest extends BaseTest {
public class TraverseActiveRegionsUnitTest extends BaseTest {
private final static boolean ENFORCE_CONTRACTS = false;
private final static boolean DEBUG = false;
@ -131,7 +131,7 @@ public class TraverseActiveRegionsOptimizedUnitTest extends BaseTest {
@DataProvider(name = "TraversalEngineProvider")
public Object[][] makeTraversals() {
final List<Object[]> traversals = new LinkedList<Object[]>();
traversals.add(new Object[]{new TraverseActiveRegionsOptimized<Integer, Integer>()});
traversals.add(new Object[]{new TraverseActiveRegions<Integer, Integer>()});
return traversals.toArray(new Object[][]{});
}
@ -537,7 +537,7 @@ public class TraverseActiveRegionsOptimizedUnitTest extends BaseTest {
new ValidationExclusion(),
new ArrayList<ReadFilter>(),
new ArrayList<ReadTransformer>(),
false, (byte)30, false, t instanceof TraverseActiveRegionsOptimized);
false, (byte)30, false, true);
List<LocusShardDataProvider> providers = new ArrayList<LocusShardDataProvider>();
for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) {