Merge branch 'master' of github.com:broadinstitute/gsa-unstable
This commit is contained in:
commit
4fb3e48099
|
|
@ -67,12 +67,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSample() {
|
||||
HCTest(CEUTRIO_BAM, "", "b8f7b741445ce6b6ea491c794ce75c17");
|
||||
HCTest(CEUTRIO_BAM, "", "1e2671557b01ad0497557097282965fc");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSample() {
|
||||
HCTest(NA12878_BAM, "", "a2c63f6e6e51a01019bdbd23125bdb15");
|
||||
HCTest(NA12878_BAM, "", "2bd237a7e1e63eebe755dbe7963e430a");
|
||||
}
|
||||
|
||||
@Test(enabled = false)
|
||||
|
|
@ -83,7 +83,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGA() {
|
||||
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
|
||||
"c679ae7f04bdfda896b5c046d35e043c");
|
||||
"a938cdd7262968597fc8eb6c1c0a69f1");
|
||||
}
|
||||
|
||||
private void HCTestComplexGGA(String bam, String args, String md5) {
|
||||
|
|
@ -101,7 +101,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
|
||||
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
|
||||
"1a034b7eb572e1b6f659d6e5d57b3e76");
|
||||
"d590c8d6d5e58d685401b65a23846893");
|
||||
}
|
||||
|
||||
private void HCTestComplexVariants(String bam, String args, String md5) {
|
||||
|
|
@ -134,7 +134,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "29f1125df5ab27cc937a144ae08ac735");
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "50a26224b9e863ee47a0619eb54a0323");
|
||||
}
|
||||
|
||||
// That problem bam came from a user on the forum and it spotted a problem where the ReadClipper
|
||||
|
|
@ -145,7 +145,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void HCTestProblematicReadsModifiedInActiveRegions() {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8b1b8d1bd7feac1503fc4ffa6236cff7"));
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("4439496472eb1e2f5c91b30ba525be37"));
|
||||
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -842,7 +842,7 @@ public class GenomeAnalysisEngine {
|
|||
if (argCollection.keepProgramRecords)
|
||||
removeProgramRecords = false;
|
||||
|
||||
final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker && argCollection.newART;
|
||||
final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker;
|
||||
|
||||
return new SAMDataSource(
|
||||
samReaderIDs,
|
||||
|
|
|
|||
|
|
@ -448,10 +448,5 @@ public class GATKArgumentCollection {
|
|||
@Hidden
|
||||
public boolean generateShadowBCF = false;
|
||||
// TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="newART", shortName = "newART", doc = "use the new ART traversal", required=false)
|
||||
public boolean newART = false;
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -245,12 +245,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
|
|||
} else if (walker instanceof ReadPairWalker) {
|
||||
return new TraverseReadPairs();
|
||||
} else if (walker instanceof ActiveRegionWalker) {
|
||||
if ( engine.getArguments().newART ) {
|
||||
// todo -- create optimized traversal
|
||||
return new TraverseActiveRegionsOptimized();
|
||||
} else {
|
||||
return new TraverseActiveRegionsOriginal();
|
||||
}
|
||||
return new TraverseActiveRegions();
|
||||
} else {
|
||||
throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type.");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@ import org.broadinstitute.sting.gatk.WalkerManager;
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.*;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension;
|
||||
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
|
||||
|
|
@ -43,8 +44,7 @@ import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
|
|||
import org.broadinstitute.sting.utils.progressmeter.ProgressMeter;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
|
|
@ -53,7 +53,7 @@ import java.util.List;
|
|||
* Time: 4:45 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public abstract class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegionWalker<M,T>,LocusShardDataProvider> {
|
||||
public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegionWalker<M,T>,LocusShardDataProvider> {
|
||||
protected final static boolean DEBUG = false;
|
||||
|
||||
// set by the tranversal
|
||||
|
|
@ -66,14 +66,6 @@ public abstract class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Ac
|
|||
protected final static Logger logger = Logger.getLogger(TraversalEngine.class);
|
||||
protected final LinkedList<ActiveRegion> workQueue = new LinkedList<ActiveRegion>();
|
||||
|
||||
abstract protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker<M, T> walker);
|
||||
|
||||
/**
|
||||
* Special function called in LinearMicroScheduler to empty out the work queue.
|
||||
* Ugly for now but will be cleaned up when we push this functionality more into the engine
|
||||
*/
|
||||
public abstract T endTraversal(final Walker<M, T> walker, T sum);
|
||||
|
||||
protected int getActiveRegionExtension() {
|
||||
return activeRegionExtension;
|
||||
}
|
||||
|
|
@ -160,4 +152,204 @@ public abstract class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Ac
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
private LinkedList<GATKSAMRecord> myReads = new LinkedList<GATKSAMRecord>();
|
||||
private Shard lastShard = null;
|
||||
|
||||
@Override
|
||||
public T traverse( final ActiveRegionWalker<M,T> walker,
|
||||
final LocusShardDataProvider dataProvider,
|
||||
T sum) {
|
||||
if ( DEBUG ) logger.warn(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider));
|
||||
|
||||
final HashSet<GATKSAMRecord> maybeDuplicatedReads = new HashSet<GATKSAMRecord>();
|
||||
// TODO -- there's got to be a better way to know this
|
||||
if ( lastShard != dataProvider.getShard() ) {
|
||||
maybeDuplicatedReads.addAll(myReads);
|
||||
logger.info("Crossing shard boundary requires us to check for duplicates against " + maybeDuplicatedReads.size() + " reads");
|
||||
if ( DEBUG ) logger.warn("Clearing myReads");
|
||||
}
|
||||
lastShard = dataProvider.getShard();
|
||||
|
||||
final LocusView locusView = new AllLocusView(dataProvider);
|
||||
|
||||
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
|
||||
|
||||
final List<ActiveRegion> activeRegions = new LinkedList<ActiveRegion>();
|
||||
ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
|
||||
|
||||
ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
|
||||
|
||||
// We keep processing while the next reference location is within the interval
|
||||
GenomeLoc prevLoc = null;
|
||||
while( locusView.hasNext() ) {
|
||||
final AlignmentContext locus = locusView.next();
|
||||
final GenomeLoc location = locus.getLocation();
|
||||
|
||||
// Grab all the previously unseen reads from this pileup and add them to the massive read list
|
||||
// Note that this must occur before we leave because we are outside the intervals because
|
||||
// reads may occur outside our intervals but overlap them in the future
|
||||
final Collection<GATKSAMRecord> reads = locusView.getLIBS().transferReadsFromAllPreviousPileups();
|
||||
for( final GATKSAMRecord read : reads ) {
|
||||
notifyOfCurrentPosition(read);
|
||||
// most of the time maybeDuplicatedReads is empty
|
||||
// TODO -- I believe that because of the ordering of reads that as soon as we don't find a read in the
|
||||
// TODO -- potential list of duplicates we can clear the hashset
|
||||
if ( ! maybeDuplicatedReads.isEmpty() && maybeDuplicatedReads.contains(read) ) {
|
||||
if ( DEBUG ) logger.warn("Skipping duplicated " + read.getReadName());
|
||||
} else {
|
||||
if ( DEBUG ) logger.warn("Adding read " + read.getReadName() + " at " + engine.getGenomeLocParser().createGenomeLoc(read) + " from provider " + dataProvider);
|
||||
myReads.add((GATKSAMRecord)read);
|
||||
}
|
||||
}
|
||||
|
||||
// skip this location -- it's not part of our engine intervals
|
||||
if ( outsideEngineIntervals(location) )
|
||||
continue;
|
||||
|
||||
if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) {
|
||||
// we've move across some interval boundary, restart profile
|
||||
profile = incorporateActiveRegions(profile, activeRegions);
|
||||
}
|
||||
|
||||
dataProvider.getShard().getReadMetrics().incrementNumIterations();
|
||||
|
||||
// create reference context. Note that if we have a pileup of "extended events", the context will
|
||||
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
|
||||
final ReferenceContext refContext = referenceView.getReferenceContext(location);
|
||||
|
||||
// Iterate forward to get all reference ordered data covering this location
|
||||
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
|
||||
|
||||
// Call the walkers isActive function for this locus and add them to the list to be integrated later
|
||||
profile.add(walkerActiveProb(walker, tracker, refContext, locus, location));
|
||||
|
||||
prevLoc = location;
|
||||
|
||||
printProgress(locus.getLocation());
|
||||
}
|
||||
|
||||
updateCumulativeMetrics(dataProvider.getShard());
|
||||
|
||||
if ( ! profile.isEmpty() )
|
||||
incorporateActiveRegions(profile, activeRegions);
|
||||
|
||||
// add active regions to queue of regions to process
|
||||
// first check if can merge active regions over shard boundaries
|
||||
if( !activeRegions.isEmpty() ) {
|
||||
if( !workQueue.isEmpty() ) {
|
||||
final ActiveRegion last = workQueue.getLast();
|
||||
final ActiveRegion first = activeRegions.get(0);
|
||||
if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= getMaxRegionSize() ) {
|
||||
workQueue.removeLast();
|
||||
activeRegions.remove(first);
|
||||
workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), getActiveRegionExtension()) );
|
||||
}
|
||||
}
|
||||
workQueue.addAll( activeRegions );
|
||||
}
|
||||
|
||||
logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
|
||||
|
||||
// now go and process all of the active regions
|
||||
sum = processActiveRegions(walker, sum, false);
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
private GenomeLoc startOfLiveRegion = null;
|
||||
|
||||
protected void notifyOfCurrentPosition(final GATKSAMRecord read) {
|
||||
notifyOfCurrentPosition(engine.getGenomeLocParser().createGenomeLoc(read));
|
||||
}
|
||||
|
||||
protected void notifyOfCurrentPosition(final GenomeLoc currentLocation) {
|
||||
if ( startOfLiveRegion == null )
|
||||
startOfLiveRegion = currentLocation;
|
||||
else
|
||||
startOfLiveRegion = startOfLiveRegion.max(currentLocation.getStartLocation());
|
||||
}
|
||||
|
||||
protected GenomeLoc getStartOfLiveRegion() {
|
||||
return startOfLiveRegion;
|
||||
}
|
||||
|
||||
protected boolean regionCompletelyWithinDeadZone(final GenomeLoc region, final boolean includeExtension) {
|
||||
return (region.getStop() < (getStartOfLiveRegion().getStart() - (includeExtension ? getActiveRegionExtension() : 0)))
|
||||
|| ! region.onSameContig(getStartOfLiveRegion());
|
||||
}
|
||||
|
||||
private T processActiveRegions(final ActiveRegionWalker<M, T> walker, T sum, final boolean forceRegionsToBeActive) {
|
||||
if( walker.activeRegionOutStream != null ) {
|
||||
writeActiveRegionsToStream(walker);
|
||||
return sum;
|
||||
} else {
|
||||
return callWalkerMapOnActiveRegions(walker, sum, forceRegionsToBeActive);
|
||||
}
|
||||
}
|
||||
|
||||
private T callWalkerMapOnActiveRegions(final ActiveRegionWalker<M, T> walker, T sum, final boolean forceRegionsToBeActive) {
|
||||
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
|
||||
// TODO can implement parallel traversal here
|
||||
while( workQueue.peek() != null ) {
|
||||
final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc();
|
||||
if ( forceRegionsToBeActive || regionCompletelyWithinDeadZone(extendedLoc, false) ) {
|
||||
final ActiveRegion activeRegion = workQueue.remove();
|
||||
if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + getStartOfLiveRegion());
|
||||
sum = processActiveRegion( activeRegion, sum, walker );
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "TraverseActiveRegions";
|
||||
}
|
||||
|
||||
private boolean readIsDead(final GATKSAMRecord read, final GenomeLoc readLoc, final ActiveRegion activeRegion) {
|
||||
return readLoc.getStop() < activeRegion.getLocation().getStart() && regionCompletelyWithinDeadZone(readLoc, true);
|
||||
}
|
||||
|
||||
protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker<M, T> walker) {
|
||||
final Iterator<GATKSAMRecord> liveReads = myReads.iterator();
|
||||
while ( liveReads.hasNext() ) {
|
||||
boolean killed = false;
|
||||
final GATKSAMRecord read = liveReads.next();
|
||||
final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read );
|
||||
|
||||
if( activeRegion.getLocation().overlapsP( readLoc ) ) {
|
||||
activeRegion.add(read);
|
||||
|
||||
if ( ! walker.wantsNonPrimaryReads() ) {
|
||||
if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion());
|
||||
liveReads.remove();
|
||||
killed = true;
|
||||
}
|
||||
} else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) {
|
||||
activeRegion.add( read );
|
||||
}
|
||||
|
||||
if ( ! killed && readIsDead(read, readLoc, activeRegion) ) {
|
||||
if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion());
|
||||
liveReads.remove();
|
||||
}
|
||||
}
|
||||
|
||||
logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc());
|
||||
final M x = walker.map(activeRegion, null);
|
||||
return walker.reduce( x, sum );
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Special function called in LinearMicroScheduler to empty out the work queue.
|
||||
* Ugly for now but will be cleaned up when we push this functionality more into the engine
|
||||
*/
|
||||
public T endTraversal(final Walker<M, T> walker, T sum) {
|
||||
return processActiveRegions((ActiveRegionWalker<M, T>)walker, sum, true);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,253 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.*;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension;
|
||||
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActivityProfile;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 12/9/11
|
||||
*/
|
||||
|
||||
public class TraverseActiveRegionsOptimized<M,T> extends TraverseActiveRegions<M,T> {
|
||||
private LinkedList<GATKSAMRecord> myReads = new LinkedList<GATKSAMRecord>();
|
||||
private Shard lastShard = null;
|
||||
|
||||
@Override
|
||||
public T traverse( final ActiveRegionWalker<M,T> walker,
|
||||
final LocusShardDataProvider dataProvider,
|
||||
T sum) {
|
||||
if ( DEBUG ) logger.warn(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider));
|
||||
|
||||
final HashSet<GATKSAMRecord> maybeDuplicatedReads = new HashSet<GATKSAMRecord>();
|
||||
// TODO -- there's got to be a better way to know this
|
||||
if ( lastShard != dataProvider.getShard() ) {
|
||||
maybeDuplicatedReads.addAll(myReads);
|
||||
logger.info("Crossing shard boundary requires us to check for duplicates against " + maybeDuplicatedReads.size() + " reads");
|
||||
if ( DEBUG ) logger.warn("Clearing myReads");
|
||||
}
|
||||
lastShard = dataProvider.getShard();
|
||||
|
||||
final LocusView locusView = new AllLocusView(dataProvider);
|
||||
|
||||
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
|
||||
|
||||
final List<ActiveRegion> activeRegions = new LinkedList<ActiveRegion>();
|
||||
ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
|
||||
|
||||
ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
|
||||
|
||||
// We keep processing while the next reference location is within the interval
|
||||
GenomeLoc prevLoc = null;
|
||||
while( locusView.hasNext() ) {
|
||||
final AlignmentContext locus = locusView.next();
|
||||
final GenomeLoc location = locus.getLocation();
|
||||
|
||||
// Grab all the previously unseen reads from this pileup and add them to the massive read list
|
||||
// Note that this must occur before we leave because we are outside the intervals because
|
||||
// reads may occur outside our intervals but overlap them in the future
|
||||
final Collection<GATKSAMRecord> reads = locusView.getLIBS().transferReadsFromAllPreviousPileups();
|
||||
for( final GATKSAMRecord read : reads ) {
|
||||
notifyOfCurrentPosition(read);
|
||||
// most of the time maybeDuplicatedReads is empty
|
||||
// TODO -- I believe that because of the ordering of reads that as soon as we don't find a read in the
|
||||
// TODO -- potential list of duplicates we can clear the hashset
|
||||
if ( ! maybeDuplicatedReads.isEmpty() && maybeDuplicatedReads.contains(read) ) {
|
||||
if ( DEBUG ) logger.warn("Skipping duplicated " + read.getReadName());
|
||||
} else {
|
||||
if ( DEBUG ) logger.warn("Adding read " + read.getReadName() + " at " + engine.getGenomeLocParser().createGenomeLoc(read) + " from provider " + dataProvider);
|
||||
myReads.add((GATKSAMRecord)read);
|
||||
}
|
||||
}
|
||||
|
||||
// skip this location -- it's not part of our engine intervals
|
||||
if ( outsideEngineIntervals(location) )
|
||||
continue;
|
||||
|
||||
if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) {
|
||||
// we've move across some interval boundary, restart profile
|
||||
profile = incorporateActiveRegions(profile, activeRegions);
|
||||
}
|
||||
|
||||
dataProvider.getShard().getReadMetrics().incrementNumIterations();
|
||||
|
||||
// create reference context. Note that if we have a pileup of "extended events", the context will
|
||||
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
|
||||
final ReferenceContext refContext = referenceView.getReferenceContext(location);
|
||||
|
||||
// Iterate forward to get all reference ordered data covering this location
|
||||
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
|
||||
|
||||
// Call the walkers isActive function for this locus and add them to the list to be integrated later
|
||||
profile.add(walkerActiveProb(walker, tracker, refContext, locus, location));
|
||||
|
||||
prevLoc = location;
|
||||
|
||||
printProgress(locus.getLocation());
|
||||
}
|
||||
|
||||
updateCumulativeMetrics(dataProvider.getShard());
|
||||
|
||||
if ( ! profile.isEmpty() )
|
||||
incorporateActiveRegions(profile, activeRegions);
|
||||
|
||||
// add active regions to queue of regions to process
|
||||
// first check if can merge active regions over shard boundaries
|
||||
if( !activeRegions.isEmpty() ) {
|
||||
if( !workQueue.isEmpty() ) {
|
||||
final ActiveRegion last = workQueue.getLast();
|
||||
final ActiveRegion first = activeRegions.get(0);
|
||||
if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= getMaxRegionSize() ) {
|
||||
workQueue.removeLast();
|
||||
activeRegions.remove(first);
|
||||
workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), getActiveRegionExtension()) );
|
||||
}
|
||||
}
|
||||
workQueue.addAll( activeRegions );
|
||||
}
|
||||
|
||||
logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
|
||||
|
||||
// now go and process all of the active regions
|
||||
sum = processActiveRegions(walker, sum, false);
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
private GenomeLoc startOfLiveRegion = null;
|
||||
|
||||
protected void notifyOfCurrentPosition(final GATKSAMRecord read) {
|
||||
notifyOfCurrentPosition(engine.getGenomeLocParser().createGenomeLoc(read));
|
||||
}
|
||||
|
||||
protected void notifyOfCurrentPosition(final GenomeLoc currentLocation) {
|
||||
if ( startOfLiveRegion == null )
|
||||
startOfLiveRegion = currentLocation;
|
||||
else
|
||||
startOfLiveRegion = startOfLiveRegion.max(currentLocation.getStartLocation());
|
||||
}
|
||||
|
||||
protected GenomeLoc getStartOfLiveRegion() {
|
||||
return startOfLiveRegion;
|
||||
}
|
||||
|
||||
protected boolean regionCompletelyWithinDeadZone(final GenomeLoc region, final boolean includeExtension) {
|
||||
return (region.getStop() < (getStartOfLiveRegion().getStart() - (includeExtension ? getActiveRegionExtension() : 0)))
|
||||
|| ! region.onSameContig(getStartOfLiveRegion());
|
||||
}
|
||||
|
||||
private T processActiveRegions(final ActiveRegionWalker<M, T> walker, T sum, final boolean forceRegionsToBeActive) {
|
||||
if( walker.activeRegionOutStream != null ) {
|
||||
writeActiveRegionsToStream(walker);
|
||||
return sum;
|
||||
} else {
|
||||
return callWalkerMapOnActiveRegions(walker, sum, forceRegionsToBeActive);
|
||||
}
|
||||
}
|
||||
|
||||
private T callWalkerMapOnActiveRegions(final ActiveRegionWalker<M, T> walker, T sum, final boolean forceRegionsToBeActive) {
|
||||
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
|
||||
// TODO can implement parallel traversal here
|
||||
while( workQueue.peek() != null ) {
|
||||
final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc();
|
||||
if ( forceRegionsToBeActive || regionCompletelyWithinDeadZone(extendedLoc, false) ) {
|
||||
final ActiveRegion activeRegion = workQueue.remove();
|
||||
if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + getStartOfLiveRegion());
|
||||
sum = processActiveRegion( activeRegion, sum, walker );
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "TraverseActiveRegionsOptimized";
|
||||
}
|
||||
|
||||
private boolean readIsDead(final GATKSAMRecord read, final GenomeLoc readLoc, final ActiveRegion activeRegion) {
|
||||
return readLoc.getStop() < activeRegion.getLocation().getStart() && regionCompletelyWithinDeadZone(readLoc, true);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker<M, T> walker) {
|
||||
final Iterator<GATKSAMRecord> liveReads = myReads.iterator();
|
||||
while ( liveReads.hasNext() ) {
|
||||
boolean killed = false;
|
||||
final GATKSAMRecord read = liveReads.next();
|
||||
final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read );
|
||||
|
||||
if( activeRegion.getLocation().overlapsP( readLoc ) ) {
|
||||
activeRegion.add(read);
|
||||
|
||||
if ( ! walker.wantsNonPrimaryReads() ) {
|
||||
if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion());
|
||||
liveReads.remove();
|
||||
killed = true;
|
||||
}
|
||||
} else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) {
|
||||
activeRegion.add( read );
|
||||
}
|
||||
|
||||
if ( ! killed && readIsDead(read, readLoc, activeRegion) ) {
|
||||
if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion());
|
||||
liveReads.remove();
|
||||
}
|
||||
}
|
||||
|
||||
logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc());
|
||||
final M x = walker.map(activeRegion, null);
|
||||
return walker.reduce( x, sum );
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Special function called in LinearMicroScheduler to empty out the work queue.
|
||||
* Ugly for now but will be cleaned up when we push this functionality more into the engine
|
||||
*/
|
||||
@Override
|
||||
public T endTraversal(final Walker<M, T> walker, T sum) {
|
||||
return processActiveRegions((ActiveRegionWalker<M, T>)walker, sum, true);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,262 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.WalkerManager;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension;
|
||||
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActivityProfile;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 12/9/11
|
||||
*/
|
||||
|
||||
public class TraverseActiveRegionsOriginal<M,T> extends TraverseActiveRegions<M,T> {
|
||||
private final LinkedHashSet<GATKSAMRecord> myReads = new LinkedHashSet<GATKSAMRecord>();
|
||||
|
||||
@Override
|
||||
public T traverse( final ActiveRegionWalker<M,T> walker,
|
||||
final LocusShardDataProvider dataProvider,
|
||||
T sum) {
|
||||
logger.debug(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider));
|
||||
|
||||
final LocusView locusView = new AllLocusView(dataProvider);
|
||||
|
||||
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
|
||||
final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
|
||||
final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion();
|
||||
|
||||
int minStart = Integer.MAX_VALUE;
|
||||
final List<ActiveRegion> activeRegions = new LinkedList<ActiveRegion>();
|
||||
ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() );
|
||||
|
||||
ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
|
||||
|
||||
// We keep processing while the next reference location is within the interval
|
||||
GenomeLoc prevLoc = null;
|
||||
while( locusView.hasNext() ) {
|
||||
final AlignmentContext locus = locusView.next();
|
||||
final GenomeLoc location = locus.getLocation();
|
||||
|
||||
// Grab all the previously unseen reads from this pileup and add them to the massive read list
|
||||
// Note that this must occur before we leave because we are outside the intervals because
|
||||
// reads may occur outside our intervals but overlap them in the future
|
||||
// TODO -- this whole HashSet logic should be changed to a linked list of reads with
|
||||
// TODO -- subsequent pass over them to find the ones overlapping the active regions
|
||||
for( final PileupElement p : locus.getBasePileup() ) {
|
||||
final GATKSAMRecord read = p.getRead();
|
||||
if( !myReads.contains(read) ) {
|
||||
myReads.add(read);
|
||||
}
|
||||
|
||||
// If this is the last pileup for this shard calculate the minimum alignment start so that we know
|
||||
// which active regions in the work queue are now safe to process
|
||||
minStart = Math.min(minStart, read.getAlignmentStart());
|
||||
}
|
||||
|
||||
// skip this location -- it's not part of our engine intervals
|
||||
if ( outsideEngineIntervals(location) )
|
||||
continue;
|
||||
|
||||
if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) {
|
||||
// we've move across some interval boundary, restart profile
|
||||
profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize);
|
||||
}
|
||||
|
||||
dataProvider.getShard().getReadMetrics().incrementNumIterations();
|
||||
|
||||
// create reference context. Note that if we have a pileup of "extended events", the context will
|
||||
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
|
||||
final ReferenceContext refContext = referenceView.getReferenceContext(location);
|
||||
|
||||
// Iterate forward to get all reference ordered data covering this location
|
||||
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
|
||||
|
||||
// Call the walkers isActive function for this locus and add them to the list to be integrated later
|
||||
profile.add(walkerActiveProb(walker, tracker, refContext, locus, location));
|
||||
|
||||
prevLoc = location;
|
||||
|
||||
printProgress(locus.getLocation());
|
||||
}
|
||||
|
||||
updateCumulativeMetrics(dataProvider.getShard());
|
||||
|
||||
if ( ! profile.isEmpty() )
|
||||
incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize);
|
||||
|
||||
// add active regions to queue of regions to process
|
||||
// first check if can merge active regions over shard boundaries
|
||||
if( !activeRegions.isEmpty() ) {
|
||||
if( !workQueue.isEmpty() ) {
|
||||
final ActiveRegion last = workQueue.getLast();
|
||||
final ActiveRegion first = activeRegions.get(0);
|
||||
if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) {
|
||||
workQueue.removeLast();
|
||||
activeRegions.remove(first);
|
||||
workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) );
|
||||
}
|
||||
}
|
||||
workQueue.addAll( activeRegions );
|
||||
}
|
||||
|
||||
logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
|
||||
|
||||
// now go and process all of the active regions
|
||||
sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig());
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Take the individual isActive calls and integrate them into contiguous active regions and
|
||||
* add these blocks of work to the work queue
|
||||
* band-pass filter the list of isActive probabilities and turn into active regions
|
||||
*
|
||||
* @param profile
|
||||
* @param activeRegions
|
||||
* @param activeRegionExtension
|
||||
* @param maxRegionSize
|
||||
* @return
|
||||
*/
|
||||
private ActivityProfile incorporateActiveRegions(final ActivityProfile profile,
|
||||
final List<ActiveRegion> activeRegions,
|
||||
final int activeRegionExtension,
|
||||
final int maxRegionSize) {
|
||||
if ( profile.isEmpty() )
|
||||
throw new IllegalStateException("trying to incorporate an empty active profile " + profile);
|
||||
|
||||
final ActivityProfile bandPassFiltered = profile.bandPassFilter();
|
||||
activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ));
|
||||
return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() );
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// code to handle processing active regions
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private T processActiveRegions( final ActiveRegionWalker<M,T> walker, T sum, final int minStart, final String currentContig ) {
|
||||
if( walker.activeRegionOutStream != null ) {
|
||||
writeActiveRegionsToStream(walker);
|
||||
return sum;
|
||||
} else {
|
||||
return callWalkerMapOnActiveRegions(walker, sum, minStart, currentContig);
|
||||
}
|
||||
}
|
||||
|
||||
private T callWalkerMapOnActiveRegions( final ActiveRegionWalker<M,T> walker, T sum, final int minStart, final String currentContig ) {
|
||||
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
|
||||
// TODO can implement parallel traversal here
|
||||
while( workQueue.peek() != null ) {
|
||||
final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc();
|
||||
if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) {
|
||||
final ActiveRegion activeRegion = workQueue.remove();
|
||||
sum = processActiveRegion( activeRegion, sum, walker );
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected T processActiveRegion( final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker<M,T> walker ) {
|
||||
final ArrayList<GATKSAMRecord> placedReads = new ArrayList<GATKSAMRecord>();
|
||||
for( final GATKSAMRecord read : myReads ) {
|
||||
final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read );
|
||||
if( activeRegion.getLocation().overlapsP( readLoc ) ) {
|
||||
// The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region)
|
||||
long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc );
|
||||
ActiveRegion bestRegion = activeRegion;
|
||||
for( final ActiveRegion otherRegionToTest : workQueue ) {
|
||||
if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) {
|
||||
maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc );
|
||||
bestRegion = otherRegionToTest;
|
||||
}
|
||||
}
|
||||
bestRegion.add( read );
|
||||
|
||||
// The read is also added to all other regions in which it overlaps but marked as non-primary
|
||||
if( walker.wantsNonPrimaryReads() ) {
|
||||
if( !bestRegion.equals(activeRegion) ) {
|
||||
activeRegion.add( read );
|
||||
}
|
||||
for( final ActiveRegion otherRegionToTest : workQueue ) {
|
||||
if( !bestRegion.equals(otherRegionToTest) ) {
|
||||
// check for non-primary vs. extended
|
||||
if ( otherRegionToTest.getLocation().overlapsP( readLoc ) ) {
|
||||
otherRegionToTest.add( read );
|
||||
} else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) {
|
||||
otherRegionToTest.add( read );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
placedReads.add( read );
|
||||
// check for non-primary vs. extended
|
||||
} else if( activeRegion.getLocation().overlapsP( readLoc ) ) {
|
||||
if ( walker.wantsNonPrimaryReads() ) {
|
||||
activeRegion.add( read );
|
||||
}
|
||||
} else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) {
|
||||
activeRegion.add( read );
|
||||
}
|
||||
}
|
||||
myReads.removeAll( placedReads ); // remove all the reads which have been placed into their active region
|
||||
// WARNING: This hashset relies on reads being exactly equal when they are placed in the list as when they are removed. So the ActiveRegionWalker can't modify the reads in any way.
|
||||
|
||||
logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc());
|
||||
final M x = walker.map( activeRegion, null );
|
||||
return walker.reduce( x, sum );
|
||||
}
|
||||
|
||||
/**
|
||||
* Special function called in LinearMicroScheduler to empty out the work queue.
|
||||
* Ugly for now but will be cleaned up when we push this functionality more into the engine
|
||||
*/
|
||||
public T endTraversal( final Walker<M,T> walker, T sum) {
|
||||
return processActiveRegions((ActiveRegionWalker<M, T>) walker, sum, Integer.MAX_VALUE, null);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,523 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import com.google.java.contract.PreconditionError;
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.commandline.Tags;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.*;
|
||||
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.executive.WindowMaker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: 1/10/13
|
||||
* Time: 8:03 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class TraverseActiveRegionsOriginalUnitTest extends BaseTest {
|
||||
|
||||
private class DummyActiveRegionWalker extends ActiveRegionWalker<Integer, Integer> {
|
||||
private final double prob;
|
||||
private EnumSet<ActiveRegionReadState> states = super.desiredReadStates();
|
||||
|
||||
protected List<GenomeLoc> isActiveCalls = new ArrayList<GenomeLoc>();
|
||||
protected Map<GenomeLoc, ActiveRegion> mappedActiveRegions = new HashMap<GenomeLoc, ActiveRegion>();
|
||||
|
||||
public DummyActiveRegionWalker() {
|
||||
this.prob = 1.0;
|
||||
}
|
||||
|
||||
public DummyActiveRegionWalker(double constProb) {
|
||||
this.prob = constProb;
|
||||
}
|
||||
|
||||
public DummyActiveRegionWalker(EnumSet<ActiveRegionReadState> wantStates) {
|
||||
this.prob = 1.0;
|
||||
this.states = wantStates;
|
||||
}
|
||||
|
||||
@Override
|
||||
public EnumSet<ActiveRegionReadState> desiredReadStates() {
|
||||
return states;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
isActiveCalls.add(ref.getLocus());
|
||||
return new ActivityProfileResult(ref.getLocus(), prob);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) {
|
||||
mappedActiveRegions.put(activeRegion.getLocation(), activeRegion);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
private final TraverseActiveRegions<Integer, Integer> t = new TraverseActiveRegionsOriginal<Integer, Integer>();
|
||||
|
||||
private IndexedFastaSequenceFile reference;
|
||||
private SAMSequenceDictionary dictionary;
|
||||
private GenomeLocParser genomeLocParser;
|
||||
|
||||
private List<GenomeLoc> intervals;
|
||||
|
||||
private static final String testBAM = "TraverseActiveRegionsUnitTest.bam";
|
||||
private static final String testBAI = "TraverseActiveRegionsUnitTest.bai";
|
||||
|
||||
@BeforeClass
|
||||
private void init() throws FileNotFoundException {
|
||||
reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference));
|
||||
dictionary = reference.getSequenceDictionary();
|
||||
genomeLocParser = new GenomeLocParser(dictionary);
|
||||
|
||||
// TODO: reads with indels
|
||||
// TODO: reads which span many regions
|
||||
// TODO: reads which are partially between intervals (in/outside extension)
|
||||
// TODO: duplicate reads
|
||||
// TODO: read at the end of a contig
|
||||
// TODO: reads which are completely outside intervals but within extension
|
||||
// TODO: test the extension itself
|
||||
// TODO: unmapped reads
|
||||
|
||||
intervals = new ArrayList<GenomeLoc>();
|
||||
intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20));
|
||||
intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999));
|
||||
intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999));
|
||||
intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999));
|
||||
intervals.add(genomeLocParser.createGenomeLoc("1", 10000, 20000));
|
||||
intervals.add(genomeLocParser.createGenomeLoc("2", 1, 100));
|
||||
intervals.add(genomeLocParser.createGenomeLoc("20", 10000, 10100));
|
||||
intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList();
|
||||
|
||||
List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
|
||||
reads.add(buildSAMRecord("simple", "1", 100, 200));
|
||||
reads.add(buildSAMRecord("overlap_equal", "1", 10, 20));
|
||||
reads.add(buildSAMRecord("overlap_unequal", "1", 10, 21));
|
||||
reads.add(buildSAMRecord("boundary_equal", "1", 1990, 2009));
|
||||
reads.add(buildSAMRecord("boundary_unequal", "1", 1990, 2008));
|
||||
reads.add(buildSAMRecord("boundary_1_pre", "1", 1950, 2000));
|
||||
reads.add(buildSAMRecord("boundary_1_post", "1", 1999, 2050));
|
||||
reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990));
|
||||
reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000));
|
||||
reads.add(buildSAMRecord("shard_boundary_1_pre", "1", 16300, 16385));
|
||||
reads.add(buildSAMRecord("shard_boundary_1_post", "1", 16384, 16400));
|
||||
reads.add(buildSAMRecord("shard_boundary_equal", "1", 16355, 16414));
|
||||
reads.add(buildSAMRecord("simple20", "20", 10025, 10075));
|
||||
|
||||
createBAM(reads);
|
||||
}
|
||||
|
||||
private void createBAM(List<GATKSAMRecord> reads) {
|
||||
File outFile = new File(testBAM);
|
||||
outFile.deleteOnExit();
|
||||
File indexFile = new File(testBAI);
|
||||
indexFile.deleteOnExit();
|
||||
|
||||
SAMFileWriter out = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(reads.get(0).getHeader(), true, outFile);
|
||||
for (GATKSAMRecord read : ReadUtils.sortReadsByCoordinate(reads)) {
|
||||
out.addAlignment(read);
|
||||
}
|
||||
out.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAllBasesSeen() {
|
||||
DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
|
||||
|
||||
List<GenomeLoc> activeIntervals = getIsActiveIntervals(walker, intervals);
|
||||
// Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call
|
||||
verifyEqualIntervals(intervals, activeIntervals);
|
||||
}
|
||||
|
||||
private List<GenomeLoc> getIsActiveIntervals(DummyActiveRegionWalker walker, List<GenomeLoc> intervals) {
|
||||
List<GenomeLoc> activeIntervals = new ArrayList<GenomeLoc>();
|
||||
for (LocusShardDataProvider dataProvider : createDataProviders(walker, intervals, testBAM)) {
|
||||
t.traverse(walker, dataProvider, 0);
|
||||
activeIntervals.addAll(walker.isActiveCalls);
|
||||
}
|
||||
|
||||
return activeIntervals;
|
||||
}
|
||||
|
||||
@Test (expectedExceptions = PreconditionError.class)
|
||||
public void testIsActiveRangeLow () {
|
||||
DummyActiveRegionWalker walker = new DummyActiveRegionWalker(-0.1);
|
||||
getActiveRegions(walker, intervals).values();
|
||||
}
|
||||
|
||||
@Test (expectedExceptions = PreconditionError.class)
|
||||
public void testIsActiveRangeHigh () {
|
||||
DummyActiveRegionWalker walker = new DummyActiveRegionWalker(1.1);
|
||||
getActiveRegions(walker, intervals).values();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testActiveRegionCoverage() {
|
||||
DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
|
||||
|
||||
Collection<ActiveRegion> activeRegions = getActiveRegions(walker, intervals).values();
|
||||
verifyActiveRegionCoverage(intervals, activeRegions);
|
||||
}
|
||||
|
||||
private void verifyActiveRegionCoverage(List<GenomeLoc> intervals, Collection<ActiveRegion> activeRegions) {
|
||||
List<GenomeLoc> intervalStarts = new ArrayList<GenomeLoc>();
|
||||
List<GenomeLoc> intervalStops = new ArrayList<GenomeLoc>();
|
||||
|
||||
for (GenomeLoc interval : intervals) {
|
||||
intervalStarts.add(interval.getStartLocation());
|
||||
intervalStops.add(interval.getStopLocation());
|
||||
}
|
||||
|
||||
Map<GenomeLoc, ActiveRegion> baseRegionMap = new HashMap<GenomeLoc, ActiveRegion>();
|
||||
|
||||
for (ActiveRegion activeRegion : activeRegions) {
|
||||
for (GenomeLoc activeLoc : toSingleBaseLocs(activeRegion.getLocation())) {
|
||||
// Contract: Regions do not overlap
|
||||
Assert.assertFalse(baseRegionMap.containsKey(activeLoc), "Genome location " + activeLoc + " is assigned to more than one region");
|
||||
baseRegionMap.put(activeLoc, activeRegion);
|
||||
}
|
||||
|
||||
GenomeLoc start = activeRegion.getLocation().getStartLocation();
|
||||
if (intervalStarts.contains(start))
|
||||
intervalStarts.remove(start);
|
||||
|
||||
GenomeLoc stop = activeRegion.getLocation().getStopLocation();
|
||||
if (intervalStops.contains(stop))
|
||||
intervalStops.remove(stop);
|
||||
}
|
||||
|
||||
for (GenomeLoc baseLoc : toSingleBaseLocs(intervals)) {
|
||||
// Contract: Each location in the interval(s) is in exactly one region
|
||||
// Contract: The total set of regions exactly matches the analysis interval(s)
|
||||
Assert.assertTrue(baseRegionMap.containsKey(baseLoc), "Genome location " + baseLoc + " is not assigned to any region");
|
||||
baseRegionMap.remove(baseLoc);
|
||||
}
|
||||
|
||||
// Contract: The total set of regions exactly matches the analysis interval(s)
|
||||
Assert.assertEquals(baseRegionMap.size(), 0, "Active regions contain base(s) outside of the given intervals");
|
||||
|
||||
// Contract: All explicit interval boundaries must also be region boundaries
|
||||
Assert.assertEquals(intervalStarts.size(), 0, "Interval start location does not match an active region start location");
|
||||
Assert.assertEquals(intervalStops.size(), 0, "Interval stop location does not match an active region stop location");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testActiveRegionExtensionOnContig() {
|
||||
DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
|
||||
|
||||
Collection<ActiveRegion> activeRegions = getActiveRegions(walker, intervals).values();
|
||||
for (ActiveRegion activeRegion : activeRegions) {
|
||||
GenomeLoc loc = activeRegion.getExtendedLoc();
|
||||
|
||||
// Contract: active region extensions must stay on the contig
|
||||
Assert.assertTrue(loc.getStart() > 0, "Active region extension begins at location " + loc.getStart() + ", past the left end of the contig");
|
||||
int refLen = dictionary.getSequence(loc.getContigIndex()).getSequenceLength();
|
||||
Assert.assertTrue(loc.getStop() <= refLen, "Active region extension ends at location " + loc.getStop() + ", past the right end of the contig");
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPrimaryReadMapping() {
|
||||
DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
|
||||
|
||||
// Contract: Each read has the Primary state in a single region (or none)
|
||||
// This is the region of maximum overlap for the read (earlier if tied)
|
||||
|
||||
// simple: Primary in 1:1-999
|
||||
// overlap_equal: Primary in 1:1-999
|
||||
// overlap_unequal: Primary in 1:1-999
|
||||
// boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
|
||||
// boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
|
||||
// boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
|
||||
// boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
|
||||
// extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999
|
||||
// outside_intervals: none
|
||||
// shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927
|
||||
// shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
|
||||
// shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
|
||||
// simple20: Primary in 20:10000-10100
|
||||
|
||||
Map<GenomeLoc, ActiveRegion> activeRegions = getActiveRegions(walker, intervals);
|
||||
ActiveRegion region;
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999));
|
||||
verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999));
|
||||
verifyReadMapping(region, "boundary_unequal", "extended_and_np", "boundary_1_pre");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999));
|
||||
verifyReadMapping(region, "boundary_equal", "boundary_1_post");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384));
|
||||
verifyReadMapping(region, "shard_boundary_1_pre");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927));
|
||||
verifyReadMapping(region, "shard_boundary_1_post", "shard_boundary_equal");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100));
|
||||
verifyReadMapping(region, "simple20");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNonPrimaryReadMapping() {
|
||||
DummyActiveRegionWalker walker = new DummyActiveRegionWalker(
|
||||
EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY));
|
||||
|
||||
// Contract: Each read has the Primary state in a single region (or none)
|
||||
// This is the region of maximum overlap for the read (earlier if tied)
|
||||
|
||||
// Contract: Each read has the Non-Primary state in all other regions it overlaps
|
||||
|
||||
// simple: Primary in 1:1-999
|
||||
// overlap_equal: Primary in 1:1-999
|
||||
// overlap_unequal: Primary in 1:1-999
|
||||
// boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
|
||||
// boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
|
||||
// boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
|
||||
// boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
|
||||
// extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999
|
||||
// outside_intervals: none
|
||||
// shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927
|
||||
// shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
|
||||
// shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
|
||||
// simple20: Primary in 20:10000-10100
|
||||
|
||||
Map<GenomeLoc, ActiveRegion> activeRegions = getActiveRegions(walker, intervals);
|
||||
ActiveRegion region;
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999));
|
||||
verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999));
|
||||
verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999));
|
||||
verifyReadMapping(region, "boundary_equal", "boundary_unequal", "boundary_1_pre", "boundary_1_post");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384));
|
||||
verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927));
|
||||
verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100));
|
||||
verifyReadMapping(region, "simple20");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExtendedReadMapping() {
|
||||
DummyActiveRegionWalker walker = new DummyActiveRegionWalker(
|
||||
EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED));
|
||||
|
||||
// Contract: Each read has the Primary state in a single region (or none)
|
||||
// This is the region of maximum overlap for the read (earlier if tied)
|
||||
|
||||
// Contract: Each read has the Non-Primary state in all other regions it overlaps
|
||||
// Contract: Each read has the Extended state in regions where it only overlaps if the region is extended
|
||||
|
||||
// simple: Primary in 1:1-999
|
||||
// overlap_equal: Primary in 1:1-999
|
||||
// overlap_unequal: Primary in 1:1-999
|
||||
// boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
|
||||
// boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
|
||||
// boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
|
||||
// boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
|
||||
// extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999
|
||||
// outside_intervals: none
|
||||
// shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927
|
||||
// shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
|
||||
// shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
|
||||
// simple20: Primary in 20:10000-10100
|
||||
|
||||
Map<GenomeLoc, ActiveRegion> activeRegions = getActiveRegions(walker, intervals);
|
||||
ActiveRegion region;
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999));
|
||||
verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999));
|
||||
verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999));
|
||||
verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384));
|
||||
verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927));
|
||||
verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");
|
||||
|
||||
region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100));
|
||||
verifyReadMapping(region, "simple20");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnmappedReads() {
|
||||
// TODO
|
||||
}
|
||||
|
||||
private void verifyReadMapping(ActiveRegion region, String... reads) {
|
||||
Collection<String> wantReads = new ArrayList<String>(Arrays.asList(reads));
|
||||
for (SAMRecord read : region.getReads()) {
|
||||
String regionReadName = read.getReadName();
|
||||
Assert.assertTrue(wantReads.contains(regionReadName), "Read " + regionReadName + " assigned to active region " + region);
|
||||
wantReads.remove(regionReadName);
|
||||
}
|
||||
|
||||
Assert.assertTrue(wantReads.isEmpty(), "Reads missing in active region " + region);
|
||||
}
|
||||
|
||||
private Map<GenomeLoc, ActiveRegion> getActiveRegions(DummyActiveRegionWalker walker, List<GenomeLoc> intervals) {
|
||||
for (LocusShardDataProvider dataProvider : createDataProviders(walker, intervals, testBAM))
|
||||
t.traverse(walker, dataProvider, 0);
|
||||
|
||||
t.endTraversal(walker, 0);
|
||||
|
||||
return walker.mappedActiveRegions;
|
||||
}
|
||||
|
||||
private Collection<GenomeLoc> toSingleBaseLocs(GenomeLoc interval) {
|
||||
List<GenomeLoc> bases = new ArrayList<GenomeLoc>();
|
||||
if (interval.size() == 1)
|
||||
bases.add(interval);
|
||||
else {
|
||||
for (int location = interval.getStart(); location <= interval.getStop(); location++)
|
||||
bases.add(genomeLocParser.createGenomeLoc(interval.getContig(), location, location));
|
||||
}
|
||||
|
||||
return bases;
|
||||
}
|
||||
|
||||
private Collection<GenomeLoc> toSingleBaseLocs(List<GenomeLoc> intervals) {
|
||||
Set<GenomeLoc> bases = new TreeSet<GenomeLoc>(); // for sorting and uniqueness
|
||||
for (GenomeLoc interval : intervals)
|
||||
bases.addAll(toSingleBaseLocs(interval));
|
||||
|
||||
return bases;
|
||||
}
|
||||
|
||||
private void verifyEqualIntervals(List<GenomeLoc> aIntervals, List<GenomeLoc> bIntervals) {
|
||||
Collection<GenomeLoc> aBases = toSingleBaseLocs(aIntervals);
|
||||
Collection<GenomeLoc> bBases = toSingleBaseLocs(bIntervals);
|
||||
|
||||
Assert.assertTrue(aBases.size() == bBases.size(), "Interval lists have a differing number of bases: " + aBases.size() + " vs. " + bBases.size());
|
||||
|
||||
Iterator<GenomeLoc> aIter = aBases.iterator();
|
||||
Iterator<GenomeLoc> bIter = bBases.iterator();
|
||||
while (aIter.hasNext() && bIter.hasNext()) {
|
||||
GenomeLoc aLoc = aIter.next();
|
||||
GenomeLoc bLoc = bIter.next();
|
||||
Assert.assertTrue(aLoc.equals(bLoc), "Interval locations do not match: " + aLoc + " vs. " + bLoc);
|
||||
}
|
||||
}
|
||||
|
||||
// copied from LocusViewTemplate
|
||||
protected GATKSAMRecord buildSAMRecord(String readName, String contig, int alignmentStart, int alignmentEnd) {
|
||||
SAMFileHeader header = ArtificialSAMUtils.createDefaultReadGroup(new SAMFileHeader(), "test", "test");
|
||||
header.setSequenceDictionary(dictionary);
|
||||
header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
|
||||
GATKSAMRecord record = new GATKSAMRecord(header);
|
||||
|
||||
record.setReadName(readName);
|
||||
record.setReferenceIndex(dictionary.getSequenceIndex(contig));
|
||||
record.setAlignmentStart(alignmentStart);
|
||||
|
||||
Cigar cigar = new Cigar();
|
||||
int len = alignmentEnd - alignmentStart + 1;
|
||||
cigar.add(new CigarElement(len, CigarOperator.M));
|
||||
record.setCigar(cigar);
|
||||
record.setReadString(new String(new char[len]).replace("\0", "A"));
|
||||
record.setBaseQualities(new byte[len]);
|
||||
|
||||
return record;
|
||||
}
|
||||
|
||||
private List<LocusShardDataProvider> createDataProviders(final Walker walker, List<GenomeLoc> intervals, String bamFile) {
|
||||
GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
|
||||
engine.setGenomeLocParser(genomeLocParser);
|
||||
t.initialize(engine, walker);
|
||||
|
||||
Collection<SAMReaderID> samFiles = new ArrayList<SAMReaderID>();
|
||||
SAMReaderID readerID = new SAMReaderID(new File(bamFile), new Tags());
|
||||
samFiles.add(readerID);
|
||||
|
||||
SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser);
|
||||
|
||||
List<LocusShardDataProvider> providers = new ArrayList<LocusShardDataProvider>();
|
||||
for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) {
|
||||
for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs())) {
|
||||
providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList<ReferenceOrderedDataSource>()));
|
||||
}
|
||||
}
|
||||
|
||||
return providers;
|
||||
}
|
||||
}
|
||||
|
|
@ -76,7 +76,7 @@ import java.util.*;
|
|||
* Test the Active Region Traversal Contract
|
||||
* http://iwww.broadinstitute.org/gsa/wiki/index.php/Active_Region_Traversal_Contract
|
||||
*/
|
||||
public class TraverseActiveRegionsOptimizedUnitTest extends BaseTest {
|
||||
public class TraverseActiveRegionsUnitTest extends BaseTest {
|
||||
private final static boolean ENFORCE_CONTRACTS = false;
|
||||
private final static boolean DEBUG = false;
|
||||
|
||||
|
|
@ -131,7 +131,7 @@ public class TraverseActiveRegionsOptimizedUnitTest extends BaseTest {
|
|||
@DataProvider(name = "TraversalEngineProvider")
|
||||
public Object[][] makeTraversals() {
|
||||
final List<Object[]> traversals = new LinkedList<Object[]>();
|
||||
traversals.add(new Object[]{new TraverseActiveRegionsOptimized<Integer, Integer>()});
|
||||
traversals.add(new Object[]{new TraverseActiveRegions<Integer, Integer>()});
|
||||
return traversals.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
|
|
@ -537,7 +537,7 @@ public class TraverseActiveRegionsOptimizedUnitTest extends BaseTest {
|
|||
new ValidationExclusion(),
|
||||
new ArrayList<ReadFilter>(),
|
||||
new ArrayList<ReadTransformer>(),
|
||||
false, (byte)30, false, t instanceof TraverseActiveRegionsOptimized);
|
||||
false, (byte)30, false, true);
|
||||
|
||||
List<LocusShardDataProvider> providers = new ArrayList<LocusShardDataProvider>();
|
||||
for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) {
|
||||
Loading…
Reference in New Issue