Massive expansion of ActiveRegionTraversal unit tests, resulting in several bugfixes to ART

-- UnitTests now include combinational tiling of reads within and spanning shard boundaries
-- ART now properly handles shard transitions, and does so efficiently without requiring hash sets or other collections of reads
-- Updating HC and CountReadsInActiveRegions integration tests
This commit is contained in:
Mark DePristo 2013-01-16 15:29:26 -05:00
parent ddcb33fcf8
commit 2a42b47e4a
7 changed files with 482 additions and 140 deletions

View File

@ -68,12 +68,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "1e2671557b01ad0497557097282965fc");
HCTest(CEUTRIO_BAM, "", "b8f7b741445ce6b6ea491c794ce75c17");
}
@Test
public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "2bd237a7e1e63eebe755dbe7963e430a");
HCTest(NA12878_BAM, "", "a2c63f6e6e51a01019bdbd23125bdb15");
}
@Test(enabled = false)
@ -84,7 +84,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
"a938cdd7262968597fc8eb6c1c0a69f1");
"c679ae7f04bdfda896b5c046d35e043c");
}
private void HCTestComplexGGA(String bam, String args, String md5) {
@ -102,7 +102,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
"d590c8d6d5e58d685401b65a23846893");
"1a034b7eb572e1b6f659d6e5d57b3e76");
}
private void HCTestComplexVariants(String bam, String args, String md5) {
@ -135,7 +135,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "50a26224b9e863ee47a0619eb54a0323");
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "29f1125df5ab27cc937a144ae08ac735");
}
// That problem bam came from a user on the forum and it spotted a problem where the ReadClipper
@ -146,7 +146,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("4439496472eb1e2f5c91b30ba525be37"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8b1b8d1bd7feac1503fc4ffa6236cff7"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.gatk.traversals;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.WalkerManager;
@ -47,24 +48,36 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.*;
/**
* Created with IntelliJ IDEA.
* Implement active region traversal
*
* User: depristo
* Date: 1/9/13
* Time: 4:45 PM
* To change this template use File | Settings | File Templates.
*
* Live region:
*
* The ART tracks a thing called the live region. The live region is a position on a specific contig
* of the alignment start of the last read we processed during this traversal. Because the
* read stream is sorted, future reads must occurs in the the live region. Therefore the the dead region
* (everything to the left of the live boundary) cannot have any more read data. The live / dead
* regions are used to decide when we can safely call map on active regions, as only active regions
* contained completely within the dead region (including extensions) have a complete set of read data
* in the collected read list. All of the data related to the live region is captured by the local
* variable spanOfLastReadSeen
*
*/
public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegionWalker<M,T>,LocusShardDataProvider> {
protected final static Logger logger = Logger.getLogger(TraversalEngine.class);
protected final static boolean DEBUG = false;
// set by the tranversal
private int activeRegionExtension = -1;
private int maxRegionSize = -1;
/**
* our log, which we want to capture anything from this class
*/
protected final static Logger logger = Logger.getLogger(TraversalEngine.class);
protected final LinkedList<ActiveRegion> workQueue = new LinkedList<ActiveRegion>();
private final LinkedList<ActiveRegion> workQueue = new LinkedList<ActiveRegion>();
private LinkedList<GATKSAMRecord> myReads = new LinkedList<GATKSAMRecord>();
private GenomeLoc spanOfLastReadSeen = null;
protected int getActiveRegionExtension() {
return activeRegionExtension;
@ -79,6 +92,11 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
return "active regions";
}
@Override
public String toString() {
return "TraverseActiveRegions";
}
@Override
public void initialize(GenomeAnalysisEngine engine, Walker walker, ProgressMeter progressMeter) {
super.initialize(engine, walker, progressMeter);
@ -153,23 +171,58 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
}
}
private LinkedList<GATKSAMRecord> myReads = new LinkedList<GATKSAMRecord>();
private Shard lastShard = null;
/**
* Did read appear in the last shard?
*
* When we transition across shard boundaries we see duplicate reads because
* each shard contains the reads that *overlap* the shard. So if we just finished
* shard 1-1000 and are now in 1001-2000 we'll see duplicate reads from 1001
* that overlapped 1-1000. This function tests read to determine if we would have
* seen it before by asking if read.getAlignmentStart() is less than the
* stop position of the last seen read at the start of the traversal. The reason
* we need to use the location of the last read at the start of the traversal
* is that we update the lastRead during the traversal, and we only want to filter
* out reads whose start is before the last read of the previous shard, not the
* current shard.
*
* @param locOfLastReadAtTraversalStart the location of the last read seen at the start of the traversal
* @param read the read we want to test if it's already been seen in the last shard
* @return true if read would have appeared in the last shard, false otherwise
*/
protected boolean appearedInLastShard(final GenomeLoc locOfLastReadAtTraversalStart, final GATKSAMRecord read) {
if ( locOfLastReadAtTraversalStart == null )
// we're in the first shard, so obviously the answer is no
return false;
else {
// otherwise check to see if the alignment occurred in the previous shard
return read.getAlignmentStart() <= locOfLastReadAtTraversalStart.getStart()
// we're on the same contig
&& read.getReferenceIndex() == locOfLastReadAtTraversalStart.getContigIndex();
}
}
// -------------------------------------------------------------------------------------
//
// Actual traverse function
//
// -------------------------------------------------------------------------------------
/**
* Is the current shard on a new contig w.r.t. the previous shard?
* @param currentShard the current shard we are processing
* @return true if the last shard was on a different contig than the current shard
*/
private boolean onNewContig(final Shard currentShard) {
return spanOfLastSeenRead() != null
&& spanOfLastSeenRead().getContigIndex() != currentShard.getLocation().getContigIndex();
}
@Override
public T traverse( final ActiveRegionWalker<M,T> walker,
final LocusShardDataProvider dataProvider,
T sum) {
if ( DEBUG ) logger.warn(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider));
final HashSet<GATKSAMRecord> maybeDuplicatedReads = new HashSet<GATKSAMRecord>();
// TODO -- there's got to be a better way to know this
if ( lastShard != dataProvider.getShard() ) {
maybeDuplicatedReads.addAll(myReads);
logger.info("Crossing shard boundary requires us to check for duplicates against " + maybeDuplicatedReads.size() + " reads");
if ( DEBUG ) logger.warn("Clearing myReads");
}
lastShard = dataProvider.getShard();
logger.debug(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider));
final LocusView locusView = new AllLocusView(dataProvider);
@ -181,6 +234,12 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView);
// We keep processing while the next reference location is within the interval
final GenomeLoc locOfLastReadAtTraversalStart = spanOfLastSeenRead();
// if we've moved onto a new contig, process all of the active regions
if ( onNewContig(dataProvider.getShard()) )
sum = processActiveRegions(walker, sum, true);
GenomeLoc prevLoc = null;
while( locusView.hasNext() ) {
final AlignmentContext locus = locusView.next();
@ -191,15 +250,12 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
// reads may occur outside our intervals but overlap them in the future
final Collection<GATKSAMRecord> reads = locusView.getLIBS().transferReadsFromAllPreviousPileups();
for( final GATKSAMRecord read : reads ) {
notifyOfCurrentPosition(read);
// most of the time maybeDuplicatedReads is empty
// TODO -- I believe that because of the ordering of reads that as soon as we don't find a read in the
// TODO -- potential list of duplicates we can clear the hashset
if ( ! maybeDuplicatedReads.isEmpty() && maybeDuplicatedReads.contains(read) ) {
if ( appearedInLastShard(locOfLastReadAtTraversalStart, read) ) {
if ( DEBUG ) logger.warn("Skipping duplicated " + read.getReadName());
} else {
if ( DEBUG ) logger.warn("Adding read " + read.getReadName() + " at " + engine.getGenomeLocParser().createGenomeLoc(read) + " from provider " + dataProvider);
myReads.add((GATKSAMRecord)read);
rememberLastReadLocation(read);
myReads.add(read);
}
}
@ -257,28 +313,87 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
return sum;
}
private GenomeLoc startOfLiveRegion = null;
protected void notifyOfCurrentPosition(final GATKSAMRecord read) {
notifyOfCurrentPosition(engine.getGenomeLocParser().createGenomeLoc(read));
/**
* Special function called in LinearMicroScheduler to empty out the work queue.
* Ugly for now but will be cleaned up when we push this functionality more into the engine
*/
public T endTraversal(final Walker<M, T> walker, T sum) {
return processActiveRegions((ActiveRegionWalker<M, T>)walker, sum, true);
}
protected void notifyOfCurrentPosition(final GenomeLoc currentLocation) {
if ( startOfLiveRegion == null )
startOfLiveRegion = currentLocation;
else
startOfLiveRegion = startOfLiveRegion.max(currentLocation.getStartLocation());
// -------------------------------------------------------------------------------------
//
// Functions to manage and interact with the live / dead zone
//
// -------------------------------------------------------------------------------------
/**
* Update the live region to reflect that the last read we've seen in the traversal is read
*
* Requires that sequential calls always be provided reads in coordinate sorted order
*
* @param read the last read we've seen during the traversal
*/
protected void rememberLastReadLocation(final GATKSAMRecord read) {
final GenomeLoc currentLocation = engine.getGenomeLocParser().createGenomeLoc(read);
if ( spanOfLastReadSeen == null )
spanOfLastReadSeen = currentLocation;
else {
if ( currentLocation.isBefore(spanOfLastReadSeen) )
throw new IllegalStateException("Updating last read seen in the traversal with read " + read + " with span " + currentLocation + " but this occurs before the previously seen read " + spanOfLastReadSeen);
spanOfLastReadSeen = currentLocation;
}
}
protected GenomeLoc getStartOfLiveRegion() {
return startOfLiveRegion;
/**
* Get a GenomeLoc indicating the start (heading to the right) of the live ART region.
* @return the left-most position of the live region on the genome
*/
protected GenomeLoc spanOfLastSeenRead() {
return spanOfLastReadSeen;
}
protected boolean regionCompletelyWithinDeadZone(final GenomeLoc region, final boolean includeExtension) {
return (region.getStop() < (getStartOfLiveRegion().getStart() - (includeExtension ? getActiveRegionExtension() : 0)))
|| ! region.onSameContig(getStartOfLiveRegion());
/**
* Is the active region completely within the traversal's dead zone?
*
* @param region the region we want to test
* @return true if the extended location of region is completely within the current dead zone, false otherwise
*/
protected boolean regionCompletelyWithinDeadZone(final ActiveRegion region) {
return region.getExtendedLoc().getStop() < spanOfLastSeenRead().getStart()
|| ! region.getExtendedLoc().onSameContig(spanOfLastSeenRead());
}
/**
* Is the read dead? That is, can it no longer be in any future active region, and therefore can be discarded?
*
* read: start |--------> stop ------ stop + extension
* region: start |-----------------| end
*
* Since the regions are coming in order, read could potentially be contained in a future interval if
* stop + activeRegionExtension >= end. If, on the other hand, stop + extension is < the end
* of this region, then we can discard it, since any future region could only include reads
* up to end + 1 - extension.
*
* Note that this function doesn't care about the dead zone. We're assuming that by
* actually calling this function with an active region that region is already in the dead zone,
* so checking that the read is in the dead zone doesn't make sense.
*
* @param read the read we're testing
* @param activeRegion the current active region
* @return true if the read is dead, false other
*/
@Requires({"read != null", "activeRegion != null"})
private boolean readCannotOccurInAnyMoreActiveRegions(final GATKSAMRecord read, final ActiveRegion activeRegion) {
return read.getAlignmentEnd() + getActiveRegionExtension() < activeRegion.getLocation().getStop();
}
// -------------------------------------------------------------------------------------
//
// Functions to process active regions that are ready for map / reduce calls
//
// -------------------------------------------------------------------------------------
private T processActiveRegions(final ActiveRegionWalker<M, T> walker, T sum, final boolean forceRegionsToBeActive) {
if( walker.activeRegionOutStream != null ) {
writeActiveRegionsToStream(walker);
@ -292,11 +407,10 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
// TODO can implement parallel traversal here
while( workQueue.peek() != null ) {
final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc();
if ( forceRegionsToBeActive || regionCompletelyWithinDeadZone(extendedLoc, false) ) {
final ActiveRegion activeRegion = workQueue.remove();
if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + getStartOfLiveRegion());
sum = processActiveRegion( activeRegion, sum, walker );
final ActiveRegion activeRegion = workQueue.peek();
if ( forceRegionsToBeActive || regionCompletelyWithinDeadZone(activeRegion) ) {
if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + spanOfLastSeenRead());
sum = processActiveRegion( workQueue.remove(), sum, walker );
} else {
break;
}
@ -305,15 +419,6 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
return sum;
}
@Override
public String toString() {
return "TraverseActiveRegions";
}
private boolean readIsDead(final GATKSAMRecord read, final GenomeLoc readLoc, final ActiveRegion activeRegion) {
return readLoc.getStop() < activeRegion.getLocation().getStart() && regionCompletelyWithinDeadZone(readLoc, true);
}
protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker<M, T> walker) {
final Iterator<GATKSAMRecord> liveReads = myReads.iterator();
while ( liveReads.hasNext() ) {
@ -325,7 +430,7 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
activeRegion.add(read);
if ( ! walker.wantsNonPrimaryReads() ) {
if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion());
if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + spanOfLastSeenRead());
liveReads.remove();
killed = true;
}
@ -333,8 +438,8 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
activeRegion.add( read );
}
if ( ! killed && readIsDead(read, readLoc, activeRegion) ) {
if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + getStartOfLiveRegion());
if ( ! killed && readCannotOccurInAnyMoreActiveRegions(read, activeRegion) ) {
if ( DEBUG ) logger.warn("Removing read " + read.getReadName() + " at " + readLoc + " with dead zone start " + spanOfLastSeenRead());
liveReads.remove();
}
}
@ -343,13 +448,4 @@ public class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,ActiveRegio
final M x = walker.map(activeRegion, null);
return walker.reduce( x, sum );
}
/**
* Special function called in LinearMicroScheduler to empty out the work queue.
* Ugly for now but will be cleaned up when we push this functionality more into the engine
*/
public T endTraversal(final Walker<M, T> walker, T sum) {
return processActiveRegions((ActiveRegionWalker<M, T>)walker, sum, true);
}
}

View File

@ -25,7 +25,9 @@
package org.broadinstitute.sting.utils.sam;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.*;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.NGSPlatform;
import java.io.File;
@ -51,6 +53,9 @@ import java.util.List;
public class ArtificialBAMBuilder {
public final static int BAM_SHARD_SIZE = 16384;
private final IndexedFastaSequenceFile reference;
private final GenomeLocParser parser;
final int nReadsPerLocus;
final int nLoci;
@ -66,14 +71,39 @@ public class ArtificialBAMBuilder {
SAMFileHeader header;
public ArtificialBAMBuilder(int nReadsPerLocus, int nLoci) {
public ArtificialBAMBuilder(final IndexedFastaSequenceFile reference, int nReadsPerLocus, int nLoci) {
this.nReadsPerLocus = nReadsPerLocus;
this.nLoci = nLoci;
this.reference = reference;
this.parser = new GenomeLocParser(reference);
createAndSetHeader(1);
}
public ArtificialBAMBuilder(int nReadsPerLocus, int nLoci) {
this(ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000).getSequenceDictionary(), nReadsPerLocus, nLoci);
}
public ArtificialBAMBuilder(final SAMSequenceDictionary dict, int nReadsPerLocus, int nLoci) {
this.nReadsPerLocus = nReadsPerLocus;
this.nLoci = nLoci;
this.reference = null;
this.parser = new GenomeLocParser(dict);
createAndSetHeader(1);
}
public IndexedFastaSequenceFile getReference() {
return reference;
}
public GenomeLocParser getGenomeLocParser() {
return parser;
}
public ArtificialBAMBuilder createAndSetHeader(final int nSamples) {
this.header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
this.header = new SAMFileHeader();
header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
header.setSequenceDictionary(parser.getContigs());
samples.clear();
for ( int i = 0; i < nSamples; i++ ) {
@ -156,6 +186,11 @@ public class ArtificialBAMBuilder {
public SAMFileHeader getHeader() { return header; }
public ArtificialBAMBuilder setHeader(SAMFileHeader header) { this.header = header; return this; }
public int getAlignmentEnd() {
return alignmentStart + nLoci * (skipNLoci + 1) + readLength;
}
public int getNSamples() { return samples.size(); }
public int expectedNumberOfReads() {

View File

@ -0,0 +1,104 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.traversals;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState;
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
import java.util.*;
/**
* ActiveRegionWalker for unit testing
*
* User: depristo
* Date: 1/15/13
* Time: 1:28 PM
*/
class DummyActiveRegionWalker extends ActiveRegionWalker<Integer, Integer> {
private final double prob;
private EnumSet<ActiveRegionReadState> states = super.desiredReadStates();
private GenomeLocSortedSet activeRegions = null;
protected List<GenomeLoc> isActiveCalls = new ArrayList<GenomeLoc>();
protected Map<GenomeLoc, ActiveRegion> mappedActiveRegions = new LinkedHashMap<GenomeLoc, ActiveRegion>();
public DummyActiveRegionWalker() {
this(1.0);
}
public DummyActiveRegionWalker(double constProb) {
this.prob = constProb;
}
public DummyActiveRegionWalker(EnumSet<ActiveRegionReadState> wantStates) {
this(1.0);
this.states = wantStates;
}
public DummyActiveRegionWalker(GenomeLocSortedSet activeRegions) {
this(1.0);
this.activeRegions = activeRegions;
}
public void setStates(EnumSet<ActiveRegionReadState> states) {
this.states = states;
}
@Override
public EnumSet<ActiveRegionReadState> desiredReadStates() {
return states;
}
@Override
public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
isActiveCalls.add(ref.getLocus());
final double p = activeRegions == null || activeRegions.overlaps(ref.getLocus()) ? prob : 0.0;
return new ActivityProfileResult(ref.getLocus(), p);
}
@Override
public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) {
mappedActiveRegions.put(activeRegion.getLocation(), activeRegion);
return 0;
}
@Override
public Integer reduceInit() {
return 0;
}
@Override
public Integer reduce(Integer value, Integer sum) {
return 0;
}
}

View File

@ -30,33 +30,26 @@ import net.sf.samtools.*;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.*;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.*;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.executive.WindowMaker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
@ -80,54 +73,6 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
private final static boolean ENFORCE_CONTRACTS = false;
private final static boolean DEBUG = false;
private class DummyActiveRegionWalker extends ActiveRegionWalker<Integer, Integer> {
private final double prob;
private EnumSet<ActiveRegionReadState> states = super.desiredReadStates();
protected List<GenomeLoc> isActiveCalls = new ArrayList<GenomeLoc>();
protected Map<GenomeLoc, ActiveRegion> mappedActiveRegions = new HashMap<GenomeLoc, ActiveRegion>();
public DummyActiveRegionWalker() {
this.prob = 1.0;
}
public DummyActiveRegionWalker(double constProb) {
this.prob = constProb;
}
public DummyActiveRegionWalker(EnumSet<ActiveRegionReadState> wantStates) {
this.prob = 1.0;
this.states = wantStates;
}
@Override
public EnumSet<ActiveRegionReadState> desiredReadStates() {
return states;
}
@Override
public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
isActiveCalls.add(ref.getLocus());
return new ActivityProfileResult(ref.getLocus(), prob);
}
@Override
public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) {
mappedActiveRegions.put(activeRegion.getLocation(), activeRegion);
return 0;
}
@Override
public Integer reduceInit() {
return 0;
}
@Override
public Integer reduce(Integer value, Integer sum) {
return 0;
}
}
@DataProvider(name = "TraversalEngineProvider")
public Object[][] makeTraversals() {
final List<Object[]> traversals = new LinkedList<Object[]>();
@ -297,7 +242,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
}
}
@Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider")
@Test(enabled = true, dataProvider = "TraversalEngineProvider")
public void testPrimaryReadMapping(TraverseActiveRegions t) {
DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
@ -340,7 +285,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
verifyReadMapping(region, "simple20");
}
@Test(enabled = true, dataProvider = "TraversalEngineProvider")
@Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider")
public void testNonPrimaryReadMapping(TraverseActiveRegions t) {
DummyActiveRegionWalker walker = new DummyActiveRegionWalker(
EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY));
@ -456,7 +401,11 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
}
private Map<GenomeLoc, ActiveRegion> getActiveRegions(TraverseActiveRegions t, DummyActiveRegionWalker walker, List<GenomeLoc> intervals) {
for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, testBAM))
return getActiveRegions(t, walker, intervals, testBAM);
}
private Map<GenomeLoc, ActiveRegion> getActiveRegions(TraverseActiveRegions t, DummyActiveRegionWalker walker, List<GenomeLoc> intervals, final String bam) {
for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, bam))
t.traverse(walker, dataProvider, 0);
t.endTraversal(walker, 0);
@ -516,14 +465,15 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
record.setCigar(cigar);
record.setReadString(new String(new char[len]).replace("\0", "A"));
record.setBaseQualities(new byte[len]);
record.setReadGroup(new GATKSAMReadGroupRecord(header.getReadGroup("test")));
return record;
}
private List<LocusShardDataProvider> createDataProviders(TraverseActiveRegions t, final Walker walker, List<GenomeLoc> intervals, String bamFile) {
private List<LocusShardDataProvider> createDataProviders(TraverseActiveRegions traverseActiveRegions, final Walker walker, List<GenomeLoc> intervals, String bamFile) {
GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
engine.setGenomeLocParser(genomeLocParser);
t.initialize(engine, walker);
traverseActiveRegions.initialize(engine, walker);
Collection<SAMReaderID> samFiles = new ArrayList<SAMReaderID>();
SAMReaderID readerID = new SAMReaderID(new File(bamFile), new Tags());
@ -539,13 +489,144 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
new ArrayList<ReadTransformer>(),
false, (byte)30, false, true);
final Set<String> samples = SampleUtils.getSAMFileSamples(dataSource.getHeader());
List<LocusShardDataProvider> providers = new ArrayList<LocusShardDataProvider>();
for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) {
for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs())) {
for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs(), samples)) {
providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList<ReferenceOrderedDataSource>()));
}
}
return providers;
}
@DataProvider(name = "CombinatorialARTTilingProvider")
public Object[][] makeCombinatorialARTTilingProvider() {
final List<Object[]> tests = new LinkedList<Object[]>();
final List<Integer> starts = Arrays.asList(
1, // very start of the chromosome
ArtificialBAMBuilder.BAM_SHARD_SIZE - 100, // right before the shard boundary
ArtificialBAMBuilder.BAM_SHARD_SIZE + 100 // right after the shard boundary
);
final List<EnumSet<ActiveRegionReadState>> allReadStates = Arrays.asList(
EnumSet.of(ActiveRegionReadState.PRIMARY),
EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY),
EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED)
);
final int maxTests = Integer.MAX_VALUE;
int nTests = 0;
for ( final int readLength : Arrays.asList(10, 100) ) {
for ( final int skips : Arrays.asList(0, 1, 10) ) {
for ( final int start : starts ) {
for ( final int nReadsPerLocus : Arrays.asList(1, 2) ) {
for ( final int nLoci : Arrays.asList(1, 1000) ) {
for ( EnumSet<ActiveRegionReadState> readStates : allReadStates ) {
final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(reference, nReadsPerLocus, nLoci);
bamBuilder.setReadLength(readLength);
bamBuilder.setSkipNLoci(skips);
bamBuilder.setAlignmentStart(start);
for ( final GenomeLocSortedSet activeRegions : enumerateActiveRegions(bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd())) {
nTests++;
if ( nTests < maxTests ) // && nTests == 1238 )
tests.add(new Object[]{nTests, activeRegions, readStates, bamBuilder});
}
}
}
}
}
}
}
return tests.toArray(new Object[][]{});
}
private Collection<GenomeLocSortedSet> enumerateActiveRegions(final int start, final int stop) {
// should basically cut up entire region into equal sized chunks, of
// size 10, 20, 50, 100, etc, alternating skipping pieces so they are inactive
// Need to make sure we include some edge cases:
final List<GenomeLocSortedSet> activeRegions = new LinkedList<GenomeLocSortedSet>();
for ( final int stepSize : Arrays.asList(11, 29, 53, 97) ) {
for ( final boolean startWithActive : Arrays.asList(true, false) ) {
activeRegions.add(makeActiveRegionMask(start, stop, stepSize, startWithActive));
}
}
// active region is the whole interval
activeRegions.add(new GenomeLocSortedSet(genomeLocParser, genomeLocParser.createGenomeLoc("1", start, stop)));
// active region extends up to the end of the data, but doesn't include start
activeRegions.add(new GenomeLocSortedSet(genomeLocParser, genomeLocParser.createGenomeLoc("1", start+10, stop)));
return activeRegions;
}
private GenomeLocSortedSet makeActiveRegionMask(final int start, final int stop, final int stepSize, final boolean startWithActive) {
final GenomeLocSortedSet active = new GenomeLocSortedSet(genomeLocParser);
boolean includeRegion = startWithActive;
for ( int left = start; left < stop; left += stepSize) {
final int right = left + stepSize;
final GenomeLoc region = genomeLocParser.createGenomeLoc("1", left, right);
if ( includeRegion )
active.add(region);
includeRegion = ! includeRegion;
}
return active;
}
@Test(enabled = true, dataProvider = "CombinatorialARTTilingProvider")
public void testARTReadsInActiveRegions(final int id, final GenomeLocSortedSet activeRegions, final EnumSet<ActiveRegionReadState> readStates, final ArtificialBAMBuilder bamBuilder) {
logger.warn("Running testARTReadsInActiveRegions id=" + id + " locs " + activeRegions + " against bam " + bamBuilder);
final List<GenomeLoc> intervals = Arrays.asList(
genomeLocParser.createGenomeLoc("1", bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd())
);
final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions);
walker.setStates(readStates);
final TraverseActiveRegions traversal = new TraverseActiveRegions<Integer, Integer>();
final Map<GenomeLoc, ActiveRegion> activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile().toString());
final Set<String> alreadySeenReads = new HashSet<String>(); // for use with the primary / non-primary
for ( final ActiveRegion region : activeRegionsMap.values() ) {
int nReadsExpectedInRegion = 0;
for ( final GATKSAMRecord read : bamBuilder.makeReads() ) {
final GenomeLoc readLoc = genomeLocParser.createGenomeLoc(read);
final Set<String> readNamesInRegion = readNamesInRegion(region);
boolean shouldBeInRegion = readStates.contains(ActiveRegionReadState.EXTENDED)
? region.getExtendedLoc().overlapsP(readLoc)
: region.getLocation().overlapsP(readLoc);
if ( ! readStates.contains(ActiveRegionReadState.NONPRIMARY) ) {
if ( alreadySeenReads.contains(read.getReadName()) )
shouldBeInRegion = false;
else if ( shouldBeInRegion )
alreadySeenReads.add(read.getReadName());
}
Assert.assertEquals(readNamesInRegion.contains(read.getReadName()), shouldBeInRegion, "Region " + region +
" failed contains read check: read " + read + " with span " + readLoc + " should be in region is " + shouldBeInRegion + " but I got the opposite");
nReadsExpectedInRegion += shouldBeInRegion ? 1 : 0;
}
Assert.assertEquals(region.size(), nReadsExpectedInRegion, "There are more reads in active region " + region + "than expected");
}
}
private Set<String> readNamesInRegion(final ActiveRegion region) {
final Set<String> readNames = new LinkedHashSet<String>(region.getReads().size());
for ( final SAMRecord read : region.getReads() )
readNames.add(read.getReadName());
return readNames;
}
}

View File

@ -54,6 +54,32 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
private static final boolean DEBUG = false;
protected LocusIteratorByState li;
@Test(enabled = true)
public void testUnmappedAndAllIReadsPassThrough() {
final int readLength = 10;
GATKSAMRecord mapped1 = ArtificialSAMUtils.createArtificialRead(header,"mapped1",0,1,readLength);
GATKSAMRecord mapped2 = ArtificialSAMUtils.createArtificialRead(header,"mapped2",0,1,readLength);
GATKSAMRecord unmapped = ArtificialSAMUtils.createArtificialRead(header,"unmapped",0,1,readLength);
GATKSAMRecord allI = ArtificialSAMUtils.createArtificialRead(header,"allI",0,1,readLength);
unmapped.setReadUnmappedFlag(true);
unmapped.setCigarString("*");
allI.setCigarString(readLength + "I");
List<GATKSAMRecord> reads = Arrays.asList(mapped1, unmapped, allI, mapped2);
// create the iterator by state with the fake reads and fake records
li = makeLTBS(reads,createTestReadProperties(DownsamplingMethod.NONE, true));
Assert.assertTrue(li.hasNext());
AlignmentContext context = li.next();
ReadBackedPileup pileup = context.getBasePileup();
Assert.assertEquals(pileup.depthOfCoverage(), 2, "Should see only 2 reads in pileup, even with unmapped and all I reads");
final List<GATKSAMRecord> rawReads = li.transferReadsFromAllPreviousPileups();
Assert.assertEquals(rawReads, reads, "Input and transferred read lists should be the same, and include the unmapped and all I reads");
}
@Test(enabled = true && ! DEBUG)
public void testXandEQOperators() {
final byte[] bases1 = new byte[] {'A','A','A','A','A','A','A','A','A','A'};
@ -451,7 +477,7 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
? new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsampleTo, null, false)
: new DownsamplingMethod(DownsampleType.NONE, null, null, false);
final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(nReadsPerLocus, nLoci);
final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(header.getSequenceDictionary(), nReadsPerLocus, nLoci);
bamBuilder.createAndSetHeader(nSamples).setReadLength(readLength).setAlignmentStart(1);
final List<GATKSAMRecord> reads = bamBuilder.makeReads();

View File

@ -47,8 +47,8 @@ import java.util.List;
* To change this template use File | Settings | File Templates.
*/
public class ArtificialBAMBuilderUnitTest extends BaseTest {
@DataProvider(name = "CombinatorialARTTilingProvider")
public Object[][] makeCombinatorialARTTilingProvider() {
@DataProvider(name = "ArtificialBAMBuilderUnitTestProvider")
public Object[][] makeArtificialBAMBuilderUnitTestProvider() {
final List<Object[]> tests = new LinkedList<Object[]>();
final List<Integer> starts = Arrays.asList(
@ -79,7 +79,7 @@ public class ArtificialBAMBuilderUnitTest extends BaseTest {
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "CombinatorialARTTilingProvider")
@Test(dataProvider = "ArtificialBAMBuilderUnitTestProvider")
public void testBamProvider(final ArtificialBAMBuilder bamBuilder, int readLength, int skips, int start, int nSamples, int nReadsPerLocus, int nLoci) {
Assert.assertEquals(bamBuilder.getReadLength(), readLength);
Assert.assertEquals(bamBuilder.getSkipNLoci(), skips);