Split original and optimized ART into largely independent pieces

-- Allows us to cleanly run old and new art, which now have different traversal behavior (on purpose).  Split unit tests as well.
This commit is contained in:
Mark DePristo 2013-01-10 20:31:26 -05:00
parent 02130dfde7
commit b9a33d3c66
5 changed files with 682 additions and 105 deletions

View File

@ -68,6 +68,12 @@ public abstract class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Ac
abstract protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker<M, T> walker);
/**
* Special function called in LinearMicroScheduler to empty out the work queue.
* Ugly for now but will be cleaned up when we push this functionality more into the engine
*/
public abstract T endTraversal(final Walker<M, T> walker, T sum);
protected int getActiveRegionExtension() {
return activeRegionExtension;
}
@ -141,21 +147,12 @@ public abstract class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Ac
return (RodLocusView)locusView;
}
protected T processActiveRegions(final ActiveRegionWalker<M, T> walker, T sum, final boolean forceRegionsToBeActive) {
if( walker.activeRegionOutStream != null ) {
writeActiveRegionsToStream(walker);
return sum;
} else {
return callWalkerMapOnActiveRegions(walker, sum, forceRegionsToBeActive);
}
}
/**
* Write out each active region to the walker activeRegionOutStream
*
* @param walker
*/
private void writeActiveRegionsToStream( final ActiveRegionWalker<M, T> walker ) {
protected void writeActiveRegionsToStream( final ActiveRegionWalker<M, T> walker ) {
// Just want to output the active regions to a file, not actually process them
for( final ActiveRegion activeRegion : workQueue ) {
if( activeRegion.isActive ) {
@ -163,66 +160,4 @@ public abstract class TraverseActiveRegions<M, T> extends TraversalEngine<M,T,Ac
}
}
}
private GenomeLoc startOfLiveRegion = null;
protected void notifyOfCurrentPosition(final GATKSAMRecord read) {
notifyOfCurrentPosition(engine.getGenomeLocParser().createGenomeLoc(read));
}
protected void notifyOfCurrentPosition(final GenomeLoc currentLocation) {
if ( startOfLiveRegion == null )
startOfLiveRegion = currentLocation;
else
startOfLiveRegion = startOfLiveRegion.max(currentLocation.getStartLocation());
}
protected GenomeLoc getStartOfLiveRegion() {
return startOfLiveRegion;
}
protected boolean regionCompletelyWithinDeadZone(final GenomeLoc region, final boolean includeExtension) {
return (region.getStop() < (getStartOfLiveRegion().getStart() - (includeExtension ? getActiveRegionExtension() : 0)))
|| ! region.onSameContig(getStartOfLiveRegion());
}
private T callWalkerMapOnActiveRegions(final ActiveRegionWalker<M, T> walker, T sum, final boolean forceRegionsToBeActive) {
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
// TODO can implement parallel traversal here
while( workQueue.peek() != null ) {
final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc();
if ( forceRegionsToBeActive || regionCompletelyWithinDeadZone(extendedLoc, false) ) {
final ActiveRegion activeRegion = workQueue.remove();
if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + getStartOfLiveRegion());
sum = processActiveRegion( activeRegion, sum, walker );
} else {
break;
}
}
return sum;
}
/**
* Special function called in LinearMicroScheduler to empty out the work queue.
* Ugly for now but will be cleaned up when we push this functionality more into the engine
*/
public T endTraversal(final Walker<M, T> walker, T sum) {
return processActiveRegions((ActiveRegionWalker<M, T>)walker, sum, true);
}
// todo -- remove me
protected ActiveRegion getBestRegion(final ActiveRegion activeRegion, final GenomeLoc readLoc) {
long minStart = activeRegion.getLocation().getStart();
ActiveRegion bestRegion = activeRegion;
for( final ActiveRegion otherRegionToTest : workQueue ) {
if( otherRegionToTest.getLocation().getStart() < minStart ) {
minStart = otherRegionToTest.getLocation().getStart();
bestRegion = otherRegionToTest;
}
}
return bestRegion;
}
}

View File

@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.activeregion.ActivityProfile;
@ -151,6 +152,54 @@ public class TraverseActiveRegionsOptimized<M,T> extends TraverseActiveRegions<M
return sum;
}
private GenomeLoc startOfLiveRegion = null;
protected void notifyOfCurrentPosition(final GATKSAMRecord read) {
notifyOfCurrentPosition(engine.getGenomeLocParser().createGenomeLoc(read));
}
protected void notifyOfCurrentPosition(final GenomeLoc currentLocation) {
if ( startOfLiveRegion == null )
startOfLiveRegion = currentLocation;
else
startOfLiveRegion = startOfLiveRegion.max(currentLocation.getStartLocation());
}
protected GenomeLoc getStartOfLiveRegion() {
return startOfLiveRegion;
}
protected boolean regionCompletelyWithinDeadZone(final GenomeLoc region, final boolean includeExtension) {
return (region.getStop() < (getStartOfLiveRegion().getStart() - (includeExtension ? getActiveRegionExtension() : 0)))
|| ! region.onSameContig(getStartOfLiveRegion());
}
private T processActiveRegions(final ActiveRegionWalker<M, T> walker, T sum, final boolean forceRegionsToBeActive) {
if( walker.activeRegionOutStream != null ) {
writeActiveRegionsToStream(walker);
return sum;
} else {
return callWalkerMapOnActiveRegions(walker, sum, forceRegionsToBeActive);
}
}
private T callWalkerMapOnActiveRegions(final ActiveRegionWalker<M, T> walker, T sum, final boolean forceRegionsToBeActive) {
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
// TODO can implement parallel traversal here
while( workQueue.peek() != null ) {
final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc();
if ( forceRegionsToBeActive || regionCompletelyWithinDeadZone(extendedLoc, false) ) {
final ActiveRegion activeRegion = workQueue.remove();
if ( DEBUG ) logger.warn("Processing active region " + activeRegion + " dead zone " + getStartOfLiveRegion());
sum = processActiveRegion( activeRegion, sum, walker );
} else {
break;
}
}
return sum;
}
@Override
public String toString() {
return "TraverseActiveRegionsOptimized";
@ -190,4 +239,15 @@ public class TraverseActiveRegionsOptimized<M,T> extends TraverseActiveRegions<M
final M x = walker.map(activeRegion, null);
return walker.reduce( x, sum );
}
/**
* Special function called in LinearMicroScheduler to empty out the work queue.
* Ugly for now but will be cleaned up when we push this functionality more into the engine
*/
@Override
public T endTraversal(final Walker<M, T> walker, T sum) {
return processActiveRegions((ActiveRegionWalker<M, T>)walker, sum, true);
}
}

View File

@ -1,14 +1,19 @@
package org.broadinstitute.sting.gatk.traversals;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.WalkerManager;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.activeregion.ActivityProfile;
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -23,14 +28,6 @@ import java.util.*;
public class TraverseActiveRegionsOriginal<M,T> extends TraverseActiveRegions<M,T> {
private final LinkedHashSet<GATKSAMRecord> myReads = new LinkedHashSet<GATKSAMRecord>();
protected Collection<GATKSAMRecord> getReadsInCurrentRegion() {
return myReads;
}
protected void removeReadsFromCurrentRegion(final List<GATKSAMRecord> placedReads) {
myReads.removeAll( placedReads ); // remove all the reads which have been placed into their active region
}
@Override
public T traverse( final ActiveRegionWalker<M,T> walker,
final LocusShardDataProvider dataProvider,
@ -40,6 +37,8 @@ public class TraverseActiveRegionsOriginal<M,T> extends TraverseActiveRegions<M,
final LocusView locusView = new AllLocusView(dataProvider);
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension();
final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion();
int minStart = Integer.MAX_VALUE;
final List<ActiveRegion> activeRegions = new LinkedList<ActiveRegion>();
@ -75,7 +74,7 @@ public class TraverseActiveRegionsOriginal<M,T> extends TraverseActiveRegions<M,
if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) {
// we've move across some interval boundary, restart profile
profile = incorporateActiveRegions(profile, activeRegions);
profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize);
}
dataProvider.getShard().getReadMetrics().incrementNumIterations();
@ -98,7 +97,7 @@ public class TraverseActiveRegionsOriginal<M,T> extends TraverseActiveRegions<M,
updateCumulativeMetrics(dataProvider.getShard());
if ( ! profile.isEmpty() )
incorporateActiveRegions(profile, activeRegions);
incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize);
// add active regions to queue of regions to process
// first check if can merge active regions over shard boundaries
@ -106,10 +105,10 @@ public class TraverseActiveRegionsOriginal<M,T> extends TraverseActiveRegions<M,
if( !workQueue.isEmpty() ) {
final ActiveRegion last = workQueue.getLast();
final ActiveRegion first = activeRegions.get(0);
if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= getMaxRegionSize() ) {
if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) {
workQueue.removeLast();
activeRegions.remove(first);
workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), getActiveRegionExtension()) );
workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) );
}
}
workQueue.addAll( activeRegions );
@ -117,32 +116,84 @@ public class TraverseActiveRegionsOriginal<M,T> extends TraverseActiveRegions<M,
logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." );
// set the dead zone to the min. This is incorrect but necessary because of the way we handle things in processActiveRegion
notifyOfCurrentPosition(engine.getGenomeLocParser().createGenomeLoc(dataProvider.getLocus().getContig(), minStart));
// now go and process all of the active regions
sum = processActiveRegions(walker, sum, false);
sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig());
return sum;
}
/**
* Take the individual isActive calls and integrate them into contiguous active regions and
* add these blocks of work to the work queue
* band-pass filter the list of isActive probabilities and turn into active regions
*
* @param profile
* @param activeRegions
* @param activeRegionExtension
* @param maxRegionSize
* @return
*/
private ActivityProfile incorporateActiveRegions(final ActivityProfile profile,
final List<ActiveRegion> activeRegions,
final int activeRegionExtension,
final int maxRegionSize) {
if ( profile.isEmpty() )
throw new IllegalStateException("trying to incorporate an empty active profile " + profile);
final ActivityProfile bandPassFiltered = profile.bandPassFilter();
activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ));
return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() );
}
// --------------------------------------------------------------------------------
//
// code to handle processing active regions
//
// --------------------------------------------------------------------------------
private T processActiveRegions( final ActiveRegionWalker<M,T> walker, T sum, final int minStart, final String currentContig ) {
if( walker.activeRegionOutStream != null ) {
writeActiveRegionsToStream(walker);
return sum;
} else {
return callWalkerMapOnActiveRegions(walker, sum, minStart, currentContig);
}
}
private T callWalkerMapOnActiveRegions( final ActiveRegionWalker<M,T> walker, T sum, final int minStart, final String currentContig ) {
// Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them
// TODO can implement parallel traversal here
while( workQueue.peek() != null ) {
final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc();
if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) {
final ActiveRegion activeRegion = workQueue.remove();
sum = processActiveRegion( activeRegion, sum, walker );
} else {
break;
}
}
return sum;
}
@Override
public String toString() {
return "TraverseActiveRegionsOriginal";
}
@Override
protected T processActiveRegion(final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker<M, T> walker) {
protected T processActiveRegion( final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker<M,T> walker ) {
final ArrayList<GATKSAMRecord> placedReads = new ArrayList<GATKSAMRecord>();
for( final GATKSAMRecord read : getReadsInCurrentRegion() ) {
for( final GATKSAMRecord read : myReads ) {
final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read );
if( activeRegion.getLocation().overlapsP( readLoc ) ) {
// The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region)
final ActiveRegion bestRegion = getBestRegion(activeRegion, readLoc);
long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc );
ActiveRegion bestRegion = activeRegion;
for( final ActiveRegion otherRegionToTest : workQueue ) {
if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) {
maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc );
bestRegion = otherRegionToTest;
}
}
bestRegion.add( read );
// The read is also added to all other regions in which it overlaps but marked as non-primary
if( walker.wantsNonPrimaryReads() ) {
if( !bestRegion.equals(activeRegion) ) {
activeRegion.add( read );
@ -160,16 +211,27 @@ public class TraverseActiveRegionsOriginal<M,T> extends TraverseActiveRegions<M,
}
placedReads.add( read );
// check for non-primary vs. extended
} else if( activeRegion.getLocation().overlapsP( readLoc ) ) {
if ( walker.wantsNonPrimaryReads() ) {
activeRegion.add( read );
}
} else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) {
activeRegion.add( read );
}
}
removeReadsFromCurrentRegion(placedReads);
myReads.removeAll( placedReads ); // remove all the reads which have been placed into their active region
// WARNING: This hashset relies on reads being exactly equal when they are placed in the list as when they are removed. So the ActiveRegionWalker can't modify the reads in any way.
logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc());
final M x = walker.map(activeRegion, null);
final M x = walker.map( activeRegion, null );
return walker.reduce( x, sum );
}
/**
* Special function called in LinearMicroScheduler to empty out the work queue.
* Ugly for now but will be cleaned up when we push this functionality more into the engine
*/
public T endTraversal( final Walker<M,T> walker, T sum) {
return processActiveRegions((ActiveRegionWalker<M,T>)walker, sum, Integer.MAX_VALUE, null);
}
}

View File

@ -76,9 +76,7 @@ import java.util.*;
* Test the Active Region Traversal Contract
* http://iwww.broadinstitute.org/gsa/wiki/index.php/Active_Region_Traversal_Contract
*/
public class TraverseActiveRegionsUnitTest extends BaseTest {
private final static boolean INCLUDE_OLD = false;
private final static boolean INCLUDE_NEW = true;
public class TraverseActiveRegionsOptimizedUnitTest extends BaseTest {
private final static boolean ENFORCE_CONTRACTS = false;
private final static boolean DEBUG = false;
@ -133,8 +131,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
@DataProvider(name = "TraversalEngineProvider")
public Object[][] makeTraversals() {
final List<Object[]> traversals = new LinkedList<Object[]>();
if ( INCLUDE_OLD ) traversals.add(new Object[]{new TraverseActiveRegionsOriginal<Integer, Integer>()});
if ( INCLUDE_NEW ) traversals.add(new Object[]{new TraverseActiveRegionsOptimized<Integer, Integer>()});
traversals.add(new Object[]{new TraverseActiveRegionsOptimized<Integer, Integer>()});
return traversals.toArray(new Object[][]{});
}

View File

@ -0,0 +1,523 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.traversals;
import com.google.java.contract.PreconditionError;
import net.sf.samtools.*;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.datasources.reads.*;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.executive.WindowMaker;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
/**
* Created with IntelliJ IDEA.
* User: depristo
* Date: 1/10/13
* Time: 8:03 PM
* To change this template use File | Settings | File Templates.
*/
public class TraverseActiveRegionsOriginalUnitTest extends BaseTest {
private class DummyActiveRegionWalker extends ActiveRegionWalker<Integer, Integer> {
private final double prob;
private EnumSet<ActiveRegionReadState> states = super.desiredReadStates();
protected List<GenomeLoc> isActiveCalls = new ArrayList<GenomeLoc>();
protected Map<GenomeLoc, ActiveRegion> mappedActiveRegions = new HashMap<GenomeLoc, ActiveRegion>();
public DummyActiveRegionWalker() {
this.prob = 1.0;
}
public DummyActiveRegionWalker(double constProb) {
this.prob = constProb;
}
public DummyActiveRegionWalker(EnumSet<ActiveRegionReadState> wantStates) {
this.prob = 1.0;
this.states = wantStates;
}
@Override
public EnumSet<ActiveRegionReadState> desiredReadStates() {
return states;
}
@Override
public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
isActiveCalls.add(ref.getLocus());
return new ActivityProfileResult(ref.getLocus(), prob);
}
@Override
public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) {
mappedActiveRegions.put(activeRegion.getLocation(), activeRegion);
return 0;
}
@Override
public Integer reduceInit() {
return 0;
}
@Override
public Integer reduce(Integer value, Integer sum) {
return 0;
}
}
private final TraverseActiveRegions<Integer, Integer> t = new TraverseActiveRegionsOriginal<Integer, Integer>();
private IndexedFastaSequenceFile reference;
private SAMSequenceDictionary dictionary;
private GenomeLocParser genomeLocParser;
private List<GenomeLoc> intervals;
private static final String testBAM = "TraverseActiveRegionsUnitTest.bam";
private static final String testBAI = "TraverseActiveRegionsUnitTest.bai";
@BeforeClass
private void init() throws FileNotFoundException {
reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference));
dictionary = reference.getSequenceDictionary();
genomeLocParser = new GenomeLocParser(dictionary);
// TODO: reads with indels
// TODO: reads which span many regions
// TODO: reads which are partially between intervals (in/outside extension)
// TODO: duplicate reads
// TODO: read at the end of a contig
// TODO: reads which are completely outside intervals but within extension
// TODO: test the extension itself
// TODO: unmapped reads
intervals = new ArrayList<GenomeLoc>();
intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20));
intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999));
intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999));
intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999));
intervals.add(genomeLocParser.createGenomeLoc("1", 10000, 20000));
intervals.add(genomeLocParser.createGenomeLoc("2", 1, 100));
intervals.add(genomeLocParser.createGenomeLoc("20", 10000, 10100));
intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList();
List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
reads.add(buildSAMRecord("simple", "1", 100, 200));
reads.add(buildSAMRecord("overlap_equal", "1", 10, 20));
reads.add(buildSAMRecord("overlap_unequal", "1", 10, 21));
reads.add(buildSAMRecord("boundary_equal", "1", 1990, 2009));
reads.add(buildSAMRecord("boundary_unequal", "1", 1990, 2008));
reads.add(buildSAMRecord("boundary_1_pre", "1", 1950, 2000));
reads.add(buildSAMRecord("boundary_1_post", "1", 1999, 2050));
reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990));
reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000));
reads.add(buildSAMRecord("shard_boundary_1_pre", "1", 16300, 16385));
reads.add(buildSAMRecord("shard_boundary_1_post", "1", 16384, 16400));
reads.add(buildSAMRecord("shard_boundary_equal", "1", 16355, 16414));
reads.add(buildSAMRecord("simple20", "20", 10025, 10075));
createBAM(reads);
}
private void createBAM(List<GATKSAMRecord> reads) {
File outFile = new File(testBAM);
outFile.deleteOnExit();
File indexFile = new File(testBAI);
indexFile.deleteOnExit();
SAMFileWriter out = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(reads.get(0).getHeader(), true, outFile);
for (GATKSAMRecord read : ReadUtils.sortReadsByCoordinate(reads)) {
out.addAlignment(read);
}
out.close();
}
@Test
public void testAllBasesSeen() {
DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
List<GenomeLoc> activeIntervals = getIsActiveIntervals(walker, intervals);
// Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call
verifyEqualIntervals(intervals, activeIntervals);
}
private List<GenomeLoc> getIsActiveIntervals(DummyActiveRegionWalker walker, List<GenomeLoc> intervals) {
List<GenomeLoc> activeIntervals = new ArrayList<GenomeLoc>();
for (LocusShardDataProvider dataProvider : createDataProviders(walker, intervals, testBAM)) {
t.traverse(walker, dataProvider, 0);
activeIntervals.addAll(walker.isActiveCalls);
}
return activeIntervals;
}
@Test (expectedExceptions = PreconditionError.class)
public void testIsActiveRangeLow () {
DummyActiveRegionWalker walker = new DummyActiveRegionWalker(-0.1);
getActiveRegions(walker, intervals).values();
}
@Test (expectedExceptions = PreconditionError.class)
public void testIsActiveRangeHigh () {
DummyActiveRegionWalker walker = new DummyActiveRegionWalker(1.1);
getActiveRegions(walker, intervals).values();
}
@Test
public void testActiveRegionCoverage() {
DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
Collection<ActiveRegion> activeRegions = getActiveRegions(walker, intervals).values();
verifyActiveRegionCoverage(intervals, activeRegions);
}
private void verifyActiveRegionCoverage(List<GenomeLoc> intervals, Collection<ActiveRegion> activeRegions) {
List<GenomeLoc> intervalStarts = new ArrayList<GenomeLoc>();
List<GenomeLoc> intervalStops = new ArrayList<GenomeLoc>();
for (GenomeLoc interval : intervals) {
intervalStarts.add(interval.getStartLocation());
intervalStops.add(interval.getStopLocation());
}
Map<GenomeLoc, ActiveRegion> baseRegionMap = new HashMap<GenomeLoc, ActiveRegion>();
for (ActiveRegion activeRegion : activeRegions) {
for (GenomeLoc activeLoc : toSingleBaseLocs(activeRegion.getLocation())) {
// Contract: Regions do not overlap
Assert.assertFalse(baseRegionMap.containsKey(activeLoc), "Genome location " + activeLoc + " is assigned to more than one region");
baseRegionMap.put(activeLoc, activeRegion);
}
GenomeLoc start = activeRegion.getLocation().getStartLocation();
if (intervalStarts.contains(start))
intervalStarts.remove(start);
GenomeLoc stop = activeRegion.getLocation().getStopLocation();
if (intervalStops.contains(stop))
intervalStops.remove(stop);
}
for (GenomeLoc baseLoc : toSingleBaseLocs(intervals)) {
// Contract: Each location in the interval(s) is in exactly one region
// Contract: The total set of regions exactly matches the analysis interval(s)
Assert.assertTrue(baseRegionMap.containsKey(baseLoc), "Genome location " + baseLoc + " is not assigned to any region");
baseRegionMap.remove(baseLoc);
}
// Contract: The total set of regions exactly matches the analysis interval(s)
Assert.assertEquals(baseRegionMap.size(), 0, "Active regions contain base(s) outside of the given intervals");
// Contract: All explicit interval boundaries must also be region boundaries
Assert.assertEquals(intervalStarts.size(), 0, "Interval start location does not match an active region start location");
Assert.assertEquals(intervalStops.size(), 0, "Interval stop location does not match an active region stop location");
}
@Test
public void testActiveRegionExtensionOnContig() {
DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
Collection<ActiveRegion> activeRegions = getActiveRegions(walker, intervals).values();
for (ActiveRegion activeRegion : activeRegions) {
GenomeLoc loc = activeRegion.getExtendedLoc();
// Contract: active region extensions must stay on the contig
Assert.assertTrue(loc.getStart() > 0, "Active region extension begins at location " + loc.getStart() + ", past the left end of the contig");
int refLen = dictionary.getSequence(loc.getContigIndex()).getSequenceLength();
Assert.assertTrue(loc.getStop() <= refLen, "Active region extension ends at location " + loc.getStop() + ", past the right end of the contig");
}
}
@Test
public void testPrimaryReadMapping() {
DummyActiveRegionWalker walker = new DummyActiveRegionWalker();
// Contract: Each read has the Primary state in a single region (or none)
// This is the region of maximum overlap for the read (earlier if tied)
// simple: Primary in 1:1-999
// overlap_equal: Primary in 1:1-999
// overlap_unequal: Primary in 1:1-999
// boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
// boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
// boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
// boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
// extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999
// outside_intervals: none
// shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927
// shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
// shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
// simple20: Primary in 20:10000-10100
Map<GenomeLoc, ActiveRegion> activeRegions = getActiveRegions(walker, intervals);
ActiveRegion region;
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999));
verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999));
verifyReadMapping(region, "boundary_unequal", "extended_and_np", "boundary_1_pre");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999));
verifyReadMapping(region, "boundary_equal", "boundary_1_post");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384));
verifyReadMapping(region, "shard_boundary_1_pre");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927));
verifyReadMapping(region, "shard_boundary_1_post", "shard_boundary_equal");
region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100));
verifyReadMapping(region, "simple20");
}
@Test
public void testNonPrimaryReadMapping() {
DummyActiveRegionWalker walker = new DummyActiveRegionWalker(
EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY));
// Contract: Each read has the Primary state in a single region (or none)
// This is the region of maximum overlap for the read (earlier if tied)
// Contract: Each read has the Non-Primary state in all other regions it overlaps
// simple: Primary in 1:1-999
// overlap_equal: Primary in 1:1-999
// overlap_unequal: Primary in 1:1-999
// boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
// boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
// boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
// boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
// extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999
// outside_intervals: none
// shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927
// shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
// shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
// simple20: Primary in 20:10000-10100
Map<GenomeLoc, ActiveRegion> activeRegions = getActiveRegions(walker, intervals);
ActiveRegion region;
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999));
verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999));
verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999));
verifyReadMapping(region, "boundary_equal", "boundary_unequal", "boundary_1_pre", "boundary_1_post");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384));
verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927));
verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");
region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100));
verifyReadMapping(region, "simple20");
}
@Test
public void testExtendedReadMapping() {
DummyActiveRegionWalker walker = new DummyActiveRegionWalker(
EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED));
// Contract: Each read has the Primary state in a single region (or none)
// This is the region of maximum overlap for the read (earlier if tied)
// Contract: Each read has the Non-Primary state in all other regions it overlaps
// Contract: Each read has the Extended state in regions where it only overlaps if the region is extended
// simple: Primary in 1:1-999
// overlap_equal: Primary in 1:1-999
// overlap_unequal: Primary in 1:1-999
// boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
// boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
// boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999
// boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999
// extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999
// outside_intervals: none
// shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927
// shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
// shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927
// simple20: Primary in 20:10000-10100
Map<GenomeLoc, ActiveRegion> activeRegions = getActiveRegions(walker, intervals);
ActiveRegion region;
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999));
verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999));
verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999));
verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384));
verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");
region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927));
verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal");
region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100));
verifyReadMapping(region, "simple20");
}
@Test
public void testUnmappedReads() {
// TODO
}
private void verifyReadMapping(ActiveRegion region, String... reads) {
Collection<String> wantReads = new ArrayList<String>(Arrays.asList(reads));
for (SAMRecord read : region.getReads()) {
String regionReadName = read.getReadName();
Assert.assertTrue(wantReads.contains(regionReadName), "Read " + regionReadName + " assigned to active region " + region);
wantReads.remove(regionReadName);
}
Assert.assertTrue(wantReads.isEmpty(), "Reads missing in active region " + region);
}
private Map<GenomeLoc, ActiveRegion> getActiveRegions(DummyActiveRegionWalker walker, List<GenomeLoc> intervals) {
for (LocusShardDataProvider dataProvider : createDataProviders(walker, intervals, testBAM))
t.traverse(walker, dataProvider, 0);
t.endTraversal(walker, 0);
return walker.mappedActiveRegions;
}
private Collection<GenomeLoc> toSingleBaseLocs(GenomeLoc interval) {
List<GenomeLoc> bases = new ArrayList<GenomeLoc>();
if (interval.size() == 1)
bases.add(interval);
else {
for (int location = interval.getStart(); location <= interval.getStop(); location++)
bases.add(genomeLocParser.createGenomeLoc(interval.getContig(), location, location));
}
return bases;
}
private Collection<GenomeLoc> toSingleBaseLocs(List<GenomeLoc> intervals) {
Set<GenomeLoc> bases = new TreeSet<GenomeLoc>(); // for sorting and uniqueness
for (GenomeLoc interval : intervals)
bases.addAll(toSingleBaseLocs(interval));
return bases;
}
private void verifyEqualIntervals(List<GenomeLoc> aIntervals, List<GenomeLoc> bIntervals) {
Collection<GenomeLoc> aBases = toSingleBaseLocs(aIntervals);
Collection<GenomeLoc> bBases = toSingleBaseLocs(bIntervals);
Assert.assertTrue(aBases.size() == bBases.size(), "Interval lists have a differing number of bases: " + aBases.size() + " vs. " + bBases.size());
Iterator<GenomeLoc> aIter = aBases.iterator();
Iterator<GenomeLoc> bIter = bBases.iterator();
while (aIter.hasNext() && bIter.hasNext()) {
GenomeLoc aLoc = aIter.next();
GenomeLoc bLoc = bIter.next();
Assert.assertTrue(aLoc.equals(bLoc), "Interval locations do not match: " + aLoc + " vs. " + bLoc);
}
}
// copied from LocusViewTemplate
protected GATKSAMRecord buildSAMRecord(String readName, String contig, int alignmentStart, int alignmentEnd) {
SAMFileHeader header = ArtificialSAMUtils.createDefaultReadGroup(new SAMFileHeader(), "test", "test");
header.setSequenceDictionary(dictionary);
header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
GATKSAMRecord record = new GATKSAMRecord(header);
record.setReadName(readName);
record.setReferenceIndex(dictionary.getSequenceIndex(contig));
record.setAlignmentStart(alignmentStart);
Cigar cigar = new Cigar();
int len = alignmentEnd - alignmentStart + 1;
cigar.add(new CigarElement(len, CigarOperator.M));
record.setCigar(cigar);
record.setReadString(new String(new char[len]).replace("\0", "A"));
record.setBaseQualities(new byte[len]);
return record;
}
private List<LocusShardDataProvider> createDataProviders(final Walker walker, List<GenomeLoc> intervals, String bamFile) {
GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
engine.setGenomeLocParser(genomeLocParser);
t.initialize(engine, walker);
Collection<SAMReaderID> samFiles = new ArrayList<SAMReaderID>();
SAMReaderID readerID = new SAMReaderID(new File(bamFile), new Tags());
samFiles.add(readerID);
SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser);
List<LocusShardDataProvider> providers = new ArrayList<LocusShardDataProvider>();
for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) {
for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs())) {
providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList<ReferenceOrderedDataSource>()));
}
}
return providers;
}
}