Made IntervalSharder respect the IntervalMergingRule specified on the command line

* This addresses PT Bug 69741902
* Added a required IMR argument to FilePointer, BAMScheduler, IntervalSharder, and SAMDataSource
* This rule is used by FilePointer.combine and FilePointer.union
* Added unit and integration tests
This commit is contained in:
Phillip Dexheimer 2014-04-24 22:49:52 -04:00
parent 4ce09d8693
commit 7a2b70a10f
12 changed files with 125 additions and 44 deletions

View File

@ -888,7 +888,8 @@ public class GenomeAnalysisEngine {
argCollection.defaultBaseQualities, argCollection.defaultBaseQualities,
removeProgramRecords, removeProgramRecords,
keepReadsInLIBS, keepReadsInLIBS,
sampleRenameMap); sampleRenameMap,
argCollection.intervalArguments.intervalMerging);
} }
/** /**

View File

@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.*; import java.util.*;
@ -51,6 +52,7 @@ public class BAMScheduler implements Iterator<FilePointer> {
private GenomeLocSortedSet loci; private GenomeLocSortedSet loci;
private PeekableIterator<GenomeLoc> locusIterator; private PeekableIterator<GenomeLoc> locusIterator;
private GenomeLoc currentLocus; private GenomeLoc currentLocus;
private IntervalMergingRule intervalMergingRule;
/* /*
* Creates BAMScheduler using contigs from the given BAM data source. * Creates BAMScheduler using contigs from the given BAM data source.
@ -59,27 +61,28 @@ public class BAMScheduler implements Iterator<FilePointer> {
* @return non-null BAM scheduler * @return non-null BAM scheduler
*/ */
public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource) { public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource) {
final BAMScheduler scheduler = new BAMScheduler(dataSource); final BAMScheduler scheduler = new BAMScheduler(dataSource, IntervalMergingRule.ALL);
final GenomeLocSortedSet intervals = GenomeLocSortedSet.createSetFromSequenceDictionary(dataSource.getHeader().getSequenceDictionary()); final GenomeLocSortedSet intervals = GenomeLocSortedSet.createSetFromSequenceDictionary(dataSource.getHeader().getSequenceDictionary());
scheduler.populateFilteredIntervalList(intervals); scheduler.populateFilteredIntervalList(intervals);
return scheduler; return scheduler;
} }
public static BAMScheduler createOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) { public static BAMScheduler createOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) {
BAMScheduler scheduler = new BAMScheduler(dataSource); BAMScheduler scheduler = new BAMScheduler(dataSource, IntervalMergingRule.ALL);
scheduler.populateUnfilteredIntervalList(parser); scheduler.populateUnfilteredIntervalList(parser);
return scheduler; return scheduler;
} }
public static BAMScheduler createOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { public static BAMScheduler createOverIntervals(final SAMDataSource dataSource, final IntervalMergingRule mergeRule, final GenomeLocSortedSet loci) {
BAMScheduler scheduler = new BAMScheduler(dataSource); BAMScheduler scheduler = new BAMScheduler(dataSource, mergeRule);
scheduler.populateFilteredIntervalList(loci); scheduler.populateFilteredIntervalList(loci);
return scheduler; return scheduler;
} }
private BAMScheduler(final SAMDataSource dataSource) { private BAMScheduler(final SAMDataSource dataSource, final IntervalMergingRule mergeRule) {
this.dataSource = dataSource; this.dataSource = dataSource;
this.intervalMergingRule = mergeRule;
for(SAMReaderID reader: dataSource.getReaderIDs()) { for(SAMReaderID reader: dataSource.getReaderIDs()) {
GATKBAMIndex index = dataSource.getIndex(reader); GATKBAMIndex index = dataSource.getIndex(reader);
if(index != null) if(index != null)
@ -124,7 +127,7 @@ public class BAMScheduler implements Iterator<FilePointer> {
* @return A file pointer over the specified region. * @return A file pointer over the specified region.
*/ */
private FilePointer generatePointerOverEntireFileset() { private FilePointer generatePointerOverEntireFileset() {
FilePointer filePointer = new FilePointer(); FilePointer filePointer = new FilePointer(intervalMergingRule);
// This is a "monolithic" FilePointer representing all regions in all files we will ever visit, and is // This is a "monolithic" FilePointer representing all regions in all files we will ever visit, and is
// the only FilePointer we will create. This allows us to have this FilePointer represent regions from // the only FilePointer we will create. This allows us to have this FilePointer represent regions from
@ -165,14 +168,14 @@ public class BAMScheduler implements Iterator<FilePointer> {
while(nextFilePointer == null && currentLocus != null) { while(nextFilePointer == null && currentLocus != null) {
// special case handling of the unmapped shard. // special case handling of the unmapped shard.
if(currentLocus == GenomeLoc.UNMAPPED) { if(currentLocus == GenomeLoc.UNMAPPED) {
nextFilePointer = new FilePointer(GenomeLoc.UNMAPPED); nextFilePointer = new FilePointer(intervalMergingRule, GenomeLoc.UNMAPPED);
for(SAMReaderID id: dataSource.getReaderIDs()) for(SAMReaderID id: dataSource.getReaderIDs())
nextFilePointer.addFileSpans(id,createSpanToEndOfFile(indexFiles.get(id).getStartOfLastLinearBin())); nextFilePointer.addFileSpans(id,createSpanToEndOfFile(indexFiles.get(id).getStartOfLastLinearBin()));
currentLocus = null; currentLocus = null;
continue; continue;
} }
nextFilePointer = new FilePointer(); nextFilePointer = new FilePointer(intervalMergingRule);
int coveredRegionStart = 1; int coveredRegionStart = 1;
int coveredRegionStop = Integer.MAX_VALUE; int coveredRegionStop = Integer.MAX_VALUE;

View File

@ -45,6 +45,7 @@ import java.util.*;
public class FilePointer { public class FilePointer {
protected final SortedMap<SAMReaderID,SAMFileSpan> fileSpans = new TreeMap<SAMReaderID,SAMFileSpan>(); protected final SortedMap<SAMReaderID,SAMFileSpan> fileSpans = new TreeMap<SAMReaderID,SAMFileSpan>();
protected final List<GenomeLoc> locations = new ArrayList<GenomeLoc>(); protected final List<GenomeLoc> locations = new ArrayList<GenomeLoc>();
protected final IntervalMergingRule intervalMergingRule;
/** /**
* Does this file pointer point into an unmapped region? * Does this file pointer point into an unmapped region?
@ -65,7 +66,8 @@ public class FilePointer {
private Integer contigIndex = null; private Integer contigIndex = null;
public FilePointer( List<GenomeLoc> locations ) { public FilePointer( final IntervalMergingRule mergeRule, final List<GenomeLoc> locations ) {
this.intervalMergingRule = mergeRule;
this.locations.addAll(locations); this.locations.addAll(locations);
this.isRegionUnmapped = checkUnmappedStatus(); this.isRegionUnmapped = checkUnmappedStatus();
@ -75,12 +77,12 @@ public class FilePointer {
} }
} }
public FilePointer( final GenomeLoc... locations ) { public FilePointer( final IntervalMergingRule mergeRule, final GenomeLoc... locations ) {
this(Arrays.asList(locations)); this(mergeRule, Arrays.asList(locations));
} }
public FilePointer( Map<SAMReaderID,SAMFileSpan> fileSpans, List<GenomeLoc> locations ) { public FilePointer( final Map<SAMReaderID,SAMFileSpan> fileSpans, final IntervalMergingRule mergeRule, final List<GenomeLoc> locations ) {
this(locations); this(mergeRule, locations);
this.fileSpans.putAll(fileSpans); this.fileSpans.putAll(fileSpans);
} }
@ -152,6 +154,15 @@ public class FilePointer {
return locations.size() > 0 ? locations.get(0).getContigIndex() : SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; return locations.size() > 0 ? locations.get(0).getContigIndex() : SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
} }
/**
* Returns the IntervalMergingRule used by this FilePointer to merge adjacent locations
*
* @return the IntervalMergingRule used by this FilePointer (never null)
*/
public IntervalMergingRule getIntervalMergingRule() {
return intervalMergingRule;
}
/** /**
* Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will * Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will
* ever visit during this GATK run? If this is set to true, the engine will expect to see only this * ever visit during this GATK run? If this is set to true, the engine will expect to see only this
@ -277,12 +288,12 @@ public class FilePointer {
* @return A completely new file pointer that is the combination of the two. * @return A completely new file pointer that is the combination of the two.
*/ */
public FilePointer combine(final GenomeLocParser parser, final FilePointer other) { public FilePointer combine(final GenomeLocParser parser, final FilePointer other) {
FilePointer combined = new FilePointer(); FilePointer combined = new FilePointer(intervalMergingRule);
List<GenomeLoc> intervals = new ArrayList<GenomeLoc>(); List<GenomeLoc> intervals = new ArrayList<GenomeLoc>();
intervals.addAll(locations); intervals.addAll(locations);
intervals.addAll(other.locations); intervals.addAll(other.locations);
for(GenomeLoc interval: IntervalUtils.sortAndMergeIntervals(parser,intervals,IntervalMergingRule.ALL)) for(GenomeLoc interval: IntervalUtils.sortAndMergeIntervals(parser,intervals,intervalMergingRule))
combined.addLocation(interval); combined.addLocation(interval);
PeekableIterator<Map.Entry<SAMReaderID,SAMFileSpan>> thisIterator = new PeekableIterator<Map.Entry<SAMReaderID,SAMFileSpan>>(this.fileSpans.entrySet().iterator()); PeekableIterator<Map.Entry<SAMReaderID,SAMFileSpan>> thisIterator = new PeekableIterator<Map.Entry<SAMReaderID,SAMFileSpan>>(this.fileSpans.entrySet().iterator());
@ -340,15 +351,18 @@ public class FilePointer {
*/ */
public static FilePointer union( List<FilePointer> filePointers, GenomeLocParser parser ) { public static FilePointer union( List<FilePointer> filePointers, GenomeLocParser parser ) {
if ( filePointers == null || filePointers.isEmpty() ) { if ( filePointers == null || filePointers.isEmpty() ) {
return new FilePointer(); return new FilePointer(IntervalMergingRule.ALL);
} }
Map<SAMReaderID, List<GATKChunk>> fileChunks = new HashMap<SAMReaderID, List<GATKChunk>>(); Map<SAMReaderID, List<GATKChunk>> fileChunks = new HashMap<SAMReaderID, List<GATKChunk>>();
List<GenomeLoc> locations = new ArrayList<GenomeLoc>(); List<GenomeLoc> locations = new ArrayList<GenomeLoc>();
IntervalMergingRule mergeRule = filePointers.get(0).getIntervalMergingRule();
// First extract all intervals and file chunks from the FilePointers into unsorted, unmerged collections // First extract all intervals and file chunks from the FilePointers into unsorted, unmerged collections
for ( FilePointer filePointer : filePointers ) { for ( FilePointer filePointer : filePointers ) {
locations.addAll(filePointer.getLocations()); locations.addAll(filePointer.getLocations());
if (mergeRule != filePointer.getIntervalMergingRule())
throw new ReviewedStingException("All FilePointers in FilePointer.union() must have use the same IntervalMergeRule");
for ( Map.Entry<SAMReaderID, SAMFileSpan> fileSpanEntry : filePointer.getFileSpans().entrySet() ) { for ( Map.Entry<SAMReaderID, SAMFileSpan> fileSpanEntry : filePointer.getFileSpans().entrySet() ) {
GATKBAMFileSpan fileSpan = (GATKBAMFileSpan)fileSpanEntry.getValue(); GATKBAMFileSpan fileSpan = (GATKBAMFileSpan)fileSpanEntry.getValue();
@ -364,7 +378,7 @@ public class FilePointer {
// Now sort and merge the intervals // Now sort and merge the intervals
List<GenomeLoc> sortedMergedLocations = new ArrayList<GenomeLoc>(); List<GenomeLoc> sortedMergedLocations = new ArrayList<GenomeLoc>();
sortedMergedLocations.addAll(IntervalUtils.sortAndMergeIntervals(parser, locations, IntervalMergingRule.ALL)); sortedMergedLocations.addAll(IntervalUtils.sortAndMergeIntervals(parser, locations, mergeRule));
// For each BAM file, convert from an unsorted, unmerged list of chunks to a GATKBAMFileSpan containing // For each BAM file, convert from an unsorted, unmerged list of chunks to a GATKBAMFileSpan containing
// the sorted, merged union of the chunks for that file // the sorted, merged union of the chunks for that file
@ -375,7 +389,7 @@ public class FilePointer {
(new GATKBAMFileSpan(unmergedChunks.toArray(new GATKChunk[unmergedChunks.size()]))).union(new GATKBAMFileSpan())); (new GATKBAMFileSpan(unmergedChunks.toArray(new GATKChunk[unmergedChunks.size()]))).union(new GATKBAMFileSpan()));
} }
return new FilePointer(mergedFileSpans, sortedMergedLocations); return new FilePointer(mergedFileSpans, mergeRule, sortedMergedLocations);
} }
/** /**

View File

@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator; import net.sf.picard.util.PeekableIterator;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import java.util.Iterator; import java.util.Iterator;
@ -54,8 +55,8 @@ public class IntervalSharder implements Iterator<FilePointer> {
return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource),parser); return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource),parser);
} }
public static IntervalSharder shardOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { public static IntervalSharder shardOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci, final IntervalMergingRule intervalMergeRule) {
return new IntervalSharder(BAMScheduler.createOverIntervals(dataSource,loci),loci.getGenomeLocParser()); return new IntervalSharder(BAMScheduler.createOverIntervals(dataSource,intervalMergeRule,loci),loci.getGenomeLocParser());
} }
private IntervalSharder(final BAMScheduler scheduler, final GenomeLocParser parser) { private IntervalSharder(final BAMScheduler scheduler, final GenomeLocParser parser) {

View File

@ -45,6 +45,7 @@ import org.broadinstitute.sting.utils.SimpleTimer;
import org.broadinstitute.sting.utils.baq.ReadTransformingIterator; import org.broadinstitute.sting.utils.baq.ReadTransformingIterator;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
@ -154,6 +155,11 @@ public class SAMDataSource {
*/ */
private final ThreadAllocation threadAllocation; private final ThreadAllocation threadAllocation;
/**
* How are adjacent intervals merged by the sharder?
*/
private final IntervalMergingRule intervalMergingRule;
/** /**
* Static set of unsupported programs that create bam files. * Static set of unsupported programs that create bam files.
* The key is the PG record ID and the value is the name of the tool that created it * The key is the PG record ID and the value is the name of the tool that created it
@ -217,7 +223,8 @@ public class SAMDataSource {
(byte) -1, (byte) -1,
false, false,
false, false,
null); null,
IntervalMergingRule.ALL);
} }
/** /**
@ -236,6 +243,7 @@ public class SAMDataSource {
* @param keepReadsInLIBS should we keep a unique list of reads in LIBS? * @param keepReadsInLIBS should we keep a unique list of reads in LIBS?
* @param sampleRenameMap Map of BAM file to new sample ID used during on-the-fly runtime sample renaming. * @param sampleRenameMap Map of BAM file to new sample ID used during on-the-fly runtime sample renaming.
* Will be null if we're not doing sample renaming. * Will be null if we're not doing sample renaming.
* @param intervalMergingRule how are adjacent intervals merged by the sharder
*/ */
public SAMDataSource( public SAMDataSource(
Collection<SAMReaderID> samFiles, Collection<SAMReaderID> samFiles,
@ -253,10 +261,12 @@ public class SAMDataSource {
byte defaultBaseQualities, byte defaultBaseQualities,
boolean removeProgramRecords, boolean removeProgramRecords,
final boolean keepReadsInLIBS, final boolean keepReadsInLIBS,
final Map<String, String> sampleRenameMap) { final Map<String, String> sampleRenameMap,
final IntervalMergingRule intervalMergingRule) {
this.readMetrics = new ReadMetrics(); this.readMetrics = new ReadMetrics();
this.genomeLocParser = genomeLocParser; this.genomeLocParser = genomeLocParser;
this.intervalMergingRule = intervalMergingRule;
readerIDs = samFiles; readerIDs = samFiles;
@ -1182,7 +1192,7 @@ public class SAMDataSource {
public Iterable<Shard> createShardIteratorOverIntervals(final GenomeLocSortedSet intervals,final ShardBalancer shardBalancer) { public Iterable<Shard> createShardIteratorOverIntervals(final GenomeLocSortedSet intervals,final ShardBalancer shardBalancer) {
if(intervals == null) if(intervals == null)
throw new ReviewedStingException("Unable to create schedule from intervals; no intervals were provided."); throw new ReviewedStingException("Unable to create schedule from intervals; no intervals were provided.");
shardBalancer.initialize(this,IntervalSharder.shardOverIntervals(SAMDataSource.this,intervals),genomeLocParser); shardBalancer.initialize(this,IntervalSharder.shardOverIntervals(SAMDataSource.this,intervals,intervalMergingRule),genomeLocParser);
return shardBalancer; return shardBalancer;
} }
} }

View File

@ -104,7 +104,7 @@ public class FindLargeShards extends CommandLineProgram {
logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize")); logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize"));
IntervalSharder sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet); IntervalSharder sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet,IntervalMergingRule.ALL);
while(sharder.hasNext()) { while(sharder.hasNext()) {
FilePointer filePointer = sharder.next(); FilePointer filePointer = sharder.next();
@ -133,7 +133,7 @@ public class FindLargeShards extends CommandLineProgram {
logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize")); logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize"));
out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n"); out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n");
sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet); sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet,IntervalMergingRule.ALL);
while(sharder.hasNext()) { while(sharder.hasNext()) {
FilePointer filePointer = sharder.next(); FilePointer filePointer = sharder.next();

View File

@ -50,6 +50,7 @@ import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
import org.broadinstitute.sting.utils.activeregion.ActivityProfileState; import org.broadinstitute.sting.utils.activeregion.ActivityProfileState;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.sam.*; import org.broadinstitute.sting.utils.sam.*;
import org.testng.Assert; import org.testng.Assert;
import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeClass;
@ -158,7 +159,7 @@ public class ReadMetricsUnitTest extends BaseTest {
new ValidationExclusion(), new ValidationExclusion(),
new ArrayList<ReadFilter>(), new ArrayList<ReadFilter>(),
new ArrayList<ReadTransformer>(), new ArrayList<ReadTransformer>(),
false, (byte)30, false, true, null); false, (byte)30, false, true, null, IntervalMergingRule.ALL);
engine.setReadsDataSource(dataSource); engine.setReadsDataSource(dataSource);
@ -193,7 +194,7 @@ public class ReadMetricsUnitTest extends BaseTest {
new ValidationExclusion(), new ValidationExclusion(),
new ArrayList<ReadFilter>(), new ArrayList<ReadFilter>(),
new ArrayList<ReadTransformer>(), new ArrayList<ReadTransformer>(),
false, (byte)30, false, true, null); false, (byte)30, false, true, null, IntervalMergingRule.ALL);
engine.setReadsDataSource(dataSource); engine.setReadsDataSource(dataSource);
final Set<String> samples = SampleUtils.getSAMFileSamples(dataSource.getHeader()); final Set<String> samples = SampleUtils.getSAMFileSamples(dataSource.getHeader());
@ -234,7 +235,7 @@ public class ReadMetricsUnitTest extends BaseTest {
new ValidationExclusion(), new ValidationExclusion(),
new ArrayList<ReadFilter>(), new ArrayList<ReadFilter>(),
new ArrayList<ReadTransformer>(), new ArrayList<ReadTransformer>(),
false, (byte)30, false, true, null); false, (byte)30, false, true, null, IntervalMergingRule.ALL);
engine.setReadsDataSource(dataSource); engine.setReadsDataSource(dataSource);
final Set<String> samples = SampleUtils.getSAMFileSamples(dataSource.getHeader()); final Set<String> samples = SampleUtils.getSAMFileSamples(dataSource.getHeader());
@ -281,7 +282,7 @@ public class ReadMetricsUnitTest extends BaseTest {
new ValidationExclusion(), new ValidationExclusion(),
filters, filters,
new ArrayList<ReadTransformer>(), new ArrayList<ReadTransformer>(),
false, (byte)30, false, true, null); false, (byte)30, false, true, null, IntervalMergingRule.ALL);
engine.setReadsDataSource(dataSource); engine.setReadsDataSource(dataSource);

View File

@ -31,6 +31,7 @@ import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.testng.Assert; import org.testng.Assert;
import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeClass;
@ -82,7 +83,7 @@ public class ActiveRegionShardBalancerUnitTest extends BaseTest {
end = myEnd; end = myEnd;
final GenomeLoc loc = genomeLocParser.createGenomeLoc(record.getSequenceName(), i, myEnd); final GenomeLoc loc = genomeLocParser.createGenomeLoc(record.getSequenceName(), i, myEnd);
final Map<SAMReaderID, SAMFileSpan> fileSpans = Collections.emptyMap(); final Map<SAMReaderID, SAMFileSpan> fileSpans = Collections.emptyMap();
final FilePointer fp = new FilePointer(fileSpans, Collections.singletonList(loc)); final FilePointer fp = new FilePointer(fileSpans, IntervalMergingRule.ALL, Collections.singletonList(loc));
pointers.add(fp); pointers.add(fp);
} }
expectedLocs.add(Collections.singleton(genomeLocParser.createGenomeLoc(record.getSequenceName(), 0, end))); expectedLocs.add(Collections.singleton(genomeLocParser.createGenomeLoc(record.getSequenceName(), 0, end)));

View File

@ -32,6 +32,7 @@ import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.testng.Assert; import org.testng.Assert;
import org.testng.annotations.BeforeMethod; import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test; import org.testng.annotations.Test;
@ -61,12 +62,26 @@ public class FilePointerUnitTest extends BaseTest {
@Test @Test
public void testFilePointerCombineDisjoint() { public void testFilePointerCombineDisjoint() {
FilePointer one = new FilePointer(genomeLocParser.createGenomeLoc("chr1",1,5)); FilePointer one = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,5));
one.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,1))); one.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,1)));
FilePointer two = new FilePointer(genomeLocParser.createGenomeLoc("chr1",6,10)); FilePointer two = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",6,10));
two.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(1,2))); two.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(1,2)));
FilePointer result = new FilePointer(genomeLocParser.createGenomeLoc("chr1",1,10)); FilePointer result = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,10));
result.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,2)));
Assert.assertEquals(one.combine(genomeLocParser,two),result,"Combination of two file pointers is incorrect");
Assert.assertEquals(two.combine(genomeLocParser,one),result,"Combination of two file pointers is incorrect");
//Now test that adjacent (but disjoint) intervals are properly handled with OVERLAPPING_ONLY
one = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY, genomeLocParser.createGenomeLoc("chr1",1,5));
one.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,1)));
two = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY, genomeLocParser.createGenomeLoc("chr1",6,10));
two.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(1,2)));
result = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY,
genomeLocParser.createGenomeLoc("chr1",1,5),
genomeLocParser.createGenomeLoc("chr1",6,10));
result.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,2))); result.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,2)));
Assert.assertEquals(one.combine(genomeLocParser,two),result,"Combination of two file pointers is incorrect"); Assert.assertEquals(one.combine(genomeLocParser,two),result,"Combination of two file pointers is incorrect");
@ -75,12 +90,24 @@ public class FilePointerUnitTest extends BaseTest {
@Test @Test
public void testFilePointerCombineJoint() { public void testFilePointerCombineJoint() {
FilePointer one = new FilePointer(genomeLocParser.createGenomeLoc("chr1",1,5)); FilePointer one = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,5));
one.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,2))); one.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,2)));
FilePointer two = new FilePointer(genomeLocParser.createGenomeLoc("chr1",2,6)); FilePointer two = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",2,6));
two.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(1,3))); two.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(1,3)));
FilePointer result = new FilePointer(genomeLocParser.createGenomeLoc("chr1",1,6)); FilePointer result = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,6));
result.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,3)));
Assert.assertEquals(one.combine(genomeLocParser,two),result,"Combination of two file pointers is incorrect");
Assert.assertEquals(two.combine(genomeLocParser,one),result,"Combination of two file pointers is incorrect");
//Repeat the tests for OVERLAPPING_ONLY
one = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY, genomeLocParser.createGenomeLoc("chr1",1,5));
one.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,2)));
two = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY, genomeLocParser.createGenomeLoc("chr1",2,6));
two.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(1,3)));
result = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY, genomeLocParser.createGenomeLoc("chr1",1,6));
result.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,3))); result.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,3)));
Assert.assertEquals(one.combine(genomeLocParser,two),result,"Combination of two file pointers is incorrect"); Assert.assertEquals(one.combine(genomeLocParser,two),result,"Combination of two file pointers is incorrect");
@ -89,12 +116,12 @@ public class FilePointerUnitTest extends BaseTest {
@Test @Test
public void testFilePointerCombineOneSided() { public void testFilePointerCombineOneSided() {
FilePointer filePointer = new FilePointer(genomeLocParser.createGenomeLoc("chr1",1,5)); FilePointer filePointer = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,5));
filePointer.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,1))); filePointer.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,1)));
FilePointer empty = new FilePointer(genomeLocParser.createGenomeLoc("chr1",6,10)); FilePointer empty = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",6,10));
// Do not add file spans to empty result // Do not add file spans to empty result
FilePointer result = new FilePointer(genomeLocParser.createGenomeLoc("chr1",1,10)); FilePointer result = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,10));
result.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,1))); result.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,1)));
Assert.assertEquals(filePointer.combine(genomeLocParser,empty),result,"Combination of two file pointers is incorrect"); Assert.assertEquals(filePointer.combine(genomeLocParser,empty),result,"Combination of two file pointers is incorrect");
Assert.assertEquals(empty.combine(genomeLocParser,filePointer),result,"Combination of two file pointers is incorrect"); Assert.assertEquals(empty.combine(genomeLocParser,filePointer),result,"Combination of two file pointers is incorrect");

View File

@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.testng.annotations.AfterMethod; import org.testng.annotations.AfterMethod;
import org.testng.annotations.BeforeMethod; import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test; import org.testng.annotations.Test;
@ -184,7 +185,7 @@ public class SAMDataSourceUnitTest extends BaseTest {
(byte) -1, (byte) -1,
removeProgramRecords, removeProgramRecords,
false, false,
null); null, IntervalMergingRule.ALL);
List<SAMProgramRecord> dontRemoveProgramRecords = data.getHeader().getProgramRecords(); List<SAMProgramRecord> dontRemoveProgramRecords = data.getHeader().getProgramRecords();
assertEquals(dontRemoveProgramRecords, defaultProgramRecords, "testRemoveProgramRecords: default program records differ from removeProgramRecords = false"); assertEquals(dontRemoveProgramRecords, defaultProgramRecords, "testRemoveProgramRecords: default program records differ from removeProgramRecords = false");
@ -205,7 +206,7 @@ public class SAMDataSourceUnitTest extends BaseTest {
(byte) -1, (byte) -1,
removeProgramRecords, removeProgramRecords,
false, false,
null); null, IntervalMergingRule.ALL);
List<SAMProgramRecord> doRemoveProgramRecords = data.getHeader().getProgramRecords(); List<SAMProgramRecord> doRemoveProgramRecords = data.getHeader().getProgramRecords();
assertTrue(doRemoveProgramRecords.isEmpty(), "testRemoveProgramRecords: program records not cleared when removeProgramRecords = true"); assertTrue(doRemoveProgramRecords.isEmpty(), "testRemoveProgramRecords: program records not cleared when removeProgramRecords = true");
@ -247,6 +248,6 @@ public class SAMDataSourceUnitTest extends BaseTest {
(byte) -1, (byte) -1,
true, true,
false, false,
null); null, IntervalMergingRule.ALL);
} }
} }

View File

@ -481,7 +481,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest {
new ValidationExclusion(), new ValidationExclusion(),
new ArrayList<ReadFilter>(), new ArrayList<ReadFilter>(),
new ArrayList<ReadTransformer>(), new ArrayList<ReadTransformer>(),
false, (byte)30, false, true, null); false, (byte)30, false, true, null, IntervalMergingRule.ALL);
engine.setReadsDataSource(dataSource); engine.setReadsDataSource(dataSource);
final Set<String> samples = SampleUtils.getSAMFileSamples(dataSource.getHeader()); final Set<String> samples = SampleUtils.getSAMFileSamples(dataSource.getHeader());

View File

@ -120,6 +120,28 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest {
execute("testNoCoverageDueToFiltering",spec); execute("testNoCoverageDueToFiltering",spec);
} }
@Test
public void testAdjacentIntervals() {
String[] intervals = {"chr1:1-999", "chr1:1000-65536", "chr1:65537-80000", "chr1:80001-81000"};
String[] bams = {publicTestDir+"exampleBAM.bam"};
String cmd = buildRootCmd(exampleFASTA, new ArrayList<String>(Arrays.asList(bams)), new ArrayList<String>(Arrays.asList(intervals))) + " -im OVERLAPPING_ONLY";
WalkerTestSpec spec = new WalkerTestSpec(cmd, 0, new ArrayList<String>());
File baseOutputFile = WalkerTest.createTempFile("depthofcoverageadjinterval", ".tmp");
spec.setOutputFileLocation(baseOutputFile);
spec.addAuxFile("84b95d62f53e28919d1b5286558a1cae", baseOutputFile);
spec.addAuxFile("e445d4529dd3e3caa486ab8f5ec63e49", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_counts"));
spec.addAuxFile("b69c89ba8b0c393b735616c2bc3aea76", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_proportions"));
spec.addAuxFile("788988dac6119a02de2c8d4dfb06b727", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_statistics"));
spec.addAuxFile("3769ed40ab3ccd2ed94a9dc05cc2bc2f", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary"));
spec.addAuxFile("1281605e022d7462fbbcd14de53d1ca3", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics"));
spec.addAuxFile("4b41d6ff88aa2662697cb7e4b5346cb8", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary"));
execute("testAdjacentIntervals", spec);
}
public void testRefNHandling(boolean includeNs, final String md5) { public void testRefNHandling(boolean includeNs, final String md5) {
String command = "-R " + b37KGReference + " -L 20:26,319,565-26,319,575 -I " + validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -T DepthOfCoverage -baseCounts --omitIntervalStatistics --omitLocusTable --omitPerSampleStats -o %s"; String command = "-R " + b37KGReference + " -L 20:26,319,565-26,319,575 -I " + validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -T DepthOfCoverage -baseCounts --omitIntervalStatistics --omitLocusTable --omitPerSampleStats -o %s";
if ( includeNs ) command += " --includeRefNSites"; if ( includeNs ) command += " --includeRefNSites";