Fixed the add functionality of GenomeLocSortedSet.

* Fixed GenomeLocSortedSet.add() to ensure that overlapping intervals are detected and an exception is thrown.
 * Fixed GenomeLocSortedSet.addRegion() by merging it with the add() method; it now produces sorted inputs in all cases.
 * Cleaned up duplicated code throughout the engine to create a list of intervals over all contigs.
 * Added more unit tests for add functionality of GLSS.
 * Resolves GSA-775.
This commit is contained in:
Eric Banks 2013-02-28 13:46:49 -05:00
parent 6a77eee5f4
commit ebd5404124
8 changed files with 130 additions and 98 deletions

View File

@ -558,7 +558,7 @@ public class GenomeAnalysisEngine {
if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
if(intervals == null)
return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer());
return readsDataSource.createShardIteratorOverMappedReads(new LocusShardBalancer());
else
return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer());
}
@ -566,7 +566,7 @@ public class GenomeAnalysisEngine {
if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
if(intervals == null)
return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer());
return readsDataSource.createShardIteratorOverMappedReads(new LocusShardBalancer());
else
return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new LocusShardBalancer());
}

View File

@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.GATKChunk;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
@ -53,14 +52,15 @@ public class BAMScheduler implements Iterator<FilePointer> {
private PeekableIterator<GenomeLoc> locusIterator;
private GenomeLoc currentLocus;
public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary referenceSequenceDictionary, final GenomeLocParser parser) {
BAMScheduler scheduler = new BAMScheduler(dataSource);
GenomeLocSortedSet intervals = new GenomeLocSortedSet(parser);
for(SAMSequenceRecord sequence: referenceSequenceDictionary.getSequences()) {
// Match only on sequence name; trust startup validation to make sure all the sequences match.
if(dataSource.getHeader().getSequenceDictionary().getSequence(sequence.getSequenceName()) != null)
intervals.add(parser.createOverEntireContig(sequence.getSequenceName()));
}
/*
* Creates BAMScheduler using contigs from the given BAM data source.
*
* @param dataSource BAM source
* @return non-null BAM scheduler
*/
public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource) {
final BAMScheduler scheduler = new BAMScheduler(dataSource);
final GenomeLocSortedSet intervals = GenomeLocSortedSet.createSetFromSequenceDictionary(dataSource.getHeader().getSequenceDictionary());
scheduler.populateFilteredIntervalList(intervals);
return scheduler;
}

View File

@ -26,7 +26,6 @@
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
@ -51,8 +50,8 @@ public class IntervalSharder implements Iterator<FilePointer> {
return new IntervalSharder(BAMScheduler.createOverAllReads(dataSource,parser),parser);
}
public static IntervalSharder shardOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary sequenceDictionary, final GenomeLocParser parser) {
return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource,sequenceDictionary,parser),parser);
public static IntervalSharder shardOverMappedReads(final SAMDataSource dataSource, final GenomeLocParser parser) {
return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource),parser);
}
public static IntervalSharder shardOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {

View File

@ -1060,10 +1060,12 @@ public class SAMDataSource {
/**
* Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any
* read that has been assigned
* @return
*
* @param shardBalancer shard balancer object
* @return non-null initialized version of the shard balancer
*/
public Iterable<Shard> createShardIteratorOverMappedReads(final SAMSequenceDictionary sequenceDictionary, final ShardBalancer shardBalancer) {
shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,sequenceDictionary,genomeLocParser),genomeLocParser);
public Iterable<Shard> createShardIteratorOverMappedReads(final ShardBalancer shardBalancer) {
shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,genomeLocParser),genomeLocParser);
return shardBalancer;
}

View File

@ -26,12 +26,10 @@
package org.broadinstitute.sting.gatk.datasources.reads.utilities;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.CommandLineProgram;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.datasources.reads.BAMScheduler;
import org.broadinstitute.sting.gatk.datasources.reads.FilePointer;
import org.broadinstitute.sting.gatk.datasources.reads.IntervalSharder;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
@ -98,14 +96,11 @@ public class FindLargeShards extends CommandLineProgram {
SAMDataSource dataSource = new SAMDataSource(bamReaders,new ThreadAllocation(),null,genomeLocParser);
// intervals
GenomeLocSortedSet intervalSortedSet = null;
if(intervals != null)
final GenomeLocSortedSet intervalSortedSet;
if ( intervals != null )
intervalSortedSet = IntervalUtils.sortAndMergeIntervals(genomeLocParser, IntervalUtils.parseIntervalArguments(genomeLocParser, intervals), IntervalMergingRule.ALL);
else {
intervalSortedSet = new GenomeLocSortedSet(genomeLocParser);
for(SAMSequenceRecord entry: refReader.getSequenceDictionary().getSequences())
intervalSortedSet.add(genomeLocParser.createGenomeLoc(entry.getSequenceName(),1,entry.getSequenceLength()));
}
else
intervalSortedSet = GenomeLocSortedSet.createSetFromSequenceDictionary(refReader.getSequenceDictionary());
logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize"));

View File

@ -266,80 +266,96 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
}
/**
* add a genomeLoc to the collection, simply inserting in order into the set
* Adds a GenomeLoc to the collection, inserting at the correct sorted position into the set.
* Throws an exception if the loc overlaps another loc already in the set.
*
* TODO -- this may break the contract of the GenomeLocSortedSet if e overlaps or
* TODO -- other locations already in the set. This code should check to see if
* TODO -- e is overlapping with its nearby elements and merge them or alternatively
* TODO -- throw an exception
* @param loc the GenomeLoc to add
*
* @param e the GenomeLoc to add
*
* @return true
* @return true if the loc was added or false otherwise (if the loc was null)
*/
public boolean add(GenomeLoc e) {
// assuming that the intervals coming arrive in order saves us a fair amount of time (and it's most likely true)
if (mArray.size() > 0 && e.isPast(mArray.get(mArray.size() - 1))) {
mArray.add(e);
return true;
} else {
final int loc = Collections.binarySearch(mArray,e);
if (loc >= 0) {
throw new ReviewedStingException("Genome Loc Sorted Set already contains the GenomicLoc " + e.toString());
} else {
mArray.add((loc+1) * -1,e);
return true;
}
}
public boolean add(final GenomeLoc loc) {
return add(loc, false);
}
/**
* Adds a GenomeLoc to the collection, merging it if it overlaps another region.
* If it's not overlapping then we add it in sorted order.
* If it's not overlapping then we insert it at the correct sorted position into the set.
*
* TODO TODO TODO -- this function is buggy and will not properly create a sorted
* TODO TODO TODO -- genome loc is addRegion is called sequentially where the second
* TODO TODO TODO -- loc added is actually before the first. So when creating
* TODO TODO TODO -- sets make sure to sort the input locations first!
* @param loc the GenomeLoc to add
*
* @param e the GenomeLoc to add to the collection
*
* @return true, if the GenomeLoc could be added to the collection
* @return true if the loc was added or false otherwise (if the loc was null)
*/
public boolean addRegion(GenomeLoc e) {
if (e == null) {
return false;
}
// have we added it to the collection?
boolean haveAdded = false;
public boolean addRegion(final GenomeLoc loc) {
return add(loc, true);
}
/**
* check if the specified element overlaps any current locations, if so
* we should merge the two.
*/
for (GenomeLoc g : mArray) {
if (g.contiguousP(e)) {
GenomeLoc c = g.merge(e);
mArray.set(mArray.indexOf(g), c);
haveAdded = true;
} else if ((g.getContigIndex() == e.getContigIndex()) &&
(e.getStart() < g.getStart()) && !haveAdded) {
mArray.add(mArray.indexOf(g), e);
return true;
} else if (haveAdded && ((e.getContigIndex() > e.getContigIndex()) ||
(g.getContigIndex() == e.getContigIndex() && e.getStart() > g.getStart()))) {
return true;
}
/**
* Adds a GenomeLoc to the collection, inserting at the correct sorted position into the set.
*
* @param loc the GenomeLoc to add
* @param mergeIfIntervalOverlaps if true we merge the interval if it overlaps another one already in the set, otherwise we throw an exception
*
* @return true if the loc was added or false otherwise (if the loc was null or an exact duplicate)
*/
public boolean add(final GenomeLoc loc, final boolean mergeIfIntervalOverlaps) {
if ( loc == null )
return false;
// if we have no other intervals yet or if the new loc is past the last one in the list (which is usually the
// case because locs are generally added in order) then be extra efficient and just add the loc to the end
if ( mArray.size() == 0 || loc.isPast(mArray.get(mArray.size() - 1)) ) {
return mArray.add(loc);
}
/** we're at the end and we haven't found locations that should fall after it,
* so we'll put it at the end
*/
if (!haveAdded) {
mArray.add(e);
// find where in the list the new loc belongs
final int binarySearchIndex = Collections.binarySearch(mArray,loc);
// if it already exists in the list, return or throw an exception as needed
if ( binarySearchIndex >= 0 ) {
if ( mergeIfIntervalOverlaps )
return false;
throw new IllegalArgumentException("GenomeLocSortedSet already contains the GenomeLoc " + loc);
}
// if it overlaps a loc already in the list merge or throw an exception as needed
final int insertionIndex = -1 * (binarySearchIndex + 1);
if ( ! mergeOverlappingIntervalsFromAdd(loc, insertionIndex, !mergeIfIntervalOverlaps) ) {
// it does not overlap any current intervals, so add it to the set
mArray.add(insertionIndex, loc);
}
return true;
}
/*
* If the provided GenomeLoc overlaps another already in the set, merge them (or throw an exception if requested)
*
* @param loc the GenomeLoc to add
* @param insertionIndex the index in the sorted set to add the new loc
* @param throwExceptionIfOverlapping if true we throw an exception if there's overlap, otherwise we merge them
*
* @return true if the loc was added or false otherwise
*/
private boolean mergeOverlappingIntervalsFromAdd(final GenomeLoc loc, final int insertionIndex, final boolean throwExceptionIfOverlapping) {
// try merging with the previous index
if ( insertionIndex != 0 && loc.overlapsP(mArray.get(insertionIndex - 1)) ) {
if ( throwExceptionIfOverlapping )
throw new IllegalArgumentException(String.format("GenomeLocSortedSet contains a GenomeLoc (%s) that overlaps with the provided one (%s)", mArray.get(insertionIndex - 1).toString(), loc.toString()));
mArray.set(insertionIndex - 1, mArray.get(insertionIndex - 1).merge(loc));
return true;
}
// try merging with the following index
if ( insertionIndex < mArray.size() && loc.overlapsP(mArray.get(insertionIndex)) ) {
if ( throwExceptionIfOverlapping )
throw new IllegalArgumentException(String.format("GenomeLocSortedSet contains a GenomeLoc (%s) that overlaps with the provided one (%s)", mArray.get(insertionIndex).toString(), loc.toString()));
mArray.set(insertionIndex, mArray.get(insertionIndex).merge(loc));
return true;
}
return false;
}
public GenomeLocSortedSet subtractRegions(GenomeLocSortedSet toRemoveSet) {
LinkedList<GenomeLoc> good = new LinkedList<GenomeLoc>();
Stack<GenomeLoc> toProcess = new Stack<GenomeLoc>();
@ -401,11 +417,11 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
*
* @return the GenomeLocSet of all references sequences as GenomeLoc's
*/
public static GenomeLocSortedSet createSetFromSequenceDictionary(SAMSequenceDictionary dict) {
GenomeLocParser parser = new GenomeLocParser(dict);
GenomeLocSortedSet returnSortedSet = new GenomeLocSortedSet(parser);
for (SAMSequenceRecord record : dict.getSequences()) {
returnSortedSet.add(parser.createGenomeLoc(record.getSequenceName(), 1, record.getSequenceLength()));
public static GenomeLocSortedSet createSetFromSequenceDictionary(final SAMSequenceDictionary dict) {
final GenomeLocParser parser = new GenomeLocParser(dict);
final GenomeLocSortedSet returnSortedSet = new GenomeLocSortedSet(parser);
for ( final SAMSequenceRecord sequence : dict.getSequences() ) {
returnSortedSet.add(parser.createOverEntireContig(sequence.getSequenceName()));
}
return returnSortedSet;
}

View File

@ -111,7 +111,7 @@ public class SAMDataSourceUnitTest extends BaseTest {
new ArrayList<ReadFilter>(),
false);
Iterable<Shard> strat = data.createShardIteratorOverMappedReads(seq.getSequenceDictionary(),new LocusShardBalancer());
Iterable<Shard> strat = data.createShardIteratorOverMappedReads(new LocusShardBalancer());
int count = 0;
try {

View File

@ -117,11 +117,31 @@ public class GenomeLocSortedSetUnitTest extends BaseTest {
GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, 30, 80);
mSortedSet.addRegion(f);
assertTrue(mSortedSet.size() == 1);
}
@Test
public void addRegionsOutOfOrder() {
final String contigTwoName = header.getSequenceDictionary().getSequence(2).getSequenceName();
assertTrue(mSortedSet.size() == 0);
GenomeLoc g = genomeLocParser.createGenomeLoc(contigTwoName, 1, 50);
mSortedSet.add(g);
GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, 30, 80);
mSortedSet.addRegion(f);
assertTrue(mSortedSet.size() == 2);
assertTrue(mSortedSet.toList().get(0).getContig().equals(contigOneName));
assertTrue(mSortedSet.toList().get(1).getContig().equals(contigTwoName));
}
@Test(expectedExceptions=ReviewedStingException.class)
@Test(expectedExceptions = IllegalArgumentException.class)
public void addThrowsException() {
assertTrue(mSortedSet.size() == 0);
GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 1, 50);
mSortedSet.add(g);
GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, 30, 80);
mSortedSet.add(f);
}
@Test(expectedExceptions=IllegalArgumentException.class)
public void testAddDuplicate() {
assertTrue(mSortedSet.size() == 0);
GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 0, 0);
@ -141,9 +161,9 @@ public class GenomeLocSortedSetUnitTest extends BaseTest {
assertTrue(mSortedSet.size() == 1);
Iterator<GenomeLoc> iter = mSortedSet.iterator();
GenomeLoc loc = iter.next();
assertTrue(loc.getStart() == 0);
assertTrue(loc.getStop() == 100);
assertTrue(loc.getContigIndex() == 1);
assertEquals(loc.getStart(), 0);
assertEquals(loc.getStop(), 100);
assertEquals(loc.getContigIndex(), 1);
}
@Test
@ -192,9 +212,9 @@ public class GenomeLocSortedSetUnitTest extends BaseTest {
assertTrue(mSortedSet.size() == 1);
Iterator<GenomeLoc> iter = mSortedSet.iterator();
GenomeLoc loc = iter.next();
assertTrue(loc.getStart() == 0);
assertTrue(loc.getStop() == 100);
assertTrue(loc.getContigIndex() == 1);
assertEquals(loc.getStart(), 0);
assertEquals(loc.getStop(), 100);
assertEquals(loc.getContigIndex(), 1);
}
@Test