Fixed the add functionality of GenomeLocSortedSet.

* Fixed GenomeLocSortedSet.add() to ensure that overlapping intervals are detected and an exception is thrown.
 * Fixed GenomeLocSortedSet.addRegion() by merging it with the add() method; it now produces sorted inputs in all cases.
 * Cleaned up duplicated code throughout the engine to create a list of intervals over all contigs.
 * Added more unit tests for add functionality of GLSS.
 * Resolves GSA-775.
This commit is contained in:
Eric Banks 2013-02-28 13:46:49 -05:00
parent 6a77eee5f4
commit ebd5404124
8 changed files with 130 additions and 98 deletions

View File

@ -558,7 +558,7 @@ public class GenomeAnalysisEngine {
if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
if(intervals == null) if(intervals == null)
return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); return readsDataSource.createShardIteratorOverMappedReads(new LocusShardBalancer());
else else
return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer()); return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer());
} }
@ -566,7 +566,7 @@ public class GenomeAnalysisEngine {
if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
if(intervals == null) if(intervals == null)
return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); return readsDataSource.createShardIteratorOverMappedReads(new LocusShardBalancer());
else else
return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new LocusShardBalancer()); return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new LocusShardBalancer());
} }

View File

@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator; import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.GATKBAMFileSpan; import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.GATKChunk; import net.sf.samtools.GATKChunk;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord; import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
@ -53,14 +52,15 @@ public class BAMScheduler implements Iterator<FilePointer> {
private PeekableIterator<GenomeLoc> locusIterator; private PeekableIterator<GenomeLoc> locusIterator;
private GenomeLoc currentLocus; private GenomeLoc currentLocus;
public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary referenceSequenceDictionary, final GenomeLocParser parser) { /*
BAMScheduler scheduler = new BAMScheduler(dataSource); * Creates BAMScheduler using contigs from the given BAM data source.
GenomeLocSortedSet intervals = new GenomeLocSortedSet(parser); *
for(SAMSequenceRecord sequence: referenceSequenceDictionary.getSequences()) { * @param dataSource BAM source
// Match only on sequence name; trust startup validation to make sure all the sequences match. * @return non-null BAM scheduler
if(dataSource.getHeader().getSequenceDictionary().getSequence(sequence.getSequenceName()) != null) */
intervals.add(parser.createOverEntireContig(sequence.getSequenceName())); public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource) {
} final BAMScheduler scheduler = new BAMScheduler(dataSource);
final GenomeLocSortedSet intervals = GenomeLocSortedSet.createSetFromSequenceDictionary(dataSource.getHeader().getSequenceDictionary());
scheduler.populateFilteredIntervalList(intervals); scheduler.populateFilteredIntervalList(intervals);
return scheduler; return scheduler;
} }

View File

@ -26,7 +26,6 @@
package org.broadinstitute.sting.gatk.datasources.reads; package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator; import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.GenomeLocSortedSet;
@ -51,8 +50,8 @@ public class IntervalSharder implements Iterator<FilePointer> {
return new IntervalSharder(BAMScheduler.createOverAllReads(dataSource,parser),parser); return new IntervalSharder(BAMScheduler.createOverAllReads(dataSource,parser),parser);
} }
public static IntervalSharder shardOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary sequenceDictionary, final GenomeLocParser parser) { public static IntervalSharder shardOverMappedReads(final SAMDataSource dataSource, final GenomeLocParser parser) {
return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource,sequenceDictionary,parser),parser); return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource),parser);
} }
public static IntervalSharder shardOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { public static IntervalSharder shardOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {

View File

@ -1060,10 +1060,12 @@ public class SAMDataSource {
/** /**
* Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any * Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any
* read that has been assigned * read that has been assigned
* @return *
* @param shardBalancer shard balancer object
* @return non-null initialized version of the shard balancer
*/ */
public Iterable<Shard> createShardIteratorOverMappedReads(final SAMSequenceDictionary sequenceDictionary, final ShardBalancer shardBalancer) { public Iterable<Shard> createShardIteratorOverMappedReads(final ShardBalancer shardBalancer) {
shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,sequenceDictionary,genomeLocParser),genomeLocParser); shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,genomeLocParser),genomeLocParser);
return shardBalancer; return shardBalancer;
} }

View File

@ -26,12 +26,10 @@
package org.broadinstitute.sting.gatk.datasources.reads.utilities; package org.broadinstitute.sting.gatk.datasources.reads.utilities;
import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.CommandLineProgram; import org.broadinstitute.sting.commandline.CommandLineProgram;
import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.datasources.reads.BAMScheduler;
import org.broadinstitute.sting.gatk.datasources.reads.FilePointer; import org.broadinstitute.sting.gatk.datasources.reads.FilePointer;
import org.broadinstitute.sting.gatk.datasources.reads.IntervalSharder; import org.broadinstitute.sting.gatk.datasources.reads.IntervalSharder;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
@ -98,14 +96,11 @@ public class FindLargeShards extends CommandLineProgram {
SAMDataSource dataSource = new SAMDataSource(bamReaders,new ThreadAllocation(),null,genomeLocParser); SAMDataSource dataSource = new SAMDataSource(bamReaders,new ThreadAllocation(),null,genomeLocParser);
// intervals // intervals
GenomeLocSortedSet intervalSortedSet = null; final GenomeLocSortedSet intervalSortedSet;
if(intervals != null) if ( intervals != null )
intervalSortedSet = IntervalUtils.sortAndMergeIntervals(genomeLocParser, IntervalUtils.parseIntervalArguments(genomeLocParser, intervals), IntervalMergingRule.ALL); intervalSortedSet = IntervalUtils.sortAndMergeIntervals(genomeLocParser, IntervalUtils.parseIntervalArguments(genomeLocParser, intervals), IntervalMergingRule.ALL);
else { else
intervalSortedSet = new GenomeLocSortedSet(genomeLocParser); intervalSortedSet = GenomeLocSortedSet.createSetFromSequenceDictionary(refReader.getSequenceDictionary());
for(SAMSequenceRecord entry: refReader.getSequenceDictionary().getSequences())
intervalSortedSet.add(genomeLocParser.createGenomeLoc(entry.getSequenceName(),1,entry.getSequenceLength()));
}
logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize")); logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize"));

View File

@ -266,80 +266,96 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
} }
/** /**
* add a genomeLoc to the collection, simply inserting in order into the set * Adds a GenomeLoc to the collection, inserting at the correct sorted position into the set.
* Throws an exception if the loc overlaps another loc already in the set.
* *
* TODO -- this may break the contract of the GenomeLocSortedSet if e overlaps or * @param loc the GenomeLoc to add
* TODO -- other locations already in the set. This code should check to see if
* TODO -- e is overlapping with its nearby elements and merge them or alternatively
* TODO -- throw an exception
* *
* @param e the GenomeLoc to add * @return true if the loc was added or false otherwise (if the loc was null)
*
* @return true
*/ */
public boolean add(GenomeLoc e) { public boolean add(final GenomeLoc loc) {
// assuming that the intervals coming arrive in order saves us a fair amount of time (and it's most likely true) return add(loc, false);
if (mArray.size() > 0 && e.isPast(mArray.get(mArray.size() - 1))) {
mArray.add(e);
return true;
} else {
final int loc = Collections.binarySearch(mArray,e);
if (loc >= 0) {
throw new ReviewedStingException("Genome Loc Sorted Set already contains the GenomicLoc " + e.toString());
} else {
mArray.add((loc+1) * -1,e);
return true;
}
}
} }
/** /**
* Adds a GenomeLoc to the collection, merging it if it overlaps another region. * Adds a GenomeLoc to the collection, merging it if it overlaps another region.
* If it's not overlapping then we add it in sorted order. * If it's not overlapping then we insert it at the correct sorted position into the set.
* *
* TODO TODO TODO -- this function is buggy and will not properly create a sorted * @param loc the GenomeLoc to add
* TODO TODO TODO -- genome loc is addRegion is called sequentially where the second
* TODO TODO TODO -- loc added is actually before the first. So when creating
* TODO TODO TODO -- sets make sure to sort the input locations first!
* *
* @param e the GenomeLoc to add to the collection * @return true if the loc was added or false otherwise (if the loc was null)
*
* @return true, if the GenomeLoc could be added to the collection
*/ */
public boolean addRegion(GenomeLoc e) { public boolean addRegion(final GenomeLoc loc) {
if (e == null) { return add(loc, true);
return false; }
}
// have we added it to the collection?
boolean haveAdded = false;
/** /**
* check if the specified element overlaps any current locations, if so * Adds a GenomeLoc to the collection, inserting at the correct sorted position into the set.
* we should merge the two. *
*/ * @param loc the GenomeLoc to add
for (GenomeLoc g : mArray) { * @param mergeIfIntervalOverlaps if true we merge the interval if it overlaps another one already in the set, otherwise we throw an exception
if (g.contiguousP(e)) { *
GenomeLoc c = g.merge(e); * @return true if the loc was added or false otherwise (if the loc was null or an exact duplicate)
mArray.set(mArray.indexOf(g), c); */
haveAdded = true; public boolean add(final GenomeLoc loc, final boolean mergeIfIntervalOverlaps) {
} else if ((g.getContigIndex() == e.getContigIndex()) && if ( loc == null )
(e.getStart() < g.getStart()) && !haveAdded) { return false;
mArray.add(mArray.indexOf(g), e);
return true; // if we have no other intervals yet or if the new loc is past the last one in the list (which is usually the
} else if (haveAdded && ((e.getContigIndex() > e.getContigIndex()) || // case because locs are generally added in order) then be extra efficient and just add the loc to the end
(g.getContigIndex() == e.getContigIndex() && e.getStart() > g.getStart()))) { if ( mArray.size() == 0 || loc.isPast(mArray.get(mArray.size() - 1)) ) {
return true; return mArray.add(loc);
}
} }
/** we're at the end and we haven't found locations that should fall after it,
* so we'll put it at the end // find where in the list the new loc belongs
*/ final int binarySearchIndex = Collections.binarySearch(mArray,loc);
if (!haveAdded) {
mArray.add(e); // if it already exists in the list, return or throw an exception as needed
if ( binarySearchIndex >= 0 ) {
if ( mergeIfIntervalOverlaps )
return false;
throw new IllegalArgumentException("GenomeLocSortedSet already contains the GenomeLoc " + loc);
} }
// if it overlaps a loc already in the list merge or throw an exception as needed
final int insertionIndex = -1 * (binarySearchIndex + 1);
if ( ! mergeOverlappingIntervalsFromAdd(loc, insertionIndex, !mergeIfIntervalOverlaps) ) {
// it does not overlap any current intervals, so add it to the set
mArray.add(insertionIndex, loc);
}
return true; return true;
} }
/*
* If the provided GenomeLoc overlaps another already in the set, merge them (or throw an exception if requested)
*
* @param loc the GenomeLoc to add
* @param insertionIndex the index in the sorted set to add the new loc
* @param throwExceptionIfOverlapping if true we throw an exception if there's overlap, otherwise we merge them
*
* @return true if the loc was added or false otherwise
*/
private boolean mergeOverlappingIntervalsFromAdd(final GenomeLoc loc, final int insertionIndex, final boolean throwExceptionIfOverlapping) {
// try merging with the previous index
if ( insertionIndex != 0 && loc.overlapsP(mArray.get(insertionIndex - 1)) ) {
if ( throwExceptionIfOverlapping )
throw new IllegalArgumentException(String.format("GenomeLocSortedSet contains a GenomeLoc (%s) that overlaps with the provided one (%s)", mArray.get(insertionIndex - 1).toString(), loc.toString()));
mArray.set(insertionIndex - 1, mArray.get(insertionIndex - 1).merge(loc));
return true;
}
// try merging with the following index
if ( insertionIndex < mArray.size() && loc.overlapsP(mArray.get(insertionIndex)) ) {
if ( throwExceptionIfOverlapping )
throw new IllegalArgumentException(String.format("GenomeLocSortedSet contains a GenomeLoc (%s) that overlaps with the provided one (%s)", mArray.get(insertionIndex).toString(), loc.toString()));
mArray.set(insertionIndex, mArray.get(insertionIndex).merge(loc));
return true;
}
return false;
}
public GenomeLocSortedSet subtractRegions(GenomeLocSortedSet toRemoveSet) { public GenomeLocSortedSet subtractRegions(GenomeLocSortedSet toRemoveSet) {
LinkedList<GenomeLoc> good = new LinkedList<GenomeLoc>(); LinkedList<GenomeLoc> good = new LinkedList<GenomeLoc>();
Stack<GenomeLoc> toProcess = new Stack<GenomeLoc>(); Stack<GenomeLoc> toProcess = new Stack<GenomeLoc>();
@ -401,11 +417,11 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
* *
* @return the GenomeLocSet of all references sequences as GenomeLoc's * @return the GenomeLocSet of all references sequences as GenomeLoc's
*/ */
public static GenomeLocSortedSet createSetFromSequenceDictionary(SAMSequenceDictionary dict) { public static GenomeLocSortedSet createSetFromSequenceDictionary(final SAMSequenceDictionary dict) {
GenomeLocParser parser = new GenomeLocParser(dict); final GenomeLocParser parser = new GenomeLocParser(dict);
GenomeLocSortedSet returnSortedSet = new GenomeLocSortedSet(parser); final GenomeLocSortedSet returnSortedSet = new GenomeLocSortedSet(parser);
for (SAMSequenceRecord record : dict.getSequences()) { for ( final SAMSequenceRecord sequence : dict.getSequences() ) {
returnSortedSet.add(parser.createGenomeLoc(record.getSequenceName(), 1, record.getSequenceLength())); returnSortedSet.add(parser.createOverEntireContig(sequence.getSequenceName()));
} }
return returnSortedSet; return returnSortedSet;
} }

View File

@ -111,7 +111,7 @@ public class SAMDataSourceUnitTest extends BaseTest {
new ArrayList<ReadFilter>(), new ArrayList<ReadFilter>(),
false); false);
Iterable<Shard> strat = data.createShardIteratorOverMappedReads(seq.getSequenceDictionary(),new LocusShardBalancer()); Iterable<Shard> strat = data.createShardIteratorOverMappedReads(new LocusShardBalancer());
int count = 0; int count = 0;
try { try {

View File

@ -117,11 +117,31 @@ public class GenomeLocSortedSetUnitTest extends BaseTest {
GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, 30, 80); GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, 30, 80);
mSortedSet.addRegion(f); mSortedSet.addRegion(f);
assertTrue(mSortedSet.size() == 1); assertTrue(mSortedSet.size() == 1);
} }
@Test
public void addRegionsOutOfOrder() {
final String contigTwoName = header.getSequenceDictionary().getSequence(2).getSequenceName();
assertTrue(mSortedSet.size() == 0);
GenomeLoc g = genomeLocParser.createGenomeLoc(contigTwoName, 1, 50);
mSortedSet.add(g);
GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, 30, 80);
mSortedSet.addRegion(f);
assertTrue(mSortedSet.size() == 2);
assertTrue(mSortedSet.toList().get(0).getContig().equals(contigOneName));
assertTrue(mSortedSet.toList().get(1).getContig().equals(contigTwoName));
}
@Test(expectedExceptions=ReviewedStingException.class) @Test(expectedExceptions = IllegalArgumentException.class)
public void addThrowsException() {
assertTrue(mSortedSet.size() == 0);
GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 1, 50);
mSortedSet.add(g);
GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, 30, 80);
mSortedSet.add(f);
}
@Test(expectedExceptions=IllegalArgumentException.class)
public void testAddDuplicate() { public void testAddDuplicate() {
assertTrue(mSortedSet.size() == 0); assertTrue(mSortedSet.size() == 0);
GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 0, 0); GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 0, 0);
@ -141,9 +161,9 @@ public class GenomeLocSortedSetUnitTest extends BaseTest {
assertTrue(mSortedSet.size() == 1); assertTrue(mSortedSet.size() == 1);
Iterator<GenomeLoc> iter = mSortedSet.iterator(); Iterator<GenomeLoc> iter = mSortedSet.iterator();
GenomeLoc loc = iter.next(); GenomeLoc loc = iter.next();
assertTrue(loc.getStart() == 0); assertEquals(loc.getStart(), 0);
assertTrue(loc.getStop() == 100); assertEquals(loc.getStop(), 100);
assertTrue(loc.getContigIndex() == 1); assertEquals(loc.getContigIndex(), 1);
} }
@Test @Test
@ -192,9 +212,9 @@ public class GenomeLocSortedSetUnitTest extends BaseTest {
assertTrue(mSortedSet.size() == 1); assertTrue(mSortedSet.size() == 1);
Iterator<GenomeLoc> iter = mSortedSet.iterator(); Iterator<GenomeLoc> iter = mSortedSet.iterator();
GenomeLoc loc = iter.next(); GenomeLoc loc = iter.next();
assertTrue(loc.getStart() == 0); assertEquals(loc.getStart(), 0);
assertTrue(loc.getStop() == 100); assertEquals(loc.getStop(), 100);
assertTrue(loc.getContigIndex() == 1); assertEquals(loc.getContigIndex(), 1);
} }
@Test @Test