Fixed the add functionality of GenomeLocSortedSet.
* Fixed GenomeLocSortedSet.add() to ensure that overlapping intervals are detected and an exception is thrown. * Fixed GenomeLocSortedSet.addRegion() by merging it with the add() method; it now produces sorted inputs in all cases. * Cleaned up duplicated code throughout the engine to create a list of intervals over all contigs. * Added more unit tests for add functionality of GLSS. * Resolves GSA-775.
This commit is contained in:
parent
6a77eee5f4
commit
ebd5404124
|
|
@ -558,7 +558,7 @@ public class GenomeAnalysisEngine {
|
|||
if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
|
||||
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
|
||||
if(intervals == null)
|
||||
return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer());
|
||||
return readsDataSource.createShardIteratorOverMappedReads(new LocusShardBalancer());
|
||||
else
|
||||
return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer());
|
||||
}
|
||||
|
|
@ -566,7 +566,7 @@ public class GenomeAnalysisEngine {
|
|||
if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate)
|
||||
throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately.");
|
||||
if(intervals == null)
|
||||
return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer());
|
||||
return readsDataSource.createShardIteratorOverMappedReads(new LocusShardBalancer());
|
||||
else
|
||||
return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new LocusShardBalancer());
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.datasources.reads;
|
|||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import net.sf.samtools.GATKChunk;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
|
@ -53,14 +52,15 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
private PeekableIterator<GenomeLoc> locusIterator;
|
||||
private GenomeLoc currentLocus;
|
||||
|
||||
public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary referenceSequenceDictionary, final GenomeLocParser parser) {
|
||||
BAMScheduler scheduler = new BAMScheduler(dataSource);
|
||||
GenomeLocSortedSet intervals = new GenomeLocSortedSet(parser);
|
||||
for(SAMSequenceRecord sequence: referenceSequenceDictionary.getSequences()) {
|
||||
// Match only on sequence name; trust startup validation to make sure all the sequences match.
|
||||
if(dataSource.getHeader().getSequenceDictionary().getSequence(sequence.getSequenceName()) != null)
|
||||
intervals.add(parser.createOverEntireContig(sequence.getSequenceName()));
|
||||
}
|
||||
/*
|
||||
* Creates BAMScheduler using contigs from the given BAM data source.
|
||||
*
|
||||
* @param dataSource BAM source
|
||||
* @return non-null BAM scheduler
|
||||
*/
|
||||
public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource) {
|
||||
final BAMScheduler scheduler = new BAMScheduler(dataSource);
|
||||
final GenomeLocSortedSet intervals = GenomeLocSortedSet.createSetFromSequenceDictionary(dataSource.getHeader().getSequenceDictionary());
|
||||
scheduler.populateFilteredIntervalList(intervals);
|
||||
return scheduler;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,7 +26,6 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
|
||||
|
|
@ -51,8 +50,8 @@ public class IntervalSharder implements Iterator<FilePointer> {
|
|||
return new IntervalSharder(BAMScheduler.createOverAllReads(dataSource,parser),parser);
|
||||
}
|
||||
|
||||
public static IntervalSharder shardOverMappedReads(final SAMDataSource dataSource, final SAMSequenceDictionary sequenceDictionary, final GenomeLocParser parser) {
|
||||
return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource,sequenceDictionary,parser),parser);
|
||||
public static IntervalSharder shardOverMappedReads(final SAMDataSource dataSource, final GenomeLocParser parser) {
|
||||
return new IntervalSharder(BAMScheduler.createOverMappedReads(dataSource),parser);
|
||||
}
|
||||
|
||||
public static IntervalSharder shardOverIntervals(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
||||
|
|
|
|||
|
|
@ -1060,10 +1060,12 @@ public class SAMDataSource {
|
|||
/**
|
||||
* Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any
|
||||
* read that has been assigned
|
||||
* @return
|
||||
*
|
||||
* @param shardBalancer shard balancer object
|
||||
* @return non-null initialized version of the shard balancer
|
||||
*/
|
||||
public Iterable<Shard> createShardIteratorOverMappedReads(final SAMSequenceDictionary sequenceDictionary, final ShardBalancer shardBalancer) {
|
||||
shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,sequenceDictionary,genomeLocParser),genomeLocParser);
|
||||
public Iterable<Shard> createShardIteratorOverMappedReads(final ShardBalancer shardBalancer) {
|
||||
shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,genomeLocParser),genomeLocParser);
|
||||
return shardBalancer;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -26,12 +26,10 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.reads.utilities;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.commandline.CommandLineProgram;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.BAMScheduler;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.FilePointer;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.IntervalSharder;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
|
||||
|
|
@ -98,14 +96,11 @@ public class FindLargeShards extends CommandLineProgram {
|
|||
SAMDataSource dataSource = new SAMDataSource(bamReaders,new ThreadAllocation(),null,genomeLocParser);
|
||||
|
||||
// intervals
|
||||
GenomeLocSortedSet intervalSortedSet = null;
|
||||
if(intervals != null)
|
||||
final GenomeLocSortedSet intervalSortedSet;
|
||||
if ( intervals != null )
|
||||
intervalSortedSet = IntervalUtils.sortAndMergeIntervals(genomeLocParser, IntervalUtils.parseIntervalArguments(genomeLocParser, intervals), IntervalMergingRule.ALL);
|
||||
else {
|
||||
intervalSortedSet = new GenomeLocSortedSet(genomeLocParser);
|
||||
for(SAMSequenceRecord entry: refReader.getSequenceDictionary().getSequences())
|
||||
intervalSortedSet.add(genomeLocParser.createGenomeLoc(entry.getSequenceName(),1,entry.getSequenceLength()));
|
||||
}
|
||||
else
|
||||
intervalSortedSet = GenomeLocSortedSet.createSetFromSequenceDictionary(refReader.getSequenceDictionary());
|
||||
|
||||
logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize"));
|
||||
|
||||
|
|
|
|||
|
|
@ -266,80 +266,96 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
}
|
||||
|
||||
/**
|
||||
* add a genomeLoc to the collection, simply inserting in order into the set
|
||||
* Adds a GenomeLoc to the collection, inserting at the correct sorted position into the set.
|
||||
* Throws an exception if the loc overlaps another loc already in the set.
|
||||
*
|
||||
* TODO -- this may break the contract of the GenomeLocSortedSet if e overlaps or
|
||||
* TODO -- other locations already in the set. This code should check to see if
|
||||
* TODO -- e is overlapping with its nearby elements and merge them or alternatively
|
||||
* TODO -- throw an exception
|
||||
* @param loc the GenomeLoc to add
|
||||
*
|
||||
* @param e the GenomeLoc to add
|
||||
*
|
||||
* @return true
|
||||
* @return true if the loc was added or false otherwise (if the loc was null)
|
||||
*/
|
||||
public boolean add(GenomeLoc e) {
|
||||
// assuming that the intervals coming arrive in order saves us a fair amount of time (and it's most likely true)
|
||||
if (mArray.size() > 0 && e.isPast(mArray.get(mArray.size() - 1))) {
|
||||
mArray.add(e);
|
||||
return true;
|
||||
} else {
|
||||
final int loc = Collections.binarySearch(mArray,e);
|
||||
if (loc >= 0) {
|
||||
throw new ReviewedStingException("Genome Loc Sorted Set already contains the GenomicLoc " + e.toString());
|
||||
} else {
|
||||
mArray.add((loc+1) * -1,e);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
public boolean add(final GenomeLoc loc) {
|
||||
return add(loc, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a GenomeLoc to the collection, merging it if it overlaps another region.
|
||||
* If it's not overlapping then we add it in sorted order.
|
||||
* If it's not overlapping then we insert it at the correct sorted position into the set.
|
||||
*
|
||||
* TODO TODO TODO -- this function is buggy and will not properly create a sorted
|
||||
* TODO TODO TODO -- genome loc is addRegion is called sequentially where the second
|
||||
* TODO TODO TODO -- loc added is actually before the first. So when creating
|
||||
* TODO TODO TODO -- sets make sure to sort the input locations first!
|
||||
* @param loc the GenomeLoc to add
|
||||
*
|
||||
* @param e the GenomeLoc to add to the collection
|
||||
*
|
||||
* @return true, if the GenomeLoc could be added to the collection
|
||||
* @return true if the loc was added or false otherwise (if the loc was null)
|
||||
*/
|
||||
public boolean addRegion(GenomeLoc e) {
|
||||
if (e == null) {
|
||||
return false;
|
||||
}
|
||||
// have we added it to the collection?
|
||||
boolean haveAdded = false;
|
||||
public boolean addRegion(final GenomeLoc loc) {
|
||||
return add(loc, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* check if the specified element overlaps any current locations, if so
|
||||
* we should merge the two.
|
||||
*/
|
||||
for (GenomeLoc g : mArray) {
|
||||
if (g.contiguousP(e)) {
|
||||
GenomeLoc c = g.merge(e);
|
||||
mArray.set(mArray.indexOf(g), c);
|
||||
haveAdded = true;
|
||||
} else if ((g.getContigIndex() == e.getContigIndex()) &&
|
||||
(e.getStart() < g.getStart()) && !haveAdded) {
|
||||
mArray.add(mArray.indexOf(g), e);
|
||||
return true;
|
||||
} else if (haveAdded && ((e.getContigIndex() > e.getContigIndex()) ||
|
||||
(g.getContigIndex() == e.getContigIndex() && e.getStart() > g.getStart()))) {
|
||||
return true;
|
||||
}
|
||||
/**
|
||||
* Adds a GenomeLoc to the collection, inserting at the correct sorted position into the set.
|
||||
*
|
||||
* @param loc the GenomeLoc to add
|
||||
* @param mergeIfIntervalOverlaps if true we merge the interval if it overlaps another one already in the set, otherwise we throw an exception
|
||||
*
|
||||
* @return true if the loc was added or false otherwise (if the loc was null or an exact duplicate)
|
||||
*/
|
||||
public boolean add(final GenomeLoc loc, final boolean mergeIfIntervalOverlaps) {
|
||||
if ( loc == null )
|
||||
return false;
|
||||
|
||||
// if we have no other intervals yet or if the new loc is past the last one in the list (which is usually the
|
||||
// case because locs are generally added in order) then be extra efficient and just add the loc to the end
|
||||
if ( mArray.size() == 0 || loc.isPast(mArray.get(mArray.size() - 1)) ) {
|
||||
return mArray.add(loc);
|
||||
}
|
||||
/** we're at the end and we haven't found locations that should fall after it,
|
||||
* so we'll put it at the end
|
||||
*/
|
||||
if (!haveAdded) {
|
||||
mArray.add(e);
|
||||
|
||||
// find where in the list the new loc belongs
|
||||
final int binarySearchIndex = Collections.binarySearch(mArray,loc);
|
||||
|
||||
// if it already exists in the list, return or throw an exception as needed
|
||||
if ( binarySearchIndex >= 0 ) {
|
||||
if ( mergeIfIntervalOverlaps )
|
||||
return false;
|
||||
throw new IllegalArgumentException("GenomeLocSortedSet already contains the GenomeLoc " + loc);
|
||||
}
|
||||
|
||||
// if it overlaps a loc already in the list merge or throw an exception as needed
|
||||
final int insertionIndex = -1 * (binarySearchIndex + 1);
|
||||
if ( ! mergeOverlappingIntervalsFromAdd(loc, insertionIndex, !mergeIfIntervalOverlaps) ) {
|
||||
// it does not overlap any current intervals, so add it to the set
|
||||
mArray.add(insertionIndex, loc);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the provided GenomeLoc overlaps another already in the set, merge them (or throw an exception if requested)
|
||||
*
|
||||
* @param loc the GenomeLoc to add
|
||||
* @param insertionIndex the index in the sorted set to add the new loc
|
||||
* @param throwExceptionIfOverlapping if true we throw an exception if there's overlap, otherwise we merge them
|
||||
*
|
||||
* @return true if the loc was added or false otherwise
|
||||
*/
|
||||
private boolean mergeOverlappingIntervalsFromAdd(final GenomeLoc loc, final int insertionIndex, final boolean throwExceptionIfOverlapping) {
|
||||
// try merging with the previous index
|
||||
if ( insertionIndex != 0 && loc.overlapsP(mArray.get(insertionIndex - 1)) ) {
|
||||
if ( throwExceptionIfOverlapping )
|
||||
throw new IllegalArgumentException(String.format("GenomeLocSortedSet contains a GenomeLoc (%s) that overlaps with the provided one (%s)", mArray.get(insertionIndex - 1).toString(), loc.toString()));
|
||||
mArray.set(insertionIndex - 1, mArray.get(insertionIndex - 1).merge(loc));
|
||||
return true;
|
||||
}
|
||||
|
||||
// try merging with the following index
|
||||
if ( insertionIndex < mArray.size() && loc.overlapsP(mArray.get(insertionIndex)) ) {
|
||||
if ( throwExceptionIfOverlapping )
|
||||
throw new IllegalArgumentException(String.format("GenomeLocSortedSet contains a GenomeLoc (%s) that overlaps with the provided one (%s)", mArray.get(insertionIndex).toString(), loc.toString()));
|
||||
mArray.set(insertionIndex, mArray.get(insertionIndex).merge(loc));
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public GenomeLocSortedSet subtractRegions(GenomeLocSortedSet toRemoveSet) {
|
||||
LinkedList<GenomeLoc> good = new LinkedList<GenomeLoc>();
|
||||
Stack<GenomeLoc> toProcess = new Stack<GenomeLoc>();
|
||||
|
|
@ -401,11 +417,11 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
|||
*
|
||||
* @return the GenomeLocSet of all references sequences as GenomeLoc's
|
||||
*/
|
||||
public static GenomeLocSortedSet createSetFromSequenceDictionary(SAMSequenceDictionary dict) {
|
||||
GenomeLocParser parser = new GenomeLocParser(dict);
|
||||
GenomeLocSortedSet returnSortedSet = new GenomeLocSortedSet(parser);
|
||||
for (SAMSequenceRecord record : dict.getSequences()) {
|
||||
returnSortedSet.add(parser.createGenomeLoc(record.getSequenceName(), 1, record.getSequenceLength()));
|
||||
public static GenomeLocSortedSet createSetFromSequenceDictionary(final SAMSequenceDictionary dict) {
|
||||
final GenomeLocParser parser = new GenomeLocParser(dict);
|
||||
final GenomeLocSortedSet returnSortedSet = new GenomeLocSortedSet(parser);
|
||||
for ( final SAMSequenceRecord sequence : dict.getSequences() ) {
|
||||
returnSortedSet.add(parser.createOverEntireContig(sequence.getSequenceName()));
|
||||
}
|
||||
return returnSortedSet;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -111,7 +111,7 @@ public class SAMDataSourceUnitTest extends BaseTest {
|
|||
new ArrayList<ReadFilter>(),
|
||||
false);
|
||||
|
||||
Iterable<Shard> strat = data.createShardIteratorOverMappedReads(seq.getSequenceDictionary(),new LocusShardBalancer());
|
||||
Iterable<Shard> strat = data.createShardIteratorOverMappedReads(new LocusShardBalancer());
|
||||
int count = 0;
|
||||
|
||||
try {
|
||||
|
|
|
|||
|
|
@ -117,11 +117,31 @@ public class GenomeLocSortedSetUnitTest extends BaseTest {
|
|||
GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, 30, 80);
|
||||
mSortedSet.addRegion(f);
|
||||
assertTrue(mSortedSet.size() == 1);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void addRegionsOutOfOrder() {
|
||||
final String contigTwoName = header.getSequenceDictionary().getSequence(2).getSequenceName();
|
||||
assertTrue(mSortedSet.size() == 0);
|
||||
GenomeLoc g = genomeLocParser.createGenomeLoc(contigTwoName, 1, 50);
|
||||
mSortedSet.add(g);
|
||||
GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, 30, 80);
|
||||
mSortedSet.addRegion(f);
|
||||
assertTrue(mSortedSet.size() == 2);
|
||||
assertTrue(mSortedSet.toList().get(0).getContig().equals(contigOneName));
|
||||
assertTrue(mSortedSet.toList().get(1).getContig().equals(contigTwoName));
|
||||
}
|
||||
|
||||
@Test(expectedExceptions=ReviewedStingException.class)
|
||||
@Test(expectedExceptions = IllegalArgumentException.class)
|
||||
public void addThrowsException() {
|
||||
assertTrue(mSortedSet.size() == 0);
|
||||
GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 1, 50);
|
||||
mSortedSet.add(g);
|
||||
GenomeLoc f = genomeLocParser.createGenomeLoc(contigOneName, 30, 80);
|
||||
mSortedSet.add(f);
|
||||
}
|
||||
|
||||
@Test(expectedExceptions=IllegalArgumentException.class)
|
||||
public void testAddDuplicate() {
|
||||
assertTrue(mSortedSet.size() == 0);
|
||||
GenomeLoc g = genomeLocParser.createGenomeLoc(contigOneName, 0, 0);
|
||||
|
|
@ -141,9 +161,9 @@ public class GenomeLocSortedSetUnitTest extends BaseTest {
|
|||
assertTrue(mSortedSet.size() == 1);
|
||||
Iterator<GenomeLoc> iter = mSortedSet.iterator();
|
||||
GenomeLoc loc = iter.next();
|
||||
assertTrue(loc.getStart() == 0);
|
||||
assertTrue(loc.getStop() == 100);
|
||||
assertTrue(loc.getContigIndex() == 1);
|
||||
assertEquals(loc.getStart(), 0);
|
||||
assertEquals(loc.getStop(), 100);
|
||||
assertEquals(loc.getContigIndex(), 1);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
@ -192,9 +212,9 @@ public class GenomeLocSortedSetUnitTest extends BaseTest {
|
|||
assertTrue(mSortedSet.size() == 1);
|
||||
Iterator<GenomeLoc> iter = mSortedSet.iterator();
|
||||
GenomeLoc loc = iter.next();
|
||||
assertTrue(loc.getStart() == 0);
|
||||
assertTrue(loc.getStop() == 100);
|
||||
assertTrue(loc.getContigIndex() == 1);
|
||||
assertEquals(loc.getStart(), 0);
|
||||
assertEquals(loc.getStop(), 100);
|
||||
assertEquals(loc.getContigIndex(), 1);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
|||
Loading…
Reference in New Issue