Fix for GSA-649: GenomeLocSortedSet.overlaps is crazy slow. Also improved GenomeLocSortedSet.sizeBeforeLoc.

This commit is contained in:
Eric Banks 2012-11-27 01:07:00 -05:00
parent e27d677c13
commit 405f3c675d
3 changed files with 76 additions and 10 deletions

View File

@ -80,7 +80,6 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
}
// skip this location -- it's not part of our engine intervals
// TODO -- this is dangerously slow with current overlaps implementation : GSA-649 / GenomeLocSortedSet.overlaps is crazy slow
if ( outsideEngineIntervals(location) )
continue;

View File

@ -43,6 +43,9 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
// our private storage for the GenomeLoc's
private List<GenomeLoc> mArray = new ArrayList<GenomeLoc>();
// cache this to make overlap checking much more efficient
private int previousOverlapSearchIndex = -1;
/** default constructor */
public GenomeLocSortedSet(GenomeLocParser parser) {
this.genomeLocParser = parser;
@ -101,7 +104,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
* Return the number of bps before loc in the sorted set
*
* @param loc the location before which we are counting bases
* @return
* @return the number of base pairs over all previous intervals
*/
public long sizeBeforeLoc(GenomeLoc loc) {
long s = 0;
@ -110,7 +113,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
if ( e.isBefore(loc) )
s += e.size();
else if ( e.isPast(loc) )
; // don't do anything
break; // we are done
else // loc is inside of s
s += loc.getStart() - e.getStart();
}
@ -131,15 +134,43 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
* Determine if the given loc overlaps any loc in the sorted set
*
* @param loc the location to test
* @return
* @return trip if the location overlaps any loc
*/
public boolean overlaps(final GenomeLoc loc) {
for(final GenomeLoc e : mArray) {
if(e.overlapsP(loc)) {
return true;
}
// edge condition
if ( mArray.isEmpty() )
return false;
// use the cached version first
if ( previousOverlapSearchIndex != -1 && overlapsAtOrImmediatelyAfterCachedIndex(loc, true) )
return true;
// update the cached index
previousOverlapSearchIndex = Collections.binarySearch(mArray, loc);
// if it matches an interval exactly, we are done
if ( previousOverlapSearchIndex > 0 )
return true;
// check whether it overlaps the interval before or after the insertion point
previousOverlapSearchIndex = Math.max(0, -1 * previousOverlapSearchIndex - 2);
return overlapsAtOrImmediatelyAfterCachedIndex(loc, false);
}
private boolean overlapsAtOrImmediatelyAfterCachedIndex(final GenomeLoc loc, final boolean updateCachedIndex) {
// check the cached entry
if ( mArray.get(previousOverlapSearchIndex).overlapsP(loc) )
return true;
// check the entry after the cached entry since we may have moved to it
boolean returnValue = false;
if ( previousOverlapSearchIndex < mArray.size() - 1 ) {
returnValue = mArray.get(previousOverlapSearchIndex + 1).overlapsP(loc);
if ( updateCachedIndex )
previousOverlapSearchIndex++;
}
return false;
return returnValue;
}
/**
@ -155,7 +186,7 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
mArray.add(e);
return true;
} else {
int loc = Collections.binarySearch(mArray,e);
final int loc = Collections.binarySearch(mArray,e);
if (loc >= 0) {
throw new ReviewedStingException("Genome Loc Sorted Set already contains the GenomicLoc " + e.toString());
} else {

View File

@ -6,6 +6,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertFalse;
import static org.testng.Assert.assertTrue;
import org.testng.annotations.BeforeClass;
@ -117,6 +118,41 @@ public class GenomeLocSortedSetUnitTest extends BaseTest {
assertTrue(loc.getContigIndex() == 1);
}
@Test
public void overlap() {
for ( int i = 1; i < 6; i++ ) {
final int start = i * 10;
mSortedSet.add(genomeLocParser.createGenomeLoc(contigOneName, start, start + 1));
}
// test matches in and around interval
assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 9, 9)));
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 10, 10)));
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 11, 11)));
assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 12, 12)));
// test matches spanning intervals
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 14, 20)));
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 11, 15)));
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 30, 40)));
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 51, 53)));
// test miss
assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 12, 19)));
// test exact match after miss
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 40, 41)));
// test matches at beginning of intervals
assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 5, 6)));
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 0, 10)));
// test matches at end of intervals
assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 52, 53)));
assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 51, 53)));
assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 52, 53)));
}
@Test
public void mergingOverlappingAbove() {
GenomeLoc e = genomeLocParser.createGenomeLoc(contigOneName, 0, 50);