High performance, correct implementation of -XL exclusion lists. Enjoy.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2994 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-03-12 22:39:20 +00:00
parent 88a48821ea
commit e7eae9b61d
2 changed files with 217 additions and 65 deletions

View File

@ -161,7 +161,8 @@ public class GenomeAnalysisEngine {
// todo -- call createSetFromList for -XL argument, and unify this with intervals, if provided
GenomeLocSortedSet excludeIntervals = null;
if (argCollection.excludeIntervals != null && argCollection.intervalMerging.check()) {
excludeIntervals = GenomeLocSortedSet.createSetFromList(parseIntervalRegion(argCollection.excludeIntervals, IntervalMergingRule.ALL));
List<GenomeLoc> rawExcludeIntervals = parseIntervalRegion(argCollection.excludeIntervals, IntervalMergingRule.ALL);
excludeIntervals = GenomeLocSortedSet.createSetFromList(rawExcludeIntervals);
}
if (argCollection.intervals != null && argCollection.intervalMerging.check()) {
@ -307,11 +308,12 @@ public class GenomeAnalysisEngine {
private GenomeLocSortedSet pruneIntervals( GenomeLocSortedSet toPrune, GenomeLocSortedSet toExclude) {
logger.info(String.format("pruning intervals from %d against %d", toPrune.size(), toExclude.size()));
for ( GenomeLoc exclude : toExclude )
toPrune.removeRegion(exclude);
logger.info(String.format("done pruning intervals == now have %d", toPrune.size()));
//for ( GenomeLoc exclude : toExclude )
// toPrune.removeRegion(exclude);
GenomeLocSortedSet x = toPrune.substractRegions(toExclude);
logger.info(String.format("done pruning intervals == now have %d", x.size()));
return toPrune;
return x;
}
/**

View File

@ -136,6 +136,112 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
return true;
}
public GenomeLocSortedSet substractRegions(GenomeLocSortedSet toRemoveSet) {
LinkedList<GenomeLoc> good = new LinkedList<GenomeLoc>();
Stack<GenomeLoc> toProcess = new Stack<GenomeLoc>();
Stack<GenomeLoc> toExclude = new Stack<GenomeLoc>();
// initialize the stacks
toProcess.addAll(mArray);
Collections.reverse(toProcess);
toExclude.addAll(toRemoveSet.mArray);
Collections.reverse(toExclude);
int i = 0;
while ( ! toProcess.empty() ) { // while there's still stuff to process
if ( toExclude.empty() ) {
good.addAll(toProcess); // no more excludes, all the processing stuff is good
break;
}
GenomeLoc p = toProcess.peek();
GenomeLoc e = toExclude.peek();
if ( p.overlapsP(e) ) {
toProcess.pop();
for ( GenomeLoc newP : subtractRegion(p, e) )
toProcess.push(newP);
} else if ( p.compareContigs(e) < 0 ) {
good.add(toProcess.pop()); // p is now good
} else if ( p.compareContigs(e) > 0 ) {
toExclude.pop(); // e can't effect anything
} else if ( p.getStop() < e.getStart() ) {
good.add(toProcess.pop()); // p stops before e starts, p is good
} else if ( e.getStop() < p.getStart() ) {
toExclude.pop(); // p starts after e stops, e is done
} else {
throw new StingException("BUG: unexpected condition: p=" + p + ", e=" + e);
}
if ( i++ % 10000 == 0 )
logger.debug("removeRegions operation: i = " + i);
}
return GenomeLocSortedSet.createSetFromList(good);
}
private static final List<GenomeLoc> EMPTY_LIST = new ArrayList<GenomeLoc>();
private List<GenomeLoc> subtractRegion(GenomeLoc g, GenomeLoc e) {
if (g.equals(e)) {
return EMPTY_LIST;
} else if (g.containsP(e)) {
List<GenomeLoc> l = new ArrayList<GenomeLoc>();
/**
* we have to create two new region, one for the before part, one for the after
* The old region:
* |----------------- old region (g) -------------|
* |----- to delete (e) ------|
*
* product (two new regions):
* |------| + |--------|
*
*/
GenomeLoc before = GenomeLocParser.createGenomeLoc(g.getContigIndex(), g.getStart(), e.getStart() - 1);
GenomeLoc after = GenomeLocParser.createGenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop());
if (after.getStop() - after.getStart() >= 0) {
l.add(after);
}
if (before.getStop() - before.getStart() >= 0) {
l.add(before);
}
return l;
} else if (e.containsP(g)) {
/**
* e completely contains g, delete g, but keep looking, there may be more regions
* i.e.:
* |--------------------- e --------------------|
* |--- g ---| |---- others ----|
*/
return EMPTY_LIST; // don't need to do anything
} else {
/**
* otherwise e overlaps some part of g
*
* figure out which region occurs first on the genome. I.e., is it:
* |------------- g ----------|
* |------------- e ----------|
*
* or:
* |------------- g ----------|
* |------------ e -----------|
*
*/
GenomeLoc n;
if (e.getStart() < g.getStart()) {
n = GenomeLocParser.createGenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop());
} else {
n = GenomeLocParser.createGenomeLoc(g.getContigIndex(), g.getStart(), e.getStart() - 1);
}
// replace g with the new region
return Arrays.asList(n);
}
}
/**
* remove an element from the set. Given a specific genome location, this function will
* remove all regions in the element set that overlap the specified region.
@ -145,84 +251,128 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
* @return true if a removal action was performed, false if the collection was unchanged.
*/
public boolean removeRegion(GenomeLoc e) {
// todo -- delete me
if (e == null) {
return false;
}
// sometimes we can't return right away, this holds the value for those cases
boolean returnValue = false;
/**
* check if the specified element overlaps any current locations, subtract the removed
* region and reinsert what is left.
*/
for (GenomeLoc g : mArray) {
if (g.overlapsP(e)) {
if (g.equals(e)) {
mArray.remove(mArray.indexOf(g));
return true;
} else if (g.containsP(e)) {
/**
* we have to create two new region, one for the before part, one for the after
* The old region:
* |----------------- old region (g) -------------|
* |----- to delete (e) ------|
*
* product (two new regions):
* |------| + |--------|
*
*/
GenomeLoc before = GenomeLocParser.createGenomeLoc(g.getContigIndex(), g.getStart(), e.getStart() - 1);
GenomeLoc after = GenomeLocParser.createGenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop());
int index = mArray.indexOf(g);
if (after.getStop() - after.getStart() >= 0) {
mArray.add(index, after);
}
if (before.getStop() - before.getStart() >= 0) {
mArray.add(index, before);
}
mArray.remove(mArray.indexOf(g));
return true;
} else if (e.containsP(g)) {
/**
* e completely contains g, delete g, but keep looking, there may be more regions
* i.e.:
* |--------------------- e --------------------|
* |--- g ---| |---- others ----|
*/
mArray.remove(mArray.indexOf(g));
returnValue = true;
} else {
returnValue = true;
/**
* otherwise e overlaps some part of g
*/
GenomeLoc l;
/**
* figure out which region occurs first on the genome. I.e., is it:
* |------------- g ----------|
* |------------- e ----------|
*
* or:
* |------------- g ----------|
* |------------ e -----------|
*
*/
if (e.getStart() < g.getStart()) {
l = GenomeLocParser.createGenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop());
} else {
l = GenomeLocParser.createGenomeLoc(g.getContigIndex(), g.getStart(), e.getStart() - 1);
}
// replace g with the new region
mArray.set(mArray.indexOf(g), l);
returnValue = true;
}
boolean finishEarly = removeOverlappingRegion(g, e);
if ( finishEarly )
break;
}
}
return returnValue;
}
// public boolean removeRegions(GenomeLocSortedSet toRemove) {
// int i = 0, j = 0;
//
// while ( true ) {
// if ( i >= mArray.size() || j >= toRemove.mArray.size() )
// return true;
//
// GenomeLoc g = mArray.get(i);
// GenomeLoc e = toRemove.mArray.get(j);
//
// if (g.overlapsP(e)) {
// boolean finishEarly = removeOverlappingRegion(g, e);
// if ( finishEarly ) {
// i++;
// }
// } else if ( g.compareContigs(e) < 0 ) {
// i++; // g is on an earlier contig, move g forward
// } else if ( g.compareContigs(e) > 0 ) {
// j++; // g is on a later contig, move e forward
// } else if ( g.getStop() < e.getStart() ) {
// i++; // g stops before e starts, move g forward
// } else if ( e.getStop() < g.getStart() ) {
// j++; // g starts after e stops, move e forward
// } else {
// throw new StingException("BUG: unexpected condition: g=" + g + ", e=" + e);
// }
//
// if ( (i + j) % 10000 == 0 )
// logger.debug("removeRegions operation: i + j = " + ( i + j ));
// }
// }
//
// todo -- delete me
private boolean removeOverlappingRegion(GenomeLoc g, GenomeLoc e) {
if (g.equals(e)) {
mArray.remove(mArray.indexOf(g));
return true;
} else if (g.containsP(e)) {
/**
* we have to create two new region, one for the before part, one for the after
* The old region:
* |----------------- old region (g) -------------|
* |----- to delete (e) ------|
*
* product (two new regions):
* |------| + |--------|
*
*/
GenomeLoc before = GenomeLocParser.createGenomeLoc(g.getContigIndex(), g.getStart(), e.getStart() - 1);
GenomeLoc after = GenomeLocParser.createGenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop());
int index = mArray.indexOf(g);
if (after.getStop() - after.getStart() >= 0) {
mArray.add(index, after);
}
if (before.getStop() - before.getStart() >= 0) {
mArray.add(index, before);
}
mArray.remove(mArray.indexOf(g));
return true;
} else if (e.containsP(g)) {
/**
* e completely contains g, delete g, but keep looking, there may be more regions
* i.e.:
* |--------------------- e --------------------|
* |--- g ---| |---- others ----|
*/
mArray.remove(mArray.indexOf(g));
return false;
} else {
/**
* otherwise e overlaps some part of g
*/
GenomeLoc l;
/**
* figure out which region occurs first on the genome. I.e., is it:
* |------------- g ----------|
* |------------- e ----------|
*
* or:
* |------------- g ----------|
* |------------ e -----------|
*
*/
if (e.getStart() < g.getStart()) {
l = GenomeLocParser.createGenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop());
} else {
l = GenomeLocParser.createGenomeLoc(g.getContigIndex(), g.getStart(), e.getStart() - 1);
}
// replace g with the new region
mArray.set(mArray.indexOf(g), l);
return false;
}
}
/**
* a simple removal of an interval contained in this list. The interval must be identical to one in the list (no partial locations or overlapping)
* @param location the GenomeLoc to remove