High performance, correct implementation of -XL exclusion lists. Enjoy.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2994 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
88a48821ea
commit
e7eae9b61d
|
|
@ -161,7 +161,8 @@ public class GenomeAnalysisEngine {
|
||||||
// todo -- call createSetFromList for -XL argument, and unify this with intervals, if provided
|
// todo -- call createSetFromList for -XL argument, and unify this with intervals, if provided
|
||||||
GenomeLocSortedSet excludeIntervals = null;
|
GenomeLocSortedSet excludeIntervals = null;
|
||||||
if (argCollection.excludeIntervals != null && argCollection.intervalMerging.check()) {
|
if (argCollection.excludeIntervals != null && argCollection.intervalMerging.check()) {
|
||||||
excludeIntervals = GenomeLocSortedSet.createSetFromList(parseIntervalRegion(argCollection.excludeIntervals, IntervalMergingRule.ALL));
|
List<GenomeLoc> rawExcludeIntervals = parseIntervalRegion(argCollection.excludeIntervals, IntervalMergingRule.ALL);
|
||||||
|
excludeIntervals = GenomeLocSortedSet.createSetFromList(rawExcludeIntervals);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (argCollection.intervals != null && argCollection.intervalMerging.check()) {
|
if (argCollection.intervals != null && argCollection.intervalMerging.check()) {
|
||||||
|
|
@ -307,11 +308,12 @@ public class GenomeAnalysisEngine {
|
||||||
|
|
||||||
private GenomeLocSortedSet pruneIntervals( GenomeLocSortedSet toPrune, GenomeLocSortedSet toExclude) {
|
private GenomeLocSortedSet pruneIntervals( GenomeLocSortedSet toPrune, GenomeLocSortedSet toExclude) {
|
||||||
logger.info(String.format("pruning intervals from %d against %d", toPrune.size(), toExclude.size()));
|
logger.info(String.format("pruning intervals from %d against %d", toPrune.size(), toExclude.size()));
|
||||||
for ( GenomeLoc exclude : toExclude )
|
//for ( GenomeLoc exclude : toExclude )
|
||||||
toPrune.removeRegion(exclude);
|
// toPrune.removeRegion(exclude);
|
||||||
logger.info(String.format("done pruning intervals == now have %d", toPrune.size()));
|
GenomeLocSortedSet x = toPrune.substractRegions(toExclude);
|
||||||
|
logger.info(String.format("done pruning intervals == now have %d", x.size()));
|
||||||
|
|
||||||
return toPrune;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -136,6 +136,112 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public GenomeLocSortedSet substractRegions(GenomeLocSortedSet toRemoveSet) {
|
||||||
|
LinkedList<GenomeLoc> good = new LinkedList<GenomeLoc>();
|
||||||
|
Stack<GenomeLoc> toProcess = new Stack<GenomeLoc>();
|
||||||
|
Stack<GenomeLoc> toExclude = new Stack<GenomeLoc>();
|
||||||
|
|
||||||
|
// initialize the stacks
|
||||||
|
toProcess.addAll(mArray);
|
||||||
|
Collections.reverse(toProcess);
|
||||||
|
toExclude.addAll(toRemoveSet.mArray);
|
||||||
|
Collections.reverse(toExclude);
|
||||||
|
|
||||||
|
int i = 0;
|
||||||
|
while ( ! toProcess.empty() ) { // while there's still stuff to process
|
||||||
|
if ( toExclude.empty() ) {
|
||||||
|
good.addAll(toProcess); // no more excludes, all the processing stuff is good
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
GenomeLoc p = toProcess.peek();
|
||||||
|
GenomeLoc e = toExclude.peek();
|
||||||
|
|
||||||
|
if ( p.overlapsP(e) ) {
|
||||||
|
toProcess.pop();
|
||||||
|
for ( GenomeLoc newP : subtractRegion(p, e) )
|
||||||
|
toProcess.push(newP);
|
||||||
|
} else if ( p.compareContigs(e) < 0 ) {
|
||||||
|
good.add(toProcess.pop()); // p is now good
|
||||||
|
} else if ( p.compareContigs(e) > 0 ) {
|
||||||
|
toExclude.pop(); // e can't effect anything
|
||||||
|
} else if ( p.getStop() < e.getStart() ) {
|
||||||
|
good.add(toProcess.pop()); // p stops before e starts, p is good
|
||||||
|
} else if ( e.getStop() < p.getStart() ) {
|
||||||
|
toExclude.pop(); // p starts after e stops, e is done
|
||||||
|
} else {
|
||||||
|
throw new StingException("BUG: unexpected condition: p=" + p + ", e=" + e);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( i++ % 10000 == 0 )
|
||||||
|
logger.debug("removeRegions operation: i = " + i);
|
||||||
|
}
|
||||||
|
|
||||||
|
return GenomeLocSortedSet.createSetFromList(good);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static final List<GenomeLoc> EMPTY_LIST = new ArrayList<GenomeLoc>();
|
||||||
|
private List<GenomeLoc> subtractRegion(GenomeLoc g, GenomeLoc e) {
|
||||||
|
if (g.equals(e)) {
|
||||||
|
return EMPTY_LIST;
|
||||||
|
} else if (g.containsP(e)) {
|
||||||
|
List<GenomeLoc> l = new ArrayList<GenomeLoc>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* we have to create two new region, one for the before part, one for the after
|
||||||
|
* The old region:
|
||||||
|
* |----------------- old region (g) -------------|
|
||||||
|
* |----- to delete (e) ------|
|
||||||
|
*
|
||||||
|
* product (two new regions):
|
||||||
|
* |------| + |--------|
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
GenomeLoc before = GenomeLocParser.createGenomeLoc(g.getContigIndex(), g.getStart(), e.getStart() - 1);
|
||||||
|
GenomeLoc after = GenomeLocParser.createGenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop());
|
||||||
|
if (after.getStop() - after.getStart() >= 0) {
|
||||||
|
l.add(after);
|
||||||
|
}
|
||||||
|
if (before.getStop() - before.getStart() >= 0) {
|
||||||
|
l.add(before);
|
||||||
|
}
|
||||||
|
|
||||||
|
return l;
|
||||||
|
} else if (e.containsP(g)) {
|
||||||
|
/**
|
||||||
|
* e completely contains g, delete g, but keep looking, there may be more regions
|
||||||
|
* i.e.:
|
||||||
|
* |--------------------- e --------------------|
|
||||||
|
* |--- g ---| |---- others ----|
|
||||||
|
*/
|
||||||
|
return EMPTY_LIST; // don't need to do anything
|
||||||
|
} else {
|
||||||
|
/**
|
||||||
|
* otherwise e overlaps some part of g
|
||||||
|
*
|
||||||
|
* figure out which region occurs first on the genome. I.e., is it:
|
||||||
|
* |------------- g ----------|
|
||||||
|
* |------------- e ----------|
|
||||||
|
*
|
||||||
|
* or:
|
||||||
|
* |------------- g ----------|
|
||||||
|
* |------------ e -----------|
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
GenomeLoc n;
|
||||||
|
if (e.getStart() < g.getStart()) {
|
||||||
|
n = GenomeLocParser.createGenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop());
|
||||||
|
} else {
|
||||||
|
n = GenomeLocParser.createGenomeLoc(g.getContigIndex(), g.getStart(), e.getStart() - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// replace g with the new region
|
||||||
|
return Arrays.asList(n);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* remove an element from the set. Given a specific genome location, this function will
|
* remove an element from the set. Given a specific genome location, this function will
|
||||||
* remove all regions in the element set that overlap the specified region.
|
* remove all regions in the element set that overlap the specified region.
|
||||||
|
|
@ -145,84 +251,128 @@ public class GenomeLocSortedSet extends AbstractSet<GenomeLoc> {
|
||||||
* @return true if a removal action was performed, false if the collection was unchanged.
|
* @return true if a removal action was performed, false if the collection was unchanged.
|
||||||
*/
|
*/
|
||||||
public boolean removeRegion(GenomeLoc e) {
|
public boolean removeRegion(GenomeLoc e) {
|
||||||
|
// todo -- delete me
|
||||||
if (e == null) {
|
if (e == null) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// sometimes we can't return right away, this holds the value for those cases
|
// sometimes we can't return right away, this holds the value for those cases
|
||||||
boolean returnValue = false;
|
boolean returnValue = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* check if the specified element overlaps any current locations, subtract the removed
|
* check if the specified element overlaps any current locations, subtract the removed
|
||||||
* region and reinsert what is left.
|
* region and reinsert what is left.
|
||||||
*/
|
*/
|
||||||
for (GenomeLoc g : mArray) {
|
for (GenomeLoc g : mArray) {
|
||||||
if (g.overlapsP(e)) {
|
if (g.overlapsP(e)) {
|
||||||
if (g.equals(e)) {
|
returnValue = true;
|
||||||
mArray.remove(mArray.indexOf(g));
|
|
||||||
return true;
|
|
||||||
} else if (g.containsP(e)) {
|
|
||||||
/**
|
|
||||||
* we have to create two new region, one for the before part, one for the after
|
|
||||||
* The old region:
|
|
||||||
* |----------------- old region (g) -------------|
|
|
||||||
* |----- to delete (e) ------|
|
|
||||||
*
|
|
||||||
* product (two new regions):
|
|
||||||
* |------| + |--------|
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
GenomeLoc before = GenomeLocParser.createGenomeLoc(g.getContigIndex(), g.getStart(), e.getStart() - 1);
|
|
||||||
GenomeLoc after = GenomeLocParser.createGenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop());
|
|
||||||
int index = mArray.indexOf(g);
|
|
||||||
if (after.getStop() - after.getStart() >= 0) {
|
|
||||||
mArray.add(index, after);
|
|
||||||
}
|
|
||||||
if (before.getStop() - before.getStart() >= 0) {
|
|
||||||
mArray.add(index, before);
|
|
||||||
}
|
|
||||||
mArray.remove(mArray.indexOf(g));
|
|
||||||
return true;
|
|
||||||
} else if (e.containsP(g)) {
|
|
||||||
/**
|
|
||||||
* e completely contains g, delete g, but keep looking, there may be more regions
|
|
||||||
* i.e.:
|
|
||||||
* |--------------------- e --------------------|
|
|
||||||
* |--- g ---| |---- others ----|
|
|
||||||
*/
|
|
||||||
mArray.remove(mArray.indexOf(g));
|
|
||||||
returnValue = true;
|
|
||||||
} else {
|
|
||||||
|
|
||||||
/**
|
boolean finishEarly = removeOverlappingRegion(g, e);
|
||||||
* otherwise e overlaps some part of g
|
if ( finishEarly )
|
||||||
*/
|
break;
|
||||||
GenomeLoc l;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* figure out which region occurs first on the genome. I.e., is it:
|
|
||||||
* |------------- g ----------|
|
|
||||||
* |------------- e ----------|
|
|
||||||
*
|
|
||||||
* or:
|
|
||||||
* |------------- g ----------|
|
|
||||||
* |------------ e -----------|
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
if (e.getStart() < g.getStart()) {
|
|
||||||
l = GenomeLocParser.createGenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop());
|
|
||||||
} else {
|
|
||||||
l = GenomeLocParser.createGenomeLoc(g.getContigIndex(), g.getStart(), e.getStart() - 1);
|
|
||||||
}
|
|
||||||
// replace g with the new region
|
|
||||||
mArray.set(mArray.indexOf(g), l);
|
|
||||||
returnValue = true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return returnValue;
|
return returnValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// public boolean removeRegions(GenomeLocSortedSet toRemove) {
|
||||||
|
// int i = 0, j = 0;
|
||||||
|
//
|
||||||
|
// while ( true ) {
|
||||||
|
// if ( i >= mArray.size() || j >= toRemove.mArray.size() )
|
||||||
|
// return true;
|
||||||
|
//
|
||||||
|
// GenomeLoc g = mArray.get(i);
|
||||||
|
// GenomeLoc e = toRemove.mArray.get(j);
|
||||||
|
//
|
||||||
|
// if (g.overlapsP(e)) {
|
||||||
|
// boolean finishEarly = removeOverlappingRegion(g, e);
|
||||||
|
// if ( finishEarly ) {
|
||||||
|
// i++;
|
||||||
|
// }
|
||||||
|
// } else if ( g.compareContigs(e) < 0 ) {
|
||||||
|
// i++; // g is on an earlier contig, move g forward
|
||||||
|
// } else if ( g.compareContigs(e) > 0 ) {
|
||||||
|
// j++; // g is on a later contig, move e forward
|
||||||
|
// } else if ( g.getStop() < e.getStart() ) {
|
||||||
|
// i++; // g stops before e starts, move g forward
|
||||||
|
// } else if ( e.getStop() < g.getStart() ) {
|
||||||
|
// j++; // g starts after e stops, move e forward
|
||||||
|
// } else {
|
||||||
|
// throw new StingException("BUG: unexpected condition: g=" + g + ", e=" + e);
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// if ( (i + j) % 10000 == 0 )
|
||||||
|
// logger.debug("removeRegions operation: i + j = " + ( i + j ));
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
|
||||||
|
// todo -- delete me
|
||||||
|
private boolean removeOverlappingRegion(GenomeLoc g, GenomeLoc e) {
|
||||||
|
if (g.equals(e)) {
|
||||||
|
mArray.remove(mArray.indexOf(g));
|
||||||
|
return true;
|
||||||
|
} else if (g.containsP(e)) {
|
||||||
|
/**
|
||||||
|
* we have to create two new region, one for the before part, one for the after
|
||||||
|
* The old region:
|
||||||
|
* |----------------- old region (g) -------------|
|
||||||
|
* |----- to delete (e) ------|
|
||||||
|
*
|
||||||
|
* product (two new regions):
|
||||||
|
* |------| + |--------|
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
GenomeLoc before = GenomeLocParser.createGenomeLoc(g.getContigIndex(), g.getStart(), e.getStart() - 1);
|
||||||
|
GenomeLoc after = GenomeLocParser.createGenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop());
|
||||||
|
int index = mArray.indexOf(g);
|
||||||
|
if (after.getStop() - after.getStart() >= 0) {
|
||||||
|
mArray.add(index, after);
|
||||||
|
}
|
||||||
|
if (before.getStop() - before.getStart() >= 0) {
|
||||||
|
mArray.add(index, before);
|
||||||
|
}
|
||||||
|
mArray.remove(mArray.indexOf(g));
|
||||||
|
return true;
|
||||||
|
} else if (e.containsP(g)) {
|
||||||
|
/**
|
||||||
|
* e completely contains g, delete g, but keep looking, there may be more regions
|
||||||
|
* i.e.:
|
||||||
|
* |--------------------- e --------------------|
|
||||||
|
* |--- g ---| |---- others ----|
|
||||||
|
*/
|
||||||
|
mArray.remove(mArray.indexOf(g));
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
/**
|
||||||
|
* otherwise e overlaps some part of g
|
||||||
|
*/
|
||||||
|
GenomeLoc l;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* figure out which region occurs first on the genome. I.e., is it:
|
||||||
|
* |------------- g ----------|
|
||||||
|
* |------------- e ----------|
|
||||||
|
*
|
||||||
|
* or:
|
||||||
|
* |------------- g ----------|
|
||||||
|
* |------------ e -----------|
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
if (e.getStart() < g.getStart()) {
|
||||||
|
l = GenomeLocParser.createGenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop());
|
||||||
|
} else {
|
||||||
|
l = GenomeLocParser.createGenomeLoc(g.getContigIndex(), g.getStart(), e.getStart() - 1);
|
||||||
|
}
|
||||||
|
// replace g with the new region
|
||||||
|
mArray.set(mArray.indexOf(g), l);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* a simple removal of an interval contained in this list. The interval must be identical to one in the list (no partial locations or overlapping)
|
* a simple removal of an interval contained in this list. The interval must be identical to one in the list (no partial locations or overlapping)
|
||||||
* @param location the GenomeLoc to remove
|
* @param location the GenomeLoc to remove
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue