From e7eae9b61deb49ca397beb59c9299dce2dc975ae Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 12 Mar 2010 22:39:20 +0000 Subject: [PATCH] High performance, correct implementation of -XL exclusion lists. Enjoy. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2994 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/GenomeAnalysisEngine.java | 12 +- .../sting/utils/GenomeLocSortedSet.java | 270 ++++++++++++++---- 2 files changed, 217 insertions(+), 65 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 9ae1271cd..0874e9f5e 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -161,7 +161,8 @@ public class GenomeAnalysisEngine { // todo -- call createSetFromList for -XL argument, and unify this with intervals, if provided GenomeLocSortedSet excludeIntervals = null; if (argCollection.excludeIntervals != null && argCollection.intervalMerging.check()) { - excludeIntervals = GenomeLocSortedSet.createSetFromList(parseIntervalRegion(argCollection.excludeIntervals, IntervalMergingRule.ALL)); + List rawExcludeIntervals = parseIntervalRegion(argCollection.excludeIntervals, IntervalMergingRule.ALL); + excludeIntervals = GenomeLocSortedSet.createSetFromList(rawExcludeIntervals); } if (argCollection.intervals != null && argCollection.intervalMerging.check()) { @@ -307,11 +308,12 @@ public class GenomeAnalysisEngine { private GenomeLocSortedSet pruneIntervals( GenomeLocSortedSet toPrune, GenomeLocSortedSet toExclude) { logger.info(String.format("pruning intervals from %d against %d", toPrune.size(), toExclude.size())); - for ( GenomeLoc exclude : toExclude ) - toPrune.removeRegion(exclude); - logger.info(String.format("done pruning intervals == now have %d", toPrune.size())); + //for ( GenomeLoc exclude : toExclude ) + // toPrune.removeRegion(exclude); + GenomeLocSortedSet x = toPrune.substractRegions(toExclude); + logger.info(String.format("done pruning intervals == now have %d", x.size())); - return toPrune; + return x; } /** diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java index 5663dfc6e..13446806e 100755 --- a/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java @@ -136,6 +136,112 @@ public class GenomeLocSortedSet extends AbstractSet { return true; } + public GenomeLocSortedSet substractRegions(GenomeLocSortedSet toRemoveSet) { + LinkedList good = new LinkedList(); + Stack toProcess = new Stack(); + Stack toExclude = new Stack(); + + // initialize the stacks + toProcess.addAll(mArray); + Collections.reverse(toProcess); + toExclude.addAll(toRemoveSet.mArray); + Collections.reverse(toExclude); + + int i = 0; + while ( ! toProcess.empty() ) { // while there's still stuff to process + if ( toExclude.empty() ) { + good.addAll(toProcess); // no more excludes, all the processing stuff is good + break; + } + + GenomeLoc p = toProcess.peek(); + GenomeLoc e = toExclude.peek(); + + if ( p.overlapsP(e) ) { + toProcess.pop(); + for ( GenomeLoc newP : subtractRegion(p, e) ) + toProcess.push(newP); + } else if ( p.compareContigs(e) < 0 ) { + good.add(toProcess.pop()); // p is now good + } else if ( p.compareContigs(e) > 0 ) { + toExclude.pop(); // e can't effect anything + } else if ( p.getStop() < e.getStart() ) { + good.add(toProcess.pop()); // p stops before e starts, p is good + } else if ( e.getStop() < p.getStart() ) { + toExclude.pop(); // p starts after e stops, e is done + } else { + throw new StingException("BUG: unexpected condition: p=" + p + ", e=" + e); + } + + if ( i++ % 10000 == 0 ) + logger.debug("removeRegions operation: i = " + i); + } + + return GenomeLocSortedSet.createSetFromList(good); + } + + + private static final List EMPTY_LIST = new ArrayList(); + private List subtractRegion(GenomeLoc g, GenomeLoc e) { + if (g.equals(e)) { + return EMPTY_LIST; + } else if (g.containsP(e)) { + List l = new ArrayList(); + + /** + * we have to create two new region, one for the before part, one for the after + * The old region: + * |----------------- old region (g) -------------| + * |----- to delete (e) ------| + * + * product (two new regions): + * |------| + |--------| + * + */ + GenomeLoc before = GenomeLocParser.createGenomeLoc(g.getContigIndex(), g.getStart(), e.getStart() - 1); + GenomeLoc after = GenomeLocParser.createGenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop()); + if (after.getStop() - after.getStart() >= 0) { + l.add(after); + } + if (before.getStop() - before.getStart() >= 0) { + l.add(before); + } + + return l; + } else if (e.containsP(g)) { + /** + * e completely contains g, delete g, but keep looking, there may be more regions + * i.e.: + * |--------------------- e --------------------| + * |--- g ---| |---- others ----| + */ + return EMPTY_LIST; // don't need to do anything + } else { + /** + * otherwise e overlaps some part of g + * + * figure out which region occurs first on the genome. I.e., is it: + * |------------- g ----------| + * |------------- e ----------| + * + * or: + * |------------- g ----------| + * |------------ e -----------| + * + */ + + GenomeLoc n; + if (e.getStart() < g.getStart()) { + n = GenomeLocParser.createGenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop()); + } else { + n = GenomeLocParser.createGenomeLoc(g.getContigIndex(), g.getStart(), e.getStart() - 1); + } + + // replace g with the new region + return Arrays.asList(n); + } + } + /** * remove an element from the set. Given a specific genome location, this function will * remove all regions in the element set that overlap the specified region. @@ -145,84 +251,128 @@ public class GenomeLocSortedSet extends AbstractSet { * @return true if a removal action was performed, false if the collection was unchanged. */ public boolean removeRegion(GenomeLoc e) { + // todo -- delete me if (e == null) { return false; } // sometimes we can't return right away, this holds the value for those cases boolean returnValue = false; + /** * check if the specified element overlaps any current locations, subtract the removed * region and reinsert what is left. */ for (GenomeLoc g : mArray) { if (g.overlapsP(e)) { - if (g.equals(e)) { - mArray.remove(mArray.indexOf(g)); - return true; - } else if (g.containsP(e)) { - /** - * we have to create two new region, one for the before part, one for the after - * The old region: - * |----------------- old region (g) -------------| - * |----- to delete (e) ------| - * - * product (two new regions): - * |------| + |--------| - * - */ - GenomeLoc before = GenomeLocParser.createGenomeLoc(g.getContigIndex(), g.getStart(), e.getStart() - 1); - GenomeLoc after = GenomeLocParser.createGenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop()); - int index = mArray.indexOf(g); - if (after.getStop() - after.getStart() >= 0) { - mArray.add(index, after); - } - if (before.getStop() - before.getStart() >= 0) { - mArray.add(index, before); - } - mArray.remove(mArray.indexOf(g)); - return true; - } else if (e.containsP(g)) { - /** - * e completely contains g, delete g, but keep looking, there may be more regions - * i.e.: - * |--------------------- e --------------------| - * |--- g ---| |---- others ----| - */ - mArray.remove(mArray.indexOf(g)); - returnValue = true; - } else { + returnValue = true; - /** - * otherwise e overlaps some part of g - */ - GenomeLoc l; - - /** - * figure out which region occurs first on the genome. I.e., is it: - * |------------- g ----------| - * |------------- e ----------| - * - * or: - * |------------- g ----------| - * |------------ e -----------| - * - */ - - if (e.getStart() < g.getStart()) { - l = GenomeLocParser.createGenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop()); - } else { - l = GenomeLocParser.createGenomeLoc(g.getContigIndex(), g.getStart(), e.getStart() - 1); - } - // replace g with the new region - mArray.set(mArray.indexOf(g), l); - returnValue = true; - } + boolean finishEarly = removeOverlappingRegion(g, e); + if ( finishEarly ) + break; } } + return returnValue; } +// public boolean removeRegions(GenomeLocSortedSet toRemove) { +// int i = 0, j = 0; +// +// while ( true ) { +// if ( i >= mArray.size() || j >= toRemove.mArray.size() ) +// return true; +// +// GenomeLoc g = mArray.get(i); +// GenomeLoc e = toRemove.mArray.get(j); +// +// if (g.overlapsP(e)) { +// boolean finishEarly = removeOverlappingRegion(g, e); +// if ( finishEarly ) { +// i++; +// } +// } else if ( g.compareContigs(e) < 0 ) { +// i++; // g is on an earlier contig, move g forward +// } else if ( g.compareContigs(e) > 0 ) { +// j++; // g is on a later contig, move e forward +// } else if ( g.getStop() < e.getStart() ) { +// i++; // g stops before e starts, move g forward +// } else if ( e.getStop() < g.getStart() ) { +// j++; // g starts after e stops, move e forward +// } else { +// throw new StingException("BUG: unexpected condition: g=" + g + ", e=" + e); +// } +// +// if ( (i + j) % 10000 == 0 ) +// logger.debug("removeRegions operation: i + j = " + ( i + j )); +// } +// } +// + + // todo -- delete me + private boolean removeOverlappingRegion(GenomeLoc g, GenomeLoc e) { + if (g.equals(e)) { + mArray.remove(mArray.indexOf(g)); + return true; + } else if (g.containsP(e)) { + /** + * we have to create two new region, one for the before part, one for the after + * The old region: + * |----------------- old region (g) -------------| + * |----- to delete (e) ------| + * + * product (two new regions): + * |------| + |--------| + * + */ + GenomeLoc before = GenomeLocParser.createGenomeLoc(g.getContigIndex(), g.getStart(), e.getStart() - 1); + GenomeLoc after = GenomeLocParser.createGenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop()); + int index = mArray.indexOf(g); + if (after.getStop() - after.getStart() >= 0) { + mArray.add(index, after); + } + if (before.getStop() - before.getStart() >= 0) { + mArray.add(index, before); + } + mArray.remove(mArray.indexOf(g)); + return true; + } else if (e.containsP(g)) { + /** + * e completely contains g, delete g, but keep looking, there may be more regions + * i.e.: + * |--------------------- e --------------------| + * |--- g ---| |---- others ----| + */ + mArray.remove(mArray.indexOf(g)); + return false; + } else { + /** + * otherwise e overlaps some part of g + */ + GenomeLoc l; + + /** + * figure out which region occurs first on the genome. I.e., is it: + * |------------- g ----------| + * |------------- e ----------| + * + * or: + * |------------- g ----------| + * |------------ e -----------| + * + */ + + if (e.getStart() < g.getStart()) { + l = GenomeLocParser.createGenomeLoc(g.getContigIndex(), e.getStop() + 1, g.getStop()); + } else { + l = GenomeLocParser.createGenomeLoc(g.getContigIndex(), g.getStart(), e.getStart() - 1); + } + // replace g with the new region + mArray.set(mArray.indexOf(g), l); + return false; + } + } + /** * a simple removal of an interval contained in this list. The interval must be identical to one in the list (no partial locations or overlapping) * @param location the GenomeLoc to remove