diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java index 15aca4162..14a8e6dcb 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java @@ -49,6 +49,14 @@ public class ActivityProfile { protected GenomeLoc regionStartLoc = null; protected GenomeLoc regionStopLoc = null; + /** + * Optimization variable. Keeps track of the right-most state we looked at in our + * last unsuccessful call to findEndOfRegion. This variable allows us to + * avoid an O(N^2) algorithm to find the end of the current region in the profile. It + * must be reset to 0 when a region is popped off the stack. + */ + int startSearchForEndOfRegionHere = 0; + /** * A cached value of the regionStartLoc contig length, to make calls to * getCurrentContigLength efficient @@ -77,10 +85,10 @@ public class ActivityProfile { /** * How far away can probability mass be moved around in this profile? * - * This distance puts an upper limit on how far, in bp, we will ever propogate probability max around + * This distance puts an upper limit on how far, in bp, we will ever propagate probability max around * when adding a new ActivityProfileState. For example, if the value of this function is * 10, and you are looking at a state at bp 5, and we know that no states beyond 5 + 10 will have - * their probability propograted back to that state. + * their probability propagated back to that state. * * @return a positive integer distance in bp */ @@ -344,6 +352,9 @@ public class ActivityProfile { // couldn't find a valid ending offset, so we return null return null; + // reset the start site of findEndOfRegion to the first element + startSearchForEndOfRegionHere = 0; + // we need to create the active region, and clip out the states we're extracting from this profile final List sub = stateList.subList(0, offsetOfNextRegionEnd + 1); final List supportingStates = new ArrayList(sub); @@ -384,8 +395,72 @@ public class ActivityProfile { "result == -1 || result < maxRegionSize", "! (result == -1 && forceConversion)"}) private int findEndOfRegion(final boolean isActiveRegion, final int minRegionSize, final int maxRegionSize, final boolean forceConversion) { + int endOfActiveRegion = findFirstActivityBoundary(isActiveRegion, maxRegionSize, startSearchForEndOfRegionHere); + startSearchForEndOfRegionHere = Math.max(endOfActiveRegion - getMaxProbPropagationDistance(), 0); + + if ( isActiveRegion && endOfActiveRegion == maxRegionSize ) + // we've run to the end of the region, let's find a good place to cut + endOfActiveRegion = findBestCutSite(endOfActiveRegion, minRegionSize); + + // we're one past the end, so i must be decremented + return forceConversion || endOfActiveRegion + getMaxProbPropagationDistance() < stateList.size() + ? endOfActiveRegion - 1 + : -1; + } + + /** + * Find the the local minimum within 0 - endOfActiveRegion where we should divide region + * + * This algorithm finds the global minimum probability state within the region [minRegionSize, endOfActiveRegion) + * (exclusive of endOfActiveRegion), and returns the state index of that state. + * that it + * + * @param endOfActiveRegion the last state of the current active region (exclusive) + * @param minRegionSize the minimum of the left-most region, after cutting + * @return the index of state after the cut site (just like endOfActiveRegion) + */ + @Requires({"endOfActiveRegion >= minRegionSize", "minRegionSize >= 0"}) + @Ensures({"result >= minRegionSize", "result <= endOfActiveRegion"}) + private int findBestCutSite(final int endOfActiveRegion, final int minRegionSize) { + int minI = endOfActiveRegion - 1; + double minP = Double.MAX_VALUE; + + for ( int i = minI; i >= minRegionSize - 1; i-- ) { + double cur = getProb(i); + if ( cur < minP && isMinimum(i) ) { + minP = cur; + minI = i; + } + } + + return minI + 1; + } + + /** + * Find the first index into the state list where the state is considered ! isActiveRegion + * + * Note that each state has a probability of being active, and this function thresholds that + * value on ACTIVE_PROB_THRESHOLD, coloring each state as active or inactive. Finds the + * largest contiguous stretch of states starting at startSearchAt with the same isActive + * state as isActiveRegion. If the entire state list has the same isActive value, then returns + * maxRegionSize + * + * @param isActiveRegion are we looking for a stretch of active states, or inactive ones? + * @param maxRegionSize don't look for a boundary that would yield a region of size > maxRegionSize + * @param startSearchAt start looking not at 0 but rather at this offset into the state list. This + * parameter allows us to remember where we looked before across calls, so that + * we don't keep searching from 0 -> 1, then 0 -> 2, then 0 -> 3 on each subsequent + * call to this function. Use with caution, as an incorrect value could result in + * skipping a true boundary + * @return the index of the first state in the state list with isActive value != isActiveRegion, or maxRegionSize + * if no such element exists + */ + @Requires({"maxRegionSize > 0", "startSearchAt >= 0", "startSearchAt <= maxRegionSize", "startSearchAt <= stateList.size()"}) + @Ensures({"result >= 0", "result <= stateList.size()"}) + private int findFirstActivityBoundary(final boolean isActiveRegion, final int maxRegionSize, final int startSearchAt) { final int nStates = stateList.size(); - int endOfActiveRegion = 0; + int endOfActiveRegion = startSearchAt; + while ( endOfActiveRegion < nStates && endOfActiveRegion < maxRegionSize ) { if ( getProb(endOfActiveRegion) > ACTIVE_PROB_THRESHOLD != isActiveRegion ) { break; @@ -393,23 +468,7 @@ public class ActivityProfile { endOfActiveRegion++; } - if ( isActiveRegion && endOfActiveRegion == maxRegionSize ) { - // we've run to the end of the region, let's find a good place to cut - int minI = endOfActiveRegion - 1; - double minP = Double.MAX_VALUE; - for ( int i = minI; i >= minRegionSize - 1; i-- ) { - double cur = getProb(i); - if ( cur < minP && isMinimum(i) ) { - minP = cur; - minI = i; - } - } - - endOfActiveRegion = minI + 1; - } - - // we're one past the end, so i must be decremented - return forceConversion || endOfActiveRegion + getMaxProbPropagationDistance() < stateList.size() ? endOfActiveRegion - 1 : -1; + return endOfActiveRegion; } /**