Linearize the findEndOfRegion algorithm in ActivityProfile, radically improving its performance

-- Previous algorithm was O(N^2) -- #resolve GSA-723 https://jira.broadinstitute.org/browse/GSA-723
2013-01-26 15:23:40 -05:00 · 2013-01-26 15:23:40 -05:00 · 72b2e77eed
parent 0fb238b61e
commit 72b2e77eed
1 changed files with 79 additions and 20 deletions
--- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java
+++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java
@ -49,6 +49,14 @@ public class ActivityProfile {
    protected GenomeLoc regionStartLoc = null;
    protected GenomeLoc regionStopLoc = null;

+    /**
+     * Optimization variable.  Keeps track of the right-most state we looked at in our
+     * last unsuccessful call to findEndOfRegion.  This variable allows us to
+     * avoid an O(N^2) algorithm to find the end of the current region in the profile.  It
+     * must be reset to 0 when a region is popped off the stack.
+     */
+    int startSearchForEndOfRegionHere = 0;
+
    /**
     * A cached value of the regionStartLoc contig length, to make calls to
     * getCurrentContigLength efficient
@ -77,10 +85,10 @@ public class ActivityProfile {
    /**
     * How far away can probability mass be moved around in this profile?
     *
-     * This distance puts an upper limit on how far, in bp, we will ever propogate probability max around
+     * This distance puts an upper limit on how far, in bp, we will ever propagate probability max around
     * when adding a new ActivityProfileState.  For example, if the value of this function is
     * 10, and you are looking at a state at bp 5, and we know that no states beyond 5 + 10 will have
-     * their probability propograted back to that state.
+     * their probability propagated back to that state.
     *
     * @return a positive integer distance in bp
     */
@ -344,6 +352,9 @@ public class ActivityProfile {
            // couldn't find a valid ending offset, so we return null
            return null;

+        // reset the start site of findEndOfRegion to the first element
+        startSearchForEndOfRegionHere = 0;
+
        // we need to create the active region, and clip out the states we're extracting from this profile
        final List<ActivityProfileState> sub = stateList.subList(0, offsetOfNextRegionEnd + 1);
        final List<ActivityProfileState> supportingStates = new ArrayList<ActivityProfileState>(sub);
@ -384,8 +395,72 @@ public class ActivityProfile {
            "result == -1 || result < maxRegionSize",
            "! (result == -1 && forceConversion)"})
    private int findEndOfRegion(final boolean isActiveRegion, final int minRegionSize, final int maxRegionSize, final boolean forceConversion) {
+        int endOfActiveRegion = findFirstActivityBoundary(isActiveRegion, maxRegionSize, startSearchForEndOfRegionHere);
+        startSearchForEndOfRegionHere = Math.max(endOfActiveRegion - getMaxProbPropagationDistance(), 0);
+
+        if ( isActiveRegion && endOfActiveRegion == maxRegionSize )
+            // we've run to the end of the region, let's find a good place to cut
+            endOfActiveRegion = findBestCutSite(endOfActiveRegion, minRegionSize);
+
+        // we're one past the end, so i must be decremented
+        return forceConversion || endOfActiveRegion + getMaxProbPropagationDistance() < stateList.size()
+                ? endOfActiveRegion - 1
+                : -1;
+    }
+
+    /**
+     * Find the the local minimum within 0 - endOfActiveRegion where we should divide region
+     *
+     * This algorithm finds the global minimum probability state within the region [minRegionSize, endOfActiveRegion)
+     * (exclusive of endOfActiveRegion), and returns the state index of that state.
+     * that it
+     *
+     * @param endOfActiveRegion the last state of the current active region (exclusive)
+     * @param minRegionSize the minimum of the left-most region, after cutting
+     * @return the index of state after the cut site (just like endOfActiveRegion)
+     */
+    @Requires({"endOfActiveRegion >= minRegionSize", "minRegionSize >= 0"})
+    @Ensures({"result >= minRegionSize", "result <= endOfActiveRegion"})
+    private int findBestCutSite(final int endOfActiveRegion, final int minRegionSize) {
+        int minI = endOfActiveRegion - 1;
+        double minP = Double.MAX_VALUE;
+
+        for ( int i = minI; i >= minRegionSize - 1; i-- ) {
+            double cur = getProb(i);
+            if ( cur < minP && isMinimum(i) ) {
+                minP = cur;
+                minI = i;
+            }
+        }
+
+        return minI + 1;
+    }
+
+    /**
+     * Find the first index into the state list where the state is considered ! isActiveRegion
+     *
+     * Note that each state has a probability of being active, and this function thresholds that
+     * value on ACTIVE_PROB_THRESHOLD, coloring each state as active or inactive.  Finds the
+     * largest contiguous stretch of states starting at startSearchAt with the same isActive
+     * state as isActiveRegion.  If the entire state list has the same isActive value, then returns
+     * maxRegionSize
+     *
+     * @param isActiveRegion are we looking for a stretch of active states, or inactive ones?
+     * @param maxRegionSize don't look for a boundary that would yield a region of size > maxRegionSize
+     * @param startSearchAt start looking not at 0 but rather at this offset into the state list.  This
+     *                      parameter allows us to remember where we looked before across calls, so that
+     *                      we don't keep searching from 0 -> 1, then 0 -> 2, then 0 -> 3 on each subsequent
+     *                      call to this function.  Use with caution, as an incorrect value could result in
+     *                      skipping a true boundary
+     * @return the index of the first state in the state list with isActive value != isActiveRegion, or maxRegionSize
+     *         if no such element exists
+     */
+    @Requires({"maxRegionSize > 0", "startSearchAt >= 0", "startSearchAt <= maxRegionSize", "startSearchAt <= stateList.size()"})
+    @Ensures({"result >= 0", "result <= stateList.size()"})
+    private int findFirstActivityBoundary(final boolean isActiveRegion, final int maxRegionSize, final int startSearchAt) {
        final int nStates = stateList.size();
-        int endOfActiveRegion = 0;
+        int endOfActiveRegion = startSearchAt;
+
        while ( endOfActiveRegion < nStates && endOfActiveRegion < maxRegionSize ) {
            if ( getProb(endOfActiveRegion) > ACTIVE_PROB_THRESHOLD != isActiveRegion ) {
                break;
@ -393,23 +468,7 @@ public class ActivityProfile {
            endOfActiveRegion++;
        }

-        if ( isActiveRegion && endOfActiveRegion == maxRegionSize ) {
-            // we've run to the end of the region, let's find a good place to cut
-            int minI = endOfActiveRegion - 1;
-            double minP = Double.MAX_VALUE;
-            for ( int i = minI; i >= minRegionSize - 1; i-- ) {
-                double cur = getProb(i);
-                if ( cur < minP && isMinimum(i) ) {
-                    minP = cur;
-                    minI = i;
-                }
-            }
-
-            endOfActiveRegion = minI + 1;
-        }
-
-        // we're one past the end, so i must be decremented
-        return forceConversion || endOfActiveRegion + getMaxProbPropagationDistance() < stateList.size() ? endOfActiveRegion - 1 : -1;
+        return endOfActiveRegion;
    }

    /**