Linearize the findEndOfRegion algorithm in ActivityProfile, radically improving its performance
-- Previous algorithm was O(N^2) -- #resolve GSA-723 https://jira.broadinstitute.org/browse/GSA-723
This commit is contained in:
parent
0fb238b61e
commit
72b2e77eed
|
|
@ -49,6 +49,14 @@ public class ActivityProfile {
|
||||||
protected GenomeLoc regionStartLoc = null;
|
protected GenomeLoc regionStartLoc = null;
|
||||||
protected GenomeLoc regionStopLoc = null;
|
protected GenomeLoc regionStopLoc = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Optimization variable. Keeps track of the right-most state we looked at in our
|
||||||
|
* last unsuccessful call to findEndOfRegion. This variable allows us to
|
||||||
|
* avoid an O(N^2) algorithm to find the end of the current region in the profile. It
|
||||||
|
* must be reset to 0 when a region is popped off the stack.
|
||||||
|
*/
|
||||||
|
int startSearchForEndOfRegionHere = 0;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A cached value of the regionStartLoc contig length, to make calls to
|
* A cached value of the regionStartLoc contig length, to make calls to
|
||||||
* getCurrentContigLength efficient
|
* getCurrentContigLength efficient
|
||||||
|
|
@ -77,10 +85,10 @@ public class ActivityProfile {
|
||||||
/**
|
/**
|
||||||
* How far away can probability mass be moved around in this profile?
|
* How far away can probability mass be moved around in this profile?
|
||||||
*
|
*
|
||||||
* This distance puts an upper limit on how far, in bp, we will ever propogate probability max around
|
* This distance puts an upper limit on how far, in bp, we will ever propagate probability max around
|
||||||
* when adding a new ActivityProfileState. For example, if the value of this function is
|
* when adding a new ActivityProfileState. For example, if the value of this function is
|
||||||
* 10, and you are looking at a state at bp 5, and we know that no states beyond 5 + 10 will have
|
* 10, and you are looking at a state at bp 5, and we know that no states beyond 5 + 10 will have
|
||||||
* their probability propograted back to that state.
|
* their probability propagated back to that state.
|
||||||
*
|
*
|
||||||
* @return a positive integer distance in bp
|
* @return a positive integer distance in bp
|
||||||
*/
|
*/
|
||||||
|
|
@ -344,6 +352,9 @@ public class ActivityProfile {
|
||||||
// couldn't find a valid ending offset, so we return null
|
// couldn't find a valid ending offset, so we return null
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
|
// reset the start site of findEndOfRegion to the first element
|
||||||
|
startSearchForEndOfRegionHere = 0;
|
||||||
|
|
||||||
// we need to create the active region, and clip out the states we're extracting from this profile
|
// we need to create the active region, and clip out the states we're extracting from this profile
|
||||||
final List<ActivityProfileState> sub = stateList.subList(0, offsetOfNextRegionEnd + 1);
|
final List<ActivityProfileState> sub = stateList.subList(0, offsetOfNextRegionEnd + 1);
|
||||||
final List<ActivityProfileState> supportingStates = new ArrayList<ActivityProfileState>(sub);
|
final List<ActivityProfileState> supportingStates = new ArrayList<ActivityProfileState>(sub);
|
||||||
|
|
@ -384,8 +395,72 @@ public class ActivityProfile {
|
||||||
"result == -1 || result < maxRegionSize",
|
"result == -1 || result < maxRegionSize",
|
||||||
"! (result == -1 && forceConversion)"})
|
"! (result == -1 && forceConversion)"})
|
||||||
private int findEndOfRegion(final boolean isActiveRegion, final int minRegionSize, final int maxRegionSize, final boolean forceConversion) {
|
private int findEndOfRegion(final boolean isActiveRegion, final int minRegionSize, final int maxRegionSize, final boolean forceConversion) {
|
||||||
|
int endOfActiveRegion = findFirstActivityBoundary(isActiveRegion, maxRegionSize, startSearchForEndOfRegionHere);
|
||||||
|
startSearchForEndOfRegionHere = Math.max(endOfActiveRegion - getMaxProbPropagationDistance(), 0);
|
||||||
|
|
||||||
|
if ( isActiveRegion && endOfActiveRegion == maxRegionSize )
|
||||||
|
// we've run to the end of the region, let's find a good place to cut
|
||||||
|
endOfActiveRegion = findBestCutSite(endOfActiveRegion, minRegionSize);
|
||||||
|
|
||||||
|
// we're one past the end, so i must be decremented
|
||||||
|
return forceConversion || endOfActiveRegion + getMaxProbPropagationDistance() < stateList.size()
|
||||||
|
? endOfActiveRegion - 1
|
||||||
|
: -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find the the local minimum within 0 - endOfActiveRegion where we should divide region
|
||||||
|
*
|
||||||
|
* This algorithm finds the global minimum probability state within the region [minRegionSize, endOfActiveRegion)
|
||||||
|
* (exclusive of endOfActiveRegion), and returns the state index of that state.
|
||||||
|
* that it
|
||||||
|
*
|
||||||
|
* @param endOfActiveRegion the last state of the current active region (exclusive)
|
||||||
|
* @param minRegionSize the minimum of the left-most region, after cutting
|
||||||
|
* @return the index of state after the cut site (just like endOfActiveRegion)
|
||||||
|
*/
|
||||||
|
@Requires({"endOfActiveRegion >= minRegionSize", "minRegionSize >= 0"})
|
||||||
|
@Ensures({"result >= minRegionSize", "result <= endOfActiveRegion"})
|
||||||
|
private int findBestCutSite(final int endOfActiveRegion, final int minRegionSize) {
|
||||||
|
int minI = endOfActiveRegion - 1;
|
||||||
|
double minP = Double.MAX_VALUE;
|
||||||
|
|
||||||
|
for ( int i = minI; i >= minRegionSize - 1; i-- ) {
|
||||||
|
double cur = getProb(i);
|
||||||
|
if ( cur < minP && isMinimum(i) ) {
|
||||||
|
minP = cur;
|
||||||
|
minI = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return minI + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find the first index into the state list where the state is considered ! isActiveRegion
|
||||||
|
*
|
||||||
|
* Note that each state has a probability of being active, and this function thresholds that
|
||||||
|
* value on ACTIVE_PROB_THRESHOLD, coloring each state as active or inactive. Finds the
|
||||||
|
* largest contiguous stretch of states starting at startSearchAt with the same isActive
|
||||||
|
* state as isActiveRegion. If the entire state list has the same isActive value, then returns
|
||||||
|
* maxRegionSize
|
||||||
|
*
|
||||||
|
* @param isActiveRegion are we looking for a stretch of active states, or inactive ones?
|
||||||
|
* @param maxRegionSize don't look for a boundary that would yield a region of size > maxRegionSize
|
||||||
|
* @param startSearchAt start looking not at 0 but rather at this offset into the state list. This
|
||||||
|
* parameter allows us to remember where we looked before across calls, so that
|
||||||
|
* we don't keep searching from 0 -> 1, then 0 -> 2, then 0 -> 3 on each subsequent
|
||||||
|
* call to this function. Use with caution, as an incorrect value could result in
|
||||||
|
* skipping a true boundary
|
||||||
|
* @return the index of the first state in the state list with isActive value != isActiveRegion, or maxRegionSize
|
||||||
|
* if no such element exists
|
||||||
|
*/
|
||||||
|
@Requires({"maxRegionSize > 0", "startSearchAt >= 0", "startSearchAt <= maxRegionSize", "startSearchAt <= stateList.size()"})
|
||||||
|
@Ensures({"result >= 0", "result <= stateList.size()"})
|
||||||
|
private int findFirstActivityBoundary(final boolean isActiveRegion, final int maxRegionSize, final int startSearchAt) {
|
||||||
final int nStates = stateList.size();
|
final int nStates = stateList.size();
|
||||||
int endOfActiveRegion = 0;
|
int endOfActiveRegion = startSearchAt;
|
||||||
|
|
||||||
while ( endOfActiveRegion < nStates && endOfActiveRegion < maxRegionSize ) {
|
while ( endOfActiveRegion < nStates && endOfActiveRegion < maxRegionSize ) {
|
||||||
if ( getProb(endOfActiveRegion) > ACTIVE_PROB_THRESHOLD != isActiveRegion ) {
|
if ( getProb(endOfActiveRegion) > ACTIVE_PROB_THRESHOLD != isActiveRegion ) {
|
||||||
break;
|
break;
|
||||||
|
|
@ -393,23 +468,7 @@ public class ActivityProfile {
|
||||||
endOfActiveRegion++;
|
endOfActiveRegion++;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( isActiveRegion && endOfActiveRegion == maxRegionSize ) {
|
return endOfActiveRegion;
|
||||||
// we've run to the end of the region, let's find a good place to cut
|
|
||||||
int minI = endOfActiveRegion - 1;
|
|
||||||
double minP = Double.MAX_VALUE;
|
|
||||||
for ( int i = minI; i >= minRegionSize - 1; i-- ) {
|
|
||||||
double cur = getProb(i);
|
|
||||||
if ( cur < minP && isMinimum(i) ) {
|
|
||||||
minP = cur;
|
|
||||||
minI = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
endOfActiveRegion = minI + 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// we're one past the end, so i must be decremented
|
|
||||||
return forceConversion || endOfActiveRegion + getMaxProbPropagationDistance() < stateList.size() ? endOfActiveRegion - 1 : -1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue