Cleanup of VariantEval, diatribe about performance problems with StateKey

-- Minor refactoring of state key iteration in VEW.map to make the dependencies more clear
-- Long discussion about the performance problems with StateKey, and how to fix it, which I have run out of time to address before ESP meeting.
This commit is contained in:
Mark DePristo 2012-03-27 11:56:24 -04:00
parent 679bb03014
commit a638996fe2
2 changed files with 104 additions and 13 deletions

View File

@ -371,18 +371,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
// find the comp
final VariantContext comp = findMatchingComp(eval, compSet);
HashMap<VariantStratifier, List<String>> stateMap = new HashMap<VariantStratifier, List<String>>();
for ( VariantStratifier vs : stratificationObjects ) {
List<String> states = vs.getRelevantStates(ref, tracker, comp, compRod.getName(), eval, evalRod.getName(), sampleName);
stateMap.put(vs, states);
}
ArrayList<StateKey> stateKeys = new ArrayList<StateKey>();
variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys);
HashSet<StateKey> stateKeysHash = new HashSet<StateKey>(stateKeys);
for ( StateKey stateKey : stateKeysHash ) {
for ( StateKey stateKey : getApplicableStates(tracker, ref, eval, evalRod.getName(), comp, compRod.getName(), sampleName) ) {
NewEvaluationContext nec = evaluationContexts.get(stateKey);
// eval against the comp
@ -410,6 +399,73 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
return null;
}
// private Iterable<StateKey> getApplicableStates(final RefMetaDataTracker tracker,
// final ReferenceContext ref,
// final VariantContext eval,
// final String evalName,
// final VariantContext comp,
// final String compName,
// final String sampleName ) {
// Set<StateKey> oldKeys = new HashSet<StateKey>(Utils.makeCollection(getApplicableStatesOld(tracker, ref, eval, evalName, comp, compName, sampleName)));
//
// int n = 0;
// for ( final StateKey newKey : getApplicableStatesNew(tracker, ref, eval, evalName, comp, compName, sampleName) ) {
// n++;
// if ( ! oldKeys.contains(newKey) )
// throw new ReviewedStingException("New key " + newKey + " missing from previous algorithm");
// }
//
// if ( n != oldKeys.size() )
// throw new ReviewedStingException("New keyset has " + n + " elements but previous algorithm had " + oldKeys.size());
//
// return oldKeys;
// }
// private Iterable<StateKey> getApplicableStatesNew(final RefMetaDataTracker tracker,
// final ReferenceContext ref,
// final VariantContext eval,
// final String evalName,
// final VariantContext comp,
// final String compName,
// final String sampleName ) {
// // todo -- implement optimized version
// }
/**
* Given specific eval and comp VCs and the sample name, return an iterable
* over all of the applicable state keys.
*
* See header of StateKey for performance problems...
*
* @param tracker
* @param ref
* @param eval
* @param evalName
* @param comp
* @param compName
* @param sampleName
* @return
*/
private Iterable<StateKey> getApplicableStates(final RefMetaDataTracker tracker,
final ReferenceContext ref,
final VariantContext eval,
final String evalName,
final VariantContext comp,
final String compName,
final String sampleName ) {
final HashMap<VariantStratifier, List<String>> stateMap = new HashMap<VariantStratifier, List<String>>(stratificationObjects.size());
for ( final VariantStratifier vs : stratificationObjects ) {
List<String> states = vs.getRelevantStates(ref, tracker, comp, compName, eval, evalName, sampleName);
stateMap.put(vs, states);
}
ArrayList<StateKey> stateKeys = new ArrayList<StateKey>();
variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys);
return new HashSet<StateKey>(stateKeys);
}
@Requires({"comp != null", "evals != null"})
private boolean compHasMatchingEval(final VariantContext comp, final Collection<VariantContext> evals) {
// find all of the matching comps

View File

@ -7,7 +7,42 @@ import java.util.TreeMap;
* A final constant class representing the specific state configuration
* for a VariantEvaluator instance.
*
* TODO optimizations to entirely remove the TreeMap and just store the HashMap for performance and use the tree for the sorted tostring function.
* The way this is currently implemented is by a map from the name of a VariantStratification to a
* specific state string. For example, the stratification Novelty has states all, known, novel. A
* specific variant and comp would be tagged as "known" by the stratification, and this could be
* represented here by the map (Novelty -> known).
*
* TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
* TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
* TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
* TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
* TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
*
* I've been staring at this state key code for a while. It's just not right, and expensive to boot.
* Here are my thoughts for future work. The state key is both a key with specific state values for
* every stratification. For example, (known, sample1, ac=1). This capability is used in some places,
* such as below, to return a set of all states that should be updated given the eval and comp
* VCs. In principle there are a finite set of such combinations (the product of all states for all active
* stratifications at initialization). We could represent such keys as integers into the set of all combinations.
*
* Note that all of the code that manipulates these things is just terrible. It's all string manipulation and
* HashMaps. Since we are effectively always squaring off our VE analyses (i.e., we have a table with
* all variable values for all stratification combinations) it doesn't make sense to allow so much dynamicism. Instead
* we should just upfront create a giant table indexed by integer keys, and manage data via a simple map from
* specific strat state to this key.
*
* The reason this is so important is that >80% of the runtime of VE with VCFs with >1000 samples is spent in
* the initializeStateKey function. Instead, we should have code that looks like:
*
* init:
* allStates <- initializeCombinationalStateSpace
*
* map:
* for each eval / comp pair:
* for each relevantState based on eval / comp:
* allStates[relevantState].update(eval, comp)
*
*
*/
public final class StateKey {
/** High-performance cache of the toString operation for a constant class */