Cleanup of VariantEval, diatribe about performance problems with StateKey
-- Minor refactoring of state key iteration in VEW.map to make the dependencies more clear -- Long discussion about the performance problems with StateKey, and how to fix it, which I have run out of time to address before ESP meeting.
This commit is contained in:
parent
679bb03014
commit
a638996fe2
|
|
@ -371,18 +371,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
||||||
// find the comp
|
// find the comp
|
||||||
final VariantContext comp = findMatchingComp(eval, compSet);
|
final VariantContext comp = findMatchingComp(eval, compSet);
|
||||||
|
|
||||||
HashMap<VariantStratifier, List<String>> stateMap = new HashMap<VariantStratifier, List<String>>();
|
for ( StateKey stateKey : getApplicableStates(tracker, ref, eval, evalRod.getName(), comp, compRod.getName(), sampleName) ) {
|
||||||
for ( VariantStratifier vs : stratificationObjects ) {
|
|
||||||
List<String> states = vs.getRelevantStates(ref, tracker, comp, compRod.getName(), eval, evalRod.getName(), sampleName);
|
|
||||||
stateMap.put(vs, states);
|
|
||||||
}
|
|
||||||
|
|
||||||
ArrayList<StateKey> stateKeys = new ArrayList<StateKey>();
|
|
||||||
variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys);
|
|
||||||
|
|
||||||
HashSet<StateKey> stateKeysHash = new HashSet<StateKey>(stateKeys);
|
|
||||||
|
|
||||||
for ( StateKey stateKey : stateKeysHash ) {
|
|
||||||
NewEvaluationContext nec = evaluationContexts.get(stateKey);
|
NewEvaluationContext nec = evaluationContexts.get(stateKey);
|
||||||
|
|
||||||
// eval against the comp
|
// eval against the comp
|
||||||
|
|
@ -410,6 +399,73 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// private Iterable<StateKey> getApplicableStates(final RefMetaDataTracker tracker,
|
||||||
|
// final ReferenceContext ref,
|
||||||
|
// final VariantContext eval,
|
||||||
|
// final String evalName,
|
||||||
|
// final VariantContext comp,
|
||||||
|
// final String compName,
|
||||||
|
// final String sampleName ) {
|
||||||
|
// Set<StateKey> oldKeys = new HashSet<StateKey>(Utils.makeCollection(getApplicableStatesOld(tracker, ref, eval, evalName, comp, compName, sampleName)));
|
||||||
|
//
|
||||||
|
// int n = 0;
|
||||||
|
// for ( final StateKey newKey : getApplicableStatesNew(tracker, ref, eval, evalName, comp, compName, sampleName) ) {
|
||||||
|
// n++;
|
||||||
|
// if ( ! oldKeys.contains(newKey) )
|
||||||
|
// throw new ReviewedStingException("New key " + newKey + " missing from previous algorithm");
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// if ( n != oldKeys.size() )
|
||||||
|
// throw new ReviewedStingException("New keyset has " + n + " elements but previous algorithm had " + oldKeys.size());
|
||||||
|
//
|
||||||
|
// return oldKeys;
|
||||||
|
// }
|
||||||
|
|
||||||
|
// private Iterable<StateKey> getApplicableStatesNew(final RefMetaDataTracker tracker,
|
||||||
|
// final ReferenceContext ref,
|
||||||
|
// final VariantContext eval,
|
||||||
|
// final String evalName,
|
||||||
|
// final VariantContext comp,
|
||||||
|
// final String compName,
|
||||||
|
// final String sampleName ) {
|
||||||
|
// // todo -- implement optimized version
|
||||||
|
// }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Given specific eval and comp VCs and the sample name, return an iterable
|
||||||
|
* over all of the applicable state keys.
|
||||||
|
*
|
||||||
|
* See header of StateKey for performance problems...
|
||||||
|
*
|
||||||
|
* @param tracker
|
||||||
|
* @param ref
|
||||||
|
* @param eval
|
||||||
|
* @param evalName
|
||||||
|
* @param comp
|
||||||
|
* @param compName
|
||||||
|
* @param sampleName
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
private Iterable<StateKey> getApplicableStates(final RefMetaDataTracker tracker,
|
||||||
|
final ReferenceContext ref,
|
||||||
|
final VariantContext eval,
|
||||||
|
final String evalName,
|
||||||
|
final VariantContext comp,
|
||||||
|
final String compName,
|
||||||
|
final String sampleName ) {
|
||||||
|
final HashMap<VariantStratifier, List<String>> stateMap = new HashMap<VariantStratifier, List<String>>(stratificationObjects.size());
|
||||||
|
for ( final VariantStratifier vs : stratificationObjects ) {
|
||||||
|
List<String> states = vs.getRelevantStates(ref, tracker, comp, compName, eval, evalName, sampleName);
|
||||||
|
stateMap.put(vs, states);
|
||||||
|
}
|
||||||
|
|
||||||
|
ArrayList<StateKey> stateKeys = new ArrayList<StateKey>();
|
||||||
|
variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys);
|
||||||
|
|
||||||
|
return new HashSet<StateKey>(stateKeys);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Requires({"comp != null", "evals != null"})
|
@Requires({"comp != null", "evals != null"})
|
||||||
private boolean compHasMatchingEval(final VariantContext comp, final Collection<VariantContext> evals) {
|
private boolean compHasMatchingEval(final VariantContext comp, final Collection<VariantContext> evals) {
|
||||||
// find all of the matching comps
|
// find all of the matching comps
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,42 @@ import java.util.TreeMap;
|
||||||
* A final constant class representing the specific state configuration
|
* A final constant class representing the specific state configuration
|
||||||
* for a VariantEvaluator instance.
|
* for a VariantEvaluator instance.
|
||||||
*
|
*
|
||||||
* TODO optimizations to entirely remove the TreeMap and just store the HashMap for performance and use the tree for the sorted tostring function.
|
* The way this is currently implemented is by a map from the name of a VariantStratification to a
|
||||||
|
* specific state string. For example, the stratification Novelty has states all, known, novel. A
|
||||||
|
* specific variant and comp would be tagged as "known" by the stratification, and this could be
|
||||||
|
* represented here by the map (Novelty -> known).
|
||||||
|
*
|
||||||
|
* TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
|
||||||
|
* TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
|
||||||
|
* TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
|
||||||
|
* TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
|
||||||
|
* TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
|
||||||
|
*
|
||||||
|
* I've been staring at this state key code for a while. It's just not right, and expensive to boot.
|
||||||
|
* Here are my thoughts for future work. The state key is both a key with specific state values for
|
||||||
|
* every stratification. For example, (known, sample1, ac=1). This capability is used in some places,
|
||||||
|
* such as below, to return a set of all states that should be updated given the eval and comp
|
||||||
|
* VCs. In principle there are a finite set of such combinations (the product of all states for all active
|
||||||
|
* stratifications at initialization). We could represent such keys as integers into the set of all combinations.
|
||||||
|
*
|
||||||
|
* Note that all of the code that manipulates these things is just terrible. It's all string manipulation and
|
||||||
|
* HashMaps. Since we are effectively always squaring off our VE analyses (i.e., we have a table with
|
||||||
|
* all variable values for all stratification combinations) it doesn't make sense to allow so much dynamicism. Instead
|
||||||
|
* we should just upfront create a giant table indexed by integer keys, and manage data via a simple map from
|
||||||
|
* specific strat state to this key.
|
||||||
|
*
|
||||||
|
* The reason this is so important is that >80% of the runtime of VE with VCFs with >1000 samples is spent in
|
||||||
|
* the initializeStateKey function. Instead, we should have code that looks like:
|
||||||
|
*
|
||||||
|
* init:
|
||||||
|
* allStates <- initializeCombinationalStateSpace
|
||||||
|
*
|
||||||
|
* map:
|
||||||
|
* for each eval / comp pair:
|
||||||
|
* for each relevantState based on eval / comp:
|
||||||
|
* allStates[relevantState].update(eval, comp)
|
||||||
|
*
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
public final class StateKey {
|
public final class StateKey {
|
||||||
/** High-performance cache of the toString operation for a constant class */
|
/** High-performance cache of the toString operation for a constant class */
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue