Cleanup of VariantEval, diatribe about performance problems with StateKey

-- Minor refactoring of state key iteration in VEW.map to make the dependencies more clear -- Long discussion about the performance problems with StateKey, and how to fix it, which I have run out of time to address before ESP meeting.
2012-03-27 11:56:24 -04:00 · 2012-03-27 11:56:24 -04:00 · a638996fe2
parent 679bb03014
commit a638996fe2
2 changed files with 104 additions and 13 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
@ -371,18 +371,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
                            // find the comp
                            final VariantContext comp = findMatchingComp(eval, compSet);

-                            HashMap<VariantStratifier, List<String>> stateMap = new HashMap<VariantStratifier, List<String>>();
-                            for ( VariantStratifier vs : stratificationObjects ) {
-                                List<String> states = vs.getRelevantStates(ref, tracker, comp, compRod.getName(), eval, evalRod.getName(), sampleName);
-                                stateMap.put(vs, states);
-                            }
-
-                            ArrayList<StateKey> stateKeys = new ArrayList<StateKey>();
-                            variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys);
-
-                            HashSet<StateKey> stateKeysHash = new HashSet<StateKey>(stateKeys);
-
-                            for ( StateKey stateKey : stateKeysHash ) {
+                            for ( StateKey stateKey : getApplicableStates(tracker, ref, eval, evalRod.getName(), comp, compRod.getName(), sampleName) ) {
                                NewEvaluationContext nec = evaluationContexts.get(stateKey);

                                // eval against the comp
@ -410,6 +399,73 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
        return null;
    }

+//    private Iterable<StateKey> getApplicableStates(final RefMetaDataTracker tracker,
+//                                                   final ReferenceContext ref,
+//                                                   final VariantContext eval,
+//                                                   final String evalName,
+//                                                   final VariantContext comp,
+//                                                   final String compName,
+//                                                   final String sampleName ) {
+//        Set<StateKey> oldKeys = new HashSet<StateKey>(Utils.makeCollection(getApplicableStatesOld(tracker, ref, eval, evalName, comp, compName, sampleName)));
+//
+//        int n = 0;
+//        for ( final StateKey newKey : getApplicableStatesNew(tracker, ref, eval, evalName, comp, compName, sampleName) ) {
+//            n++;
+//            if ( ! oldKeys.contains(newKey) )
+//                throw new ReviewedStingException("New key " + newKey + " missing from previous algorithm");
+//        }
+//
+//        if ( n != oldKeys.size() )
+//            throw new ReviewedStingException("New keyset has " + n + " elements but previous algorithm had " + oldKeys.size());
+//
+//        return oldKeys;
+//    }
+
+//    private Iterable<StateKey> getApplicableStatesNew(final RefMetaDataTracker tracker,
+//                                                   final ReferenceContext ref,
+//                                                   final VariantContext eval,
+//                                                   final String evalName,
+//                                                   final VariantContext comp,
+//                                                   final String compName,
+//                                                   final String sampleName ) {
+//        // todo -- implement optimized version
+//    }
+
+    /**
+     * Given specific eval and comp VCs and the sample name, return an iterable
+     * over all of the applicable state keys.
+     *
+     * See header of StateKey for performance problems...
+     *
+     * @param tracker
+     * @param ref
+     * @param eval
+     * @param evalName
+     * @param comp
+     * @param compName
+     * @param sampleName
+     * @return
+     */
+    private Iterable<StateKey> getApplicableStates(final RefMetaDataTracker tracker,
+                                                   final ReferenceContext ref,
+                                                   final VariantContext eval,
+                                                   final String evalName,
+                                                   final VariantContext comp,
+                                                   final String compName,
+                                                   final String sampleName ) {
+        final HashMap<VariantStratifier, List<String>> stateMap = new HashMap<VariantStratifier, List<String>>(stratificationObjects.size());
+        for ( final VariantStratifier vs : stratificationObjects ) {
+            List<String> states = vs.getRelevantStates(ref, tracker, comp, compName, eval, evalName, sampleName);
+            stateMap.put(vs, states);
+        }
+
+        ArrayList<StateKey> stateKeys = new ArrayList<StateKey>();
+        variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys);
+
+        return new HashSet<StateKey>(stateKeys);
+    }
+
+
    @Requires({"comp != null", "evals != null"})
    private boolean compHasMatchingEval(final VariantContext comp, final Collection<VariantContext> evals) {
        // find all of the matching comps
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java
@ -7,7 +7,42 @@ import java.util.TreeMap;
 * A final constant class representing the specific state configuration
 * for a VariantEvaluator instance.
 *
- * TODO optimizations to entirely remove the TreeMap and just store the HashMap for performance and use the tree for the sorted tostring function.
+ * The way this is currently implemented is by a map from the name of a VariantStratification to a
+ * specific state string.  For example, the stratification Novelty has states all, known, novel. A
+ * specific variant and comp would be tagged as "known" by the stratification, and this could be
+ * represented here by the map (Novelty -> known).
+ *
+ * TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
+ * TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
+ * TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
+ * TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
+ * TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
+ *
+ * I've been staring at this state key code for a while.  It's just not right, and expensive to boot.
+ * Here are my thoughts for future work.  The state key is both a key with specific state values for
+ * every stratification.  For example, (known, sample1, ac=1).  This capability is used in some places,
+ * such as below, to return a set of all states that should be updated given the eval and comp
+ * VCs.  In principle there are a finite set of such combinations (the product of all states for all active
+ * stratifications at initialization).  We could represent such keys as integers into the set of all combinations.
+ *
+ * Note that all of the code that manipulates these things is just terrible. It's all string manipulation and
+ * HashMaps.  Since we are effectively always squaring off our VE analyses (i.e., we have a table with
+ * all variable values for all stratification combinations) it doesn't make sense to allow so much dynamicism.  Instead
+ * we should just upfront create a giant table indexed by integer keys, and manage data via a simple map from
+ * specific strat state to this key.
+ *
+ * The reason this is so important is that >80% of the runtime of VE with VCFs with >1000 samples is spent in
+ * the initializeStateKey function.  Instead, we should have code that looks like:
+ *
+ * init:
+ * allStates <- initializeCombinationalStateSpace
+ *
+ * map:
+ * for each eval / comp pair:
+ *   for each relevantState based on eval / comp:
+ *     allStates[relevantState].update(eval, comp)
+ *
+ *
 */
 public final class StateKey {
    /** High-performance cache of the toString operation for a constant class */