diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 4bfd90eac..ebd2500fd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -371,18 +371,7 @@ public class VariantEvalWalker extends RodWalker implements Tr // find the comp final VariantContext comp = findMatchingComp(eval, compSet); - HashMap> stateMap = new HashMap>(); - for ( VariantStratifier vs : stratificationObjects ) { - List states = vs.getRelevantStates(ref, tracker, comp, compRod.getName(), eval, evalRod.getName(), sampleName); - stateMap.put(vs, states); - } - - ArrayList stateKeys = new ArrayList(); - variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys); - - HashSet stateKeysHash = new HashSet(stateKeys); - - for ( StateKey stateKey : stateKeysHash ) { + for ( StateKey stateKey : getApplicableStates(tracker, ref, eval, evalRod.getName(), comp, compRod.getName(), sampleName) ) { NewEvaluationContext nec = evaluationContexts.get(stateKey); // eval against the comp @@ -410,6 +399,73 @@ public class VariantEvalWalker extends RodWalker implements Tr return null; } +// private Iterable getApplicableStates(final RefMetaDataTracker tracker, +// final ReferenceContext ref, +// final VariantContext eval, +// final String evalName, +// final VariantContext comp, +// final String compName, +// final String sampleName ) { +// Set oldKeys = new HashSet(Utils.makeCollection(getApplicableStatesOld(tracker, ref, eval, evalName, comp, compName, sampleName))); +// +// int n = 0; +// for ( final StateKey newKey : getApplicableStatesNew(tracker, ref, eval, evalName, comp, compName, sampleName) ) { +// n++; +// if ( ! oldKeys.contains(newKey) ) +// throw new ReviewedStingException("New key " + newKey + " missing from previous algorithm"); +// } +// +// if ( n != oldKeys.size() ) +// throw new ReviewedStingException("New keyset has " + n + " elements but previous algorithm had " + oldKeys.size()); +// +// return oldKeys; +// } + +// private Iterable getApplicableStatesNew(final RefMetaDataTracker tracker, +// final ReferenceContext ref, +// final VariantContext eval, +// final String evalName, +// final VariantContext comp, +// final String compName, +// final String sampleName ) { +// // todo -- implement optimized version +// } + + /** + * Given specific eval and comp VCs and the sample name, return an iterable + * over all of the applicable state keys. + * + * See header of StateKey for performance problems... + * + * @param tracker + * @param ref + * @param eval + * @param evalName + * @param comp + * @param compName + * @param sampleName + * @return + */ + private Iterable getApplicableStates(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final VariantContext eval, + final String evalName, + final VariantContext comp, + final String compName, + final String sampleName ) { + final HashMap> stateMap = new HashMap>(stratificationObjects.size()); + for ( final VariantStratifier vs : stratificationObjects ) { + List states = vs.getRelevantStates(ref, tracker, comp, compName, eval, evalName, sampleName); + stateMap.put(vs, states); + } + + ArrayList stateKeys = new ArrayList(); + variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys); + + return new HashSet(stateKeys); + } + + @Requires({"comp != null", "evals != null"}) private boolean compHasMatchingEval(final VariantContext comp, final Collection evals) { // find all of the matching comps diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java index 36b09300b..f62de17a5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/StateKey.java @@ -7,7 +7,42 @@ import java.util.TreeMap; * A final constant class representing the specific state configuration * for a VariantEvaluator instance. * - * TODO optimizations to entirely remove the TreeMap and just store the HashMap for performance and use the tree for the sorted tostring function. + * The way this is currently implemented is by a map from the name of a VariantStratification to a + * specific state string. For example, the stratification Novelty has states all, known, novel. A + * specific variant and comp would be tagged as "known" by the stratification, and this could be + * represented here by the map (Novelty -> known). + * + * TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12 + * TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12 + * TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12 + * TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12 + * TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12 + * + * I've been staring at this state key code for a while. It's just not right, and expensive to boot. + * Here are my thoughts for future work. The state key is both a key with specific state values for + * every stratification. For example, (known, sample1, ac=1). This capability is used in some places, + * such as below, to return a set of all states that should be updated given the eval and comp + * VCs. In principle there are a finite set of such combinations (the product of all states for all active + * stratifications at initialization). We could represent such keys as integers into the set of all combinations. + * + * Note that all of the code that manipulates these things is just terrible. It's all string manipulation and + * HashMaps. Since we are effectively always squaring off our VE analyses (i.e., we have a table with + * all variable values for all stratification combinations) it doesn't make sense to allow so much dynamicism. Instead + * we should just upfront create a giant table indexed by integer keys, and manage data via a simple map from + * specific strat state to this key. + * + * The reason this is so important is that >80% of the runtime of VE with VCFs with >1000 samples is spent in + * the initializeStateKey function. Instead, we should have code that looks like: + * + * init: + * allStates <- initializeCombinationalStateSpace + * + * map: + * for each eval / comp pair: + * for each relevantState based on eval / comp: + * allStates[relevantState].update(eval, comp) + * + * */ public final class StateKey { /** High-performance cache of the toString operation for a constant class */