Fixing VariantEval logic and having it use the new rod system.

2011-08-11 13:39:34 -04:00 · 2011-08-11 13:39:34 -04:00 · 265c3d744b
parent f1b09db39e
commit 265c3d744b
3 changed files with 148 additions and 127 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
@ -60,14 +60,16 @@ import java.util.*;
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250)
 public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGenotyper.UGStatistics> implements TreeReducible<UnifiedGenotyper.UGStatistics>, AnnotatorCompatibleWalker {

-    @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
+    @ArgumentCollection
+    private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();

    /**
      * A dbSNP VCF file from which to annotate.
      *
      * rsIDs from this file are used to populate the ID column of the output.  Also, the DB INFO flag will be set when appropriate.
      */
-    @ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
+    @ArgumentCollection
+    protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
    public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; }
    public RodBinding<SnpEffFeature> getSnpEffRodBinding() { return null; }
    public List<RodBinding<VariantContext>> getCompRodBindings() { return Collections.emptyList(); }
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
@ -3,8 +3,8 @@ package org.broadinstitute.sting.gatk.walkers.varianteval;
 import net.sf.picard.reference.IndexedFastaSequenceFile;
 import net.sf.samtools.SAMSequenceRecord;
 import org.apache.log4j.Logger;
-import org.broadinstitute.sting.commandline.Argument;
-import org.broadinstitute.sting.commandline.Output;
+import org.broadinstitute.sting.commandline.*;
+import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
@ -27,7 +27,6 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
 import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.exceptions.StingException;
-import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;
 import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;

@ -46,6 +45,15 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
    @Output
    protected PrintStream out;

+    @Input(fullName="eval", shortName = "eval", doc="Input evaluation file(s)", required=true)
+    public List<RodBinding<VariantContext>> evals;
+
+    @Input(fullName="comp", shortName = "comp", doc="Input comparison file(s)", required=false)
+    public List<RodBinding<VariantContext>> comps = Collections.emptyList();
+
+    @ArgumentCollection
+    protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
+
    // Help arguments
    @Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit")
    protected Boolean LIST = false;
@ -61,7 +69,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
    protected Set<String> SAMPLE_EXPRESSIONS;

    @Argument(shortName="knownName", doc="Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets", required=false)
-    protected String[] KNOWN_NAMES = {DbSNPHelper.STANDARD_DBSNP_TRACK_NAME};
+    protected String[] KNOWN_NAMES = {dbsnp.dbsnp.getName()};

    // Stratification arguments
    @Argument(fullName="stratificationModule", shortName="ST", doc="One or more specific stratification modules to apply to the eval track(s) (in addition to the standard stratifications, unless -noS is specified)", required=false)
@ -115,6 +123,10 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
    // The set of all possible evaluation contexts
    private HashMap<StateKey, NewEvaluationContext> evaluationContexts = null;

+    // important stratifications
+    private boolean byFilterIsEnabled = false;
+    private boolean perSampleIsEnabled = false;
+
    // Output report
    private GATKReport report = null;

@ -134,27 +146,21 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
        // Just list the modules, and exit quickly.
        if (LIST) { variantEvalUtils.listModulesAndExit(); }

-        // Categorize each rod as an eval or a comp rod.
-        for ( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
-            if ( d.getName().startsWith("eval") ) {
-                evalNames.add(d.getName());
-            } else if ( d.getName().startsWith("comp") || d.getName().startsWith(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
-                compNames.add(d.getName());
-            } else {
-                logger.info(String.format("Not evaluating ROD binding '%s' because the name did not start with %s, comp, or eval", d.getName(), Utils.join(", ", KNOWN_NAMES)));
-            }
-        }
-
-        // Barf if we don't have any eval tracks.
-        if (evalNames.size() == 0) {
-            throw new UserException("No evaluation tracks were specified.  Please bind one or more callsets to evaluate using the -B argument with a trackname that starts with the word 'eval'.");
-        }
-
        // Add a dummy comp track if none exists
-        if (compNames.size() == 0) {
-            compNames.add("none");
+        if ( comps.size() == 0 ) {
+            comps.add(new RodBinding(VariantContext.class, "none", "UNBOUND", "", new Tags()));
        }

+        // Cache the rod names
+        for ( RodBinding<VariantContext> compRod : comps )
+            compNames.add(compRod.getName());
+
+        for ( RodBinding<VariantContext> evalRod : evals )
+            evalNames.add(evalRod.getName());
+
+        if ( dbsnp.dbsnp.isBound() )
+            compNames.add(dbsnp.dbsnp.getName());
+
        // Set up set of known names
        knownNames.addAll(Arrays.asList(KNOWN_NAMES));

@ -190,6 +196,12 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
        // Initialize the set of stratifications and evaluations to use
        stratificationObjects = variantEvalUtils.initializeStratificationObjects(this, NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE);
        Set<Class<? extends VariantEvaluator>> evaluationObjects = variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE);
+        for ( VariantStratifier vs : getStratificationObjects() ) {
+            if ( vs.getClass().getSimpleName().equals("Filter") )
+                byFilterIsEnabled = true;
+            else if ( vs.getClass().getSimpleName().equals("Sample") )
+                perSampleIsEnabled = true;
+        }

        // Initialize the evaluation contexts
        evaluationContexts = variantEvalUtils.initializeEvaluationContexts(stratificationObjects, evaluationObjects, null, null);
@ -221,15 +233,68 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
        if (tracker != null) {
            String aastr = (ancestralAlignments == null) ? null : new String(ancestralAlignments.getSubsequenceAt(ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop()).getBases());

-            //      track           sample  vc
-            HashMap<String, HashMap<String, VariantContext>> vcs = variantEvalUtils.getVariantContexts(tracker, ref, compNames, evalNames, typesToUse != null);
+            //      --------- track ---------           sample  - VariantContexts -
+            HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>> evalVCs = variantEvalUtils.bindVariantContexts(tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled);
+            HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>> compVCs = variantEvalUtils.bindVariantContexts(tracker, ref, comps, byFilterIsEnabled, false, false);

-            for ( String compName : compNames ) {
-                VariantContext comp = vcs.containsKey(compName) && vcs.get(compName) != null && vcs.get(compName).containsKey(ALL_SAMPLE_NAME) ? vcs.get(compName).get(ALL_SAMPLE_NAME) : null;
+            // for each eval track
+            for ( final RodBinding<VariantContext> evalRod : evals ) {
+                final HashMap<String, Set<VariantContext>> evalSet = evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : new HashMap<String, Set<VariantContext>>(0);

-                for ( String evalName : evalNames ) {
-                    for ( String sampleName : sampleNamesForStratification ) {
-                        VariantContext eval = vcs.containsKey(evalName) && vcs.get(evalName) != null ? vcs.get(evalName).get(sampleName) : null;
+                // for each sample stratifier
+                for ( final String sampleName : sampleNamesForStratification ) {
+                    Set<VariantContext> evalSetBySample = evalSet.get(sampleName);
+                    if ( evalSetBySample == null ) {
+                        evalSetBySample = new HashSet<VariantContext>(1);
+                        evalSetBySample.add(null);
+                    }
+
+                    // for each eval in the track
+                    for ( VariantContext eval : evalSetBySample ) {
+                        // deal with ancestral alleles if requested
+                        if ( eval != null && aastr != null ) {
+                            HashMap<String, Object> newAts = new HashMap<String, Object>(eval.getAttributes());
+                            newAts.put("ANCESTRALALLELE", aastr);
+                            eval = VariantContext.modifyAttributes(eval, newAts);
+                        }
+
+                        // for each comp track
+                        for ( final RodBinding<VariantContext> compRod : comps ) {
+                            // no sample stratification for comps
+                            final Set<VariantContext> compSet = compVCs.get(compRod) == null ? new HashSet<VariantContext>(0) : compVCs.get(compRod).values().iterator().next();
+
+                            // find the comp
+                            final VariantContext comp = findMatchingComp(eval, compSet);
+
+                            HashMap<VariantStratifier, ArrayList<String>> stateMap = new HashMap<VariantStratifier, ArrayList<String>>();
+                            for ( VariantStratifier vs : stratificationObjects ) {
+                                ArrayList<String> states = vs.getRelevantStates(ref, tracker, comp, compRod.getName(), eval, evalRod.getName(), sampleName);
+                                stateMap.put(vs, states);
+                            }
+
+                            ArrayList<StateKey> stateKeys = new ArrayList<StateKey>();
+                            variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys);
+
+                            HashSet<StateKey> stateKeysHash = new HashSet<StateKey>(stateKeys);
+
+                            for ( StateKey stateKey : stateKeysHash ) {
+                                NewEvaluationContext nec = evaluationContexts.get(stateKey);
+
+                                // eval against the comp
+                                synchronized (nec) {
+                                    nec.apply(tracker, ref, context, comp, eval);
+                                }
+
+                                // eval=null against all comps of different type
+                                for ( VariantContext otherComp : compSet ) {
+                                    if ( otherComp != comp ) {
+                                        synchronized (nec) {
+                                            nec.apply(tracker, ref, context, otherComp, null);
+                                        }
+                                    }
+                                }
+                            }
+                        }

                        // todo: Eric, this is really the problem.  We select single eval and comp VCs independently
                        // todo: discarding multiple eval tracks at the sites and not providing matched comps
@ -247,37 +312,6 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
                        // todo: like subset to sample, etc.  So you probably will want a master map that maps
                        // todo: from special eval bindings to the digested VC for efficiency.

-                        if ( typesToUse != null ) {
-                            if ( eval != null && ! typesToUse.contains(eval.getType()) ) eval = null;
-                            if ( comp != null && ! typesToUse.contains(comp.getType()) ) comp = null;
-//                            if ( eval != null ) logger.info("Keeping " + eval);
-                        }
-
-                        if (eval != null && aastr != null) {
-                            HashMap<String, Object> newAts = new HashMap<String, Object>(eval.getAttributes());
-                            newAts.put("ANCESTRALALLELE", aastr);
-
-                            eval = VariantContext.modifyAttributes(eval, newAts);
-                        }
-
-                        HashMap<VariantStratifier, ArrayList<String>> stateMap = new HashMap<VariantStratifier, ArrayList<String>>();
-                        for ( VariantStratifier vs : stratificationObjects ) {
-                            ArrayList<String> states = vs.getRelevantStates(ref, tracker, comp, compName, eval, evalName, sampleName);
-                            stateMap.put(vs, states);
-                        }
-
-                        ArrayList<StateKey> stateKeys = new ArrayList<StateKey>();
-                        variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys);
-
-                        HashSet<StateKey> stateKeysHash = new HashSet<StateKey>(stateKeys);
-
-                        for ( StateKey stateKey : stateKeysHash ) {
-                            NewEvaluationContext nec = evaluationContexts.get(stateKey);
-
-                            synchronized (nec) {
-                                nec.apply(tracker, ref, context, comp, eval);
-                            }
-                        }
                    }
                }
            }
@ -286,6 +320,25 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
        return null;
    }

+    private VariantContext findMatchingComp(final VariantContext eval, final Set<VariantContext> comps) {
+        // if no comps, return null
+        if ( comps == null || comps.isEmpty() )
+            return null;
+
+        // if no eval, return any comp
+        if ( eval == null )
+            return comps.iterator().next();
+
+        // find a matching comp
+        for ( VariantContext comp : comps ) {
+            if ( comp.getType() == eval.getType() )
+                return comp;
+        }
+
+        // if no matching comp, return null
+        return null;
+    }
+
    public Integer treeReduce(Integer lhs, Integer rhs) { return null; }

    @Override
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java
@ -1,6 +1,7 @@
 package org.broadinstitute.sting.gatk.walkers.varianteval.util;

 import org.apache.log4j.Logger;
+import org.broadinstitute.sting.commandline.RodBinding;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.gatk.report.GATKReport;
@ -56,8 +57,9 @@ public class VariantEvalUtils {
    /**
     * Initialize required, standard and user-specified stratification objects
     *
-     * @param noStandardStrats don't use the standard stratifications
-     * @param modulesToUse     the list of stratification modules to use
+     * @param variantEvalWalker  the parent walker
+     * @param noStandardStrats   don't use the standard stratifications
+     * @param modulesToUse       the list of stratification modules to use
     * @return set of stratifications to use
     */
    public TreeSet<VariantStratifier> initializeStratificationObjects(VariantEvalWalker variantEvalWalker, boolean noStandardStrats, String[] modulesToUse) {
@ -256,23 +258,6 @@ public class VariantEvalUtils {
        return report;
    }

-    /**
-     * Figure out what the allowable variation types are based on the eval context
-     *
-     * @param tracker   the reference metadata tracker
-     * @param ref       the reference context
-     * @param compNames the comp track names
-     * @param evalNames the evaluation track names
-     * @return the set of allowable variation types
-     */
-    public EnumSet<VariantContext.Type> getAllowableVariationTypes(RefMetaDataTracker tracker,
-                                                                   ReferenceContext ref,
-                                                                   Set<String> compNames,
-                                                                   Set<String> evalNames,
-                                                                   boolean dynamicSelectTypes ) {
-        return EnumSet.allOf(VariantContext.Type.class);
-    }
-
    /**
     * Subset a VariantContext to a single sample
     *
@ -321,78 +306,59 @@ public class VariantEvalUtils {
     *
     * @param tracker        the metadata tracker
     * @param ref            the reference context
-     * @param trackNames     the list of track names to process
-     * @param allowableTypes a set of allowable variation types
+     * @param tracks         the list of tracks to process
     * @param byFilter       if false, only accept PASSing VariantContexts.  Otherwise, accept both PASSing and filtered
     *                       sites
     * @param subsetBySample if false, do not separate the track into per-sample VCs
     * @param trackPerSample if false, don't stratify per sample (and don't cut up the VariantContext like we would need
     *                       to do this)
-     * @return a mapping of track names to a list of VariantContext objects
+     *
+     * @return the mapping of track to VC list that should be populated
     */
-    protected void bindVariantContexts(HashMap<String, HashMap<String, VariantContext>> bindings, RefMetaDataTracker tracker, ReferenceContext ref, Set<String> trackNames, EnumSet<VariantContext.Type> allowableTypes, boolean byFilter, boolean subsetBySample, boolean trackPerSample) {
-        for (String trackName : trackNames) {
-            HashMap<String, VariantContext> vcs = new HashMap<String, VariantContext>();
+    public HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>> bindVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, List<RodBinding<VariantContext>> tracks, boolean byFilter, boolean subsetBySample, boolean trackPerSample) {
+        if ( tracker == null )
+            return null;

-            VariantContext vc = tracker == null ? null : tracker.getFirstValue(VariantContext.class, trackName, ref.getLocus());
+        HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>> bindings = new HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>>();

-            // First, filter the VariantContext to represent only the samples for evaluation
-            if (vc != null) {
+        for ( RodBinding<VariantContext> track : tracks ) {
+            HashMap<String, Set<VariantContext>> mapping = new HashMap<String, Set<VariantContext>>();
+
+            for ( VariantContext vc : tracker.getValues(track, ref.getLocus()) ) {
+
+                // First, filter the VariantContext to represent only the samples for evaluation
                VariantContext vcsub = vc;

-                if (subsetBySample && vc.hasGenotypes() && vc.hasGenotypes(variantEvalWalker.getSampleNamesForEvaluation())) {
+                if ( subsetBySample && vc.hasGenotypes() && vc.hasGenotypes(variantEvalWalker.getSampleNamesForEvaluation()) ) {
                    vcsub = getSubsetOfVariantContext(vc, variantEvalWalker.getSampleNamesForEvaluation());
                }

-                if ((byFilter || !vcsub.isFiltered())) {
-                    vcs.put(VariantEvalWalker.getAllSampleName(), vcsub);
+                if ( (byFilter || !vcsub.isFiltered()) ) {
+                    addMapping(mapping, VariantEvalWalker.getAllSampleName(), vcsub);
                }

                // Now, if stratifying, split the subsetted vc per sample and add each as a new context
-                if (vc.hasGenotypes() && trackPerSample) {
-                    for (String sampleName : variantEvalWalker.getSampleNamesForEvaluation()) {
+                if ( vc.hasGenotypes() && trackPerSample ) {
+                    for ( String sampleName : variantEvalWalker.getSampleNamesForEvaluation() ) {
                        VariantContext samplevc = getSubsetOfVariantContext(vc, sampleName);

-                        if ((byFilter || !samplevc.isFiltered())) {
-                            vcs.put(sampleName, samplevc);
+                        if ( byFilter || !samplevc.isFiltered() ) {
+                            addMapping(mapping, sampleName, samplevc);
                        }
                    }
                }

-                bindings.put(trackName, vcs);
+                bindings.put(track, mapping);
            }
        }
+
+        return bindings;
    }

-    /**
-     * Maps track names to sample name to VariantContext objects.  For eval tracks, VariantContexts per specified sample
-     * are also included.
-     *
-     * @param tracker   the metadata tracker
-     * @param ref       the reference context
-     * @param compNames the list of comp names to process
-     * @param evalNames the list of eval names to process
-     * @return a mapping of track names to a list of VariantContext objects
-     */
-    public HashMap<String, HashMap<String, VariantContext>> getVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set<String> compNames, Set<String> evalNames, boolean dynamicSelectTypes) {
-        HashMap<String, HashMap<String, VariantContext>> vcs = new HashMap<String, HashMap<String, VariantContext>>();
-
-        EnumSet<VariantContext.Type> allowableTypes = getAllowableVariationTypes(tracker, ref, compNames, evalNames, dynamicSelectTypes);
-
-        boolean byFilter = false;
-        boolean perSampleIsEnabled = false;
-        for (VariantStratifier vs : variantEvalWalker.getStratificationObjects()) {
-            if (vs.getClass().getSimpleName().equals("Filter")) {
-                byFilter = true;
-            } else if (vs.getClass().getSimpleName().equals("Sample")) {
-                perSampleIsEnabled = true;
-            }
-        }
-
-        bindVariantContexts(vcs, tracker, ref, evalNames, allowableTypes, byFilter, true, perSampleIsEnabled);
-        bindVariantContexts(vcs, tracker, ref, compNames, allowableTypes, byFilter, false, false);
-
-        return vcs;
+    private void addMapping(HashMap<String, Set<VariantContext>> mappings, String sample, VariantContext vc) {
+        if ( !mappings.containsKey(sample) )
+            mappings.put(sample, new HashSet<VariantContext>());
+        mappings.get(sample).add(vc);
    }

    /**