managed to actually move the files too! Damn you svn

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2785 348d0f76-0448-11de-a6fe-93d51630548a
2010-02-05 13:47:19 +00:00 · 2010-02-05 13:47:19 +00:00 · 777617b6c7
parent 8938a4146d
commit 777617b6c7
6 changed files with 944 additions and 0 deletions
--- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/CountVariants.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/CountVariants.java
@ -0,0 +1,108 @@
+package org.broadinstitute.sting.oneoffprojects.variantcontext.varianteval2;
+
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContext;
+import org.broadinstitute.sting.oneoffprojects.variantcontext.Genotype;
+import org.broadinstitute.sting.utils.StingException;
+
+import java.util.List;
+import java.util.Arrays;
+
+public class CountVariants extends VariantEvaluator {
+    long nProcessedLoci = 0;
+    long nCalledLoci = 0;
+    long nVariantLoci = 0;
+    long nRefLoci = 0;
+
+    long nSNPs = 0;
+    long nInsertions = 0;
+    long nDeletions = 0;
+    long nComplex = 0;
+
+    long nNoCalls = 0;
+    long nHets = 0;
+    long nHomRef = 0;
+    long nHomVar = 0;
+
+    public CountVariants(VariantEval2Walker parent) {
+        // don't do anything
+    }
+
+    private double perLocusRate(long n)      { return rate(n, nProcessedLoci); }
+    private long perLocusRInverseRate(long n) { return inverseRate(n, nProcessedLoci); }
+
+    public String getName() {
+        return "counter";
+    }
+
+    public int getComparisonOrder() {
+        return 1;   // we only need to see each eval track
+    }
+
+    public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { 
+        nProcessedLoci += context.getSkippedBases() + 1;
+    }
+
+    public void update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+        //nProcessedLoci++;
+        nCalledLoci++;
+
+        if ( vc1.isVariant() ) nVariantLoci++;
+        switch ( vc1.getType() ) {
+            case NO_VARIATION: nRefLoci++; break;
+            case SNP: nSNPs++; break;
+            case INDEL:
+                if ( vc1.isInsertion() ) nInsertions++; else nDeletions++;
+                break;
+            case MIXED: nComplex++; break;
+        }
+
+        for ( Genotype g : vc1.getGenotypes().values()  ) {
+            switch ( g.getType() ) {
+                case NO_CALL: nNoCalls++; break;
+                case HOM_REF: nHomRef++; break;
+                case HET:     nHets++; break;
+                case HOM_VAR: nHomVar++; break;
+                default: throw new StingException("BUG: Unexpected genotype type: " + g);
+            }
+        }
+    }
+
+    public String toString() {
+        return getName() + ": " + summaryLine();
+    }
+
+    private String summaryLine() {
+        return String.format("%d %d %d %d " +
+                "%.2e %d " +
+                "%d %d %d %d " +
+                "%d %d %d " +
+                "%.2e %d %.2f " +
+                "%.2f %d %.2f",
+                nProcessedLoci, nCalledLoci, nRefLoci, nVariantLoci,
+                perLocusRate(nVariantLoci), perLocusRInverseRate(nVariantLoci),
+                nSNPs, nDeletions, nInsertions, nComplex,
+                nHomRef, nHets, nHomVar,
+                perLocusRate(nHets), perLocusRInverseRate(nHets), ratio(nHets, nHomVar),
+                perLocusRate(nDeletions + nInsertions), perLocusRInverseRate(nDeletions + nInsertions), ratio(nDeletions, nInsertions));
+    }
+
+    private static List<String> HEADER =
+            Arrays.asList("nProcessedLoci", "nCalledLoci", "nRefLoci", "nVariantLoci",
+                    "variantRate", "variantRatePerBp",
+                    "nSNPs", "nDeletions", "nInsertions", "nComplex",
+                    "nHomRefGenotypes", "nHetGenotypes", "nHomVarGenotypes",
+                    "heterozygosity", "heterozygosityPerBp", "hetHomRatio",
+                    "indelRate", "indelRatePerBp", "deletionInsertionRatio");
+
+    // making it a table
+    public List<String> getTableHeader() {
+        return HEADER;
+    }
+
+    public List<List<String>> getTableRows() {
+        return Arrays.asList(Arrays.asList(summaryLine().split(" ")));
+    }
+}
--- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/DbSNPPercentage.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/DbSNPPercentage.java
@ -0,0 +1,110 @@
+package org.broadinstitute.sting.oneoffprojects.variantcontext.varianteval2;
+
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.*;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.StingException;
+import org.broadinstitute.sting.utils.genotype.Variation;
+import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContext;
+import org.broadinstitute.sting.oneoffprojects.variantcontext.Genotype;
+import org.broadinstitute.sting.oneoffprojects.variantcontext.Allele;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Arrays;
+
+/**
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2009 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ * <p/>
+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither
+ * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
+ */
+public class DbSNPPercentage extends VariantEvaluator {
+    private long nDBSNPs       = 0;
+    private long nEvalSNPs     = 0;
+    private long nSNPsAtdbSNPs = 0;
+    private long nConcordant   = 0;
+
+    public DbSNPPercentage(VariantEval2Walker parent) {
+        // don't do anything
+    }
+
+    public String getName() {
+        return "dbsnp_percentage";
+    }
+
+    public int getComparisonOrder() {
+        return 2;   // we need to see each eval track and each comp track
+    }
+
+    public long nDBSNPs()        { return nDBSNPs; }
+    public long nEvalSNPs()      { return nEvalSNPs; }
+    public long nSNPsAtdbSNPs()  { return nSNPsAtdbSNPs; }
+    public long nConcordant()    { return nConcordant; }
+    public long nNovelSites()    { return Math.abs(nEvalSNPs() - nSNPsAtdbSNPs()); }
+
+    public String toString() {
+        return getName() + ": " + summaryLine();
+    }
+
+    private String summaryLine() {
+        return String.format("%d %d %d %d %d %.2f %.2f",
+                nDBSNPs(), nEvalSNPs(), nSNPsAtdbSNPs(), nConcordant(), nNovelSites(), 100 * dbSNPRate(), 100 * concordanceRate());
+    }
+
+    private static List<String> HEADER =
+            Arrays.asList("n_dbsnps", "n_eval_snps", "n_overlapping_snps", "n_concordant", "n_novel_snps", "dbsnp_rate", "concordance_rate");
+
+    // making it a table
+    public List<String> getTableHeader() {
+        return HEADER;
+    }
+
+    public List<List<String>> getTableRows() {
+        return Arrays.asList(Arrays.asList(summaryLine().split(" ")));
+    }
+
+    /**
+     * Returns true if every allele in eval is also in dbsnp
+     *
+     * @param eval
+     * @param dbsnp
+     * @return
+     */
+    public boolean discordantP(VariantContext eval, VariantContext dbsnp ) {
+        for ( Allele a : eval.getAlleles() ) {
+            if ( ! dbsnp.hasAllele(a, true) )
+                return true;
+        }
+
+        return false;
+    }
+
+
+    /**
+     * What fraction of the evaluated site variants were also found in the db?
+     *
+     * @return
+     */
+    public double dbSNPRate()           { return rate(nSNPsAtdbSNPs(), nEvalSNPs()); }
+    public double concordanceRate()     { return rate(nConcordant(), nSNPsAtdbSNPs()); }
+
+    public void update2(VariantContext eval, VariantContext dbsnp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+        boolean dbSNPIsGood = dbsnp != null && dbsnp.isSNP() && dbsnp.isNotFiltered();
+        boolean evalIsGood = eval != null && eval.isSNP();
+
+        if ( dbSNPIsGood ) nDBSNPs++;             // count the number of dbSNP events
+        if ( evalIsGood )  nEvalSNPs++;           // count the number of dbSNP events
+
+        if ( dbSNPIsGood && evalIsGood )  {
+            nSNPsAtdbSNPs++;
+
+            if ( ! discordantP(eval, dbsnp) )     // count whether we're concordant or not with the dbSNP value
+                nConcordant++;
+        }
+    }
+}
--- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/MendelianViolationEvaluator.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/MendelianViolationEvaluator.java
@ -0,0 +1,191 @@
+package org.broadinstitute.sting.oneoffprojects.variantcontext.varianteval2;
+
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContext;
+import org.broadinstitute.sting.oneoffprojects.variantcontext.Genotype;
+import org.broadinstitute.sting.oneoffprojects.variantcontext.Allele;
+import org.broadinstitute.sting.utils.StingException;
+
+import java.util.List;
+import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+
+/**
+ * Mendelian violation detection and counting
+ *
+ * a violation looks like:
+ * Suppose dad = A/B and mom = C/D
+ * The child can be [A or B] / [C or D].
+ * If the child doesn't match this, the site is a violation
+ *
+ * Some examples:
+ *
+ * mom = A/A, dad = C/C
+ * child can be A/C only
+ *
+ * mom = A/C, dad = C/C
+ * child can be A/C or C/C
+ *
+ * mom = A/C, dad = A/C
+ * child can be A/A, A/C, C/C
+ *
+ * The easiest way to do this calculation is to:
+ *
+ * Get alleles for mom => A/B
+ * Get alleles for dad => C/D
+ * Make allowed genotypes for child: A/C, A/D, B/C, B/D
+ * Check that the child is one of these.
+ */
+public class MendelianViolationEvaluator extends VariantEvaluator {
+    long nVariants, nViolations, nOverCalls, nUnderCalls;
+    String mom, dad, child;
+    VariantEval2Walker parent;
+
+    private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)");
+
+    public MendelianViolationEvaluator(VariantEval2Walker parent) {
+        this.parent = parent;
+
+        if ( enabled() ) {
+            Matcher m = FAMILY_PATTERN.matcher(parent.FAMILY_STRUCTURE);
+            if ( m.matches() ) {
+                //System.out.printf("Found a family pattern: %s%n", parent.FAMILY_STRUCTURE);
+                mom = m.group(1);
+                dad = m.group(2);
+                child = m.group(3);
+                parent.getLogger().debug(String.format("Found a family pattern: %s mom=%s dad=%s child=%s", parent.FAMILY_STRUCTURE, mom, dad, child));
+
+            } else {
+                throw new IllegalArgumentException("Malformatted family structure string: " + parent.FAMILY_STRUCTURE + " required format is mom+dad=child");
+            }
+        }
+    }
+
+    private boolean enabled() {
+        return parent.FAMILY_STRUCTURE != null;
+    }
+
+    private double getQThreshold() {
+        return parent.MENDELIAN_VIOLATION_QUAL_THRESHOLD / 10;  // we aren't 10x scaled in the GATK a la phred
+    }
+
+    public String getName() {
+        return "mendelian_violations";
+    }
+
+    public int getComparisonOrder() {
+        return 1;   // we only need to see each eval track
+    }
+
+    public void update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+        if ( vc.isBiallelic() && vc.hasGenotypes() ) { // todo -- currently limited to biallelic loci
+            nVariants++;
+
+            Genotype momG   = vc.getGenotype(mom);
+            Genotype dadG   = vc.getGenotype(dad);
+            Genotype childG = vc.getGenotype(child);
+
+            if ( momG.getNegLog10PError() > getQThreshold() && dadG.getNegLog10PError() > getQThreshold() && childG.getNegLog10PError() > getQThreshold() ) {
+                // all genotypes are good, so let's see if child is a violation
+
+                if ( isViolation(vc, momG, dadG, childG) ) {
+                    nViolations++;
+
+                    String label = null;
+                    switch ( getViolationType( vc, momG, dadG, childG ) ) {
+                        case UNDER_CALL:
+                            nUnderCalls++;
+                            label = "under_called";
+                            break;
+                        case OVER_CALL:
+                            nOverCalls++;
+                            label = "over_called";
+                            break;
+                        default:
+                            throw new StingException("BUG: unexpected violation type at " + vc);
+
+                    }
+
+                    String why = String.format("Mendelian violation %s: at %s m=%s d=%s c=%s", label, vc.getLocation(), momG.toBriefString(), dadG.toBriefString(), childG.toBriefString());
+                    addInterestingSite(why , vc);
+                }
+            }
+        }
+    }
+
+    /**
+     * Are we under or over calling?
+     *
+     * Assuming this is a bialleic locus, we then have 2 alleles A and B.  There are really two types of violations:
+     *
+     * Undercall: where the child is A/A but parent genotypes imply that child must carry at least one B allele
+     * Overall:   where the child carries a B allele but this B allele couldn't have been inherited from either parent
+     *
+     * The way to determine this is to look at mom and dad separately. If the child doesn't carry at least one
+     * allele from each parent, it's an under calls.  Otherwise it's an overcall.
+     */
+    public ViolationType getViolationType(VariantContext vc, Genotype momG, Genotype dadG, Genotype childG ) {
+        switch ( childG.getType() ) {
+            case HOM_REF:
+                return ViolationType.UNDER_CALL;  // if you have to undercalled as a hom ref child
+            case HET:
+                // the only two violations of a het is where both parents are hom.  If they are hom ref, you overcalled,
+                // otherwise you undercalled
+                return momG.isHomRef() ? ViolationType.OVER_CALL : ViolationType.UNDER_CALL;
+            case HOM_VAR:
+                return ViolationType.OVER_CALL;  // if you have to overcalled as a hom var child
+            default:
+                throw new StingException("BUG: unexpected child genotype class " + childG);
+        }
+    }
+
+    private enum ViolationType {
+        UNDER_CALL, OVER_CALL
+    }
+
+    public boolean isViolation(VariantContext vc, Genotype momG, Genotype dadG, Genotype childG ) {
+        VariantContext momVC = vc.subContextFromGenotypes(momG);
+        VariantContext dadVC = vc.subContextFromGenotypes(dadG);
+        int i = 0;
+        for ( Allele momAllele : momVC.getAlleles() ) {
+            for ( Allele dadAllele : dadVC.getAlleles() ) {
+                Genotype possibleChild = new Genotype("possibleGenotype" + i, Arrays.asList(momAllele, dadAllele));
+                if ( childG.sameGenotype(possibleChild, false) ) {
+                    return false;
+                }
+            }
+        }
+
+        return true;
+    }
+
+    public String toString() {
+        return getName() + ": " + summaryLine();
+    }
+
+    private String summaryLine() {
+        return String.format("%d %d %.2e %d %.2e %.2f %d %.2e %.2f",
+                nVariants, nViolations, rate(nViolations, nVariants),
+                nOverCalls, rate(nOverCalls, nVariants), ratio(nOverCalls, nViolations),
+                nUnderCalls, rate(nUnderCalls, nVariants), ratio(nUnderCalls, nViolations));
+    }
+
+    private static List<String> HEADER =
+            Arrays.asList("nVariants",
+                    "nViolations", "violationRate",
+                    "nOverCalls", "overCallRate", "overCallFraction",
+                    "nUnderCalls", "underCallRate", "underCallFraction");
+
+    // making it a table
+    public List<String> getTableHeader() {
+        return HEADER;
+    }
+
+    public List<List<String>> getTableRows() {
+        return Arrays.asList(Arrays.asList(summaryLine().split(" ")));
+    }
+}
--- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/TiTvVariantEvaluator.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/TiTvVariantEvaluator.java
@ -0,0 +1,65 @@
+package org.broadinstitute.sting.oneoffprojects.variantcontext.varianteval2;
+
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContext;
+import org.broadinstitute.sting.oneoffprojects.variantcontext.Genotype;
+import org.broadinstitute.sting.utils.StingException;
+
+import java.util.List;
+import java.util.Arrays;
+
+public class TiTvVariantEvaluator extends VariantEvaluator {
+    long nTi = 0, nTv = 0;
+    long nTiInStd = 0, nTvInStd = 0;
+
+    public TiTvVariantEvaluator(VariantEval2Walker parent) {
+        // don't do anything
+    }
+
+    public String getName() {
+        return "titv";
+    }
+
+    public int getComparisonOrder() {
+        return 2;   // we only need to see each eval track
+    }
+
+    public void updateTiTv(VariantContext vc, boolean updateStandard) {
+        if ( vc != null && vc.isSNP() && vc.isBiallelic() ) {
+            if ( vc.isTransition() ) {
+                if ( updateStandard ) nTiInStd++; else nTi++;
+            } else {
+                if ( updateStandard ) nTvInStd++; else nTv++;
+            }
+        }
+    }
+
+    public void update2(VariantContext vc1, VariantContext vc2, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+        if ( vc1 != null ) updateTiTv(vc1, false);
+        if ( vc2 != null ) updateTiTv(vc2, true);
+    }
+
+    public String toString() {
+        return getName() + ": " + summaryLine();
+    }
+
+    private String summaryLine() {
+        return String.format("%d %d %.2f %d %d %.2f",
+                nTi, nTv, ratio(nTi, nTv),
+                nTiInStd, nTvInStd, ratio(nTiInStd, nTvInStd));
+    }
+
+    private static List<String> HEADER =
+            Arrays.asList("nTi", "nTv", "TiTvRatio", "nTiStandard", "nTvStandard", "TiTvRatioStandard");
+
+    // making it a table
+    public List<String> getTableHeader() {
+        return HEADER;
+    }
+
+    public List<List<String>> getTableRows() {
+        return Arrays.asList(Arrays.asList(summaryLine().split(" ")));
+    }
+}
--- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java
@ -0,0 +1,398 @@
+package org.broadinstitute.sting.oneoffprojects.variantcontext.varianteval2;
+
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.*;
+import org.broadinstitute.sting.gatk.walkers.RodWalker;
+import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
+import org.broadinstitute.sting.utils.*;
+import org.broadinstitute.sting.utils.cmdLine.Argument;
+import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContext;
+import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContextAdaptors;
+import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContextUtils;
+import org.apache.log4j.Logger;
+
+import java.util.*;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+
+
+//
+// todo -- write a simple column table system and have the evaluators return this instead of the list<list<string>> objects
+// todo -- site frequency spectrum eval (freq. of variants in eval as a function of their AC and AN numbers)
+// todo -- multiple sample concordance tool (genotypes in eval vs. genotypes in truth)
+// todo -- allele freqeuncy discovery tool (FREQ in true vs. discovery counts in eval).  Needs to process subset of samples in true (pools)
+// todo -- clustered SNP counter
+// todo -- HWEs
+// todo -- Validation data analysis from VE1?  What is it and should we transition it over?
+// todo -- indel metrics [count of sizes in/del should be in CountVariants]
+// todo -- create JEXL context implementing object that simply looks up values for JEXL evaluations.  Throws error for unknown fields
+//
+
+//
+// Todo -- should really include argument parsing @annotations from subclass in this walker.  Very
+// todo -- useful general capability.  Right now you need to add arguments to VariantEval2 to handle new
+// todo -- evaluation arguments (which is better than passing a string!)
+//
+
+//
+// todo -- the whole organization only supports a single eval x comp evaluation.  We need to instantiate
+// todo -- new contexts for each comparison object too!
+//
+
+/**
+ * Test routine for new VariantContext object
+ */
+public class VariantEval2Walker extends RodWalker<Integer, Integer> {
+    // --------------------------------------------------------------------------------------------------------------
+    //
+    // walker arguments
+    //
+    // --------------------------------------------------------------------------------------------------------------
+
+    // todo -- add doc string
+    @Argument(shortName="select", doc="", required=false)
+    protected String[] SELECT_EXPS = {"QUAL > 500.0", "HARD_TO_VALIDATE==1", "GATK_STANDARD==1"};
+
+    // todo -- add doc string
+    @Argument(shortName="selectName", doc="", required=false)
+    protected String[] SELECT_NAMES = {"q500plus", "low_mapq", "gatk_std_filters"};
+
+    @Argument(shortName="known", doc="Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets", required=false)
+    protected String[] KNOWN_NAMES = {"dbsnp"};
+
+    //
+    // Arguments for Mendelian Violation calculations
+    //
+    @Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations: this argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
+    protected String FAMILY_STRUCTURE;
+
+    @Argument(shortName="MVQ", fullName="MendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
+    protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50;
+
+    @Argument(shortName="PI", fullName="PrintInterestingSites", doc="If provided, interesting sites in the unselected, called set will be printed", required=false)
+    protected boolean PRINT_INTERESTING_SITES = false;
+
+//    @Argument(shortName="PIA", fullName="PrintAllInterestingSites", doc="If provided, interesting sites will be printed for all sets.  Verbose", required=false)
+//    protected boolean PRINT_ALL_INTERESTING_SITES = false;
+    
+    // --------------------------------------------------------------------------------------------------------------
+    //
+    // private walker data
+    //
+    // --------------------------------------------------------------------------------------------------------------
+
+    /** private class holding all of the information about a single evaluation group (e.g., for eval ROD) */
+    private class EvaluationContext extends HashMap<String, Set<VariantEvaluator>> {
+        // useful for typing
+        public String trackName, contextName;
+        VariantContextUtils.MatchExp selectExp;
+
+        public EvaluationContext(String trackName, String contextName, VariantContextUtils.MatchExp selectExp) {
+            this.trackName = trackName;
+            this.contextName = contextName;
+            this.selectExp = selectExp;
+        }
+    }
+
+    private HashMap<String, EvaluationContext> contexts = new HashMap<String, EvaluationContext>();
+    private Set<String> compNames = new HashSet<String>();
+
+    private static String RAW_SET_NAME      = "raw";
+    private static String RETAINED_SET_NAME = "called";
+    private static String FILTERED_SET_NAME = "filtered";
+    private static String ALL_SET_NAME      = "all";
+    private static String KNOWN_SET_NAME    = "known";
+    private static String NOVEL_SET_NAME    = "novel";
+
+    // Dynamically determined variantEvaluation classes
+    private List<String> variantEvaluationNames = new ArrayList<String>();
+    private List<Class<? extends VariantEvaluator>> evaluationClasses = null;
+
+    // --------------------------------------------------------------------------------------------------------------
+    //
+    // initialize
+    //
+    // --------------------------------------------------------------------------------------------------------------
+
+    public void initialize() {
+        determineAllEvalations();
+        List<VariantContextUtils.MatchExp> selectExps = VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS);
+
+        for ( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
+            if ( d.getName().startsWith("eval") ) {
+                for ( VariantContextUtils.MatchExp e : selectExps ) {
+                    addNewContext(d.getName(), d.getName() + "." + e.name, e);
+                }
+                addNewContext(d.getName(), d.getName() + ".all", null);
+            } else if ( d.getName().startsWith("dbsnp") || d.getName().startsWith("hapmap") || d.getName().startsWith("comp") ) {
+                compNames.add(d.getName());
+            } else {
+                logger.info("Not evaluating ROD binding " + d.getName());
+            }
+        }
+
+        determineContextNamePartSizes();
+    }
+
+    private void determineAllEvalations() {
+        evaluationClasses = PackageUtils.getClassesImplementingInterface(VariantEvaluator.class);
+        for ( VariantEvaluator e : instantiateEvalationsSet(false, null) ) {
+            // for collecting purposes
+            variantEvaluationNames.add(e.getName());
+            logger.debug("Including VariantEvaluator " + e.getName() + " of class " + e.getClass());
+        }
+
+        Collections.sort(variantEvaluationNames);
+    }
+
+    private Set<VariantEvaluator> instantiateEvalationsSet(boolean baseline, String name) {
+        Set<VariantEvaluator> evals = new HashSet<VariantEvaluator>();
+        Object[] args = new Object[]{this};
+        Class[] argTypes = new Class[]{this.getClass()};
+
+        for ( Class c : evaluationClasses ) {
+            try {
+                Constructor constructor = c.getConstructor(argTypes);
+                VariantEvaluator eval = (VariantEvaluator)constructor.newInstance(args);
+                if ( baseline ) eval.printInterestingSites(name);
+                evals.add(eval);
+            } catch (InstantiationException e) {
+                throw new StingException(String.format("Cannot instantiate class '%s': must be concrete class", c.getSimpleName()));
+            } catch (NoSuchMethodException e) {
+                throw new StingException(String.format("Cannot find expected constructor for class '%s': must have constructor accepting a single VariantEval2Walker object", c.getSimpleName()));
+            } catch (IllegalAccessException e) {
+                throw new StingException(String.format("Cannot instantiate class '%s':", c.getSimpleName()));
+            } catch (InvocationTargetException e) {
+                throw new StingException(String.format("Cannot instantiate class '%s':", c.getSimpleName()));
+            }
+        }
+
+        return evals;
+    }
+
+    private void addNewContext(String trackName, String contextName, VariantContextUtils.MatchExp selectExp) {
+        EvaluationContext group = new EvaluationContext(trackName, contextName, selectExp);
+
+        for ( String filteredName : Arrays.asList(RAW_SET_NAME, RETAINED_SET_NAME, FILTERED_SET_NAME) ) {
+            for ( String subname : Arrays.asList(ALL_SET_NAME, KNOWN_SET_NAME, NOVEL_SET_NAME) ) {
+                String name = subname + "." + filteredName;
+                group.put(name, instantiateEvalationsSet(subname == ALL_SET_NAME && filteredName == RETAINED_SET_NAME, trackName + "." + (selectExp == null ? "all" : selectExp.name) + "." + name));
+            }
+        }
+
+        contexts.put(contextName, group);
+    }
+
+    // --------------------------------------------------------------------------------------------------------------
+    //
+    // map
+    //
+    // --------------------------------------------------------------------------------------------------------------
+
+    public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+        //System.out.printf("map at %s with %d skipped%n", context.getLocation(), context.getSkippedBases());
+
+        Map<String, VariantContext> comps = getCompVariantContexts(tracker, context);
+
+        // to enable walking over pairs where eval or comps have no elements
+        for ( EvaluationContext group : contexts.values() ) {
+            VariantContext vc = getVariantContext(group.trackName, tracker, context);
+
+            //logger.debug(String.format("Updating %s of %s with variant", group.name, vc));
+            for ( Map.Entry<String, Set<VariantEvaluator>> namedEvaluations : group.entrySet() ) {
+                String evaluationName = namedEvaluations.getKey();
+                Set<VariantEvaluator> evaluations = namedEvaluations.getValue();
+                boolean evalWantsVC = applyVCtoEvaluation(evaluationName, vc, comps, group);
+
+                for ( VariantEvaluator evaluation : evaluations ) {
+                    // we always call update0 in case the evaluation tracks things like number of bases covered
+                    evaluation.update0(tracker, ref, context);
+
+                    // now call the single or paired update function
+                    switch ( evaluation.getComparisonOrder() ) {
+                        case 1:
+                            if ( evalWantsVC && vc != null )
+                                evaluation.update1(vc, tracker, ref, context);
+                            break;
+                        case 2:
+                            for ( VariantContext comp : comps.values() ) {
+                                evaluation.update2( evalWantsVC ? vc : null, comp, tracker, ref, context);
+                            }
+                            break;
+                        default:
+                            throw new StingException("BUG: Unexpected evaluation order " + evaluation);
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    private boolean applyVCtoEvaluation(String evaluationName, VariantContext vc, Map<String, VariantContext> comps, EvaluationContext group) {
+        if ( vc == null )
+            return true;
+        
+        if ( evaluationName.contains(FILTERED_SET_NAME) && vc.isNotFiltered() )
+            return false;
+
+        if ( evaluationName.contains(RETAINED_SET_NAME) && vc.isFiltered() )
+            return false;
+
+        boolean vcKnown = vcIsKnown(vc, comps, KNOWN_NAMES);
+        if ( evaluationName.contains(KNOWN_SET_NAME) && ! vcKnown )
+            return false;
+        else if ( evaluationName.contains(NOVEL_SET_NAME) && vcKnown )
+            return false;
+
+        if ( group.selectExp != null && ! VariantContextUtils.match(vc, group.selectExp) )
+            return false;
+
+        // nothing invalidated our membership in this set
+        return true;
+    }
+
+    private Map<String, VariantContext> getCompVariantContexts(RefMetaDataTracker tracker, AlignmentContext context) {
+        Map<String, VariantContext> comps = new HashMap<String, VariantContext>();
+
+        for ( String compName : compNames ) {
+            comps.put(compName, getVariantContext(compName, tracker, context));
+        }
+
+        // todo -- remove me when the loop works correctly for comparisons of eval x comp for each comp
+        if ( comps.size() > 1 ) throw new StingException("VariantEval2 currently only supports comparisons of N eval tracks vs. a single comparison track.  Yes, I know...");
+        return comps;
+    }
+
+    private boolean vcIsKnown(VariantContext vc, Map<String, VariantContext> comps, String[] knownNames ) {
+        for ( String knownName : knownNames ) {
+            VariantContext known = comps.get(knownName);
+            if ( known != null && known.isNotFiltered() && known.getType() == vc.getType() )
+                return true;
+        }
+
+        return false;
+    }
+
+// can't handle this situation
+// todo -- warning, this leads to some missing SNPs at complex loci, such as:
+// todo -- 591     1       841619  841620  rs4970464       0       -       A       A       -/C/T   genomic mixed   unknown 0       0       near-gene-3     exact   1
+// todo -- 591     1       841619  841620  rs62677860      0       +       A       A       C/T     genomic single  unknown 0       0       near-gene-3     exact   1
+//
+//logger.info(String.format("Ignore second+ events at locus %s in rod %s => rec is %s", context.getLocation(), rodList.getName(), rec));
+
+    private VariantContext getVariantContext(String name, RefMetaDataTracker tracker, AlignmentContext context) {
+        if ( tracker != null ) {
+            RODRecordList<ReferenceOrderedDatum> rodList = tracker.getTrackData(name, null);
+            if ( rodList != null ) {
+                for ( ReferenceOrderedDatum rec : rodList.getRecords() ) {
+                    if ( rec.getLocation().getStart() == context.getLocation().getStart() ) {
+                        VariantContext vc = VariantContextAdaptors.convertToVariantContext(name, rec);
+                        if ( vc != null ) {
+                            return vc;
+                        }
+                    }
+                }
+            }
+        }
+
+        return null;
+    }
+
+    // --------------------------------------------------------------------------------------------------------------
+    //
+    // reduce
+    //
+    // --------------------------------------------------------------------------------------------------------------
+    public Integer reduceInit() {
+        return 0;
+    }
+
+    public Integer reduce(Integer point, Integer sum) {
+        return point + sum;
+    }
+
+    public VariantEvaluator getEvalByName(String name, Set<VariantEvaluator> s) {
+        for ( VariantEvaluator e : s )
+            if ( e.getName().equals(name) )
+                return e;
+        return null;
+    }
+
+    private <T extends Comparable<T>> List<T> sorted(Collection<T> c ) {
+        List<T> l = new ArrayList<T>(c);
+        Collections.sort(l);
+        return l;
+    }
+
+    private final static String CONTEXT_HEADER = "track.subset.novelty.filter";
+    private final static int N_CONTEXT_NAME_PARTS = CONTEXT_HEADER.split("\\.").length;
+    private static int[] nameSizes = new int[N_CONTEXT_NAME_PARTS];
+    static {
+        int i = 0;
+        for ( String elt : CONTEXT_HEADER.split("\\.") )
+            nameSizes[i++] = elt.length();
+    }
+
+    private void determineContextNamePartSizes() {
+        for ( String contextName : sorted(contexts.keySet()) ) {
+            EvaluationContext group = contexts.get(contextName);
+            for ( String evalSubgroupName : sorted(group.keySet()) ) {
+                String keyWord = contextName + "." + evalSubgroupName;
+                String[] parts = keyWord.split("\\.");
+                if ( parts.length != N_CONTEXT_NAME_PARTS ) {
+                    throw new StingException("Unexpected number of eval name parts " + keyWord + " length = " + parts.length + ", expected " + N_CONTEXT_NAME_PARTS);
+                } else {
+                    for ( int i = 0; i < parts.length; i++ )
+                        nameSizes[i] = Math.max(nameSizes[i], parts[i].length());
+                }
+            }
+        }
+    }
+    
+    private String formatKeyword(String keyWord) {
+        //System.out.printf("keyword %s%n", keyWord);
+
+        StringBuilder s = new StringBuilder();
+        int i = 0;
+        for ( String part : keyWord.split("\\.") ) {
+            //System.out.printf("part %s %d%n", part, nameSizes[i]);
+            s.append(String.format("%"+nameSizes[i]+"s ", part));
+            i++;
+        }
+
+        return s.toString();
+    }
+
+    public void onTraversalDone(Integer result) {
+        // todo -- this really needs to be pretty printed; use some kind of table organization
+        // todo -- so that we can load up all of the data in one place, analyze the widths of the columns
+        // todo -- and pretty print it
+        for ( String evalName : variantEvaluationNames ) {
+            boolean first = true;
+            out.printf("%n%n");
+            // todo -- show that comp is dbsnp, etc. is columns
+            for ( String contextName : sorted(contexts.keySet()) ) {
+                EvaluationContext group = contexts.get(contextName);
+
+                out.printf("%s%n", Utils.dupString('-', 80));
+                for ( String evalSubgroupName : sorted(group.keySet()) ) {
+                    Set<VariantEvaluator> evalSet = group.get(evalSubgroupName);
+                    VariantEvaluator eval = getEvalByName(evalName, evalSet);
+                    String keyWord = contextName + "." + evalSubgroupName;
+
+                    if ( first ) {
+                        out.printf("%20s %s %s%n", evalName, formatKeyword(CONTEXT_HEADER), Utils.join("\t", eval.getTableHeader()));
+                        first = false;
+                    }
+
+                    for ( List<String> row : eval.getTableRows() )
+                        out.printf("%20s %s %s%n", evalName, formatKeyword(keyWord), Utils.join("\t", row));
+                }
+            }
+        }
+    }
+
+    protected Logger getLogger() { return logger; }
+}
--- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEvaluator.java
+++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEvaluator.java
@ -0,0 +1,72 @@
+package org.broadinstitute.sting.oneoffprojects.variantcontext.varianteval2;
+
+import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+
+import java.util.List;
+import java.util.ArrayList;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: depristo
+ * Date: Jan 29, 2010
+ * Time: 3:38:02 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public abstract class VariantEvaluator {
+    protected boolean accumulateInterestingSites = false, printInterestingSites = false;
+    protected String interestingSitePrefix = null;
+    protected boolean processedASite = false;
+    protected List<VariantContext> interestingSites = new ArrayList<VariantContext>();
+
+    public abstract String getName();
+
+    // do we want to keep track of things that are interesting
+    public void accumulateInterestingSites(boolean enable)              { accumulateInterestingSites = enable; }
+    public void printInterestingSites(String prefix)                    { printInterestingSites = true; interestingSitePrefix = prefix; }
+    public boolean isAccumulatingInterestingSites()                     { return accumulateInterestingSites; }
+    public List<VariantContext> getInterestingSites()                   { return interestingSites; }
+    protected void addInterestingSite(String why, VariantContext vc) {
+        if ( accumulateInterestingSites )
+            interestingSites.add(vc);
+        if ( printInterestingSites )
+            System.out.printf("%40s %s%n", interestingSitePrefix, why);
+    }
+
+    public boolean processedAnySites()                                  { return processedASite; }
+    protected void markSiteAsProcessed()                                { processedASite = true; }
+
+    /** Should return the number of VariantContexts expected as inputs to update.  Can be 1 or 2 */
+    public abstract int getComparisonOrder();
+
+    /** called at all sites, regardless of eval context itself; useful for counting processed bases */
+    public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { }
+    public void update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { }
+    public void update2(VariantContext vc1, VariantContext vc2, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { }
+
+    public void finalize() {}
+
+    public abstract String toString();
+
+    // making it a table
+    public abstract List<String> getTableHeader();
+    public abstract List<List<String>> getTableRows();
+
+    //
+    // useful common utility routines
+    //
+    protected double rate(long n, long d) {
+        return n / (1.0 * Math.max(d, 1));
+    }
+
+    protected long inverseRate(long n, long d) {
+        return n == 0 ? 0 : d / Math.max(n, 1);
+    }
+
+    protected double ratio(long num, long denom) {
+        return ((double)num) / (Math.max(denom, 1));
+    }
+
+}