diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/CountVariants.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/CountVariants.java new file mode 100755 index 000000000..87c636a8b --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/CountVariants.java @@ -0,0 +1,108 @@ +package org.broadinstitute.sting.oneoffprojects.variantcontext.varianteval2; + +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContext; +import org.broadinstitute.sting.oneoffprojects.variantcontext.Genotype; +import org.broadinstitute.sting.utils.StingException; + +import java.util.List; +import java.util.Arrays; + +public class CountVariants extends VariantEvaluator { + long nProcessedLoci = 0; + long nCalledLoci = 0; + long nVariantLoci = 0; + long nRefLoci = 0; + + long nSNPs = 0; + long nInsertions = 0; + long nDeletions = 0; + long nComplex = 0; + + long nNoCalls = 0; + long nHets = 0; + long nHomRef = 0; + long nHomVar = 0; + + public CountVariants(VariantEval2Walker parent) { + // don't do anything + } + + private double perLocusRate(long n) { return rate(n, nProcessedLoci); } + private long perLocusRInverseRate(long n) { return inverseRate(n, nProcessedLoci); } + + public String getName() { + return "counter"; + } + + public int getComparisonOrder() { + return 1; // we only need to see each eval track + } + + public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + nProcessedLoci += context.getSkippedBases() + 1; + } + + public void update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + //nProcessedLoci++; + nCalledLoci++; + + if ( vc1.isVariant() ) nVariantLoci++; + switch ( vc1.getType() ) { + case NO_VARIATION: nRefLoci++; break; + case SNP: nSNPs++; break; + case INDEL: + if ( vc1.isInsertion() ) nInsertions++; else nDeletions++; + break; + case MIXED: nComplex++; break; + } + + for ( Genotype g : vc1.getGenotypes().values() ) { + switch ( g.getType() ) { + case NO_CALL: nNoCalls++; break; + case HOM_REF: nHomRef++; break; + case HET: nHets++; break; + case HOM_VAR: nHomVar++; break; + default: throw new StingException("BUG: Unexpected genotype type: " + g); + } + } + } + + public String toString() { + return getName() + ": " + summaryLine(); + } + + private String summaryLine() { + return String.format("%d %d %d %d " + + "%.2e %d " + + "%d %d %d %d " + + "%d %d %d " + + "%.2e %d %.2f " + + "%.2f %d %.2f", + nProcessedLoci, nCalledLoci, nRefLoci, nVariantLoci, + perLocusRate(nVariantLoci), perLocusRInverseRate(nVariantLoci), + nSNPs, nDeletions, nInsertions, nComplex, + nHomRef, nHets, nHomVar, + perLocusRate(nHets), perLocusRInverseRate(nHets), ratio(nHets, nHomVar), + perLocusRate(nDeletions + nInsertions), perLocusRInverseRate(nDeletions + nInsertions), ratio(nDeletions, nInsertions)); + } + + private static List HEADER = + Arrays.asList("nProcessedLoci", "nCalledLoci", "nRefLoci", "nVariantLoci", + "variantRate", "variantRatePerBp", + "nSNPs", "nDeletions", "nInsertions", "nComplex", + "nHomRefGenotypes", "nHetGenotypes", "nHomVarGenotypes", + "heterozygosity", "heterozygosityPerBp", "hetHomRatio", + "indelRate", "indelRatePerBp", "deletionInsertionRatio"); + + // making it a table + public List getTableHeader() { + return HEADER; + } + + public List> getTableRows() { + return Arrays.asList(Arrays.asList(summaryLine().split(" "))); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/DbSNPPercentage.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/DbSNPPercentage.java new file mode 100755 index 000000000..319a58a6f --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/DbSNPPercentage.java @@ -0,0 +1,110 @@ +package org.broadinstitute.sting.oneoffprojects.variantcontext.varianteval2; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.genotype.Variation; +import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContext; +import org.broadinstitute.sting.oneoffprojects.variantcontext.Genotype; +import org.broadinstitute.sting.oneoffprojects.variantcontext.Allele; + +import java.util.ArrayList; +import java.util.List; +import java.util.Arrays; + +/** + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + *

+ * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + */ +public class DbSNPPercentage extends VariantEvaluator { + private long nDBSNPs = 0; + private long nEvalSNPs = 0; + private long nSNPsAtdbSNPs = 0; + private long nConcordant = 0; + + public DbSNPPercentage(VariantEval2Walker parent) { + // don't do anything + } + + public String getName() { + return "dbsnp_percentage"; + } + + public int getComparisonOrder() { + return 2; // we need to see each eval track and each comp track + } + + public long nDBSNPs() { return nDBSNPs; } + public long nEvalSNPs() { return nEvalSNPs; } + public long nSNPsAtdbSNPs() { return nSNPsAtdbSNPs; } + public long nConcordant() { return nConcordant; } + public long nNovelSites() { return Math.abs(nEvalSNPs() - nSNPsAtdbSNPs()); } + + public String toString() { + return getName() + ": " + summaryLine(); + } + + private String summaryLine() { + return String.format("%d %d %d %d %d %.2f %.2f", + nDBSNPs(), nEvalSNPs(), nSNPsAtdbSNPs(), nConcordant(), nNovelSites(), 100 * dbSNPRate(), 100 * concordanceRate()); + } + + private static List HEADER = + Arrays.asList("n_dbsnps", "n_eval_snps", "n_overlapping_snps", "n_concordant", "n_novel_snps", "dbsnp_rate", "concordance_rate"); + + // making it a table + public List getTableHeader() { + return HEADER; + } + + public List> getTableRows() { + return Arrays.asList(Arrays.asList(summaryLine().split(" "))); + } + + /** + * Returns true if every allele in eval is also in dbsnp + * + * @param eval + * @param dbsnp + * @return + */ + public boolean discordantP(VariantContext eval, VariantContext dbsnp ) { + for ( Allele a : eval.getAlleles() ) { + if ( ! dbsnp.hasAllele(a, true) ) + return true; + } + + return false; + } + + + /** + * What fraction of the evaluated site variants were also found in the db? + * + * @return + */ + public double dbSNPRate() { return rate(nSNPsAtdbSNPs(), nEvalSNPs()); } + public double concordanceRate() { return rate(nConcordant(), nSNPsAtdbSNPs()); } + + public void update2(VariantContext eval, VariantContext dbsnp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + boolean dbSNPIsGood = dbsnp != null && dbsnp.isSNP() && dbsnp.isNotFiltered(); + boolean evalIsGood = eval != null && eval.isSNP(); + + if ( dbSNPIsGood ) nDBSNPs++; // count the number of dbSNP events + if ( evalIsGood ) nEvalSNPs++; // count the number of dbSNP events + + if ( dbSNPIsGood && evalIsGood ) { + nSNPsAtdbSNPs++; + + if ( ! discordantP(eval, dbsnp) ) // count whether we're concordant or not with the dbSNP value + nConcordant++; + } + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/MendelianViolationEvaluator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/MendelianViolationEvaluator.java new file mode 100755 index 000000000..274a447cd --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/MendelianViolationEvaluator.java @@ -0,0 +1,191 @@ +package org.broadinstitute.sting.oneoffprojects.variantcontext.varianteval2; + +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContext; +import org.broadinstitute.sting.oneoffprojects.variantcontext.Genotype; +import org.broadinstitute.sting.oneoffprojects.variantcontext.Allele; +import org.broadinstitute.sting.utils.StingException; + +import java.util.List; +import java.util.Arrays; +import java.util.ArrayList; +import java.util.regex.Pattern; +import java.util.regex.Matcher; + +/** + * Mendelian violation detection and counting + * + * a violation looks like: + * Suppose dad = A/B and mom = C/D + * The child can be [A or B] / [C or D]. + * If the child doesn't match this, the site is a violation + * + * Some examples: + * + * mom = A/A, dad = C/C + * child can be A/C only + * + * mom = A/C, dad = C/C + * child can be A/C or C/C + * + * mom = A/C, dad = A/C + * child can be A/A, A/C, C/C + * + * The easiest way to do this calculation is to: + * + * Get alleles for mom => A/B + * Get alleles for dad => C/D + * Make allowed genotypes for child: A/C, A/D, B/C, B/D + * Check that the child is one of these. + */ +public class MendelianViolationEvaluator extends VariantEvaluator { + long nVariants, nViolations, nOverCalls, nUnderCalls; + String mom, dad, child; + VariantEval2Walker parent; + + private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)"); + + public MendelianViolationEvaluator(VariantEval2Walker parent) { + this.parent = parent; + + if ( enabled() ) { + Matcher m = FAMILY_PATTERN.matcher(parent.FAMILY_STRUCTURE); + if ( m.matches() ) { + //System.out.printf("Found a family pattern: %s%n", parent.FAMILY_STRUCTURE); + mom = m.group(1); + dad = m.group(2); + child = m.group(3); + parent.getLogger().debug(String.format("Found a family pattern: %s mom=%s dad=%s child=%s", parent.FAMILY_STRUCTURE, mom, dad, child)); + + } else { + throw new IllegalArgumentException("Malformatted family structure string: " + parent.FAMILY_STRUCTURE + " required format is mom+dad=child"); + } + } + } + + private boolean enabled() { + return parent.FAMILY_STRUCTURE != null; + } + + private double getQThreshold() { + return parent.MENDELIAN_VIOLATION_QUAL_THRESHOLD / 10; // we aren't 10x scaled in the GATK a la phred + } + + public String getName() { + return "mendelian_violations"; + } + + public int getComparisonOrder() { + return 1; // we only need to see each eval track + } + + public void update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( vc.isBiallelic() && vc.hasGenotypes() ) { // todo -- currently limited to biallelic loci + nVariants++; + + Genotype momG = vc.getGenotype(mom); + Genotype dadG = vc.getGenotype(dad); + Genotype childG = vc.getGenotype(child); + + if ( momG.getNegLog10PError() > getQThreshold() && dadG.getNegLog10PError() > getQThreshold() && childG.getNegLog10PError() > getQThreshold() ) { + // all genotypes are good, so let's see if child is a violation + + if ( isViolation(vc, momG, dadG, childG) ) { + nViolations++; + + String label = null; + switch ( getViolationType( vc, momG, dadG, childG ) ) { + case UNDER_CALL: + nUnderCalls++; + label = "under_called"; + break; + case OVER_CALL: + nOverCalls++; + label = "over_called"; + break; + default: + throw new StingException("BUG: unexpected violation type at " + vc); + + } + + String why = String.format("Mendelian violation %s: at %s m=%s d=%s c=%s", label, vc.getLocation(), momG.toBriefString(), dadG.toBriefString(), childG.toBriefString()); + addInterestingSite(why , vc); + } + } + } + } + + /** + * Are we under or over calling? + * + * Assuming this is a bialleic locus, we then have 2 alleles A and B. There are really two types of violations: + * + * Undercall: where the child is A/A but parent genotypes imply that child must carry at least one B allele + * Overall: where the child carries a B allele but this B allele couldn't have been inherited from either parent + * + * The way to determine this is to look at mom and dad separately. If the child doesn't carry at least one + * allele from each parent, it's an under calls. Otherwise it's an overcall. + */ + public ViolationType getViolationType(VariantContext vc, Genotype momG, Genotype dadG, Genotype childG ) { + switch ( childG.getType() ) { + case HOM_REF: + return ViolationType.UNDER_CALL; // if you have to undercalled as a hom ref child + case HET: + // the only two violations of a het is where both parents are hom. If they are hom ref, you overcalled, + // otherwise you undercalled + return momG.isHomRef() ? ViolationType.OVER_CALL : ViolationType.UNDER_CALL; + case HOM_VAR: + return ViolationType.OVER_CALL; // if you have to overcalled as a hom var child + default: + throw new StingException("BUG: unexpected child genotype class " + childG); + } + } + + private enum ViolationType { + UNDER_CALL, OVER_CALL + } + + public boolean isViolation(VariantContext vc, Genotype momG, Genotype dadG, Genotype childG ) { + VariantContext momVC = vc.subContextFromGenotypes(momG); + VariantContext dadVC = vc.subContextFromGenotypes(dadG); + int i = 0; + for ( Allele momAllele : momVC.getAlleles() ) { + for ( Allele dadAllele : dadVC.getAlleles() ) { + Genotype possibleChild = new Genotype("possibleGenotype" + i, Arrays.asList(momAllele, dadAllele)); + if ( childG.sameGenotype(possibleChild, false) ) { + return false; + } + } + } + + return true; + } + + public String toString() { + return getName() + ": " + summaryLine(); + } + + private String summaryLine() { + return String.format("%d %d %.2e %d %.2e %.2f %d %.2e %.2f", + nVariants, nViolations, rate(nViolations, nVariants), + nOverCalls, rate(nOverCalls, nVariants), ratio(nOverCalls, nViolations), + nUnderCalls, rate(nUnderCalls, nVariants), ratio(nUnderCalls, nViolations)); + } + + private static List HEADER = + Arrays.asList("nVariants", + "nViolations", "violationRate", + "nOverCalls", "overCallRate", "overCallFraction", + "nUnderCalls", "underCallRate", "underCallFraction"); + + // making it a table + public List getTableHeader() { + return HEADER; + } + + public List> getTableRows() { + return Arrays.asList(Arrays.asList(summaryLine().split(" "))); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/TiTvVariantEvaluator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/TiTvVariantEvaluator.java new file mode 100755 index 000000000..1fab3ccc0 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/TiTvVariantEvaluator.java @@ -0,0 +1,65 @@ +package org.broadinstitute.sting.oneoffprojects.variantcontext.varianteval2; + +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContext; +import org.broadinstitute.sting.oneoffprojects.variantcontext.Genotype; +import org.broadinstitute.sting.utils.StingException; + +import java.util.List; +import java.util.Arrays; + +public class TiTvVariantEvaluator extends VariantEvaluator { + long nTi = 0, nTv = 0; + long nTiInStd = 0, nTvInStd = 0; + + public TiTvVariantEvaluator(VariantEval2Walker parent) { + // don't do anything + } + + public String getName() { + return "titv"; + } + + public int getComparisonOrder() { + return 2; // we only need to see each eval track + } + + public void updateTiTv(VariantContext vc, boolean updateStandard) { + if ( vc != null && vc.isSNP() && vc.isBiallelic() ) { + if ( vc.isTransition() ) { + if ( updateStandard ) nTiInStd++; else nTi++; + } else { + if ( updateStandard ) nTvInStd++; else nTv++; + } + } + } + + public void update2(VariantContext vc1, VariantContext vc2, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( vc1 != null ) updateTiTv(vc1, false); + if ( vc2 != null ) updateTiTv(vc2, true); + } + + public String toString() { + return getName() + ": " + summaryLine(); + } + + private String summaryLine() { + return String.format("%d %d %.2f %d %d %.2f", + nTi, nTv, ratio(nTi, nTv), + nTiInStd, nTvInStd, ratio(nTiInStd, nTvInStd)); + } + + private static List HEADER = + Arrays.asList("nTi", "nTv", "TiTvRatio", "nTiStandard", "nTvStandard", "TiTvRatioStandard"); + + // making it a table + public List getTableHeader() { + return HEADER; + } + + public List> getTableRows() { + return Arrays.asList(Arrays.asList(summaryLine().split(" "))); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java new file mode 100755 index 000000000..561086f15 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEval2Walker.java @@ -0,0 +1,398 @@ +package org.broadinstitute.sting.oneoffprojects.variantcontext.varianteval2; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContext; +import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContextAdaptors; +import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContextUtils; +import org.apache.log4j.Logger; + +import java.util.*; +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; + + +// +// todo -- write a simple column table system and have the evaluators return this instead of the list> objects +// todo -- site frequency spectrum eval (freq. of variants in eval as a function of their AC and AN numbers) +// todo -- multiple sample concordance tool (genotypes in eval vs. genotypes in truth) +// todo -- allele freqeuncy discovery tool (FREQ in true vs. discovery counts in eval). Needs to process subset of samples in true (pools) +// todo -- clustered SNP counter +// todo -- HWEs +// todo -- Validation data analysis from VE1? What is it and should we transition it over? +// todo -- indel metrics [count of sizes in/del should be in CountVariants] +// todo -- create JEXL context implementing object that simply looks up values for JEXL evaluations. Throws error for unknown fields +// + +// +// Todo -- should really include argument parsing @annotations from subclass in this walker. Very +// todo -- useful general capability. Right now you need to add arguments to VariantEval2 to handle new +// todo -- evaluation arguments (which is better than passing a string!) +// + +// +// todo -- the whole organization only supports a single eval x comp evaluation. We need to instantiate +// todo -- new contexts for each comparison object too! +// + +/** + * Test routine for new VariantContext object + */ +public class VariantEval2Walker extends RodWalker { + // -------------------------------------------------------------------------------------------------------------- + // + // walker arguments + // + // -------------------------------------------------------------------------------------------------------------- + + // todo -- add doc string + @Argument(shortName="select", doc="", required=false) + protected String[] SELECT_EXPS = {"QUAL > 500.0", "HARD_TO_VALIDATE==1", "GATK_STANDARD==1"}; + + // todo -- add doc string + @Argument(shortName="selectName", doc="", required=false) + protected String[] SELECT_NAMES = {"q500plus", "low_mapq", "gatk_std_filters"}; + + @Argument(shortName="known", doc="Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets", required=false) + protected String[] KNOWN_NAMES = {"dbsnp"}; + + // + // Arguments for Mendelian Violation calculations + // + @Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations: this argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) + protected String FAMILY_STRUCTURE; + + @Argument(shortName="MVQ", fullName="MendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false) + protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50; + + @Argument(shortName="PI", fullName="PrintInterestingSites", doc="If provided, interesting sites in the unselected, called set will be printed", required=false) + protected boolean PRINT_INTERESTING_SITES = false; + +// @Argument(shortName="PIA", fullName="PrintAllInterestingSites", doc="If provided, interesting sites will be printed for all sets. Verbose", required=false) +// protected boolean PRINT_ALL_INTERESTING_SITES = false; + + // -------------------------------------------------------------------------------------------------------------- + // + // private walker data + // + // -------------------------------------------------------------------------------------------------------------- + + /** private class holding all of the information about a single evaluation group (e.g., for eval ROD) */ + private class EvaluationContext extends HashMap> { + // useful for typing + public String trackName, contextName; + VariantContextUtils.MatchExp selectExp; + + public EvaluationContext(String trackName, String contextName, VariantContextUtils.MatchExp selectExp) { + this.trackName = trackName; + this.contextName = contextName; + this.selectExp = selectExp; + } + } + + private HashMap contexts = new HashMap(); + private Set compNames = new HashSet(); + + private static String RAW_SET_NAME = "raw"; + private static String RETAINED_SET_NAME = "called"; + private static String FILTERED_SET_NAME = "filtered"; + private static String ALL_SET_NAME = "all"; + private static String KNOWN_SET_NAME = "known"; + private static String NOVEL_SET_NAME = "novel"; + + // Dynamically determined variantEvaluation classes + private List variantEvaluationNames = new ArrayList(); + private List> evaluationClasses = null; + + // -------------------------------------------------------------------------------------------------------------- + // + // initialize + // + // -------------------------------------------------------------------------------------------------------------- + + public void initialize() { + determineAllEvalations(); + List selectExps = VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS); + + for ( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) { + if ( d.getName().startsWith("eval") ) { + for ( VariantContextUtils.MatchExp e : selectExps ) { + addNewContext(d.getName(), d.getName() + "." + e.name, e); + } + addNewContext(d.getName(), d.getName() + ".all", null); + } else if ( d.getName().startsWith("dbsnp") || d.getName().startsWith("hapmap") || d.getName().startsWith("comp") ) { + compNames.add(d.getName()); + } else { + logger.info("Not evaluating ROD binding " + d.getName()); + } + } + + determineContextNamePartSizes(); + } + + private void determineAllEvalations() { + evaluationClasses = PackageUtils.getClassesImplementingInterface(VariantEvaluator.class); + for ( VariantEvaluator e : instantiateEvalationsSet(false, null) ) { + // for collecting purposes + variantEvaluationNames.add(e.getName()); + logger.debug("Including VariantEvaluator " + e.getName() + " of class " + e.getClass()); + } + + Collections.sort(variantEvaluationNames); + } + + private Set instantiateEvalationsSet(boolean baseline, String name) { + Set evals = new HashSet(); + Object[] args = new Object[]{this}; + Class[] argTypes = new Class[]{this.getClass()}; + + for ( Class c : evaluationClasses ) { + try { + Constructor constructor = c.getConstructor(argTypes); + VariantEvaluator eval = (VariantEvaluator)constructor.newInstance(args); + if ( baseline ) eval.printInterestingSites(name); + evals.add(eval); + } catch (InstantiationException e) { + throw new StingException(String.format("Cannot instantiate class '%s': must be concrete class", c.getSimpleName())); + } catch (NoSuchMethodException e) { + throw new StingException(String.format("Cannot find expected constructor for class '%s': must have constructor accepting a single VariantEval2Walker object", c.getSimpleName())); + } catch (IllegalAccessException e) { + throw new StingException(String.format("Cannot instantiate class '%s':", c.getSimpleName())); + } catch (InvocationTargetException e) { + throw new StingException(String.format("Cannot instantiate class '%s':", c.getSimpleName())); + } + } + + return evals; + } + + private void addNewContext(String trackName, String contextName, VariantContextUtils.MatchExp selectExp) { + EvaluationContext group = new EvaluationContext(trackName, contextName, selectExp); + + for ( String filteredName : Arrays.asList(RAW_SET_NAME, RETAINED_SET_NAME, FILTERED_SET_NAME) ) { + for ( String subname : Arrays.asList(ALL_SET_NAME, KNOWN_SET_NAME, NOVEL_SET_NAME) ) { + String name = subname + "." + filteredName; + group.put(name, instantiateEvalationsSet(subname == ALL_SET_NAME && filteredName == RETAINED_SET_NAME, trackName + "." + (selectExp == null ? "all" : selectExp.name) + "." + name)); + } + } + + contexts.put(contextName, group); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // map + // + // -------------------------------------------------------------------------------------------------------------- + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + //System.out.printf("map at %s with %d skipped%n", context.getLocation(), context.getSkippedBases()); + + Map comps = getCompVariantContexts(tracker, context); + + // to enable walking over pairs where eval or comps have no elements + for ( EvaluationContext group : contexts.values() ) { + VariantContext vc = getVariantContext(group.trackName, tracker, context); + + //logger.debug(String.format("Updating %s of %s with variant", group.name, vc)); + for ( Map.Entry> namedEvaluations : group.entrySet() ) { + String evaluationName = namedEvaluations.getKey(); + Set evaluations = namedEvaluations.getValue(); + boolean evalWantsVC = applyVCtoEvaluation(evaluationName, vc, comps, group); + + for ( VariantEvaluator evaluation : evaluations ) { + // we always call update0 in case the evaluation tracks things like number of bases covered + evaluation.update0(tracker, ref, context); + + // now call the single or paired update function + switch ( evaluation.getComparisonOrder() ) { + case 1: + if ( evalWantsVC && vc != null ) + evaluation.update1(vc, tracker, ref, context); + break; + case 2: + for ( VariantContext comp : comps.values() ) { + evaluation.update2( evalWantsVC ? vc : null, comp, tracker, ref, context); + } + break; + default: + throw new StingException("BUG: Unexpected evaluation order " + evaluation); + } + } + } + } + + return 0; + } + + private boolean applyVCtoEvaluation(String evaluationName, VariantContext vc, Map comps, EvaluationContext group) { + if ( vc == null ) + return true; + + if ( evaluationName.contains(FILTERED_SET_NAME) && vc.isNotFiltered() ) + return false; + + if ( evaluationName.contains(RETAINED_SET_NAME) && vc.isFiltered() ) + return false; + + boolean vcKnown = vcIsKnown(vc, comps, KNOWN_NAMES); + if ( evaluationName.contains(KNOWN_SET_NAME) && ! vcKnown ) + return false; + else if ( evaluationName.contains(NOVEL_SET_NAME) && vcKnown ) + return false; + + if ( group.selectExp != null && ! VariantContextUtils.match(vc, group.selectExp) ) + return false; + + // nothing invalidated our membership in this set + return true; + } + + private Map getCompVariantContexts(RefMetaDataTracker tracker, AlignmentContext context) { + Map comps = new HashMap(); + + for ( String compName : compNames ) { + comps.put(compName, getVariantContext(compName, tracker, context)); + } + + // todo -- remove me when the loop works correctly for comparisons of eval x comp for each comp + if ( comps.size() > 1 ) throw new StingException("VariantEval2 currently only supports comparisons of N eval tracks vs. a single comparison track. Yes, I know..."); + return comps; + } + + private boolean vcIsKnown(VariantContext vc, Map comps, String[] knownNames ) { + for ( String knownName : knownNames ) { + VariantContext known = comps.get(knownName); + if ( known != null && known.isNotFiltered() && known.getType() == vc.getType() ) + return true; + } + + return false; + } + +// can't handle this situation +// todo -- warning, this leads to some missing SNPs at complex loci, such as: +// todo -- 591 1 841619 841620 rs4970464 0 - A A -/C/T genomic mixed unknown 0 0 near-gene-3 exact 1 +// todo -- 591 1 841619 841620 rs62677860 0 + A A C/T genomic single unknown 0 0 near-gene-3 exact 1 +// +//logger.info(String.format("Ignore second+ events at locus %s in rod %s => rec is %s", context.getLocation(), rodList.getName(), rec)); + + private VariantContext getVariantContext(String name, RefMetaDataTracker tracker, AlignmentContext context) { + if ( tracker != null ) { + RODRecordList rodList = tracker.getTrackData(name, null); + if ( rodList != null ) { + for ( ReferenceOrderedDatum rec : rodList.getRecords() ) { + if ( rec.getLocation().getStart() == context.getLocation().getStart() ) { + VariantContext vc = VariantContextAdaptors.convertToVariantContext(name, rec); + if ( vc != null ) { + return vc; + } + } + } + } + } + + return null; + } + + // -------------------------------------------------------------------------------------------------------------- + // + // reduce + // + // -------------------------------------------------------------------------------------------------------------- + public Integer reduceInit() { + return 0; + } + + public Integer reduce(Integer point, Integer sum) { + return point + sum; + } + + public VariantEvaluator getEvalByName(String name, Set s) { + for ( VariantEvaluator e : s ) + if ( e.getName().equals(name) ) + return e; + return null; + } + + private > List sorted(Collection c ) { + List l = new ArrayList(c); + Collections.sort(l); + return l; + } + + private final static String CONTEXT_HEADER = "track.subset.novelty.filter"; + private final static int N_CONTEXT_NAME_PARTS = CONTEXT_HEADER.split("\\.").length; + private static int[] nameSizes = new int[N_CONTEXT_NAME_PARTS]; + static { + int i = 0; + for ( String elt : CONTEXT_HEADER.split("\\.") ) + nameSizes[i++] = elt.length(); + } + + private void determineContextNamePartSizes() { + for ( String contextName : sorted(contexts.keySet()) ) { + EvaluationContext group = contexts.get(contextName); + for ( String evalSubgroupName : sorted(group.keySet()) ) { + String keyWord = contextName + "." + evalSubgroupName; + String[] parts = keyWord.split("\\."); + if ( parts.length != N_CONTEXT_NAME_PARTS ) { + throw new StingException("Unexpected number of eval name parts " + keyWord + " length = " + parts.length + ", expected " + N_CONTEXT_NAME_PARTS); + } else { + for ( int i = 0; i < parts.length; i++ ) + nameSizes[i] = Math.max(nameSizes[i], parts[i].length()); + } + } + } + } + + private String formatKeyword(String keyWord) { + //System.out.printf("keyword %s%n", keyWord); + + StringBuilder s = new StringBuilder(); + int i = 0; + for ( String part : keyWord.split("\\.") ) { + //System.out.printf("part %s %d%n", part, nameSizes[i]); + s.append(String.format("%"+nameSizes[i]+"s ", part)); + i++; + } + + return s.toString(); + } + + public void onTraversalDone(Integer result) { + // todo -- this really needs to be pretty printed; use some kind of table organization + // todo -- so that we can load up all of the data in one place, analyze the widths of the columns + // todo -- and pretty print it + for ( String evalName : variantEvaluationNames ) { + boolean first = true; + out.printf("%n%n"); + // todo -- show that comp is dbsnp, etc. is columns + for ( String contextName : sorted(contexts.keySet()) ) { + EvaluationContext group = contexts.get(contextName); + + out.printf("%s%n", Utils.dupString('-', 80)); + for ( String evalSubgroupName : sorted(group.keySet()) ) { + Set evalSet = group.get(evalSubgroupName); + VariantEvaluator eval = getEvalByName(evalName, evalSet); + String keyWord = contextName + "." + evalSubgroupName; + + if ( first ) { + out.printf("%20s %s %s%n", evalName, formatKeyword(CONTEXT_HEADER), Utils.join("\t", eval.getTableHeader())); + first = false; + } + + for ( List row : eval.getTableRows() ) + out.printf("%20s %s %s%n", evalName, formatKeyword(keyWord), Utils.join("\t", row)); + } + } + } + } + + protected Logger getLogger() { return logger; } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEvaluator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEvaluator.java new file mode 100755 index 000000000..522d1a538 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval2/VariantEvaluator.java @@ -0,0 +1,72 @@ +package org.broadinstitute.sting.oneoffprojects.variantcontext.varianteval2; + +import org.broadinstitute.sting.oneoffprojects.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; + +import java.util.List; +import java.util.ArrayList; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: Jan 29, 2010 + * Time: 3:38:02 PM + * To change this template use File | Settings | File Templates. + */ +public abstract class VariantEvaluator { + protected boolean accumulateInterestingSites = false, printInterestingSites = false; + protected String interestingSitePrefix = null; + protected boolean processedASite = false; + protected List interestingSites = new ArrayList(); + + public abstract String getName(); + + // do we want to keep track of things that are interesting + public void accumulateInterestingSites(boolean enable) { accumulateInterestingSites = enable; } + public void printInterestingSites(String prefix) { printInterestingSites = true; interestingSitePrefix = prefix; } + public boolean isAccumulatingInterestingSites() { return accumulateInterestingSites; } + public List getInterestingSites() { return interestingSites; } + protected void addInterestingSite(String why, VariantContext vc) { + if ( accumulateInterestingSites ) + interestingSites.add(vc); + if ( printInterestingSites ) + System.out.printf("%40s %s%n", interestingSitePrefix, why); + } + + public boolean processedAnySites() { return processedASite; } + protected void markSiteAsProcessed() { processedASite = true; } + + /** Should return the number of VariantContexts expected as inputs to update. Can be 1 or 2 */ + public abstract int getComparisonOrder(); + + /** called at all sites, regardless of eval context itself; useful for counting processed bases */ + public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { } + public void update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { } + public void update2(VariantContext vc1, VariantContext vc2, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { } + + public void finalize() {} + + public abstract String toString(); + + // making it a table + public abstract List getTableHeader(); + public abstract List> getTableRows(); + + // + // useful common utility routines + // + protected double rate(long n, long d) { + return n / (1.0 * Math.max(d, 1)); + } + + protected long inverseRate(long n, long d) { + return n == 0 ? 0 : d / Math.max(n, 1); + } + + protected double ratio(long num, long denom) { + return ((double)num) / (Math.max(denom, 1)); + } + +}