From bcf80cc7b3bd527a3f0eab6f863e4c75fcec72dd Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 22 Mar 2012 21:14:44 -0400 Subject: [PATCH] Cleanup in VariantEval. Example of molten VariantEval output -- Moved a variety of useful formatting routines for ratios, percentages, etc, into VariantEvalator.java so everyone can share. Code updated to use these routines where appropriate -- Added variantWasSingleton() to VariantEvaluator, which can be used to determine if a site, even after subsetting to specific samples, was a singleton in the original full VCF -- TableType, which used to be an interface, is now an abstract class, allowing us to implement some generally functionality and avoid duplication. -- This included creating a getRowName() function that used to be hardcoded as "row" but how can be overridden. -- #### This allows us implement molten tables, which are vastly easier to use than multi-row data sets. See IndelHistogram class (in later commit) for example of molten VE output --- .../varianteval/VariantEvalWalker.java | 31 ++++---------- .../varianteval/evaluators/CountVariants.java | 12 +++--- .../evaluators/GenotypeConcordance.java | 8 ++-- .../evaluators/GenotypePhasingEvaluator.java | 2 +- .../evaluators/MultiallelicSummary.java | 17 ++------ .../evaluators/VariantEvaluator.java | 42 +++++++++++++++++++ .../evaluators/VariantQualityScore.java | 12 +----- .../evaluators/VariantSummary.java | 4 +- .../walkers/varianteval/util/TableType.java | 12 +++--- .../varianteval/util/VariantEvalUtils.java | 2 +- 10 files changed, 75 insertions(+), 67 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index d18c7e10a..3a67fd5d7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -93,6 +93,7 @@ import java.util.*; */ @Reference(window=@Window(start=-50, stop=50)) public class VariantEvalWalker extends RodWalker implements TreeReducible { + public static final String IS_SINGLETON_KEY = "ISSINGLETON"; @Output protected PrintStream out; @@ -494,7 +495,7 @@ public class VariantEvalWalker extends RodWalker implements Tr if (field.get(ve) instanceof TableType) { TableType t = (TableType) field.get(ve); - String subTableName = ve.getClass().getSimpleName() + "." + field.getName(); + final String subTableName = ve.getClass().getSimpleName() + "." + field.getName(); final DataPoint dataPointAnn = datamap.get(field); GATKReportTable table; @@ -509,17 +510,10 @@ public class VariantEvalWalker extends RodWalker implements Tr table.addColumn(vs.getName(), "unknown"); } - table.addColumn("row", "unknown"); - - for ( Object o : t.getColumnKeys() ) { - String c; - - if (o instanceof String) { - c = (String) o; - } else { - c = o.toString(); - } + table.addColumn(t.getRowName(), "unknown"); + for ( final Object o : t.getColumnKeys() ) { + final String c = o.toString(); table.addColumn(c, 0.0); } } else { @@ -527,7 +521,7 @@ public class VariantEvalWalker extends RodWalker implements Tr } for (int row = 0; row < t.getRowKeys().length; row++) { - String r = (String) t.getRowKeys()[row]; + final String r = t.getRowKeys()[row].toString(); for ( VariantStratifier vs : stratificationObjects ) { final String columnName = vs.getName(); @@ -535,17 +529,10 @@ public class VariantEvalWalker extends RodWalker implements Tr } for (int col = 0; col < t.getColumnKeys().length; col++) { - String c; - if (t.getColumnKeys()[col] instanceof String) { - c = (String) t.getColumnKeys()[col]; - } else { - c = t.getColumnKeys()[col].toString(); - } - - String newStateKey = stateKey.toString() + r; + final String c = t.getColumnKeys()[col].toString(); + final String newStateKey = stateKey.toString() + r; table.set(newStateKey, c, t.getCell(row, col)); - - table.set(newStateKey, "row", r); + table.set(newStateKey, t.getRowName(), r); } } } else { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java index 9a97b005c..6fc4208ee 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java @@ -30,7 +30,6 @@ public class CountVariants extends VariantEvaluator implements StandardEval { @DataPoint(description = "Number of variants per base", format = "%.8f") public double variantRatePerBp = 0; - @DataPoint(description = "Number of snp loci", format = "%d") public long nSNPs = 0; @DataPoint(description = "Number of mnp loci", format = "%d") @@ -47,7 +46,6 @@ public class CountVariants extends VariantEvaluator implements StandardEval { @DataPoint(description = "Number of mixed loci (loci that can't be classified as a SNP, Indel or MNP)", format = "%d") public long nMixed = 0; - @DataPoint(description = "Number of no calls loci", format = "%d") public long nNoCalls = 0; @DataPoint(description = "Number of het loci", format = "%d") @@ -72,8 +70,8 @@ public class CountVariants extends VariantEvaluator implements StandardEval { public double indelRate = 0; @DataPoint(description = "indel rate per base pair", format = "%.2f") public double indelRatePerBp = 0; - @DataPoint(description = "deletion to insertion ratio", format = "%.2f") - public double deletionInsertionRatio = 0; + @DataPoint(description = "insertion to deletion ratio", format = "%.2f") + public double insertionDeletionRatio = 0; private double perLocusRate(long n) { return rate(n, nProcessedLoci); @@ -113,12 +111,12 @@ public class CountVariants extends VariantEvaluator implements StandardEval { case SNP: nVariantLoci++; nSNPs++; - if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++; + if (variantWasSingleton(vc1)) nSingletons++; break; case MNP: nVariantLoci++; nMNPs++; - if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++; + if (variantWasSingleton(vc1)) nSingletons++; break; case INDEL: nVariantLoci++; @@ -201,6 +199,6 @@ public class CountVariants extends VariantEvaluator implements StandardEval { hetHomRatio = ratio(nHets, nHomVar); indelRate = perLocusRate(nDeletions + nInsertions + nComplex); indelRatePerBp = perLocusRInverseRate(nDeletions + nInsertions + nComplex); - deletionInsertionRatio = ratio(nDeletions, nInsertions); + insertionDeletionRatio = ratio(nInsertions, nDeletions); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java index 4f5aeed61..75aacf5ba 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java @@ -59,7 +59,7 @@ public class GenotypeConcordance extends VariantEvaluator { private boolean discordantInteresting = false; - static class FrequencyStats implements TableType { + static class FrequencyStats extends TableType { class Stats { public Stats(int found, int missed) { nFound = found; nMissed = missed; } public long nFound = 0; @@ -103,7 +103,7 @@ public class GenotypeConcordance extends VariantEvaluator { } } - static class QualityScoreHistograms implements TableType { + static class QualityScoreHistograms extends TableType { final static int NUM_BINS = 20; final HashMap truePositiveQualityScoreMap = new HashMap(); // A HashMap holds all the quality scores until we are able to bin them appropriately final HashMap falsePositiveQualityScoreMap = new HashMap(); @@ -362,7 +362,7 @@ public class GenotypeConcordance extends VariantEvaluator { /** * a table of sample names to genotype concordance figures */ -class SampleStats implements TableType { +class SampleStats extends TableType { private final int nGenotypeTypes; // sample to concordance stats object @@ -448,7 +448,7 @@ class SampleStats implements TableType { /** * a table of sample names to genotype concordance summary statistics */ -class SampleSummaryStats implements TableType { +class SampleSummaryStats extends TableType { protected final static String ALL_SAMPLES_KEY = "allSamples"; protected final static String[] COLUMN_KEYS = new String[]{ "percent_comp_ref_called_ref", diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java index f4369401b..2f9671d90 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java @@ -376,7 +376,7 @@ class PhaseStats { /** * a table of sample names to genotype phasing statistics */ -class SamplePhasingStatistics implements TableType { +class SamplePhasingStatistics extends TableType { private HashMap sampleStats = null; private double minPhaseQuality; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java index 5cea0322f..1c34be4a1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MultiallelicSummary.java @@ -87,13 +87,8 @@ public class MultiallelicSummary extends VariantEvaluator implements StandardEva public String indelNoveltyRate = "NA"; - public void initialize(VariantEvalWalker walker) {} - @Override public boolean enabled() { return true; } - - public int getComparisonOrder() { - return 2; - } + @Override public int getComparisonOrder() { return 2; } public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1); @@ -156,12 +151,6 @@ public class MultiallelicSummary extends VariantEvaluator implements StandardEva // TODO -- implement me } - private final String noveltyRate(final int all, final int known) { - final int novel = all - known; - final double rate = (novel / (1.0 * all)); - return all == 0 ? "NA" : String.format("%.2f", rate); - } - public void finalizeEvaluation() { processedMultiSnpRatio = (double)nMultiSNPs / (double)nProcessedLoci; variantMultiSnpRatio = (double)nMultiSNPs / (double)nSNPs; @@ -170,7 +159,7 @@ public class MultiallelicSummary extends VariantEvaluator implements StandardEva TiTvRatio = (double)nTi / (double)nTv; - SNPNoveltyRate = noveltyRate(nMultiSNPs, knownSNPsPartial + knownSNPsComplete); - indelNoveltyRate = noveltyRate(nMultiSNPs, knownIndelsPartial + knownIndelsComplete); + SNPNoveltyRate = formattedNoveltyRate(knownSNPsPartial + knownSNPsComplete, nMultiSNPs); + indelNoveltyRate = formattedNoveltyRate(knownIndelsPartial + knownIndelsComplete, nMultiSNPs); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java index 83a1c2f3b..7e5cf37ff 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantEvaluator.java @@ -49,4 +49,46 @@ public abstract class VariantEvaluator { return true; } + /** + * Returns true if the variant in vc was a singleton in the original input evaluation + * set, regardless of variant context subsetting that has occurred. + * @param eval + * @return true if eval was originally a singleton site + */ + protected static final boolean variantWasSingleton(final VariantContext eval) { + return eval.getAttributeAsBoolean(VariantEvalWalker.IS_SINGLETON_KEY, false); + } + + /** + * Convenience function that formats the novelty rate as a %.2f string + * + * @param known number of variants from all that are known + * @param all number of all variants + * @return a String novelty rate, or NA if all == 0 + */ + protected static final String formattedNoveltyRate(final int known, final int all) { + return formattedPercent(all - known, all); + } + + /** + * Convenience function that formats the novelty rate as a %.2f string + * + * @param x number of objects part of total that meet some criteria + * @param total count of all objects, including x + * @return a String percent rate, or NA if total == 0 + */ + protected static final String formattedPercent(final int x, final int total) { + return total == 0 ? "NA" : String.format("%.2f", x / (1.0*total)); + } + + /** + * Convenience function that formats a ratio as a %.2f string + * + * @param num number of observations in the numerator + * @param denom number of observations in the denumerator + * @return a String formatted ratio, or NA if all == 0 + */ + protected static final String formattedRatio(final int num, final int denom) { + return denom == 0 ? "NA" : String.format("%.2f", num / (1.0 * denom)); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java index ce9e45c9b..8417faf5f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java @@ -54,7 +54,7 @@ public class VariantQualityScore extends VariantEvaluator { @DataPoint(description = "average variant quality for each allele count") AlleleCountStats alleleCountStats = null; - static class TiTvStats implements TableType { + static class TiTvStats extends TableType { final static int NUM_BINS = 20; final HashMap> qualByIsTransition = new HashMap>(); // A hashMap holds all the qualities until we are able to bin them appropriately final long transitionByQuality[] = new long[NUM_BINS]; @@ -73,10 +73,6 @@ public class VariantQualityScore extends VariantEvaluator { return columnKeys; } - public String getName() { - return "TiTvStats"; - } - public String getCell(int x, int y) { return String.valueOf(titvByQuality[y]); } @@ -143,7 +139,7 @@ public class VariantQualityScore extends VariantEvaluator { } } - class AlleleCountStats implements TableType { + class AlleleCountStats extends TableType { final HashMap> qualityListMap = new HashMap>(); final HashMap qualityMap = new HashMap(); @@ -163,10 +159,6 @@ public class VariantQualityScore extends VariantEvaluator { return new String[]{"alleleCount","avgQual"}; } - public String getName() { - return "AlleleCountStats"; - } - public String getCell(int x, int y) { int iii = 0; for( final Integer key : qualityListMap.keySet() ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java index aa3eff756..31f9a4f78 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java @@ -255,9 +255,7 @@ public class VariantSummary extends VariantEvaluator implements StandardEval { private final String noveltyRate(Type type) { final int all = allVariantCounts.all(type); final int known = knownVariantCounts.all(type); - final int novel = all - known; - final double rate = (novel / (1.0 * all)); - return all == 0 ? "NA" : String.format("%.2f", rate); + return formattedNoveltyRate(known, all); } public void finalizeEvaluation() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/TableType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/TableType.java index 7ffc3e2c8..6ab7d1af3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/TableType.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/TableType.java @@ -9,9 +9,11 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.util; * * an interface for turning arbritary objects into tables */ -public interface TableType { - public Object[] getRowKeys(); - public Object[] getColumnKeys(); - public Object getCell(int x, int y); - public String getName(); +public abstract class TableType { + public abstract Object[] getRowKeys(); + public abstract Object[] getColumnKeys(); + public abstract Object getCell(int x, int y); + public String getName() { return getClass().getSimpleName(); } + public String getRowName() { return "row"; } + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java index 44af9f574..f9e740576 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java @@ -310,7 +310,7 @@ public class VariantEvalUtils { final int newAlleleCount = vcsub.getHetCount() + 2 * vcsub.getHomVarCount(); if (originalAlleleCount == newAlleleCount && newAlleleCount == 1) { - builder.attribute("ISSINGLETON", true); + builder.attribute(VariantEvalWalker.IS_SINGLETON_KEY, true); } VariantContextUtils.calculateChromosomeCounts(builder, true);