From 41076361449a9e155b916847fb28c26fc82add5c Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 23 Nov 2011 13:02:07 -0500 Subject: [PATCH] VariantEval updates -- Performance optimizations -- Tables now are cleanly formatted (floats are %.2f printed) -- VariantSummary is a standard report now -- Removed CompEvalGenotypes (it didn't do anything) -- Deleted unused classes in GenotypeConcordance -- Updates integration tests as appropriate --- .../varianteval/VariantEvalWalker.java | 19 +- .../evaluators/CompEvalGenotypes.java | 35 --- .../varianteval/evaluators/CompOverlap.java | 4 +- .../varianteval/evaluators/CountVariants.java | 12 +- .../evaluators/G1KPhaseITable.java | 159 ------------- .../evaluators/GenotypeConcordance.java | 109 --------- .../evaluators/TiTvVariantEvaluator.java | 6 +- .../evaluators/ValidationReport.java | 8 +- .../evaluators/VariantSummary.java | 223 ++++++++++++++++++ .../walkers/varianteval/util/DataPoint.java | 1 + .../varianteval/util/VariantEvalUtils.java | 19 +- .../VariantEvalIntegrationTest.java | 44 ++-- 12 files changed, 281 insertions(+), 358 deletions(-) delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompEvalGenotypes.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/G1KPhaseITable.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 558d9c6bf..10d4651b7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -312,16 +312,17 @@ public class VariantEvalWalker extends RodWalker implements Tr String aastr = (ancestralAlignments == null) ? null : new String(ancestralAlignments.getSubsequenceAt(ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop()).getBases()); // --------- track --------- sample - VariantContexts - - HashMap, HashMap>> evalVCs = variantEvalUtils.bindVariantContexts(tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals); - HashMap, HashMap>> compVCs = variantEvalUtils.bindVariantContexts(tracker, ref, comps, byFilterIsEnabled, false, false, false); + HashMap, HashMap>> evalVCs = variantEvalUtils.bindVariantContexts(tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled, mergeEvals); + HashMap, HashMap>> compVCs = variantEvalUtils.bindVariantContexts(tracker, ref, comps, byFilterIsEnabled, false, false, false); // for each eval track for ( final RodBinding evalRod : evals ) { - final HashMap> evalSet = evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : new HashMap>(0); + final Map> emptyEvalMap = Collections.emptyMap(); + final Map> evalSet = evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : emptyEvalMap; // for each sample stratifier for ( final String sampleName : sampleNamesForStratification ) { - Set evalSetBySample = evalSet.get(sampleName); + Collection evalSetBySample = evalSet.get(sampleName); if ( evalSetBySample == null ) { evalSetBySample = new HashSet(1); evalSetBySample.add(null); @@ -337,8 +338,8 @@ public class VariantEvalWalker extends RodWalker implements Tr // for each comp track for ( final RodBinding compRod : comps ) { // no sample stratification for comps - final HashMap> compSetHash = compVCs.get(compRod); - final Set compSet = (compSetHash == null || compSetHash.size() == 0) ? new HashSet(0) : compVCs.get(compRod).values().iterator().next(); + final HashMap> compSetHash = compVCs.get(compRod); + final Collection compSet = (compSetHash == null || compSetHash.size() == 0) ? Collections.emptyList() : compVCs.get(compRod).values().iterator().next(); // find the comp final VariantContext comp = findMatchingComp(eval, compSet); @@ -382,7 +383,7 @@ public class VariantEvalWalker extends RodWalker implements Tr return null; } - private VariantContext findMatchingComp(final VariantContext eval, final Set comps) { + private VariantContext findMatchingComp(final VariantContext eval, final Collection comps) { // if no comps, return null if ( comps == null || comps.isEmpty() ) return null; @@ -447,11 +448,11 @@ public class VariantEvalWalker extends RodWalker implements Tr TableType t = (TableType) field.get(ve); String subTableName = ve.getClass().getSimpleName() + "." + field.getName(); - String subTableDesc = datamap.get(field).description(); + final DataPoint dataPointAnn = datamap.get(field); GATKReportTable table; if (!report.hasTable(subTableName)) { - report.addTable(subTableName, subTableDesc); + report.addTable(subTableName, dataPointAnn.description()); table = report.getTable(subTableName); table.addPrimaryKey("entry", false); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompEvalGenotypes.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompEvalGenotypes.java deleted file mode 100755 index 925bff9c0..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompEvalGenotypes.java +++ /dev/null @@ -1,35 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.variantcontext.Genotype; - -class NewCompEvalGenotypes { - private GenomeLoc loc; - private Genotype compGt; - private Genotype evalGt; - - public NewCompEvalGenotypes(GenomeLoc loc, Genotype compGt, Genotype evalGt) { - this.loc = loc; - this.compGt = compGt; - this.evalGt = evalGt; - } - - public GenomeLoc getLocus() { - return loc; - } - - public Genotype getCompGenotpye() { - return compGt; - } - public Genotype getEvalGenotype() { - return evalGt; - } - - public void setCompGenotype(Genotype compGt) { - this.compGt = compGt; - } - - public void setEvalGenotype(Genotype evalGt) { - this.evalGt = evalGt; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java index b3695921a..89d137ea9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CompOverlap.java @@ -28,13 +28,13 @@ public class CompOverlap extends VariantEvaluator implements StandardEval { @DataPoint(description = "number of eval sites at comp sites") long nVariantsAtComp = 0; - @DataPoint(description = "percentage of eval sites at comp sites") + @DataPoint(description = "percentage of eval sites at comp sites", format = "%.2f" ) double compRate = 0.0; @DataPoint(description = "number of concordant sites") long nConcordant = 0; - @DataPoint(description = "the concordance rate") + @DataPoint(description = "the concordance rate", format = "%.2f") double concordantRate = 0.0; public int getComparisonOrder() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java index d8413573a..c740eb78c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java @@ -62,17 +62,17 @@ public class CountVariants extends VariantEvaluator implements StandardEval { public long nHomDerived = 0; // calculations that get set in the finalizeEvaluation method - @DataPoint(description = "heterozygosity per locus rate") + @DataPoint(description = "heterozygosity per locus rate", format = "%.2e") public double heterozygosity = 0; - @DataPoint(description = "heterozygosity per base pair") + @DataPoint(description = "heterozygosity per base pair", format = "%.2f") public double heterozygosityPerBp = 0; - @DataPoint(description = "heterozygosity to homozygosity ratio") + @DataPoint(description = "heterozygosity to homozygosity ratio", format = "%.2f") public double hetHomRatio = 0; - @DataPoint(description = "indel rate (insertion count + deletion count)") + @DataPoint(description = "indel rate (insertion count + deletion count)", format = "%.2e") public double indelRate = 0; - @DataPoint(description = "indel rate per base pair") + @DataPoint(description = "indel rate per base pair", format = "%.2f") public double indelRatePerBp = 0; - @DataPoint(description = "deletion to insertion ratio") + @DataPoint(description = "deletion to insertion ratio", format = "%.2f") public double deletionInsertionRatio = 0; private double perLocusRate(long n) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/G1KPhaseITable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/G1KPhaseITable.java deleted file mode 100644 index ff8f6307c..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/G1KPhaseITable.java +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.EnumMap; -import java.util.HashMap; -import java.util.Map; - -@Analysis(description = "Build 1000 Genome Phase I paper summary of variants table") -public class G1KPhaseITable extends VariantEvaluator { - // basic counts on various rates found - @DataPoint(description = "Number of samples") - public long nSamples = 0; - - @DataPoint(description = "Number of processed loci") - public long nProcessedLoci = 0; - - @DataPoint(description = "Number of SNPs") - public long nSNPs = 0; - @DataPoint(description = "SNP Novelty Rate") - public String SNPNoveltyRate = "NA"; - @DataPoint(description = "Mean number of SNPs per individual") - public long nSNPsPerSample = 0; - - @DataPoint(description = "Number of Indels") - public long nIndels = 0; - @DataPoint(description = "Indel Novelty Rate") - public String IndelNoveltyRate = "NA"; - @DataPoint(description = "Mean number of Indels per individual") - public long nIndelsPerSample = 0; - - @DataPoint(description = "Number of SVs") - public long nSVs = 0; - @DataPoint(description = "SV Novelty Rate") - public String SVNoveltyRate = "NA"; - @DataPoint(description = "Mean number of SVs per individual") - public long nSVsPerSample = 0; - - Map allVariantCounts, knownVariantCounts; - Map> countsPerSample; - - private final Map makeCounts() { - Map counts = new EnumMap(VariantContext.Type.class); - counts.put(VariantContext.Type.SNP, 0); - counts.put(VariantContext.Type.INDEL, 0); - counts.put(VariantContext.Type.SYMBOLIC, 0); - return counts; - } - - public void initialize(VariantEvalWalker walker) { - countsPerSample = new HashMap>(); - nSamples = walker.getSampleNamesForEvaluation().size(); - - for ( String sample : walker.getSampleNamesForEvaluation() ) { - countsPerSample.put(sample, makeCounts()); - } - - allVariantCounts = makeCounts(); - knownVariantCounts = makeCounts(); - } - - @Override public boolean enabled() { return true; } - - public int getComparisonOrder() { - return 2; // we only need to see each eval track - } - - public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1); - } - - public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( eval == null || eval.isMonomorphicInSamples() ) return null; - - switch (eval.getType()) { - case SNP: - case INDEL: - case SYMBOLIC: - allVariantCounts.put(eval.getType(), allVariantCounts.get(eval.getType()) + 1); - if ( comp != null ) - knownVariantCounts.put(eval.getType(), knownVariantCounts.get(eval.getType()) + 1); - break; - default: - throw new UserException.BadInput("Unexpected variant context type: " + eval); - } - - // count variants per sample - for (final Genotype g : eval.getGenotypes()) { - if ( ! g.isNoCall() && ! g.isHomRef() ) { - int count = countsPerSample.get(g.getSampleName()).get(eval.getType()); - countsPerSample.get(g.getSampleName()).put(eval.getType(), count + 1); - } - } - - return null; // we don't capture any interesting sites - } - - private final int perSampleMean(VariantContext.Type type) { - long sum = 0; - for ( Map count : countsPerSample.values() ) { - sum += count.get(type); - } - return (int)(Math.round(sum / (1.0 * countsPerSample.size()))); - } - - private final String noveltyRate(VariantContext.Type type) { - int all = allVariantCounts.get(type); - int known = knownVariantCounts.get(type); - int novel = all - known; - double rate = (novel / (1.0 * all)); - return all == 0 ? "NA" : String.format("%.2f", rate); - } - - public void finalizeEvaluation() { - nSNPs = allVariantCounts.get(VariantContext.Type.SNP); - nIndels = allVariantCounts.get(VariantContext.Type.INDEL); - nSVs = allVariantCounts.get(VariantContext.Type.SYMBOLIC); - - nSNPsPerSample = perSampleMean(VariantContext.Type.SNP); - nIndelsPerSample = perSampleMean(VariantContext.Type.INDEL); - nSVsPerSample = perSampleMean(VariantContext.Type.SYMBOLIC); - - SNPNoveltyRate = noveltyRate(VariantContext.Type.SNP); - IndelNoveltyRate = noveltyRate(VariantContext.Type.INDEL); - SVNoveltyRate = noveltyRate(VariantContext.Type.SYMBOLIC); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java index 70b37f500..4f5aeed61 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypeConcordance.java @@ -445,39 +445,6 @@ class SampleStats implements TableType { } } -/** - * Sample stats, but for AC - */ -class ACStats extends SampleStats { - private String[] rowKeys; - - public ACStats(VariantContext evalvc, VariantContext compvc, int nGenotypeTypes) { - super(nGenotypeTypes); - rowKeys = new String[1+2*evalvc.getGenotypes().size()+1+2*compvc.getGenotypes().size()]; - for ( int i = 0; i <= 2*evalvc.getGenotypes().size(); i++ ) { // todo -- assuming ploidy 2 here... - concordanceStats.put(String.format("evalAC%d",i),new long[nGenotypeTypes][nGenotypeTypes]); - rowKeys[i] = String.format("evalAC%d",i); - - } - - for ( int i = 0; i <= 2*compvc.getGenotypes().size(); i++ ) { - concordanceStats.put(String.format("compAC%d",i), new long[nGenotypeTypes][nGenotypeTypes]); - rowKeys[1+2*evalvc.getGenotypes().size()+i] = String.format("compAC%d",i); - } - } - - public String getName() { - return "Allele Count Statistics"; - } - - public Object[] getRowKeys() { - if ( rowKeys == null ) { - throw new StingException("RowKeys is null!"); - } - return rowKeys; - } -} - /** * a table of sample names to genotype concordance summary statistics */ @@ -637,79 +604,3 @@ class SampleSummaryStats implements TableType { } } -/** - * SampleSummaryStats .. but for allele counts - */ -class ACSummaryStats extends SampleSummaryStats { - private String[] rowKeys; - - public ACSummaryStats (final VariantContext evalvc, final VariantContext compvc) { - concordanceSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]); - rowKeys = new String[3+2*evalvc.getGenotypes().size() + 2*compvc.getGenotypes().size()]; - rowKeys[0] = ALL_SAMPLES_KEY; - for( int i = 0; i <= 2*evalvc.getGenotypes().size() ; i ++ ) { - concordanceSummary.put(String.format("evalAC%d",i), new double[COLUMN_KEYS.length]); - rowKeys[i+1] = String.format("evalAC%d",i); - } - for( int i = 0; i <= 2*compvc.getGenotypes().size() ; i ++ ) { - concordanceSummary.put(String.format("compAC%d",i), new double[COLUMN_KEYS.length]); - rowKeys[2+2*evalvc.getGenotypes().size()+i] = String.format("compAC%d",i); - } - - } - - public String getName() { - return "Allele Count Summary Statistics"; - } - - public Object[] getRowKeys() { - if ( rowKeys == null) { - throw new StingException("rowKeys is null!!"); - } - return rowKeys; - } -} - -class CompACNames implements Comparator{ - - final Logger myLogger; - private boolean info = true; - - public CompACNames(Logger l) { - myLogger = l; - } - - public boolean equals(Object o) { - return ( o.getClass() == CompACNames.class ); - } - - public int compare(Object o1, Object o2) { - if ( info ) { - myLogger.info("Sorting AC names"); - info = false; - } - //System.out.printf("Objects %s %s get ranks %d %d%n",o1.toString(),o2.toString(),getRank(o1),getRank(o2)); - return getRank(o1) - getRank(o2); - } - - public int getRank(Object o) { - if ( o.getClass() != String.class ) { - return Integer.MIN_VALUE/4; - } else { - String s = (String) o; - if ( s.startsWith("eval") ) { - return Integer.MIN_VALUE/4 + 1 + parseAC(s); - } else if ( s.startsWith("comp") ) { - return 1+ parseAC(s); - } else { - return Integer.MIN_VALUE/4; - } - } - } - - public int parseAC(String s) { - String[] g = s.split("AC"); - return Integer.parseInt(g[1]); - } -} - diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java index 17d7171b8..9de850d82 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java @@ -16,19 +16,19 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv long nTi = 0; @DataPoint(description = "number of transversion loci") long nTv = 0; - @DataPoint(description = "the transition to transversion ratio") + @DataPoint(description = "the transition to transversion ratio", format = "%.2f") double tiTvRatio = 0.0; @DataPoint(description = "number of comp transition sites") long nTiInComp = 0; @DataPoint(description = "number of comp transversion sites") long nTvInComp = 0; - @DataPoint(description = "the transition to transversion ratio for comp sites") + @DataPoint(description = "the transition to transversion ratio for comp sites", format = "%.2f") double TiTvRatioStandard = 0.0; @DataPoint(description = "number of derived transition loci") long nTiDerived = 0; @DataPoint(description = "number of derived transversion loci") long nTvDerived = 0; - @DataPoint(description = "the derived transition to transversion ratio") + @DataPoint(description = "the derived transition to transversion ratio", format = "%.2f") double tiTvDerivedRatio = 0.0; public boolean enabled() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java index 1a0591e9d..86d3467fb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java @@ -30,10 +30,10 @@ public class ValidationReport extends VariantEvaluator implements StandardEval { @DataPoint(description = "FN") int FN = 0; @DataPoint(description = "TN") int TN = 0; - @DataPoint(description = "Sensitivity") double sensitivity = 0; - @DataPoint(description = "Specificity") double specificity = 0; - @DataPoint(description = "PPV") double PPV = 0; - @DataPoint(description = "FDR") double FDR = 0; + @DataPoint(description = "Sensitivity", format = "%.2f") double sensitivity = 0; + @DataPoint(description = "Specificity", format = "%.2f") double specificity = 0; + @DataPoint(description = "PPV", format = "%.2f") double PPV = 0; + @DataPoint(description = "FDR", format = "%.2f") double FDR = 0; @DataPoint(description = "CompMonoEvalNoCall") int CompMonoEvalNoCall = 0; @DataPoint(description = "CompMonoEvalFiltered") int CompMonoEvalFiltered = 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java new file mode 100644 index 000000000..ba7164400 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantSummary.java @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; +import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.util.Collection; +import java.util.EnumMap; +import java.util.HashMap; +import java.util.Map; + +@Analysis(description = "1000 Genomes Phase I summary of variants table") +public class VariantSummary extends VariantEvaluator implements StandardEval { + // basic counts on various rates found + @DataPoint(description = "Number of samples") + public long nSamples = 0; + + @DataPoint(description = "Number of processed loci") + public long nProcessedLoci = 0; + + @DataPoint(description = "Number of SNPs") + public long nSNPs = 0; + @DataPoint(description = "Overall TiTv ratio", format = "%.2f") + public double TiTvRatio = 0; + @DataPoint(description = "SNP Novelty Rate") + public String SNPNoveltyRate = "NA"; + @DataPoint(description = "Mean number of SNPs per individual") + public long nSNPsPerSample = 0; + @DataPoint(description = "Mean TiTv ratio per individual", format = "%.2f") + public double TiTvRatioPerSample = 0; + @DataPoint(description = "Mean depth of coverage per sample at SNPs", format = "%.1f") + public double SNPDPPerSample = 0; + + @DataPoint(description = "Number of Indels") + public long nIndels = 0; + @DataPoint(description = "Indel Novelty Rate") + public String IndelNoveltyRate = "NA"; + @DataPoint(description = "Mean number of Indels per individual") + public long nIndelsPerSample = 0; + @DataPoint(description = "Mean depth of coverage per sample at Indels", format = "%.1f") + public double IndelDPPerSample = 0; + + @DataPoint(description = "Number of SVs") + public long nSVs = 0; + @DataPoint(description = "SV Novelty Rate") + public String SVNoveltyRate = "NA"; + @DataPoint(description = "Mean number of SVs per individual") + public long nSVsPerSample = 0; + + TypeSampleMap allVariantCounts, knownVariantCounts; + TypeSampleMap countsPerSample; + TypeSampleMap transitionsPerSample, transversionsPerSample; + TypeSampleMap depthPerSample; + + private final static String ALL = "ALL"; + + private class TypeSampleMap extends EnumMap> { + public TypeSampleMap(final Collection samples) { + super(VariantContext.Type.class); + for ( VariantContext.Type type : VariantContext.Type.values() ) { + Map bySample = new HashMap(samples.size()); + for ( final String sample : samples ) { + bySample.put(sample, 0); + } + bySample.put(ALL, 0); + this.put(type, bySample); + } + } + + public final void inc(final VariantContext.Type type, final String sample) { + final int count = this.get(type).get(sample); + get(type).put(sample, count + 1); + } + + public final int all(VariantContext.Type type) { + return get(type).get(ALL); + } + + public final int meanValue(VariantContext.Type type) { + long sum = 0; + int n = 0; + for ( final Map.Entry pair : get(type).entrySet() ) { + if ( pair.getKey() != ALL) { + n++; + sum += pair.getValue(); + } + } + return (int)(Math.round(sum / (1.0 * n))); + } + + public final double ratioValue(VariantContext.Type type, TypeSampleMap denoms, boolean allP) { + double sum = 0; + int n = 0; + for ( final String sample : get(type).keySet() ) { + if ( (allP && sample == ALL) || (!allP && sample != ALL) ) { + final long num = get(type).get(sample); + final long denom = denoms.get(type).get(sample); + sum += ratio(num, denom); + n++; + } + } + return Math.round(sum / (1.0 * n)); + } + } + + + public void initialize(VariantEvalWalker walker) { + nSamples = walker.getSampleNamesForEvaluation().size(); + countsPerSample = new TypeSampleMap(walker.getSampleNamesForEvaluation()); + transitionsPerSample = new TypeSampleMap(walker.getSampleNamesForEvaluation()); + transversionsPerSample = new TypeSampleMap(walker.getSampleNamesForEvaluation()); + allVariantCounts = new TypeSampleMap(walker.getSampleNamesForEvaluation()); + knownVariantCounts = new TypeSampleMap(walker.getSampleNamesForEvaluation()); + depthPerSample = new TypeSampleMap(walker.getSampleNamesForEvaluation()); + } + + @Override public boolean enabled() { return true; } + + public int getComparisonOrder() { + return 2; // we only need to see each eval track + } + + public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1); + } + + public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( eval == null || eval.isMonomorphicInSamples() ) return null; + + TypeSampleMap titvTable = null; + + switch (eval.getType()) { + case SNP: + titvTable = VariantContextUtils.isTransition(eval) ? transitionsPerSample : transversionsPerSample; + titvTable.inc(eval.getType(), ALL); + case INDEL: + case SYMBOLIC: + allVariantCounts.inc(eval.getType(), ALL); + if ( comp != null ) + knownVariantCounts.inc(eval.getType(), ALL); + if ( eval.hasAttribute(VCFConstants.DEPTH_KEY) ) + depthPerSample.inc(eval.getType(), ALL); + break; + default: + throw new UserException.BadInput("Unexpected variant context type: " + eval); + } + + // per sample metrics + for (final Genotype g : eval.getGenotypes()) { + if ( ! g.isNoCall() && ! g.isHomRef() ) { + countsPerSample.inc(eval.getType(), g.getSampleName()); + + // update transition / transversion ratio + if ( titvTable != null ) titvTable.inc(eval.getType(), g.getSampleName()); + + if ( g.hasAttribute(VCFConstants.DEPTH_KEY) ) + depthPerSample.inc(eval.getType(), g.getSampleName()); + } + } + + return null; // we don't capture any interesting sites + } + + private final String noveltyRate(VariantContext.Type type) { + final int all = allVariantCounts.all(type); + final int known = knownVariantCounts.all(type); + final int novel = all - known; + final double rate = (novel / (1.0 * all)); + return all == 0 ? "NA" : String.format("%.2f", rate); + } + + public void finalizeEvaluation() { + nSNPs = allVariantCounts.all(VariantContext.Type.SNP); + nIndels = allVariantCounts.all(VariantContext.Type.INDEL); + nSVs = allVariantCounts.all(VariantContext.Type.SYMBOLIC); + + TiTvRatio = transitionsPerSample.ratioValue(VariantContext.Type.SNP, transversionsPerSample, true); + TiTvRatioPerSample = transitionsPerSample.ratioValue(VariantContext.Type.SNP, transversionsPerSample, false); + + nSNPsPerSample = countsPerSample.meanValue(VariantContext.Type.SNP); + nIndelsPerSample = countsPerSample.meanValue(VariantContext.Type.INDEL); + nSVsPerSample = countsPerSample.meanValue(VariantContext.Type.SYMBOLIC); + + SNPNoveltyRate = noveltyRate(VariantContext.Type.SNP); + IndelNoveltyRate = noveltyRate(VariantContext.Type.INDEL); + SVNoveltyRate = noveltyRate(VariantContext.Type.SYMBOLIC); + + SNPDPPerSample = depthPerSample.meanValue(VariantContext.Type.SNP); + IndelDPPerSample = depthPerSample.meanValue(VariantContext.Type.INDEL); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/DataPoint.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/DataPoint.java index 396843252..90a6b97e0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/DataPoint.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/DataPoint.java @@ -6,4 +6,5 @@ import java.lang.annotation.RetentionPolicy; @Retention(RetentionPolicy.RUNTIME) public @interface DataPoint { String description() default ""; // the description, optional + String format() default ""; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java index 2c57d475c..cb44ca522 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java @@ -246,7 +246,7 @@ public class VariantEvalUtils { field.setAccessible(true); if (!(field.get(vei) instanceof TableType)) { - table.addColumn(field.getName(), 0.0); + table.addColumn(field.getName(), 0.0, datamap.get(field).format()); } } } catch (InstantiationException e) { @@ -297,6 +297,7 @@ public class VariantEvalUtils { * Additional variant contexts per sample are automatically generated and added to the map unless the sample name * matches the ALL_SAMPLE_NAME constant. * + * * @param tracker the metadata tracker * @param ref the reference context * @param tracks the list of tracks to process @@ -308,7 +309,7 @@ public class VariantEvalUtils { * * @return the mapping of track to VC list that should be populated */ - public HashMap, HashMap>> + public HashMap, HashMap>> bindVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, List> tracks, @@ -319,11 +320,11 @@ public class VariantEvalUtils { if ( tracker == null ) return null; - HashMap, HashMap>> bindings = new HashMap, HashMap>>(); + HashMap, HashMap>> bindings = new HashMap, HashMap>>(); RodBinding firstTrack = tracks.isEmpty() ? null : tracks.get(0); for ( RodBinding track : tracks ) { - HashMap> mapping = new HashMap>(); + HashMap> mapping = new HashMap>(); for ( VariantContext vc : tracker.getValues(track, ref.getLocus()) ) { @@ -352,9 +353,9 @@ public class VariantEvalUtils { if ( mergeTracks && bindings.containsKey(firstTrack) ) { // go through each binding of sample -> value and add all of the bindings from this entry - HashMap> firstMapping = bindings.get(firstTrack); - for ( Map.Entry> elt : mapping.entrySet() ) { - Set firstMappingSet = firstMapping.get(elt.getKey()); + HashMap> firstMapping = bindings.get(firstTrack); + for ( Map.Entry> elt : mapping.entrySet() ) { + Collection firstMappingSet = firstMapping.get(elt.getKey()); if ( firstMappingSet != null ) { firstMappingSet.addAll(elt.getValue()); } else { @@ -369,9 +370,9 @@ public class VariantEvalUtils { return bindings; } - private void addMapping(HashMap> mappings, String sample, VariantContext vc) { + private void addMapping(HashMap> mappings, String sample, VariantContext vc) { if ( !mappings.containsKey(sample) ) - mappings.put(sample, new LinkedHashSet()); + mappings.put(sample, new ArrayList(1)); mappings.get(sample).add(vc); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 102d4715e..403ecce78 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -30,7 +30,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("a36414421621b377d6146d58d2fcecd0") + Arrays.asList("abe943d1aac120d7e75b9b9e5dac2399") ); executeTest("testFunctionClassWithSnpeff", spec); } @@ -50,7 +50,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("6a71b17c19f5914c277a99f45f5d9c39") + Arrays.asList("5fd9624c7a35ffb79d0feb1e233fc757") ); executeTest("testStratifySamplesAndExcludeMonomorphicSites", spec); } @@ -70,7 +70,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("fb926edfd3d811e18b33798a43ef4379") + Arrays.asList("4a8765cd02d36e63f6d0f0c10a6c674b") ); executeTest("testFundamentalsCountVariantsSNPsandIndels", spec); } @@ -91,7 +91,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("26b7d57e3a204ac80a28cb29485b59b7") + Arrays.asList("4106ab8f742ad1c3138c29220151503c") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNovelty", spec); } @@ -113,7 +113,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("1df8184062f330bea9da8bacacc5a09d") + Arrays.asList("6cee3a8d68307a118944f2df5401ac89") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNoveltyAndFilter", spec); } @@ -134,7 +134,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("927f26414509db9e7c0a2c067d57c949") + Arrays.asList("af5dd27354d5dfd0d2fe03149af09b55") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithCpG", spec); } @@ -155,7 +155,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("e6fddefd95122cabc5a0f0b95bce6d34") + Arrays.asList("062a231e203671e19aa9c6507710d762") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithFunctionalClass", spec); } @@ -176,7 +176,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("df10486dae73a9cf8c647964f51ba3e0") + Arrays.asList("75abdd2b17c0a5e04814b6969a3d4d7e") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithDegeneracy", spec); } @@ -197,7 +197,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("524adb0b7ff70e227b8803a88f36713e") + Arrays.asList("bdbb5f8230a4a193058750c5e506c733") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithSample", spec); } @@ -220,7 +220,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("ef6449789dfc032602458b7c5538a1bc") + Arrays.asList("f076120da22930294840fcc396f5f141") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithJexlExpression", spec); } @@ -245,7 +245,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("13b90e94fa82d72bb04a0a5addb27c3f") + Arrays.asList("69201f4a2a7a44b38805a4aeeb8830b6") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithMultipleJexlExpressions", spec); } @@ -264,7 +264,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("8458b9d7803d75aae551fac7dbd152d6") + Arrays.asList("c3bd3cb6cfb21a8c2b4d5f69104bf6c2") ); executeTest("testFundamentalsCountVariantsNoCompRod", spec); } @@ -277,7 +277,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --eval " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" + " --comp:comp_genotypes,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.head.vcf"; WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -ST CpG -o %s", - 1, Arrays.asList("b954dee127ec4205ed7d33c91aa3e045")); + 1, Arrays.asList("861f94e3237d62bd5bc00757319241f7")); executeTestParallel("testSelect1", spec); } @@ -294,7 +294,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { @Test public void testCompVsEvalAC() { String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -ST CpG -EV GenotypeConcordance --eval:evalYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.very.few.lines.vcf --comp:compYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.fake.genotypes.ac.test.vcf"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("ae0027197547731a9a5c1eec5fbe0221")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("955c33365e017679047fabec0f14d5e0")); executeTestParallel("testCompVsEvalAC",spec); } @@ -312,7 +312,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { @Test public void testCompOverlap() { String extraArgs = "-T VariantEval -R " + b37KGReference + " -L " + validationDataLocation + "VariantEval/pacbio.hg19.intervals --comp:comphapmap " + comparisonDataLocation + "Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf --eval " + validationDataLocation + "VariantEval/pacbio.ts.recalibrated.vcf -noEV -EV CompOverlap -sn NA12878 -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("009ecc8376a20dce81ff5299ef6bfecb")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("fb7d989e44bd74c5376cb5732f9f3f64")); executeTestParallel("testCompOverlap",spec); } @@ -324,7 +324,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --dbsnp " + b37dbSNP132 + " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("835b44fc3004cc975c968c9f92ed25d6")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("da5bcb305c5ef207ce175821efdbdefd")); executeTestParallel("testEvalTrackWithoutGenotypes",spec); } @@ -336,7 +336,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { " --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" + " --eval:evalBC " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bc.sites.vcf" + " -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("f0e003f1293343c3210ae95e8936b19a")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("fde839ece1442388f21a2f0b936756a8")); executeTestParallel("testMultipleEvalTracksWithoutGenotypes",spec); } @@ -353,13 +353,13 @@ public class VariantEvalIntegrationTest extends WalkerTest { " -noST -noEV -ST Novelty -EV CompOverlap" + " -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("0b81d97f843ec4a1a4222d1f9949bfca")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("1efae6b3b88c752b771e0c8fae24464e")); executeTestParallel("testMultipleCompTracks",spec); } @Test - public void testPerSampleAndSubsettedSampleHaveSameResults() { - String md5 = "7425ca5c439afd7bb33ed5cfea02c2b3"; + public void testPerSampleAndSubsettedSampleHaveSameResults1() { + String md5 = "bc9bcabc3105e2515d9a2d41506d2de1"; WalkerTestSpec spec = new WalkerTestSpec( buildCommandLine( @@ -414,7 +414,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("924b6123edb9da540d0abc66f6f33e16") + Arrays.asList("e53546243250634fc03e83b4e61ec55f") ); executeTest("testAlleleCountStrat", spec); } @@ -435,7 +435,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("9794e2dba205c6929dc89899fdf0bf6b") + Arrays.asList("c8086f0525bc13e666afeb670c2e13ae") ); executeTest("testIntervalStrat", spec); }