From b0432ee1e2b10f6ad258e43b1d708f455317b207 Mon Sep 17 00:00:00 2001 From: kiran Date: Mon, 31 Jan 2011 17:03:41 +0000 Subject: [PATCH] First part of a two-stage commit. Removing old VariantEval to make room for VariantEval 3.0 in core. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5137 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/varianteval/CompOverlap.java | 107 -- .../varianteval/CountFunctionalClasses.java | 77 -- .../walkers/varianteval/CountVariants.java | 150 --- .../varianteval/GenotypeConcordance.java | 789 --------------- .../varianteval/GenotypePhasingEvaluator.java | 419 -------- .../varianteval/IndelLengthHistogram.java | 114 --- .../walkers/varianteval/IndelMetricsByAC.java | 214 ---- .../walkers/varianteval/IndelStatistics.java | 508 ---------- .../MendelianViolationEvaluator.java | 182 ---- .../walkers/varianteval/PrintMissingComp.java | 67 -- .../varianteval/SimpleMetricsByAC.java | 175 ---- .../walkers/varianteval/StandardEval.java | 28 - .../varianteval/ThetaVariantEvaluator.java | 136 --- .../varianteval/TiTvVariantEvaluator.java | 68 -- .../varianteval/VariantEvalWalker.java | 918 ------------------ .../walkers/varianteval/VariantEvaluator.java | 101 -- .../varianteval/VariantQualityScore.java | 258 ----- 17 files changed, 4311 deletions(-) delete mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/varianteval/CompOverlap.java delete mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/varianteval/CountFunctionalClasses.java delete mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/varianteval/CountVariants.java delete mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypeConcordance.java delete mode 100644 java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypePhasingEvaluator.java delete mode 100644 java/src/org/broadinstitute/sting/gatk/walkers/varianteval/IndelLengthHistogram.java delete mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/varianteval/IndelMetricsByAC.java delete mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/varianteval/IndelStatistics.java delete mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/varianteval/MendelianViolationEvaluator.java delete mode 100644 java/src/org/broadinstitute/sting/gatk/walkers/varianteval/PrintMissingComp.java delete mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/varianteval/SimpleMetricsByAC.java delete mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/varianteval/StandardEval.java delete mode 100644 java/src/org/broadinstitute/sting/gatk/walkers/varianteval/ThetaVariantEvaluator.java delete mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/varianteval/TiTvVariantEvaluator.java delete mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java delete mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvaluator.java delete mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantQualityScore.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/CompOverlap.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/CompOverlap.java deleted file mode 100755 index 6edb5c06d..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/CompOverlap.java +++ /dev/null @@ -1,107 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval; - -import org.broad.tribble.util.variantcontext.Allele; -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.utils.report.tags.Analysis; -import org.broadinstitute.sting.utils.report.tags.DataPoint; - -/** - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - *

- * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ -@Analysis(name = "Comp Overlap", description = "the overlap between eval and comp sites") -public class CompOverlap extends VariantEvaluator implements StandardEval { - - @DataPoint(name = "eval sites", description = "number of eval SNP sites") - long nEvalSNPs = 0; - - @DataPoint(name = "comp sites", description = "number of comp SNP sites") - long nCompSNPs = 0; - - @DataPoint(name = "evals not at comp", description = "number of eval sites outside of comp sites") - long novelSites = 0; - - @DataPoint(name = "evals at comp", description = "number of eval sites at comp sites") - long nSNPsAtComp = 0; - - @DataPoint(name = "% evals at comp", description = "percentage of eval sites at comp sites") - double compRate = 0.0; - - @DataPoint(name = "concordant", description = "number of concordant sites") - long nConcordant = 0; - - @DataPoint(name = "% concordant", description = "the concordance rate") - double concordantRate = 0.0; - - private boolean expectingIndels = false; - - public CompOverlap(VariantEvalWalker parent) { - super(parent); - expectingIndels = parent.dels; - } - - public String getName() { - return "compOverlap"; - } - - public int getComparisonOrder() { - return 2; // we need to see each eval track and each comp track - } - - public long nNovelSites() { return nEvalSNPs - nSNPsAtComp; } - public double compRate() { return rate(nSNPsAtComp, nEvalSNPs); } - public double concordanceRate() { return rate(nConcordant, nSNPsAtComp); } - - public void finalizeEvaluation() { - compRate = 100 * compRate(); - concordantRate = 100 * concordanceRate(); - novelSites = nNovelSites(); - } - - public boolean enabled() { - return true; - } - - /** - * Returns true if every allele in eval is also in comp - * - * @param eval eval context - * @param comp db context - * @return true if eval and db are discordant - */ - public boolean discordantP(VariantContext eval, VariantContext comp) { - for (Allele a : eval.getAlleles()) { - if (!comp.hasAllele(a, true)) - return true; - } - - return false; - } - - public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - boolean compIsGood = expectingIndels ? comp != null && comp.isNotFiltered() && comp.isIndel() : comp != null && comp.isNotFiltered() && comp.isSNP() ; - boolean evalIsGood = expectingIndels ? eval != null && eval.isIndel() : eval != null && eval.isSNP() ; - - if ( compIsGood ) nCompSNPs++; // count the number of comp events - if (evalIsGood) nEvalSNPs++; // count the number of eval events - - if (compIsGood && evalIsGood) { - nSNPsAtComp++; - - if (!discordantP(eval, comp)) // count whether we're concordant or not with the comp value - nConcordant++; - } - - return null; // we don't capture any interesting sites - } - - -} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/CountFunctionalClasses.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/CountFunctionalClasses.java deleted file mode 100755 index 4cd502a00..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/CountFunctionalClasses.java +++ /dev/null @@ -1,77 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval; - -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.report.tags.Analysis; -import org.broadinstitute.sting.utils.report.tags.DataPoint; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; - -@Analysis(name = "Count Functional Classes", description = "Counts instances of different functional variant classes (provided the variants are annotated with that information)") -public class CountFunctionalClasses extends VariantEvaluator { - // the following fields are in output order: - @DataPoint(description = "miRNA") - long nMiRNA= 0; - - @DataPoint(description = "3'-UTR") - long nUTR3 = 0; - - @DataPoint(description = "Intron") - long nIntron = 0; - - @DataPoint(description = "Splice-site") - long nSpliceSite= 0; - - @DataPoint(description = "Read-through") - long nReadThrough = 0; - - @DataPoint(description = "Nonsense") - long nNonsense = 0; - - @DataPoint(description = "Missense") - long nMissense = 0; - - @DataPoint(description = "Synonymous") - long nSynonymous = 0; - - @DataPoint(description = "5'-UTR") - long nUTR5= 0; - - @DataPoint(description = "Promoter") - long nPromoter = 0; - - public CountFunctionalClasses(VariantEvalWalker parent) { - super(parent); - } - - public String getName() { - return "functionalclasses"; - } - - public boolean enabled() { - return false; - } - - public int getComparisonOrder() { - return 1; - } - - public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - String type = vc1.getAttributeAsString("type"); - - if (type != null) { - if (type.equalsIgnoreCase("miRNA")) { nMiRNA++; } - else if (type.equalsIgnoreCase("3'-UTR")) { nUTR3++; } - else if (type.equalsIgnoreCase("Intron")) { nIntron++; } - else if (type.equalsIgnoreCase("Splice_site")) { nSpliceSite++; } - else if (type.equalsIgnoreCase("Read-through")) { nReadThrough++; } - else if (type.equalsIgnoreCase("Nonsense")) { nNonsense++; } - else if (type.equalsIgnoreCase("Missense")) { nMissense++; } - else if (type.equalsIgnoreCase("Synonymous")) { nSynonymous++; } - else if (type.equalsIgnoreCase("5'-UTR")) { nUTR5++; } - else if (type.equalsIgnoreCase("Promoter")) { nPromoter++; } - } - - return null; - } -} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/CountVariants.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/CountVariants.java deleted file mode 100755 index f040b2e62..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/CountVariants.java +++ /dev/null @@ -1,150 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval; - -import org.broad.tribble.util.variantcontext.Genotype; -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.report.tags.Analysis; -import org.broadinstitute.sting.utils.report.tags.DataPoint; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - - -@Analysis(name = "Count Variants", description = "Counts different classes of variants in the sample") -public class CountVariants extends VariantEvaluator implements StandardEval { - - // the following fields are in output order: - - // basic counts on various rates found - @DataPoint(description = "Number of processed loci") - long nProcessedLoci = 0; - @DataPoint(description = "Number of called loci") - long nCalledLoci = 0; - @DataPoint(description = "Number of reference loci") - long nRefLoci = 0; - @DataPoint(description = "Number of variant loci") - long nVariantLoci = 0; - - // the following two calculations get set in the finalizeEvaluation - @DataPoint(description = "Variants per loci rate") - double variantRate = 0; - @DataPoint(description = "Number of variants per base") - double variantRatePerBp = 0; - - - @DataPoint(description = "Number of snp loci") - long nSNPs = 0; - @DataPoint(description = "Number of insertions") - long nInsertions = 0; - @DataPoint(description = "Number of deletions") - long nDeletions = 0; - @DataPoint(description = "Number of complex loci") - long nComplex = 0; - - @DataPoint(description = "Number of no calls loci") - long nNoCalls = 0; - @DataPoint(description = "Number of het loci") - long nHets = 0; - @DataPoint(description = "Number of hom ref loci") - long nHomRef = 0; - @DataPoint(description = "Number of hom var loci") - long nHomVar = 0; - - // calculations that get set in the finalizeEvaluation method - @DataPoint(description = "heterozygosity per locus rate") - double heterozygosity = 0; - @DataPoint(description = "heterozygosity per base pair") - double heterozygosityPerBp = 0; - @DataPoint(description = "heterozygosity to homozygosity ratio") - double hetHomRatio = 0; - @DataPoint(description = "indel rate (insertion count + deletion count)") - double indelRate = 0; - @DataPoint(description = "indel rate per base pair") - double indelRatePerBp = 0; - @DataPoint(description = "deletion to insertion ratio") - double deletionInsertionRatio = 0; - - public CountVariants(VariantEvalWalker parent) { - super(parent); - } - - private double perLocusRate(long n) { - return rate(n, nProcessedLoci); - } - - private long perLocusRInverseRate(long n) { - return inverseRate(n, nProcessedLoci); - } - - public String getName() { - return "counter"; - } - - public boolean enabled() { - return true; - } - - public int getComparisonOrder() { - return 1; // we only need to see each eval track - } - - public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - nProcessedLoci += context.getSkippedBases() + (ref == null ? 0 : 1); - } - - public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - //nProcessedLoci++; - nCalledLoci++; - - if (vc1.isVariant()) nVariantLoci++; - switch (vc1.getType()) { - case NO_VARIATION: - nRefLoci++; - break; - case SNP: - nSNPs++; - break; - case INDEL: - if (vc1.isInsertion()) nInsertions++; - else nDeletions++; - break; - case MIXED: - nComplex++; - break; - default: - throw new ReviewedStingException("Unexpected VariantContext type " + vc1.getType()); - } - - for (Genotype g : vc1.getGenotypes().values()) { - switch (g.getType()) { - case NO_CALL: - nNoCalls++; - break; - case HOM_REF: - nHomRef++; - break; - case HET: - nHets++; - break; - case HOM_VAR: - nHomVar++; - break; - default: - throw new ReviewedStingException("BUG: Unexpected genotype type: " + g); - } - } - - return null; // we don't capture any interesting sites - } - - public void finalizeEvaluation() { - variantRate = perLocusRate(nVariantLoci); - variantRatePerBp = perLocusRInverseRate(nVariantLoci); - heterozygosity = perLocusRate(nHets); - heterozygosityPerBp = perLocusRInverseRate(nHets); - hetHomRatio = ratio(nHets, nHomVar); - indelRate = perLocusRate(nDeletions + nInsertions); - indelRatePerBp = perLocusRInverseRate(nDeletions + nInsertions); - deletionInsertionRatio = ratio(nDeletions, nInsertions); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypeConcordance.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypeConcordance.java deleted file mode 100755 index c61cd74ec..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypeConcordance.java +++ /dev/null @@ -1,789 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval; - -import org.apache.commons.lang.ArrayUtils; -import org.broad.tribble.util.variantcontext.Allele; -import org.broad.tribble.util.variantcontext.Genotype; -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broad.tribble.vcf.VCFConstants; -import org.broadinstitute.sting.gatk.contexts.*; -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.report.tags.Analysis; -import org.broadinstitute.sting.utils.report.tags.DataPoint; -import org.broadinstitute.sting.utils.report.utils.TableType; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.util.*; - -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -@Analysis(name = "Genotype Concordance", description = "Determine the genotype concordance between the genotypes in difference tracks") -public class GenotypeConcordance extends VariantEvaluator implements StandardEval { - private static final boolean PRINT_INTERESTING_SITES = true; - - protected final static Logger logger = Logger.getLogger(GenotypeConcordance.class); - - // a mapping from allele count to stats - @DataPoint(description = "the frequency statistics for each allele") - FrequencyStats alleleFreqStats = new FrequencyStats(); - - // a mapping from sample to stats - @DataPoint(name="samples", description = "the concordance statistics for each sample") - SampleStats sampleStats = null; - - // a mapping from sample to stats summary - @DataPoint(name="summary", description = "the concordance statistics summary for each sample") - SampleSummaryStats sampleSummaryStats = null; - - // two histograms of variant quality scores, for true positive and false positive calls - @DataPoint(description = "the variant quality score histograms for true positive and false positive calls") - QualityScoreHistograms qualityScoreHistograms = null; - - @DataPoint(description = "the concordance statistics summary by allele count") - ACSummaryStats alleleCountSummary = null; - - @DataPoint(description = "the concordance statistics by allele count") - ACStats alleleCountStats = null; - - private static final int MAX_MISSED_VALIDATION_DATA = 100; - - private boolean discordantInteresting = false; - - private VariantEvalWalker.EvaluationContext group = null; - - static class FrequencyStats implements TableType { - class Stats { - public Stats(int found, int missed) { nFound = found; nMissed = missed; } - public long nFound = 0; - public long nMissed = 0; - } - public HashMap foundMissedByAC = new HashMap(); - - public Object[] getRowKeys() { - String rows[] = new String[foundMissedByAC.size()]; - int index = 0; - for (int i : foundMissedByAC.keySet()) rows[index++] = "Allele Count " + i; - return rows; - } - - public Object[] getColumnKeys() { - return new String[]{"number_found", "number_missing"}; - } - - public String getName() { - return "FrequencyStats"; - } - - public String getCell(int x, int y) { - if (x >= foundMissedByAC.size()) throw new IllegalStateException(x + " is greater than the max index of " + (foundMissedByAC.size()-1)); - if (y == 0) return String.valueOf(foundMissedByAC.get(foundMissedByAC.keySet().toArray(new Integer[foundMissedByAC.size()])[x]).nFound); - else return String.valueOf(foundMissedByAC.get(foundMissedByAC.keySet().toArray(new Integer[foundMissedByAC.size()])[x]).nMissed); - } - - public void incrementFoundCount(int alleleFreq) { - if (!foundMissedByAC.containsKey(alleleFreq)) - foundMissedByAC.put(alleleFreq,new Stats(1,0)); - else - foundMissedByAC.get(alleleFreq).nFound++; - } - - public void incrementMissedCount(int alleleFreq) { - if (!foundMissedByAC.containsKey(alleleFreq)) - foundMissedByAC.put(alleleFreq,new Stats(0,1)); - else - foundMissedByAC.get(alleleFreq).nMissed++; - } - } - - static class QualityScoreHistograms implements TableType { - final static int NUM_BINS = 20; - final HashMap truePositiveQualityScoreMap = new HashMap(); // A HashMap holds all the quality scores until we are able to bin them appropriately - final HashMap falsePositiveQualityScoreMap = new HashMap(); - final int truePositiveHist[] = new int[NUM_BINS]; // the final histograms that get reported out - final int falsePositiveHist[] = new int[NUM_BINS]; - final String[] rowKeys = new String[]{"true_positive_hist", "false_positive_hist"}; - - public Object[] getRowKeys() { - return rowKeys; - } - - public Object[] getColumnKeys() { - final String columnKeys[] = new String[NUM_BINS]; - for( int iii = 0; iii < NUM_BINS; iii++ ) { - columnKeys[iii] = "histBin" + iii; - } - return columnKeys; - } - - public String getName() { - return "QualityScoreHistogram"; - } - - public String getCell(int x, int y) { - if( x == 0 ) { - return String.valueOf(truePositiveHist[y]); - } else if ( x == 1 ) { - return String.valueOf(falsePositiveHist[y]); - } else { - throw new ReviewedStingException( "Unknown row in " + getName() + ", row = " + x ); - } - } - - public String toString() { - String returnString = ""; - // output both histogram arrays - returnString += "TP: "; - for( int iii = 0; iii < NUM_BINS; iii++ ) { - returnString += truePositiveHist[iii] + " "; - } - returnString += "\nFP: "; - for( int iii = 0; iii < NUM_BINS; iii++ ) { - returnString += falsePositiveHist[iii] + " "; - } - return returnString; - } - - public void incrValue( final double qual, final boolean isTruePositiveCall ) { - HashMap qualScoreMap; - if( isTruePositiveCall ) { - qualScoreMap = truePositiveQualityScoreMap; - } else { - qualScoreMap = falsePositiveQualityScoreMap; - } - final Integer qualKey = Math.round((float) qual); - if( qualScoreMap.containsKey(qualKey) ) { - qualScoreMap.put(qualKey, qualScoreMap.get(qualKey) + 1); - } else { - qualScoreMap.put(qualKey, 1); - } - } - - public void organizeHistogramTables() { - for( int iii = 0; iii < NUM_BINS; iii++ ) { - truePositiveHist[iii] = 0; - falsePositiveHist[iii] = 0; - } - - int maxQual = 0; - - // Calculate the maximum quality score for both TP and FP calls in order to normalize and histogram - for( final Integer qual : truePositiveQualityScoreMap.keySet()) { - if( qual > maxQual ) { - maxQual = qual; - } - } - for( final Integer qual : falsePositiveQualityScoreMap.keySet()) { - if( qual > maxQual ) { - maxQual = qual; - } - } - - final double binSize = ((double)maxQual) / ((double) (NUM_BINS-1)); //BUGBUG: should be normalized max to min, not max to 0 - - for( final Integer qual : truePositiveQualityScoreMap.keySet()) { - final int index = (int)Math.floor( ((double)qual) / binSize ); - if(index >= 0) { //BUGBUG: problem when maxQual is zero? - truePositiveHist[ index ] += truePositiveQualityScoreMap.get(qual); - } - } - for( final Integer qual : falsePositiveQualityScoreMap.keySet()) { - final int index = (int)Math.floor( ((double)qual) / binSize ); - if(index >= 0) { - falsePositiveHist[ index ] += falsePositiveQualityScoreMap.get(qual); - } - } - } - } - - // keep a list of the validation data we saw before the first eval data - private HashSet missedValidationData = new HashSet(); - - - public GenotypeConcordance(VariantEvalWalker parent) { - super(parent); - discordantInteresting = parent.DISCORDANT_INTERESTING; - } - - public String getName() { - return "genotypeConcordance"; - } - - public int getComparisonOrder() { - return 2; // we need to see each eval track and each comp track - } - - public boolean enabled() { - return true; - } - - public String toString() { - return getName() + ": "; - } - - private boolean warnedAboutValidationData = false; - - public String update2(VariantContext eval, VariantContext validation, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantEvalWalker.EvaluationContext group) { - this.group = group; - - String interesting = null; - - // sanity check that we at least have either eval or validation data - if (eval == null && !isValidVC(validation)) { - return interesting; - } - - if( qualityScoreHistograms == null ) { - qualityScoreHistograms = new QualityScoreHistograms(); - } - - if ( alleleCountStats == null && eval != null && validation != null ) { - alleleCountStats = new ACStats(eval,validation,Genotype.Type.values().length); - alleleCountSummary = new ACSummaryStats(eval, validation); - } - - if (sampleStats == null) { - if (eval != null) { - // initialize the concordance table - sampleStats = new SampleStats(eval,Genotype.Type.values().length); - sampleSummaryStats = new SampleSummaryStats(eval); - for (final VariantContext vc : missedValidationData) { - determineStats(null, vc); - } - missedValidationData = null; - } else { - // todo -- Eric, this results in a memory problem when eval is WEx data but you are using CG calls genome-wide - // todo -- perhaps you need should extend the evaluators with an initialize - // todo -- method that gets the header (or samples) for the first eval sites? - if (missedValidationData.size() > MAX_MISSED_VALIDATION_DATA) { - if (!warnedAboutValidationData) { - //logger.warn("Too many genotype sites missed before eval site appeared; ignoring"); - warnedAboutValidationData = true; - } - } else { - missedValidationData.add(validation); - } - return interesting; - } - } - - interesting = determineStats(eval, validation); - - return interesting; // we don't capture any interesting sites - } - - private String determineStats(final VariantContext eval, final VariantContext validation) { - - String interesting = null; - - final boolean validationIsValidVC = isValidVC(validation); - final String evalAC = ( vcHasGoodAC(eval) ) ? String.format("evalAC%d",getAC(eval)) : null ; - final String validationAC = ( vcHasGoodAC(validation) ) ? String.format("compAC%d",getAC(validation)) : null; - - // determine concordance for eval data - if (eval != null) { - for (final String sample : eval.getGenotypes().keySet()) { - final Genotype.Type called = eval.getGenotype(sample).getType(); - final Genotype.Type truth; - - if (!validationIsValidVC || !validation.hasGenotype(sample)) { - truth = Genotype.Type.NO_CALL; - } else { - truth = validation.getGenotype(sample).getType(); - // interesting = "ConcordanceStatus=FP"; - if (discordantInteresting && truth.ordinal() != called.ordinal()) - { - interesting = "ConcordanceStatus=" + truth + "/" + called; - } - } - - sampleStats.incrValue(sample, truth, called); - if ( evalAC != null && validationAC != null) { - alleleCountStats.incrValue(evalAC,truth,called); - alleleCountStats.incrValue(validationAC,truth,called); - } - } - } - // otherwise, mark no-calls for all samples - else { - final Genotype.Type called = Genotype.Type.NO_CALL; - - for (final String sample : validation.getGenotypes().keySet()) { - final Genotype.Type truth = validation.getGenotype(sample).getType(); - sampleStats.incrValue(sample, truth, called); - if ( evalAC != null ) { - alleleCountStats.incrValue(evalAC,truth,called); - } - // print out interesting sites - if ( PRINT_INTERESTING_SITES && super.getVEWalker().gcLog != null ) { - if ( (truth == Genotype.Type.HOM_VAR || truth == Genotype.Type.HET) && called == Genotype.Type.NO_CALL ) { - super.getVEWalker().gcLog.printf("%s FN %s%n", group, validation); - } - if ( (called == Genotype.Type.HOM_VAR || called == Genotype.Type.HET) && truth == Genotype.Type.HOM_REF ) { - super.getVEWalker().gcLog.printf("%s FP %s%n", group, validation); - } - } - } - } - - // determine allele count concordance () // this is really a FN rate estimate -- CH - if (validationIsValidVC && validation.isPolymorphic()) { - int trueAlleleCount = 0; - for (final Allele a : validation.getAlternateAlleles()) { - trueAlleleCount += validation.getChromosomeCount(a); - } - if (eval != null) { - alleleFreqStats.incrementFoundCount(trueAlleleCount); - } else { - alleleFreqStats.incrementMissedCount(trueAlleleCount); - } - } - - // TP & FP quality score histograms - if( eval != null && eval.isPolymorphic() && validationIsValidVC ) { - if( eval.getGenotypes().keySet().size() == 1 ) { // single sample calls - for( final String sample : eval.getGenotypes().keySet() ) { // only one sample - if( validation.hasGenotype(sample) ) { - final Genotype truth = validation.getGenotype(sample); - qualityScoreHistograms.incrValue( eval.getPhredScaledQual(), !truth.isHomRef() ); - } - } - } else { // multi sample calls - qualityScoreHistograms.incrValue( eval.getPhredScaledQual(), validation.isPolymorphic() ); - } - - } - - return interesting; - } - - private static boolean isValidVC(final VariantContext vc) { - return (vc != null && !vc.isFiltered()); - } - - public void finalizeEvaluation() { - if( qualityScoreHistograms != null ) { - qualityScoreHistograms.organizeHistogramTables(); - } - - if( sampleSummaryStats != null && sampleStats != null ) { - sampleSummaryStats.generateSampleSummaryStats( sampleStats ); - } - - if ( alleleCountSummary != null && alleleCountStats != null ) { - alleleCountSummary.generateSampleSummaryStats( alleleCountStats ); - } - } - - private boolean vcHasGoodAC(VariantContext vc) { - return ( vc != null && vc.getAlternateAlleles().size() == 1 && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ); - - } - - private int getAC(VariantContext vc) { - if ( List.class.isAssignableFrom(vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY).getClass()) ) { - return ((List) vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY)).get(0); - } else if ( Integer.class.isAssignableFrom(vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY).getClass())) { - return (Integer) vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY); - } else if ( String.class.isAssignableFrom(vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY).getClass()) ) { - // two ways of parsing - String ac = (String) vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY); - if ( ac.startsWith("[") ) { - return Integer.parseInt(ac.replaceAll("\\[","").replaceAll("\\]","")); - } else { - try { - return Integer.parseInt(ac); - } catch ( NumberFormatException e ) { - throw new UserException(String.format("The format of the AC field is improperly formatted: AC=%s",ac)); - } - } - } else { - throw new UserException(String.format("The format of the AC field does not appear to be of integer-list or String format, class was %s",vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY).getClass())); - } - } -} - -/** - * a table of sample names to genotype concordance figures - */ -class SampleStats implements TableType { - private final int nGenotypeTypes; - - // sample to concordance stats object - public final HashMap concordanceStats = new HashMap(); - - /** - * - * @return one row per sample - */ - public Object[] getRowKeys() { - return concordanceStats.keySet().toArray(new String[concordanceStats.size()]); - } - - /** - * increment the specified value - * @param sample the sample name - * @param truth the truth type - * @param called the called type - */ - public void incrValue(String sample, Genotype.Type truth, Genotype.Type called) { - if ( concordanceStats.containsKey(sample) ) - concordanceStats.get(sample)[truth.ordinal()][called.ordinal()]++; - else if ( called != Genotype.Type.NO_CALL ) - throw new UserException.CommandLineException("Sample " + sample + " has not been seen in a previous eval; this analysis module assumes that all samples are present in each variant context"); - } - - /** - * get the column keys - * @return a list of objects, in this case strings, that are the column names - */ - public Object[] getColumnKeys() { - return new String[]{"total_true_ref","%_ref/ref","n_ref/no-call", - "n_ref/ref","n_ref/het","n_ref/hom", - "total_true_het","%_het/het","n_het/no-call", - "n_het/ref","n_het/het","n_het/hom", - "total_true_hom","%_hom/hom","n_hom/no-call", - "n_hom/ref","n_hom/het","n_hom/hom"}; - } - - - public SampleStats(VariantContext vc, int nGenotypeTypes) { - this.nGenotypeTypes = nGenotypeTypes; - for (String sample : vc.getGenotypes().keySet()) - concordanceStats.put(sample, new long[nGenotypeTypes][nGenotypeTypes]); - } - - public SampleStats(int genotypeTypes) { - nGenotypeTypes = genotypeTypes; - } - - public Object getCell(int x, int y) { - // we have three rows of 6 right now for output (rows: ref, het, hom) - Genotype.Type type = Genotype.Type.values()[(y/6)+1]; // get the row type - // save some repeat work, get the total every time - long total = 0; - Object[] rowKeys = getRowKeys(); - for (int called = 0; called < nGenotypeTypes; called++) { - total += concordanceStats.get(rowKeys[x])[type.ordinal()][called]; - } - - // now get the cell they're interested in - switch (y % 6) { - case (0): // get the total_true for this type - return total; - case (1): - return total == 0 ? 0.0 : (100.0 * (double) concordanceStats.get(rowKeys[x])[type.ordinal()][type.ordinal()] / (double) total); - default: - return concordanceStats.get(rowKeys[x])[type.ordinal()][(y % 6) - 2]; - } - } - - public String getName() { - return "Sample Statistics"; - } -} - -/** - * Sample stats, but for AC - */ -class ACStats extends SampleStats { - private String[] rowKeys; - - public ACStats(VariantContext evalvc, VariantContext compvc, int nGenotypeTypes) { - super(nGenotypeTypes); - rowKeys = new String[1+2*evalvc.getGenotypes().size()+1+2*compvc.getGenotypes().size()]; - for ( int i = 0; i <= 2*evalvc.getGenotypes().size(); i++ ) { // todo -- assuming ploidy 2 here... - concordanceStats.put(String.format("evalAC%d",i),new long[nGenotypeTypes][nGenotypeTypes]); - rowKeys[i] = String.format("evalAC%d",i); - - } - - for ( int i = 0; i <= 2*compvc.getGenotypes().size(); i++ ) { - concordanceStats.put(String.format("compAC%d",i), new long[nGenotypeTypes][nGenotypeTypes]); - rowKeys[1+2*evalvc.getGenotypes().size()+i] = String.format("compAC%d",i); - } - } - - public String getName() { - return "Allele Count Statistics"; - } - - public Object[] getRowKeys() { - if ( rowKeys == null ) { - throw new StingException("RowKeys is null!"); - } - return rowKeys; - } -} - -/** - * a table of sample names to genotype concordance summary statistics - */ -class SampleSummaryStats implements TableType { - protected final static String ALL_SAMPLES_KEY = "allSamples"; - protected final static String[] COLUMN_KEYS = new String[]{ - "percent_comp_ref_called_var", - "percent_comp_het_called_het", - "percent_comp_het_called_var", - "percent_comp_hom_called_hom", - "percent_comp_hom_called_var", - "percent_non-reference_sensitivity", - "percent_overall_genotype_concordance", - "percent_non-reference_discrepancy_rate"}; - - // sample to concordance stats object - protected final HashMap concordanceSummary = new HashMap(); - - /** - * - * @return one row per sample - */ - public Object[] getRowKeys() { - return concordanceSummary.keySet().toArray(new String[concordanceSummary.size()]); - } - - /** - * get the column keys - * @return a list of objects, in this case strings, that are the column names - */ - public Object[] getColumnKeys() { - return COLUMN_KEYS; - } - - public SampleSummaryStats(final VariantContext vc) { - concordanceSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]); - for( final String sample : vc.getGenotypes().keySet() ) { - concordanceSummary.put(sample, new double[COLUMN_KEYS.length]); - } - } - - public SampleSummaryStats() { - - } - - public Object getCell(int x, int y) { - final Object[] rowKeys = getRowKeys(); - return String.format("%.2f",concordanceSummary.get(rowKeys[x])[y]); - } - - /** - * Helper routine that sums up all columns / rows found in stats specified by all pairs in d1 x d2 - * - * @param stats - * @param d1 - * @param d2 - * @return - */ - private long sumStatsAllPairs( final long[][] stats, EnumSet d1, EnumSet d2 ) { - long sum = 0L; - - for(final Genotype.Type e1 : d1 ) { - for(final Genotype.Type e2 : d2 ) { - sum += stats[e1.ordinal()][e2.ordinal()]; - } - } - - return sum; - } - - private long sumStatsDiag( final long[][] stats, EnumSet d1) { - long sum = 0L; - - for(final Genotype.Type e1 : d1 ) { - sum += stats[e1.ordinal()][e1.ordinal()]; - } - - return sum; - } - - private double ratio(long numer, long denom) { - return denom != 0L ? 100.0 * ( ((double)numer) / ((double)denom) ) : 0.0; - } - - final long[] allSamplesNumerators = new long[COLUMN_KEYS.length]; - final long[] allSamplesDenominators = new long[COLUMN_KEYS.length]; - - private void updateSummaries(int i, double[] summary, long numer, long denom ) { - allSamplesNumerators[i] += numer; - allSamplesDenominators[i] += denom; - summary[i] = ratio(numer, denom); - } - - - /** - * Calculate the five summary stats per sample - * @param sampleStats The Map which holds concordance values per sample - */ - public void generateSampleSummaryStats( final SampleStats sampleStats ) { - EnumSet allVariantGenotypes = EnumSet.of(Genotype.Type.HOM_VAR, Genotype.Type.HET); - EnumSet allCalledGenotypes = EnumSet.of(Genotype.Type.HOM_VAR, Genotype.Type.HET, Genotype.Type.HOM_REF); - EnumSet allGenotypes = EnumSet.allOf(Genotype.Type.class); - - for( final String sample : concordanceSummary.keySet() ) { - if ( sample.equals(ALL_SAMPLES_KEY) ) continue; - - final long[][] stats = sampleStats.concordanceStats.get(sample); - final double[] summary = concordanceSummary.get(sample); - if( stats == null ) { throw new ReviewedStingException( "SampleStats and SampleSummaryStats contain different samples! sample = " + sample ); } - - long numer, denom; - - // Summary 0: % ref called as var - numer = sumStatsAllPairs(stats, EnumSet.of(Genotype.Type.HOM_REF), allVariantGenotypes); - denom = sumStatsAllPairs(stats, EnumSet.of(Genotype.Type.HOM_REF), allGenotypes); - updateSummaries(0, summary, numer, denom); - - // Summary 1: % het called as het - numer = stats[Genotype.Type.HET.ordinal()][Genotype.Type.HET.ordinal()]; - denom = sumStatsAllPairs(stats, EnumSet.of(Genotype.Type.HET), allGenotypes); - updateSummaries(1, summary, numer, denom); - - // Summary 2: % het called as var - numer = sumStatsAllPairs(stats, EnumSet.of(Genotype.Type.HET), allVariantGenotypes); - denom = sumStatsAllPairs(stats, EnumSet.of(Genotype.Type.HET), allGenotypes); - updateSummaries(2, summary, numer, denom); - - // Summary 3: % homVar called as homVar - numer = stats[Genotype.Type.HOM_VAR.ordinal()][Genotype.Type.HOM_VAR.ordinal()]; - denom = sumStatsAllPairs(stats, EnumSet.of(Genotype.Type.HOM_VAR), allGenotypes); - updateSummaries(3, summary, numer, denom); - - // Summary 4: % homVars called as var - numer = sumStatsAllPairs(stats, EnumSet.of(Genotype.Type.HOM_VAR), allVariantGenotypes); - denom = sumStatsAllPairs(stats, EnumSet.of(Genotype.Type.HOM_VAR), allGenotypes); - updateSummaries(4, summary, numer, denom); - - // Summary 5: % non-ref called as non-ref - // MAD: this is known as the non-reference sensitivity (# non-ref according to comp found in eval / # non-ref in comp) - numer = sumStatsAllPairs(stats, allVariantGenotypes, allVariantGenotypes); - denom = sumStatsAllPairs(stats, allVariantGenotypes, allGenotypes); - updateSummaries(5, summary, numer, denom); - - // Summary 6: overall genotype concordance of sites called in eval track - // MAD: this is the tradition genotype concordance - numer = sumStatsDiag(stats, allCalledGenotypes); - denom = sumStatsAllPairs(stats, allCalledGenotypes, allCalledGenotypes); - updateSummaries(6, summary, numer, denom); - - // Summary 7: overall genotype concordance of sites called non-ref in eval track - long homrefConcords = stats[Genotype.Type.HOM_REF.ordinal()][Genotype.Type.HOM_REF.ordinal()]; - long diag = sumStatsDiag(stats, allVariantGenotypes); - long allNoHomRef = sumStatsAllPairs(stats, allCalledGenotypes, allCalledGenotypes) - homrefConcords; - numer = allNoHomRef - diag; - denom = allNoHomRef; - updateSummaries(7, summary, numer, denom); - } - - // update the final summary stats - final double[] allSamplesSummary = concordanceSummary.get(ALL_SAMPLES_KEY); - for ( int i = 0; i < allSamplesSummary.length; i++) { - allSamplesSummary[i] = ratio(allSamplesNumerators[i], allSamplesDenominators[i]); - } - - } - - public String getName() { - return "Sample Summary Statistics"; - } -} - -/** - * SampleSummaryStats .. but for allele counts - */ -class ACSummaryStats extends SampleSummaryStats { - private String[] rowKeys; - - public ACSummaryStats (final VariantContext evalvc, final VariantContext compvc) { - concordanceSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]); - rowKeys = new String[3+2*evalvc.getGenotypes().size() + 2*compvc.getGenotypes().size()]; - rowKeys[0] = ALL_SAMPLES_KEY; - for( int i = 0; i <= 2*evalvc.getGenotypes().size() ; i ++ ) { - concordanceSummary.put(String.format("evalAC%d",i), new double[COLUMN_KEYS.length]); - rowKeys[i+1] = String.format("evalAC%d",i); - } - for( int i = 0; i <= 2*compvc.getGenotypes().size() ; i ++ ) { - concordanceSummary.put(String.format("compAC%d",i), new double[COLUMN_KEYS.length]); - rowKeys[2+2*evalvc.getGenotypes().size()+i] = String.format("compAC%d",i); - } - - } - - public String getName() { - return "Allele Count Summary Statistics"; - } - - public Object[] getRowKeys() { - if ( rowKeys == null) { - throw new StingException("rowKeys is null!!"); - } - return rowKeys; - } -} - -class CompACNames implements Comparator{ - - final Logger myLogger; - private boolean info = true; - - public CompACNames(Logger l) { - myLogger = l; - } - - public boolean equals(Object o) { - return ( o.getClass() == CompACNames.class ); - } - - public int compare(Object o1, Object o2) { - if ( info ) { - myLogger.info("Sorting AC names"); - info = false; - } - //System.out.printf("Objects %s %s get ranks %d %d%n",o1.toString(),o2.toString(),getRank(o1),getRank(o2)); - return getRank(o1) - getRank(o2); - } - - public int getRank(Object o) { - if ( o.getClass() != String.class ) { - return Integer.MIN_VALUE/4; - } else { - String s = (String) o; - if ( s.startsWith("eval") ) { - return Integer.MIN_VALUE/4 + 1 + parseAC(s); - } else if ( s.startsWith("comp") ) { - return 1+ parseAC(s); - } else { - return Integer.MIN_VALUE/4; - } - } - } - - public int parseAC(String s) { - String[] g = s.split("AC"); - return Integer.parseInt(g[1]); - } -} - diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypePhasingEvaluator.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypePhasingEvaluator.java deleted file mode 100644 index 5587a0180..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/GenotypePhasingEvaluator.java +++ /dev/null @@ -1,419 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval; - -import org.broad.tribble.util.variantcontext.Genotype; -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broad.tribble.vcf.VCFConstants; -import org.broadinstitute.sting.gatk.contexts.*; -import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.gatk.walkers.phasing.*; -import org.broadinstitute.sting.utils.report.tags.Analysis; -import org.broadinstitute.sting.utils.report.tags.DataPoint; -import org.broadinstitute.sting.utils.report.utils.TableType; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.MathUtils; - -import java.util.*; - -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -@Analysis(name = "Genotype Phasing Evaluation", description = "Evaluates the phasing of genotypes in different tracks") -public class GenotypePhasingEvaluator extends VariantEvaluator { - protected final static Logger logger = Logger.getLogger(GenotypePhasingEvaluator.class); - - // a mapping from sample to stats - @DataPoint(name = "samples", description = "the phasing statistics for each sample") - SamplePhasingStatistics samplePhasingStatistics = null; - - SamplePreviousGenotypes samplePrevGenotypes = null; - - public GenotypePhasingEvaluator(VariantEvalWalker parent) { - super(parent); - this.samplePhasingStatistics = new SamplePhasingStatistics(getVEWalker().minPhaseQuality); - this.samplePrevGenotypes = new SamplePreviousGenotypes(); - } - - public String getName() { - return "GenotypePhasingEvaluator"; - } - - public int getComparisonOrder() { - return 2; // we only need to see pairs of (comp, eval) - } - - public boolean enabled() { - return true; - } - - public String toString() { - return getName() + ":
"; - } - - public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantEvalWalker.EvaluationContext group) { - Reasons interesting = new Reasons(); - if (ref == null) - return interesting.toString(); - GenomeLoc curLocus = ref.getLocus(); - - logger.debug("update2() locus: " + curLocus); - logger.debug("comp = " + comp + " eval = " + eval); - - Set allSamples = new HashSet(); - - Map compSampGenotypes = null; - if (isRelevantToPhasing(comp)) { - allSamples.addAll(comp.getSampleNames()); - compSampGenotypes = comp.getGenotypes(); - } - - Map evalSampGenotypes = null; - if (isRelevantToPhasing(eval)) { - allSamples.addAll(eval.getSampleNames()); - evalSampGenotypes = eval.getGenotypes(); - } - - for (String samp : allSamples) { - logger.debug("sample = " + samp); - - Genotype compSampGt = null; - if (compSampGenotypes != null) - compSampGt = compSampGenotypes.get(samp); - - Genotype evalSampGt = null; - if (evalSampGenotypes != null) - evalSampGt = evalSampGenotypes.get(samp); - - if (compSampGt == null || evalSampGt == null) { // Since either comp or eval (or both) are missing the site, the best we can do is hope to preserve phase [if the non-missing one preserves phase] - // Having an unphased site breaks the phasing for the sample [does NOT permit "transitive phasing"] - hence, must reset phasing knowledge for both comp and eval [put a null CompEvalGenotypes]: - if (isNonNullButUnphased(compSampGt) || isNonNullButUnphased(evalSampGt)) - samplePrevGenotypes.put(samp, null); - } - else { // Both comp and eval have a non-null Genotype at this site: - AllelePair compAllelePair = new AllelePair(compSampGt); - AllelePair evalAllelePair = new AllelePair(evalSampGt); - - boolean breakPhasing = false; - if (compSampGt.isHet() != evalSampGt.isHet() || compSampGt.isHom() != evalSampGt.isHom()) - breakPhasing = true; // since they are not both het or both hom - else { // both are het, or both are hom: - boolean topMatchesTopAndBottomMatchesBottom = (topMatchesTop(compAllelePair, evalAllelePair) && bottomMatchesBottom(compAllelePair, evalAllelePair)); - boolean topMatchesBottomAndBottomMatchesTop = (topMatchesBottom(compAllelePair, evalAllelePair) && bottomMatchesTop(compAllelePair, evalAllelePair)); - if (!topMatchesTopAndBottomMatchesBottom && !topMatchesBottomAndBottomMatchesTop) - breakPhasing = true; // since the 2 VCFs have different diploid genotypes for this sample - } - - if (breakPhasing) { - samplePrevGenotypes.put(samp, null); // nothing to do for this site, AND must remove any history for the future - } - else if (compSampGt.isHet() && evalSampGt.isHet()) { - /* comp and eval have the HET same Genotype at this site: - [Note that if both are hom, then nothing is done here, but the het history IS preserved]. - */ - CompEvalGenotypes prevCompAndEval = samplePrevGenotypes.get(samp); - if (prevCompAndEval != null && !prevCompAndEval.getLocus().onSameContig(curLocus)) // exclude curLocus if it is "phased" relative to a different chromosome - prevCompAndEval = null; - - // Replace the previous hets with the current hets: - samplePrevGenotypes.put(samp, curLocus, compSampGt, evalSampGt); - - if (prevCompAndEval != null) { - GenomeLoc prevLocus = prevCompAndEval.getLocus(); - logger.debug("Potentially phaseable het locus: " + curLocus + " [relative to previous het locus: " + prevLocus + "]"); - PhaseStats ps = samplePhasingStatistics.ensureSampleStats(samp); - - boolean compSampIsPhased = genotypesArePhasedAboveThreshold(compSampGt); - boolean evalSampIsPhased = genotypesArePhasedAboveThreshold(evalSampGt); - if (compSampIsPhased || evalSampIsPhased) { - if (!evalSampIsPhased) { - ps.onlyCompPhased++; - interesting.addReason("ONLY_COMP", samp, group, prevLocus, ""); - } - else if (!compSampIsPhased) { - ps.onlyEvalPhased++; - interesting.addReason("ONLY_EVAL", samp, group, prevLocus, ""); - } - else { // both comp and eval are phased: - AllelePair prevCompAllelePair = new AllelePair(prevCompAndEval.getCompGenotpye()); - AllelePair prevEvalAllelePair = new AllelePair(prevCompAndEval.getEvalGenotype()); - - // Sufficient to check only the top of comp, since we ensured that comp and eval have the same diploid genotypes for this sample: - boolean topsMatch = (topMatchesTop(prevCompAllelePair, prevEvalAllelePair) && topMatchesTop(compAllelePair, evalAllelePair)); - boolean topMatchesBottom = (topMatchesBottom(prevCompAllelePair, prevEvalAllelePair) && topMatchesBottom(compAllelePair, evalAllelePair)); - - if (topsMatch || topMatchesBottom) { - ps.phasesAgree++; - - Double compPQ = getPQ(compSampGt); - Double evalPQ = getPQ(evalSampGt); - if (compPQ != null && evalPQ != null && MathUtils.compareDoubles(compPQ, evalPQ) != 0) - interesting.addReason("PQ_CHANGE", samp, group, prevLocus, compPQ + " -> " + evalPQ); - } - else { - ps.phasesDisagree++; - logger.debug("SWITCHED locus: " + curLocus); - interesting.addReason("SWITCH", samp, group, prevLocus, toString(prevCompAllelePair, compAllelePair) + " -> " + toString(prevEvalAllelePair, evalAllelePair)); - } - } - } - else { - ps.neitherPhased++; - } - } - } - } - } - logger.debug("\n" + samplePhasingStatistics + "\n"); - - return interesting.toString(); - } - - public static boolean isRelevantToPhasing(VariantContext vc) { - return (vc != null && !vc.isFiltered()); - } - - public boolean isNonNullButUnphased(Genotype gt) { - return (gt != null && !genotypesArePhasedAboveThreshold(gt)); - } - - public boolean genotypesArePhasedAboveThreshold(Genotype gt) { - if (gt.isHom()) // Can always consider a hom site to be phased to its predecessor, since its successor will only be phased to it if it's hom or "truly" phased - return true; - - if (!gt.isPhased()) - return false; - - Double pq = getPQ(gt); - return (pq == null || pq >= getVEWalker().minPhaseQuality); - } - - public static Double getPQ(Genotype gt) { - return gt.getAttributeAsDoubleNoException(ReadBackedPhasingWalker.PQ_KEY); - } - - public boolean topMatchesTop(AllelePair b1, AllelePair b2) { - return b1.getTopAllele().equals(b2.getTopAllele()); - } - - public boolean topMatchesBottom(AllelePair b1, AllelePair b2) { - return b1.getTopAllele().equals(b2.getBottomAllele()); - } - - public boolean bottomMatchesTop(AllelePair b1, AllelePair b2) { - return topMatchesBottom(b2, b1); - } - - public boolean bottomMatchesBottom(AllelePair b1, AllelePair b2) { - return b1.getBottomAllele().equals(b2.getBottomAllele()); - } - - public String toString(AllelePair prev, AllelePair cur) { - return prev.getTopAllele().getBaseString() + "+" + cur.getTopAllele().getBaseString() + "|" + prev.getBottomAllele().getBaseString() + "+" + cur.getBottomAllele().getBaseString(); - } - - public void finalizeEvaluation() { - } - - private static class Reasons { - private StringBuilder sb; - - public Reasons() { - sb = new StringBuilder(); - } - - public void addReason(String category, String sample, VariantEvalWalker.EvaluationContext evalGroup, GenomeLoc prevLoc, String reason) { - sb.append(category + "(" + sample + ", previous: " + prevLoc + " [" + evalGroup.compTrackName + ", " + evalGroup.evalTrackName + "]): " + reason + ";"); - } - - public String toString() { - if (sb.length() == 0) - return null; - - return "reasons=" + sb.toString(); - } - } -} - - - -class CompEvalGenotypes { - private GenomeLoc loc; - private Genotype compGt; - private Genotype evalGt; - - public CompEvalGenotypes(GenomeLoc loc, Genotype compGt, Genotype evalGt) { - this.loc = loc; - this.compGt = compGt; - this.evalGt = evalGt; - } - - public GenomeLoc getLocus() { - return loc; - } - - public Genotype getCompGenotpye() { - return compGt; - } - public Genotype getEvalGenotype() { - return evalGt; - } - - public void setCompGenotype(Genotype compGt) { - this.compGt = compGt; - } - - public void setEvalGenotype(Genotype evalGt) { - this.evalGt = evalGt; - } -} - -class SamplePreviousGenotypes { - private HashMap sampleGenotypes = null; - - public SamplePreviousGenotypes() { - this.sampleGenotypes = new HashMap(); - } - - public CompEvalGenotypes get(String sample) { - return sampleGenotypes.get(sample); - } - - public void put(String sample, CompEvalGenotypes compEvalGts) { - sampleGenotypes.put(sample, compEvalGts); - } - - public void put(String sample, GenomeLoc locus, Genotype compGt, Genotype evalGt) { - sampleGenotypes.put(sample, new CompEvalGenotypes(locus, compGt, evalGt)); - } -} - -class PhaseStats { - public int neitherPhased; - public int onlyCompPhased; - public int onlyEvalPhased; - public int phasesAgree; - public int phasesDisagree; - - public PhaseStats() { - this.neitherPhased = 0; - this.onlyCompPhased = 0; - this.onlyEvalPhased = 0; - this.phasesAgree = 0; - this.phasesDisagree = 0; - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("Neither phased: " + neitherPhased + "\tOnly Comp: " + onlyCompPhased + "\tOnly Eval: " + onlyEvalPhased + "\tSame phase: " + phasesAgree + "\tOpposite phase: " + phasesDisagree); - return sb.toString(); - } - - public static String[] getFieldNamesArray() { - return new String[]{"total", "neither", "only_comp", "only_eval", "both", "match", "switch", "switch_rate"}; - } - - public Object getField(int index) { - switch (index) { - case (0): - return (neitherPhased + onlyCompPhased + onlyEvalPhased + phasesAgree + phasesDisagree); - case (1): - return neitherPhased; - case (2): - return onlyCompPhased; - case (3): - return onlyEvalPhased; - case (4): - return (phasesAgree + phasesDisagree); - case (5): - return phasesAgree; - case (6): - return phasesDisagree; - case (7): - return ((phasesDisagree == 0) ? 0 : ((double) phasesDisagree) / (phasesAgree + phasesDisagree)); - default: - return -1; - } - } -} - -/** - * a table of sample names to genotype phasing statistics - */ -class SamplePhasingStatistics implements TableType { - private HashMap sampleStats = null; - private double minPhaseQuality; - - public SamplePhasingStatistics(double minPhaseQuality) { - this.sampleStats = new HashMap(); - this.minPhaseQuality = minPhaseQuality; - } - - public PhaseStats ensureSampleStats(String samp) { - PhaseStats ps = sampleStats.get(samp); - if (ps == null) { - ps = new PhaseStats(); - sampleStats.put(samp, ps); - } - return ps; - } - - /** - * @return one row per sample - */ - public String[] getRowKeys() { - return sampleStats.keySet().toArray(new String[sampleStats.size()]); - } - - /** - * get the column keys - * - * @return a list of objects, in this case strings, that are the column names - */ - public String[] getColumnKeys() { - return PhaseStats.getFieldNamesArray(); - } - - public Object getCell(int x, int y) { - String[] rowKeys = getRowKeys(); - PhaseStats ps = sampleStats.get(rowKeys[x]); - return ps.getField(y); - } - - public String getName() { - return "Sample Phasing Statistics (for PQ >= " + minPhaseQuality + ")"; - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - for (Map.Entry sampPhaseStatsEnt : sampleStats.entrySet()) { - String sample = sampPhaseStatsEnt.getKey(); - PhaseStats ps = sampPhaseStatsEnt.getValue(); - - sb.append(sample + "\t" + ps); - } - return sb.toString(); - } -} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/IndelLengthHistogram.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/IndelLengthHistogram.java deleted file mode 100644 index 9cad48f95..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/IndelLengthHistogram.java +++ /dev/null @@ -1,114 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval; - -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.utils.report.tags.Analysis; -import org.broadinstitute.sting.utils.report.tags.DataPoint; -import org.broadinstitute.sting.utils.report.utils.TableType; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -/** - * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl - * - * @Author chartl - * @Date May 26, 2010 - */ -@Analysis(name = "Indel length histograms", description = "Shows the distrbution of insertion/deletion event lengths (negative for deletion, positive for insertion)") -public class IndelLengthHistogram extends VariantEvaluator { - private static final int SIZE_LIMIT = 50; - @DataPoint(name="indelLengthHistogram",description="Histogram of indel lengths") - IndelHistogram indelHistogram = new IndelHistogram(SIZE_LIMIT); - - /* - * Indel length histogram table object - */ - - static class IndelHistogram implements TableType { - private Integer[] colKeys; - private int limit; - private String[] rowKeys = {"EventLength"}; - private Integer[] indelHistogram; - - public IndelHistogram(int limit) { - colKeys = initColKeys(limit); - indelHistogram = initHistogram(limit); - this.limit = limit; - } - - public Object[] getColumnKeys() { - return colKeys; - } - - public Object[] getRowKeys() { - return rowKeys; - } - - public Object getCell(int row, int col) { - return indelHistogram[col]; - } - - private Integer[] initColKeys(int size) { - Integer[] cK = new Integer[size*2+1]; - int index = 0; - for ( int i = -size; i <= size; i ++ ) { - cK[index] = i; - index++; - } - - return cK; - } - - private Integer[] initHistogram(int size) { - Integer[] hist = new Integer[size*2+1]; - for ( int i = 0; i < 2*size+1; i ++ ) { - hist[i] = 0; - } - - return hist; - } - - public String getName() { return "indelHistTable"; } - - public void update(int eLength) { - indelHistogram[len2index(eLength)]++; - } - - private int len2index(int len) { - if ( len > limit || len < -limit ) { - throw new ReviewedStingException("Indel length exceeds limit of "+limit+" please increase indel limit size"); - } - return len + limit; - } - } - - public IndelLengthHistogram(VariantEvalWalker parent) { super(parent); } - - public boolean enabled() { return false; } - - public String getName() { return "IndelLengthHistogram"; } - - public int getComparisonOrder() { return 1; } // need only the evals - - public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - //System.out.println("Update1 called"); - if ( ! vc1.isBiallelic() && vc1.isIndel() ) { - veWalker.getLogger().warn("[IndelLengthHistogram] Non-biallelic indel at "+ref.getLocus()+" ignored."); - return vc1.toString(); // biallelic sites are output - } - - if ( vc1.isIndel() ) { - //System.out.println("Is indel"); - if ( vc1.isInsertion() ) { - indelHistogram.update(vc1.getAlternateAllele(0).length()); - } else if ( vc1.isDeletion() ) { - indelHistogram.update(-vc1.getReference().length()); - } else { - throw new ReviewedStingException("Indel type that is not insertion or deletion."); - } - } - - return null; - } -} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/IndelMetricsByAC.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/IndelMetricsByAC.java deleted file mode 100755 index 84cefa544..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/IndelMetricsByAC.java +++ /dev/null @@ -1,214 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval; - -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.utils.report.tags.Analysis; -import org.broadinstitute.sting.utils.report.tags.DataPoint; -import org.broadinstitute.sting.utils.report.utils.TableType; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.ArrayList; - -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * @author delangel - * @since Apr 11, 2010 - */ - -@Analysis(name = "Indel Metrics by allele count", description = "Shows various stats binned by allele count") -public class IndelMetricsByAC extends VariantEvaluator { - // a mapping from quality score histogram bin to Ti/Tv ratio - @DataPoint(name="Indel Metrics by AC", description = "Indel Metrics by allele count") - IndelMetricsByAc metrics = null; - - //@DataPoint(name="Quality by Allele Count", description = "average variant quality for each allele count") - //AlleleCountStats alleleCountStats = null; - private static final int INDEL_SIZE_LIMIT = 100; - private static final int NUM_SCALAR_COLUMNS = 6; - static int len2Index(int ind) { - return ind+INDEL_SIZE_LIMIT+NUM_SCALAR_COLUMNS; - } - - static int index2len(int ind) { - return ind-INDEL_SIZE_LIMIT-NUM_SCALAR_COLUMNS; - } - - protected final static String[] METRIC_COLUMNS; - static { - METRIC_COLUMNS= new String[NUM_SCALAR_COLUMNS+2*INDEL_SIZE_LIMIT+1]; - METRIC_COLUMNS[0] = "AC"; - METRIC_COLUMNS[1] = "nIns"; - METRIC_COLUMNS[2] = "nDels"; - METRIC_COLUMNS[3] = "n"; - METRIC_COLUMNS[4] = "nComplex"; - METRIC_COLUMNS[5] = "nLong"; - - for (int k=NUM_SCALAR_COLUMNS; k < NUM_SCALAR_COLUMNS+ 2*INDEL_SIZE_LIMIT+1; k++) - METRIC_COLUMNS[k] = "indel_size_len"+Integer.valueOf(index2len(k)); - } - - class IndelMetricsAtAC { - public int ac = -1, nIns =0, nDel = 0, nComplex = 0, nLong; - public int sizeCount[] = new int[2*INDEL_SIZE_LIMIT+1]; - - public IndelMetricsAtAC(int ac) { this.ac = ac; } - - public void update(VariantContext eval) { - int eventLength = 0; - if ( eval.isInsertion() ) { - eventLength = eval.getAlternateAllele(0).length(); - nIns++; - } else if ( eval.isDeletion() ) { - eventLength = -eval.getReference().length(); - nDel++; - } - else { - nComplex++; - } - if (Math.abs(eventLength) < INDEL_SIZE_LIMIT) - sizeCount[len2Index(eventLength)]++; - else - nLong++; - - - - } - - // corresponding to METRIC_COLUMNS - public String getColumn(int i) { - if (i >= NUM_SCALAR_COLUMNS && i <=NUM_SCALAR_COLUMNS+ 2*INDEL_SIZE_LIMIT) - return String.valueOf(sizeCount[i-NUM_SCALAR_COLUMNS]); - - switch (i) { - case 0: return String.valueOf(ac); - case 1: return String.valueOf(nIns); - case 2: return String.valueOf(nDel); - case 3: return String.valueOf(nIns + nDel); - case 4: return String.valueOf(nComplex); - case 5: return String.valueOf(nLong); - - default: - throw new ReviewedStingException("Unexpected column " + i); - } - } - } - - class IndelMetricsByAc implements TableType { - ArrayList metrics = new ArrayList(); - Object[] rows = null; - - public IndelMetricsByAc( int nchromosomes ) { - rows = new Object[nchromosomes+1]; - metrics = new ArrayList(nchromosomes+1); - for ( int i = 0; i < nchromosomes + 1; i++ ) { - metrics.add(new IndelMetricsAtAC(i)); - rows[i] = "ac" + i; - } - } - - public Object[] getRowKeys() { - return rows; - } - - public Object[] getColumnKeys() { - return METRIC_COLUMNS; - } - - public String getName() { - return "IndelMetricsByAc"; - } - - // - public String getCell(int ac, int y) { - return metrics.get(ac).getColumn(y); - } - - public String toString() { - String returnString = ""; - return returnString; - } - - public void incrValue( VariantContext eval ) { - int ac = -1; - - if ( eval.hasGenotypes() ) - ac = eval.getChromosomeCount(eval.getAlternateAllele(0)); - else if ( eval.hasAttribute("AC") ) { - ac = Integer.valueOf(eval.getAttributeAsString("AC")); - } - - if ( ac != -1 ) - metrics.get(ac).update(eval); - } - } - - public IndelMetricsByAC(VariantEvalWalker parent) { - super(parent); - // don't do anything - } - - public String getName() { - return "IndelMetricsByAC"; - } - - public int getComparisonOrder() { - return 1; // we only need to see each eval track - } - - public boolean enabled() { - return true; - } - - public String toString() { - return getName(); - } - - public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - final String interesting = null; - - if (eval != null ) { - if ( metrics == null ) { - int nSamples = this.getVEWalker().getNSamplesForEval(eval); - if ( nSamples != -1 ) - metrics = new IndelMetricsByAc(2 * nSamples); - } - - if ( eval.isIndel() && eval.isBiallelic() && - metrics != null ) { - metrics.incrValue(eval); - } - } - - return interesting; // This module doesn't capture any interesting sites, so return null - } - - //public void finalizeEvaluation() { - // - //} -} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/IndelStatistics.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/IndelStatistics.java deleted file mode 100755 index ba25d6489..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/IndelStatistics.java +++ /dev/null @@ -1,508 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval; - -import org.broad.tribble.util.variantcontext.Genotype; -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.report.tags.Analysis; -import org.broadinstitute.sting.utils.report.tags.DataPoint; -import org.broadinstitute.sting.utils.report.utils.TableType; - -import java.util.Arrays; -import java.util.HashMap; - -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -@Analysis(name = "IndelStatistics", description = "Shows various indel metrics and statistics") -public class IndelStatistics extends VariantEvaluator { - @DataPoint(name="IndelStatistics", description = "Indel Statistics") - IndelStats indelStats = null; - - @DataPoint(name="IndelClasses", description = "Indel Classification") - IndelClasses indelClasses = null; - - - private static final int INDEL_SIZE_LIMIT = 100; - private static final int NUM_SCALAR_COLUMNS = 10; - - static int len2Index(int ind) { - return ind+INDEL_SIZE_LIMIT+NUM_SCALAR_COLUMNS; - } - - static int index2len(int ind) { - return ind-INDEL_SIZE_LIMIT-NUM_SCALAR_COLUMNS; - } - - static class IndelStats implements TableType { - protected final static String ALL_SAMPLES_KEY = "allSamples"; - protected final static String[] COLUMN_KEYS; - static { - COLUMN_KEYS= new String[NUM_SCALAR_COLUMNS+2*INDEL_SIZE_LIMIT+1]; - COLUMN_KEYS[0] = "heterozygosity"; - COLUMN_KEYS[1] = "number_of_insertions"; - COLUMN_KEYS[2] = "number_of_deletions"; - COLUMN_KEYS[3] = "number_het_insertions"; - COLUMN_KEYS[4] = "number_homozygous_insertions"; - COLUMN_KEYS[5] = "number_het_deletions"; - COLUMN_KEYS[6] = "number_homozygous_deletions"; - COLUMN_KEYS[7] = "number of homozygous reference sites"; - COLUMN_KEYS[8] = "number of complex events"; - COLUMN_KEYS[9] = "number of long indels"; - - for (int k=NUM_SCALAR_COLUMNS; k < NUM_SCALAR_COLUMNS+ 2*INDEL_SIZE_LIMIT+1; k++) - COLUMN_KEYS[k] = "indel_size_len"+Integer.valueOf(index2len(k)); - } - - // map of sample to statistics - protected final HashMap indelSummary = new HashMap(); - - public IndelStats(final VariantContext vc) { - indelSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]); - for( final String sample : vc.getGenotypes().keySet() ) { - indelSummary.put(sample, new double[COLUMN_KEYS.length]); - } - } - - /** - * - * @return one row per sample - */ - public Object[] getRowKeys() { - return indelSummary.keySet().toArray(new String[indelSummary.size()]); - } - public Object getCell(int x, int y) { - final Object[] rowKeys = getRowKeys(); - return String.format("%4.2f",indelSummary.get(rowKeys[x])[y]); - } - - /** - * get the column keys - * @return a list of objects, in this case strings, that are the column names - */ - public Object[] getColumnKeys() { - return COLUMN_KEYS; - } - - public String getName() { - return "IndelStats"; - } - - public int getComparisonOrder() { - return 1; // we only need to see each eval track - } - - public String toString() { - return getName(); - } - - /* - * increment the specified value - */ - public void incrValue(VariantContext vc) { - int eventLength = 0; - boolean isInsertion = false, isDeletion = false; - - if ( vc.isInsertion() ) { - eventLength = vc.getAlternateAllele(0).length(); - indelSummary.get(ALL_SAMPLES_KEY)[1]++; - isInsertion = true; - } else if ( vc.isDeletion() ) { - indelSummary.get(ALL_SAMPLES_KEY)[2]++; - eventLength = -vc.getReference().length(); - isDeletion = true; - } - else { - indelSummary.get(ALL_SAMPLES_KEY)[8]++; - } - - // make sure event doesn't overstep array boundaries - if (Math.abs(eventLength) < INDEL_SIZE_LIMIT) - indelSummary.get(ALL_SAMPLES_KEY)[len2Index(eventLength)]++; - else - indelSummary.get(ALL_SAMPLES_KEY)[9]++; - - - for( final String sample : vc.getGenotypes().keySet() ) { - if ( indelSummary.containsKey(sample) ) { - Genotype g = vc.getGenotype(sample); - boolean isVariant = (g.isCalled() && !g.isHomRef()); - if (isVariant) { - // update ins/del count - if (isInsertion) { - indelSummary.get(sample)[1]++; - } - else if (isDeletion) - indelSummary.get(sample)[2]++; - else - indelSummary.get(sample)[8]++; - - // update histogram - if (Math.abs(eventLength) < INDEL_SIZE_LIMIT) - indelSummary.get(sample)[len2Index(eventLength)]++; - else - indelSummary.get(sample)[9]++; - - if (g.isHet()) - if (isInsertion) - indelSummary.get(sample)[3]++; - else - indelSummary.get(sample)[5]++; - else - if (isInsertion) - indelSummary.get(sample)[4]++; - else - indelSummary.get(sample)[6]++; - - - - } - else - indelSummary.get(sample)[7]++; - } - } - - - } - } - - static class IndelClasses implements TableType { - protected final static String ALL_SAMPLES_KEY = "allSamples"; - protected final static String[] COLUMN_KEYS; - - - - static { - COLUMN_KEYS= new String[41]; - COLUMN_KEYS[0] = "Novel_A"; - COLUMN_KEYS[1] = "Novel_C"; - COLUMN_KEYS[2] = "Novel_G"; - COLUMN_KEYS[3] = "Novel_T"; - COLUMN_KEYS[4] = "NOVEL_1"; - COLUMN_KEYS[5] = "NOVEL_2"; - COLUMN_KEYS[6] = "NOVEL_3"; - COLUMN_KEYS[7] = "NOVEL_4"; - COLUMN_KEYS[8] = "NOVEL_5"; - COLUMN_KEYS[9] = "NOVEL_6"; - COLUMN_KEYS[10] = "NOVEL_7"; - COLUMN_KEYS[11] = "NOVEL_8"; - COLUMN_KEYS[12] = "NOVEL_9"; - COLUMN_KEYS[13] = "NOVEL_10orMore"; - COLUMN_KEYS[14] = "RepeatExpansion_A"; - COLUMN_KEYS[15] = "RepeatExpansion_C"; - COLUMN_KEYS[16] = "RepeatExpansion_G"; - COLUMN_KEYS[17] = "RepeatExpansion_T"; - COLUMN_KEYS[18] = "RepeatExpansion_AC"; - COLUMN_KEYS[19] = "RepeatExpansion_AG"; - COLUMN_KEYS[20] = "RepeatExpansion_AT"; - COLUMN_KEYS[21] = "RepeatExpansion_CA"; - COLUMN_KEYS[22] = "RepeatExpansion_CG"; - COLUMN_KEYS[23] = "RepeatExpansion_CT"; - COLUMN_KEYS[24] = "RepeatExpansion_GA"; - COLUMN_KEYS[25] = "RepeatExpansion_GC"; - COLUMN_KEYS[26] = "RepeatExpansion_GT"; - COLUMN_KEYS[27] = "RepeatExpansion_TA"; - COLUMN_KEYS[28] = "RepeatExpansion_TC"; - COLUMN_KEYS[29] = "RepeatExpansion_TG"; - COLUMN_KEYS[30] = "RepeatExpansion_1"; - COLUMN_KEYS[31] = "RepeatExpansion_2"; - COLUMN_KEYS[32] = "RepeatExpansion_3"; - COLUMN_KEYS[33] = "RepeatExpansion_4"; - COLUMN_KEYS[34] = "RepeatExpansion_5"; - COLUMN_KEYS[35] = "RepeatExpansion_6"; - COLUMN_KEYS[36] = "RepeatExpansion_7"; - COLUMN_KEYS[37] = "RepeatExpansion_8"; - COLUMN_KEYS[38] = "RepeatExpansion_9"; - COLUMN_KEYS[39] = "RepeatExpansion_10orMore"; - COLUMN_KEYS[40] = "Other"; - - } - - private static final int START_IND_NOVEL = 4; - private static final int STOP_IND_NOVEL = 13; - private static final int START_IND_FOR_REPEAT_EXPANSION_1 = 14; - private static final int STOP_IND_FOR_REPEAT_EXPANSION_2 = 29; - private static final int START_IND_FOR_REPEAT_EXPANSION_COUNTS = 30; - private static final int STOP_IND_FOR_REPEAT_EXPANSION_COUNTS = 39; - private static final int IND_FOR_OTHER_EVENT = 40; - private static final int START_IND_NOVEL_PER_BASE = 0; - private static final int STOP_IND_NOVEL_PER_BASE = 3; - - - // map of sample to statistics - protected final HashMap indelClassSummary = new HashMap(); - - public IndelClasses(final VariantContext vc) { - indelClassSummary.put(ALL_SAMPLES_KEY, new int[COLUMN_KEYS.length]); - for( final String sample : vc.getGenotypes().keySet() ) { - indelClassSummary.put(sample, new int[COLUMN_KEYS.length]); - } - } - - /** - * - * @return one row per sample - */ - public Object[] getRowKeys() { - return indelClassSummary.keySet().toArray(new String[indelClassSummary.size()]); - } - public Object getCell(int x, int y) { - final Object[] rowKeys = getRowKeys(); - return String.format("%d",indelClassSummary.get(rowKeys[x])[y]); - } - - /** - * get the column keys - * @return a list of objects, in this case strings, that are the column names - */ - public Object[] getColumnKeys() { - return COLUMN_KEYS; - } - - public String getName() { - return "IndelClasses"; - } - - public int getComparisonOrder() { - return 1; // we only need to see each eval track - } - - public String toString() { - return getName(); - } - - private void incrementSampleStat(VariantContext vc, int index) { - indelClassSummary.get(ALL_SAMPLES_KEY)[index]++; - for( final String sample : vc.getGenotypes().keySet() ) { - if ( indelClassSummary.containsKey(sample) ) { - Genotype g = vc.getGenotype(sample); - boolean isVariant = (g.isCalled() && !g.isHomRef()); - if (isVariant) - // update count - indelClassSummary.get(sample)[index]++; - - } - } - - } - /* - * increment the specified value - */ - private String findMinimalEvent(String eventString) { - - // for each length up to given string length, see if event string is a repetition of units of size N - boolean foundSubstring = false; - String minEvent = eventString; - for (int k=1; k < eventString.length(); k++) { - if (eventString.length() % k > 0) - continue; - String str = eventString.substring(0,k); - // now see if event string is a repetition of str - int numReps = eventString.length() / k; - String r = ""; - for (int j=0; j < numReps; j++) - r = r.concat(str); - - if (r.matches(eventString)) { - foundSubstring = true; - minEvent = str; - break; - } - - } - return minEvent; - } - public void incrValue(VariantContext vc, ReferenceContext ref) { - int eventLength = 0; - boolean isInsertion = false, isDeletion = false; - String indelAlleleString; - - if ( vc.isInsertion() ) { - isInsertion = true; - indelAlleleString = vc.getAlternateAllele(0).getDisplayString(); - } else if ( vc.isDeletion() ) { - isDeletion = true; - indelAlleleString = vc.getReference().getDisplayString(); - } - else { - incrementSampleStat(vc, IND_FOR_OTHER_EVENT); - - return; - } - - byte[] refBases = ref.getBases(); - - indelAlleleString = findMinimalEvent(indelAlleleString); - eventLength = indelAlleleString.length(); - - // See first if indel is a repetition of bases before current - int indStart = refBases.length/2-eventLength+1; - - boolean done = false; - int numRepetitions = 0; - while (!done) { - if (indStart < 0) - done = true; - else { - String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength)); - if (refPiece.matches(indelAlleleString)) - { - numRepetitions++; - indStart = indStart - eventLength; - } - else - done = true; - - } - } - - // now do it forward - done = false; - indStart = refBases.length/2+1; - while (!done) { - if (indStart + eventLength >= refBases.length) - break; - else { - String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength)); - if (refPiece.matches(indelAlleleString)) - { - numRepetitions++; - indStart = indStart + eventLength; - } - else - done = true; - - } - } - - if (numRepetitions == 0) { - //unrepeated sequence from surroundings - int ind = START_IND_NOVEL + (eventLength-1); - if (ind > STOP_IND_NOVEL) - ind = STOP_IND_NOVEL; - incrementSampleStat(vc, ind); - - if (eventLength == 1) { - // log single base indels additionally by base - String keyStr = "Novel_" + indelAlleleString; - int k; - for (k=START_IND_NOVEL_PER_BASE; k <= STOP_IND_NOVEL_PER_BASE; k++) { - if (keyStr.matches(COLUMN_KEYS[k])) - break; - } - // log now event - incrementSampleStat(vc, k); - } - } - else { - int ind = START_IND_FOR_REPEAT_EXPANSION_COUNTS + (numRepetitions-1); - if (ind > STOP_IND_FOR_REPEAT_EXPANSION_COUNTS) - ind = STOP_IND_FOR_REPEAT_EXPANSION_COUNTS; - incrementSampleStat(vc, ind); - - if (eventLength<=2) { - // for single or dinucleotide indels, we further log the base in which they occurred - String keyStr = "RepeatExpansion_" + indelAlleleString; - int k; - for (k=START_IND_FOR_REPEAT_EXPANSION_1; k <= STOP_IND_FOR_REPEAT_EXPANSION_2; k++) { - if (keyStr.matches(COLUMN_KEYS[k])) - break; - } - // log now event - incrementSampleStat(vc, k); - } - - } - //g+ - /* - System.out.format("RefBefore: %s\n",new String(refBefore)); - System.out.format("RefAfter: %s\n",new String(refAfter)); - System.out.format("Indel Allele: %s\n", indelAlleleString); - System.out.format("Num Repetitions: %d\n", numRepetitions); - */ - } - - } - - public IndelStatistics(VariantEvalWalker parent) { - super(parent); - // don't do anything - } - - public String getName() { - return "IndelStatistics"; - } - - public int getComparisonOrder() { - return 1; // we only need to see each eval track - } - - public boolean enabled() { - return true; - } - - public String toString() { - return getName(); - } - public String update2(VariantContext eval, VariantContext validation, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantEvalWalker.EvaluationContext group) { - return null; - } - - - public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - if (eval != null ) { - if ( indelStats == null ) { - int nSamples = this.getVEWalker().getNSamplesForEval(eval); - if ( nSamples != -1 ) - indelStats = new IndelStats(eval); - } - if ( indelClasses == null ) { - indelClasses = new IndelClasses(eval); - } - - if ( eval.isIndel() && eval.isBiallelic() ) { - if (indelStats != null ) - indelStats.incrValue(eval); - - if (indelClasses != null) - indelClasses.incrValue(eval, ref); - } - } - - return null; // This module doesn't capture any interesting sites, so return null - } - public String update0(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - return null; - } - public void finalizeEvaluation() { - // - int k=0; - } - -} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/MendelianViolationEvaluator.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/MendelianViolationEvaluator.java deleted file mode 100755 index 36995a070..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/MendelianViolationEvaluator.java +++ /dev/null @@ -1,182 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval; - -import org.broad.tribble.util.variantcontext.Allele; -import org.broad.tribble.util.variantcontext.Genotype; -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.report.tags.Analysis; -import org.broadinstitute.sting.utils.report.tags.DataPoint; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.List; -import java.util.Arrays; -import java.util.regex.Pattern; -import java.util.regex.Matcher; - -/** - * Mendelian violation detection and counting - *

- * a violation looks like: - * Suppose dad = A/B and mom = C/D - * The child can be [A or B] / [C or D]. - * If the child doesn't match this, the site is a violation - *

- * Some examples: - *

- * mom = A/A, dad = C/C - * child can be A/C only - *

- * mom = A/C, dad = C/C - * child can be A/C or C/C - *

- * mom = A/C, dad = A/C - * child can be A/A, A/C, C/C - *

- * The easiest way to do this calculation is to: - *

- * Get alleles for mom => A/B - * Get alleles for dad => C/D - * Make allowed genotypes for child: A/C, A/D, B/C, B/D - * Check that the child is one of these. - */ -@Analysis(name = "Mendelian Violation Evaluator", description = "Mendelian Violation Evaluator") -public class MendelianViolationEvaluator extends VariantEvaluator { - - @DataPoint(name = "variants", description = "Number of mendelian variants found") - long nVariants; - - @DataPoint(name = "violations", description = "Number of mendelian violations found") - long nViolations; - - @DataPoint(name="KHR->PHV",description = "number of child hom ref calls where the parent was hom variant") - long KidHomRef_ParentHomVar; - @DataPoint(name="KHET->PHR",description = "number of child het calls where the parent was hom ref") - long KidHet_ParentsHomRef; - @DataPoint(name="KHET->PHV",description = "number of child het calls where the parent was hom variant") - long KidHet_ParentsHomVar; - @DataPoint(name="KHV->PHR",description = "number of child hom variant calls where the parent was hom ref") - long KidHomVar_ParentHomRef; - - TrioStructure trio; - - private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)"); - - public static class TrioStructure { - public String mom, dad, child; - } - - public static TrioStructure parseTrioDescription(String family) { - Matcher m = FAMILY_PATTERN.matcher(family); - if (m.matches()) { - TrioStructure trio = new TrioStructure(); - //System.out.printf("Found a family pattern: %s%n", parent.FAMILY_STRUCTURE); - trio.mom = m.group(1); - trio.dad = m.group(2); - trio.child = m.group(3); - return trio; - } else { - throw new IllegalArgumentException("Malformatted family structure string: " + family + " required format is mom+dad=child"); - } - } - - public MendelianViolationEvaluator(VariantEvalWalker parent) { - super(parent); - - if (enabled()) { - trio = parseTrioDescription(parent.FAMILY_STRUCTURE); - parent.getLogger().debug(String.format("Found a family pattern: %s mom=%s dad=%s child=%s", - parent.FAMILY_STRUCTURE, trio.mom, trio.dad, trio.child)); - } - } - - public boolean enabled() { - return getVEWalker().FAMILY_STRUCTURE != null; - } - - private double getQThreshold() { - return getVEWalker().MENDELIAN_VIOLATION_QUAL_THRESHOLD / 10; // we aren't 10x scaled in the GATK a la phred - } - - public String getName() { - return "mendelian_violations"; - } - - public int getComparisonOrder() { - return 1; // we only need to see each eval track - } - - public String update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (vc.isBiallelic() && vc.hasGenotypes()) { // todo -- currently limited to biallelic loci - Genotype momG = vc.getGenotype(trio.mom); - Genotype dadG = vc.getGenotype(trio.dad); - Genotype childG = vc.getGenotype(trio.child); - - if (includeGenotype(momG) && includeGenotype(dadG) && includeGenotype(childG)) { - nVariants++; - - if (momG == null || dadG == null || childG == null) - throw new IllegalArgumentException(String.format("VariantContext didn't contain genotypes for expected trio members: mom=%s dad=%s child=%s", trio.mom, trio.dad, trio.child)); - - // all genotypes are good, so let's see if child is a violation - - if (isViolation(vc, momG, dadG, childG)) { - nViolations++; - - String label; - if (childG.isHomRef() && (momG.isHomVar() || dadG.isHomVar())) { - label = "KidHomRef_ParentHomVar"; - KidHomRef_ParentHomVar++; - } else if (childG.isHet() && (momG.isHomRef() && dadG.isHomRef())) { - label = "KidHet_ParentsHomRef"; - KidHet_ParentsHomRef++; - } else if (childG.isHet() && (momG.isHomVar() && dadG.isHomVar())) { - label = "KidHet_ParentsHomVar"; - KidHet_ParentsHomVar++; - } else if (childG.isHomVar() && (momG.isHomRef() || dadG.isHomRef())) { - label = "KidHomVar_ParentHomRef"; - KidHomVar_ParentHomRef++; - } else { - throw new ReviewedStingException("BUG: unexpected child genotype class " + childG); - } - - return "MendelViolation=" + label; - } - } - } - - return null; // we don't capture any intersting sites - } - - private boolean includeGenotype(Genotype g) { - return g.getNegLog10PError() > getQThreshold() && g.isCalled(); - } - - public static boolean isViolation(VariantContext vc, Genotype momG, Genotype dadG, Genotype childG) { - return isViolation(vc, momG.getAlleles(), dadG.getAlleles(), childG.getAlleles()); - } - - public static boolean isViolation(VariantContext vc, TrioStructure trio ) { - return isViolation(vc, vc.getGenotype(trio.mom), vc.getGenotype(trio.dad), vc.getGenotype(trio.child) ); - } - - public static boolean isViolation(VariantContext vc, List momA, List dadA, List childA) { - //VariantContext momVC = vc.subContextFromGenotypes(momG); - //VariantContext dadVC = vc.subContextFromGenotypes(dadG); - int i = 0; - Genotype childG = new Genotype("kidG", childA); - for (Allele momAllele : momA) { - for (Allele dadAllele : dadA) { - if (momAllele.isCalled() && dadAllele.isCalled()) { - Genotype possibleChild = new Genotype("possibleGenotype" + i, Arrays.asList(momAllele, dadAllele)); - if (childG.sameGenotype(possibleChild, false)) { - return false; - } - } - } - } - - return true; - } -} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/PrintMissingComp.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/PrintMissingComp.java deleted file mode 100644 index 7d186c7a0..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/PrintMissingComp.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.varianteval; - -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.utils.report.tags.Analysis; -import org.broadinstitute.sting.utils.report.tags.DataPoint; - -@Analysis(name = "PrintMissingComp", description = "the overlap between eval and comp sites") -public class PrintMissingComp extends VariantEvaluator { - @DataPoint(name = "evals not at comp", description = "number of eval sites outside of comp sites") - long nMissing = 0; - - public PrintMissingComp(VariantEvalWalker parent) { - super(parent); - } - - public String getName() { - return "PrintMissingComp"; - } - - public int getComparisonOrder() { - return 2; // we need to see each eval track and each comp track - } - - public boolean enabled() { - return true; - } - - - public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - boolean compIsGood = comp != null && comp.isNotFiltered() && comp.isSNP(); - boolean evalIsGood = eval != null && eval.isSNP(); - - if ( compIsGood & ! evalIsGood ) { - nMissing++; - return "MissingFrom" + comp.getSource(); - } else { - return null; - } - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/SimpleMetricsByAC.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/SimpleMetricsByAC.java deleted file mode 100755 index 6f279635b..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/SimpleMetricsByAC.java +++ /dev/null @@ -1,175 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval; - -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.utils.report.tags.Analysis; -import org.broadinstitute.sting.utils.report.tags.DataPoint; -import org.broadinstitute.sting.utils.report.utils.TableType; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.ArrayList; - -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * @author depristo - * @since Apr 11, 2010 - */ - -@Analysis(name = "Quality Metrics by allele count", description = "Shows various stats binned by allele count") -public class SimpleMetricsByAC extends VariantEvaluator implements StandardEval { - // a mapping from quality score histogram bin to Ti/Tv ratio - @DataPoint(name="TiTv by AC", description = "TiTv by allele count") - MetricsByAc metrics = null; - - //@DataPoint(name="Quality by Allele Count", description = "average variant quality for each allele count") - //AlleleCountStats alleleCountStats = null; - - private final static Object[] METRIC_COLUMNS = {"AC", "nTi", "nTv", "n", "Ti/Tv"}; - - class MetricsAtAC { - public int ac = -1, nTi = 0, nTv = 0; - - public MetricsAtAC(int ac) { this.ac = ac; } - - public void update(VariantContext eval) { - if ( VariantContextUtils.isTransition(eval) ) - nTi++; - else - nTv++; - } - - // corresponding to METRIC_COLUMNS - public String getColumn(int i) { - switch (i) { - case 0: return String.valueOf(ac); - case 1: return String.valueOf(nTi); - case 2: return String.valueOf(nTv); - case 3: return String.valueOf(nTi + nTv); - case 4: return String.valueOf(ratio(nTi, nTv)); - default: - throw new ReviewedStingException("Unexpected column " + i); - } - } - } - - class MetricsByAc implements TableType { - ArrayList metrics = new ArrayList(); - Object[] rows = null; - - public MetricsByAc( int nchromosomes ) { - rows = new Object[nchromosomes+1]; - metrics = new ArrayList(nchromosomes+1); - for ( int i = 0; i < nchromosomes + 1; i++ ) { - metrics.add(new MetricsAtAC(i)); - rows[i] = "ac" + i; - } - } - - public Object[] getRowKeys() { - return rows; - } - - public Object[] getColumnKeys() { - return METRIC_COLUMNS; - } - - public String getName() { - return "MetricsByAc"; - } - - // - public String getCell(int ac, int y) { - return metrics.get(ac).getColumn(y); - } - - public String toString() { - String returnString = ""; - return returnString; - } - - public void incrValue( VariantContext eval ) { - int ac = -1; - - if ( eval.hasGenotypes() ) - ac = eval.getChromosomeCount(eval.getAlternateAllele(0)); - else if ( eval.hasAttribute("AC") ) { - ac = Integer.valueOf(eval.getAttributeAsString("AC")); - } - - if ( ac != -1 ) - metrics.get(ac).update(eval); - } - } - - public SimpleMetricsByAC(VariantEvalWalker parent) { - super(parent); - // don't do anything - } - - public String getName() { - return "SimpleMetricsByAC"; - } - - public int getComparisonOrder() { - return 1; // we only need to see each eval track - } - - public boolean enabled() { - return true; - } - - public String toString() { - return getName(); - } - - public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - final String interesting = null; - - if (eval != null ) { - if ( metrics == null ) { - int nSamples = this.getVEWalker().getNSamplesForEval(eval); - if ( nSamples != -1 ) - metrics = new MetricsByAc(2 * nSamples); - } - - if ( eval.isSNP() && - eval.isBiallelic() && - metrics != null ) { - metrics.incrValue(eval); - } - } - - return interesting; // This module doesn't capture any interesting sites, so return null - } - - //public void finalizeEvaluation() { - // - //} -} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/StandardEval.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/StandardEval.java deleted file mode 100755 index 351d65db8..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/StandardEval.java +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.varianteval; - -public interface StandardEval {} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/ThetaVariantEvaluator.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/ThetaVariantEvaluator.java deleted file mode 100644 index 3f6a776de..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/ThetaVariantEvaluator.java +++ /dev/null @@ -1,136 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval; - -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broad.tribble.util.variantcontext.Genotype; -import org.broad.tribble.util.variantcontext.Allele; -import org.broadinstitute.sting.utils.report.tags.Analysis; -import org.broadinstitute.sting.utils.report.tags.DataPoint; -import java.util.concurrent.ConcurrentMap; -import java.util.concurrent.ConcurrentHashMap; - -@Analysis(name = "Theta Variant Evaluator", description = "Computes different estimates of theta based on variant sites and genotypes") -public class ThetaVariantEvaluator extends VariantEvaluator { - - @DataPoint(name = "avg_heterozygosity", description = "Average heterozygosity at variant sites; note that missing genotypes are ignored when computing this value") - double avgHet = 0.0; - @DataPoint(name = "avg_pairwise_diffs", description = "Average pairwise differences at aligned sequences; averaged over both number of sequeneces and number of variant sites; note that missing genotypes are ignored when computing this value") - double avgAvgDiffs = 0.0; - @DataPoint(name = "sum_heterozygosity", description = "Sum of heterozygosity over all variant sites; divide this by total target to get estimate of per base theta") - double totalHet = 0.0; - @DataPoint(name = "sum_pairwise_diffs", description = "Sum of pairwise diffs over all variant sites; divide this by total target to get estimate of per base theta") - double totalAvgDiffs = 0.0; - @DataPoint(name = "theta_region_num_sites", description = "Theta for entire region estimated based on number of segregating sites; divide ths by total target to get estimate of per base theta") - double thetaRegionNumSites = 0.0; - - //helper variables - double numSites = 0; - - public ThetaVariantEvaluator(VariantEvalWalker parent) { - super(parent); - } - - public boolean enabled() { - return true; - } - - public String getName() { - return "theta"; - } - - public int getComparisonOrder() { - return 1; - } - - public String update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - if (vc == null || !vc.isSNP() || !vc.hasGenotypes()) { - return null; //no interesting sites - } - - if (vc.hasGenotypes()) { - - //this maps allele to a count - ConcurrentMap alleleCounts = new ConcurrentHashMap(); - - int numHetsHere = 0; - float numGenosHere = 0; - int numIndsHere = 0; - - for (Genotype genotype : vc.getGenotypes().values()) { - numIndsHere++; - if (!genotype.isNoCall()) { - //increment stats for heterozygosity - if (genotype.isHet()) { - numHetsHere++; - } - - numGenosHere++; - //increment stats for pairwise mismatches - - for (Allele allele : genotype.getAlleles()) { - if (allele.isNonNull() && allele.isCalled()) { - String alleleString = allele.toString(); - alleleCounts.putIfAbsent(alleleString, 0); - alleleCounts.put(alleleString, alleleCounts.get(alleleString) + 1); - } - } - } - } - if (numGenosHere > 0) { - //only if have one called genotype at least - this.numSites++; - - this.totalHet += numHetsHere / numGenosHere; - - //compute based on num sites - float harmonicFactor = 0; - for (int i = 1; i <= numIndsHere; i++) { - harmonicFactor += 1.0 / i; - } - this.thetaRegionNumSites += 1.0 / harmonicFactor; - - //now compute pairwise mismatches - float numPairwise = 0; - float numDiffs = 0; - for (String allele1 : alleleCounts.keySet()) { - int allele1Count = alleleCounts.get(allele1); - - for (String allele2 : alleleCounts.keySet()) { - if (allele1.compareTo(allele2) < 0) { - continue; - } - if (allele1 .compareTo(allele2) == 0) { - numPairwise += allele1Count * (allele1Count - 1) * .5; - - } - else { - int allele2Count = alleleCounts.get(allele2); - numPairwise += allele1Count * allele2Count; - numDiffs += allele1Count * allele2Count; - } - } - } - - if (numPairwise > 0) { - this.totalAvgDiffs += numDiffs / numPairwise; - } - } - } - - return null; - } - - @Override - public void finalizeEvaluation() { - - if (this.numSites > 0) { - - this.avgHet = this.totalHet / this.numSites; - this.avgAvgDiffs = this.totalAvgDiffs / this.numSites; - - } - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/TiTvVariantEvaluator.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/TiTvVariantEvaluator.java deleted file mode 100755 index 307f253c7..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/TiTvVariantEvaluator.java +++ /dev/null @@ -1,68 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval; - -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.report.tags.Analysis; -import org.broadinstitute.sting.utils.report.tags.DataPoint; - -@Analysis(name = "Ti/Tv Variant Evaluator", description = "Ti/Tv Variant Evaluator") -public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEval { - - @DataPoint(name = "ti_count", description = "number of transition loci") - long nTi = 0; - @DataPoint(name = "tv_count", description = "number of transversion loci") - long nTv = 0; - @DataPoint(name = "ti/tv_ratio", description = "the transition to transversion ratio") - double tiTvRatio = 0.0; - @DataPoint(name = "ti_count_comp", description = "number of comp transition sites") - long nTiInComp = 0; - @DataPoint(name = "tv_count_comp", description = "number of comp transversion sites") - long nTvInComp = 0; - @DataPoint(name = "ti/tv_ratio_standard", description = "the transition to transversion ratio for comp sites") - double TiTvRatioStandard = 0.0; - - public TiTvVariantEvaluator(VariantEvalWalker parent) { - super(parent); - } - - public boolean enabled() { - return true; - } - - public String getName() { - return "titv"; - } - - public int getComparisonOrder() { - return 2; // we only need to see each eval track - } - - public void updateTiTv(VariantContext vc, boolean updateStandard) { - if (vc != null && vc.isSNP() && vc.isBiallelic()) { - if (VariantContextUtils.isTransition(vc)) { - if (updateStandard) nTiInComp++; - else nTi++; - } else { - if (updateStandard) nTvInComp++; - else nTv++; - } - } - } - - public String update2(VariantContext vc1, VariantContext vc2, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (vc1 != null) updateTiTv(vc1, false); - if (vc2 != null) updateTiTv(vc2, true); - - return null; // we don't capture any intersting sites - } - - @Override - public void finalizeEvaluation() { - // the ti/tv ratio needs to be set (it's not calculated per-variant). - this.tiTvRatio = rate(nTi,nTv); - this.TiTvRatioStandard = rate(nTiInComp, nTvInComp); - } -} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java deleted file mode 100755 index f2c14c99b..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ /dev/null @@ -1,918 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.varianteval; - -import org.apache.commons.jexl2.*; -import org.apache.commons.jexl2.introspection.*; -import org.apache.commons.logging.LogFactory; -import org.apache.log4j.Logger; -import org.broad.tribble.util.variantcontext.MutableVariantContext; -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broad.tribble.vcf.VCFConstants; -import org.broad.tribble.vcf.VCFWriter; -import org.broad.tribble.vcf.VCFHeader; -import org.broad.tribble.vcf.VCFHeaderLine; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; -import org.broadinstitute.sting.gatk.walkers.Reference; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.gatk.walkers.Window; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.gatk.walkers.variantrecalibration.Tranche; -import org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.report.ReportMarshaller; -import org.broadinstitute.sting.utils.report.VE2ReportFactory; -import org.broadinstitute.sting.utils.report.templates.ReportFormat; -import org.broadinstitute.sting.utils.report.utils.Node; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.classloader.PluginManager; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.XReadLines; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.OutputStreamWriter; -import java.io.PrintStream; -import java.lang.reflect.Constructor; -import java.lang.reflect.Field; -import java.util.*; - -// todo -- evalations should support comment lines -// todo -- add Mendelian variable explanations (nDeNovo and nMissingTransmissions) - -// todo -- site frequency spectrum eval (freq. of variants in eval as a function of their AC and AN numbers) -// todo -- clustered SNP counter -// todo -- HWEs -// todo -- indel metrics [count of sizes in/del should be in CountVariants] - -// todo -- port over SNP density walker: -// todo -- see walker for WG calc but will need to make it work with intervals correctly - -// Todo -- should really include argument parsing @annotations from subclass in this walker. Very -// todo -- useful general capability. Right now you need to add arguments to VariantEval2 to handle new -// todo -- evaluation arguments (which is better than passing a string!) - -// todo -- these really should be implemented as default select expression -// todo Extend VariantEval, our general-purpose tool for SNP evaluation, to differentiate Ti/Tv at CpG islands and also -// todo classify (and count) variants into coding, non-coding, synonomous/non-symonomous, 2/4 fold degenerate sites, etc. -// todo Assume that the incoming VCF has the annotations (you don't need to do this) but VE2 should split up results by -// todo these catogies automatically (using the default selects) - -// todo -- this is really more a documentation issue. Really would be nice to have a pre-defined argument packet that -// todo -- can be provided to the system -// todo -- We agreed to report two standard values for variant evaluation from here out. One, we will continue to report -// todo -- the dbSNP 129 rate. Additionally, we will start to report the % of variants found that have already been seen in -// todo -- 1000 Genomes. This should be implemented as another standard comp_1kg binding, pointing to only variants -// todo -- discovered and released by 1KG. Might need to make this data set ourselves and keep it in GATK/data like -// todo -- dbsnp rod -// -// todo -- implement as select statment, but it's hard for multi-sample calls. -// todo -- Provide separate dbsnp rates for het only calls and any call where there is at least one hom-var genotype, -// todo -- since hets are much more likely to be errors -// -// todo -- Add Heng's hom run metrics -- single sample haplotype block lengths - - -/** - * General-purpose tool for variant evaluation (% in dbSNP, genotype concordance, Ts/Tv ratios, and a lot more) - */ -@Reference(window=@Window(start=-50,stop=50)) -public class VariantEvalWalker extends RodWalker implements TreeReducible { - @Output - protected PrintStream out; - - // -------------------------------------------------------------------------------------------------------------- - // - // walker arguments - // - // -------------------------------------------------------------------------------------------------------------- - - @Argument(shortName="select", doc="One or more stratifications to use when evaluating the data", required=false) - protected ArrayList SELECT_EXPS = new ArrayList(); - - @Argument(shortName="selectName", doc="Names to use for the list of stratifications (must be a 1-to-1 mapping)", required=false) - protected ArrayList SELECT_NAMES = new ArrayList(); - - @Hidden - @Argument(shortName="summary", doc="One or more JEXL staments to log after evaluating the data", required=false) - protected ArrayList SUMMARY_EXPS = new ArrayList(); - - @Hidden - @Argument(shortName="validate", doc="One or more JEXL validations to use after evaluating the data", required=false) - protected ArrayList VALIDATE_EXPS = new ArrayList(); - - @Argument(shortName="known", doc="Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets", required=false) - protected String[] KNOWN_NAMES = {DbSNPHelper.STANDARD_DBSNP_TRACK_NAME}; - - @Argument(shortName="sample", doc="Derive eval and comp contexts using only these sample genotypes, when genotypes are available in the original context", required=false) - protected String[] SAMPLES = {}; - private List SAMPLES_LIST = null; - - // - // Arguments for choosing which modules to run - // - @Argument(fullName="evalModule", shortName="E", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noStandard is specified)", required=false) - protected String[] modulesToUse = {}; - - @Argument(fullName="doNotUseAllStandardModules", shortName="noStandard", doc="Do not use the standard modules by default (instead, only those that are specified with the -E option)") - protected Boolean NO_STANDARD = false; - - @Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit") - protected Boolean LIST = false; - - // - // Arguments for Mendelian Violation calculations - // - @Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations: this argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) - protected String FAMILY_STRUCTURE; - - @Argument(shortName="MVQ", fullName="MendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false) - protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50; - - @Output(shortName="outputVCF", fullName="InterestingSitesVCF", doc="If provided, interesting sites emitted to this vcf and the INFO field annotated as to why they are interesting", required=false) - protected VCFWriter writer = null; - - @Argument(shortName="gcLog", fullName="GenotypeCocordanceLog", doc="If provided, sites with genotype concordance problems (e.g., FP and FNs) will be emitted ot this file", required=false) - protected PrintStream gcLog = null; - - private static double NO_MIN_QUAL_SCORE = -1.0; - @Argument(shortName = "Q", fullName="minPhredConfidenceScore", doc="Minimum confidence score to consider an evaluation SNP a variant", required=false) - public double minQualScore = NO_MIN_QUAL_SCORE; - @Argument(shortName = "Qcomp", fullName="minPhredConfidenceScoreForComp", doc="Minimum confidence score to consider a comp SNP a variant", required=false) - public double minCompQualScore = NO_MIN_QUAL_SCORE; - @Argument(shortName = "dels", fullName="indelCalls", doc="evaluate indels rather than SNPs", required = false) - public boolean dels = false; - - // Right now we will only be looking at SNPS - // todo -- enable INDEL variant contexts, there's no reason not to but the integration tests need to be updated - EnumSet ALLOW_VARIANT_CONTEXT_TYPES = EnumSet.of(VariantContext.Type.SNP, VariantContext.Type.NO_VARIATION); - - @Argument(shortName="rsID", fullName="rsID", doc="If provided, list of rsID and build number for capping known snps by their build date", required=false) - protected String rsIDFile = null; - - @Argument(shortName="maxRsIDBuild", fullName="maxRsIDBuild", doc="If provided, only variants with rsIDs <= maxRsIDBuild will be included in the set of known snps", required=false) - protected int maxRsIDBuild = Integer.MAX_VALUE; - - @Argument(shortName="reportType", fullName="reportType", doc="If provided, set the template type", required=false) - protected VE2ReportFactory.VE2TemplateType reportType = VE2ReportFactory.defaultReportFormat; - - @Output(shortName="reportLocation", fullName="reportLocation", doc="If provided, set the base file for reports (Required for output formats with more than one file per analysis)", required=false) - protected File outputLocation = null; - - @Argument(shortName="nSamples", fullName="nSamples", doc="If provided, analyses that need the number of samples in an eval track that has no genotype information will receive this number as the number of samples", required=false) - protected int nSamples = -1; - - Set rsIDsToExclude = null; - - @Argument(shortName="aatk", fullName="aminoAcidTransitionKey", doc="required for the amino acid transition table; this is the key in the info field for the VCF for the transition", required = false) - public String aminoAcidTransitionKey = null; - - @Argument(shortName="aats", fullName="aminoAcidTransitionSplit", doc="required for the amino acid transition table, this is the key on which to split the info field value to get the reference and alternate amino acids", required=false) - public String aminoAcidTransitionSplit = null; - - @Argument(shortName="aatUseCodons", fullName="aminoAcidsRepresentedByCodons", doc="for the amino acid table, specifiy that the transitions are represented as codon changes, and not directly amino acid names", required = false) - public boolean aatUseCodons = false; - - @Argument(shortName="disI", fullName="discordantInteresting", doc="If passed, write discordant sites as interesting", required=false) - protected boolean DISCORDANT_INTERESTING = false; - - @Argument(fullName="tranchesFile", shortName="tf", doc="The input tranches file describing where to cut the data", required=false) - private String TRANCHE_FILENAME = null; - - // For GenotypePhasingEvaluator: - @Argument(fullName = "minPhaseQuality", shortName = "minPQ", doc = "The minimum phasing quality (PQ) score required to consider phasing; [default:0]", required = false) - protected Double minPhaseQuality = 0.0; // accept any positive value of PQ - - @Argument(shortName="min", fullName="minimalComparisons", doc="If passed, filters and raw site values won't be computed", required=false) - protected boolean MINIMAL = false; - - - // -------------------------------------------------------------------------------------------------------------- - // - // private walker data - // - // -------------------------------------------------------------------------------------------------------------- - - /** private class holding all of the information about a single evaluation group (e.g., for eval ROD) */ - public class EvaluationContext implements Comparable { - // useful for typing - public String evalTrackName, compTrackName, novelty, filtered; - public boolean enableInterestingSiteCaptures = false; - VariantContextUtils.JexlVCMatchExp selectExp; - Set evaluations; - - public boolean isIgnoringFilters() { return filtered.equals(RAW_SET_NAME); } - public boolean requiresFiltered() { return filtered.equals(FILTERED_SET_NAME); } - public boolean requiresNotFiltered() { return filtered.equals(RETAINED_SET_NAME); } - public boolean isIgnoringNovelty() { return novelty.equals(ALL_SET_NAME); } - public boolean requiresNovel() { return novelty.equals(NOVEL_SET_NAME); } - public boolean requiresKnown() { return novelty.equals(KNOWN_SET_NAME); } - - public boolean isSelected() { return selectExp == null; } - - public String getDisplayName() { - return getName(CONTEXT_SEPARATOR); - } - - public String getJexlName() { - return getName("."); - } - - private String getName(String separator) { - return Utils.join(separator, Arrays.asList(evalTrackName, compTrackName, selectExp == null ? "all" : selectExp.name, filtered, novelty)); - } - - public String toString() { return getDisplayName(); } - - public int compareTo(EvaluationContext other) { - return this.getDisplayName().compareTo(other.getDisplayName()); - } - - public EvaluationContext( String evalName, String compName, String novelty, String filtered, VariantContextUtils.JexlVCMatchExp selectExp ) { - this.evalTrackName = evalName; - this.compTrackName = compName; - this.novelty = novelty; - this.filtered = filtered; - this.selectExp = selectExp; - this.enableInterestingSiteCaptures = selectExp == null; - this.evaluations = instantiateEvalationsSet(); - } - } - - private List contexts = null; - - // lists of all comp and eval ROD track names - private Set compNames = new HashSet(); - private Set evalNames = new HashSet(); - - private List variantEvaluationNames = new ArrayList(); - - private static String RAW_SET_NAME = "raw"; - private static String RETAINED_SET_NAME = "called"; - private static String FILTERED_SET_NAME = "filtered"; - private static String ALL_SET_NAME = "all"; - private static String KNOWN_SET_NAME = "known"; - private static String NOVEL_SET_NAME = "novel"; - - private static String NO_COMP_NAME = "N/A"; - - private final static String CONTEXT_SEPARATOR = "XXX"; - //private final static String CONTEXT_SEPARATOR = "\\."; - private final static String CONTEXT_HEADER = Utils.join(CONTEXT_SEPARATOR, Arrays.asList("eval", "comp", "select", "filter", "novelty")); - private final static int N_CONTEXT_NAME_PARTS = CONTEXT_HEADER.split(CONTEXT_SEPARATOR).length; - private static int[] nameSizes = new int[N_CONTEXT_NAME_PARTS]; - static { - int i = 0; - for ( String elt : CONTEXT_HEADER.split(CONTEXT_SEPARATOR) ) - nameSizes[i++] = elt.length(); - } - - // Dynamically determined variantEvaluation classes - private Set> evaluationClasses = null; - - // -------------------------------------------------------------------------------------------------------------- - // - // initialize - // - // -------------------------------------------------------------------------------------------------------------- - - public boolean printInterestingSites() { return writer != null; } - - public void initialize() { - if ( dels ) { - ALLOW_VARIANT_CONTEXT_TYPES = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.NO_VARIATION); - } - ReportFormat.AcceptableOutputType type = (outputLocation == null) ? ReportFormat.AcceptableOutputType.STREAM : ReportFormat.AcceptableOutputType.FILE; - if (!VE2ReportFactory.isCompatibleWithOutputType(type,reportType)) - throw new UserException.CommandLineException("The report format requested is not compatible with your output location. You specified a " + type + " output type which isn't an option for " + reportType); - if ( LIST ) - listModulesAndExit(); - - SAMPLES_LIST = SampleUtils.getSamplesFromCommandLineInput(Arrays.asList(SAMPLES)); - - determineEvalations(); - - if ( TRANCHE_FILENAME != null ) { - // we are going to build a few select names automatically from the tranches file - for ( Tranche t : Tranche.readTraches(new File(TRANCHE_FILENAME)) ) { - logger.info("Adding select for all variant above the pCut of : " + t); - SELECT_EXPS.add(String.format(VariantRecalibrator.VQS_LOD_KEY + " >= %.2f", t.minVQSLod)); - SELECT_NAMES.add(String.format("FDR-%.2f", t.fdr)); - } - } - - if ( SELECT_NAMES.size() > 0 ) { - logger.info("Selects: " + SELECT_NAMES); - logger.info("Selects: " + SELECT_EXPS); - } - List selectExps = VariantContextUtils.initializeMatchExps(SELECT_NAMES, SELECT_EXPS); - - for ( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) { - if ( d.getName().startsWith("eval") ) { - evalNames.add(d.getName()); - } else if ( d.getName().startsWith("comp") ) { - compNames.add(d.getName()); - } else if ( d.getName().startsWith(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) || d.getName().startsWith("hapmap") ) { - compNames.add(d.getName()); - } else { - logger.info("Not evaluating ROD binding " + d.getName()); - } - } - - // if no comp rod was provided, we still want to be able to do evaluations, so use a default comp name - if ( compNames.size() == 0 ) - compNames.add(NO_COMP_NAME); - - contexts = initializeEvaluationContexts(evalNames, compNames, selectExps); - determineContextNamePartSizes(); - - if ( rsIDFile != null ) { - if ( maxRsIDBuild == Integer.MAX_VALUE ) - throw new IllegalArgumentException("rsIDFile " + rsIDFile + " was given but associated max RSID build parameter wasn't available"); - rsIDsToExclude = getrsIDsToExclude(new File(rsIDFile), maxRsIDBuild); - } - - if ( writer != null ) { - Set samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), evalNames); - final VCFHeader vcfHeader = new VCFHeader(new HashSet(), samples); - writer.writeHeader(vcfHeader); - } - } - - private void listModulesAndExit() { - List> veClasses = new PluginManager( VariantEvaluator.class ).getPlugins(); - out.println("\nAvailable eval modules:"); - out.println("(Standard modules are starred)"); - for (Class veClass : veClasses) - out.println("\t" + veClass.getSimpleName() + (StandardEval.class.isAssignableFrom(veClass) ? "*" : "")); - out.println(); - System.exit(0); - } - - private static Set getrsIDsToExclude(File rsIDFile, int maxRsIDBuild) { - List toExclude = new LinkedList(); - - int n = 1; - try { - for ( String line : new XReadLines(rsIDFile) ) { - String parts[] = line.split(" "); - if ( parts.length != 2 ) - throw new UserException.MalformedFile(rsIDFile, "Invalid rsID / build pair at " + n + " line = " + line ); - //System.out.printf("line %s %s %s%n", line, parts[0], parts[1]); - if ( Integer.valueOf(parts[1]) > maxRsIDBuild ) { - //System.out.printf("Excluding %s%n", line); - toExclude.add("rs"+parts[0]); - } - n++; - - if ( n % 1000000 == 0 ) - logger.info(String.format("Read %d rsIDs from rsID -> build file", n)); - } - } catch (FileNotFoundException e) { - throw new UserException.CouldNotReadInputFile(rsIDFile, e); - } - - logger.info(String.format("Excluding %d of %d (%.2f%%) rsIDs found from builds > %d", - toExclude.size(), n, ((100.0 * toExclude.size())/n), maxRsIDBuild)); - - return new HashSet(toExclude); - } - - private boolean excludeComp(VariantContext vc) { - String id = vc != null && vc.hasID() ? vc.getID() : null; - boolean ex = rsIDsToExclude != null && id != null && rsIDsToExclude.contains(id); - //System.out.printf("Testing id %s ex=%b against %s%n", id, ex, vc); - return ex; - } - - private void determineEvalations() { - // create a map for all eval modules for easy lookup - HashMap> classMap = new HashMap>(); - for ( Class c : new PluginManager( VariantEvaluator.class ).getPlugins() ) - classMap.put(c.getSimpleName(), c); - - evaluationClasses = new HashSet>(); - - // by default, use standard eval modules - if ( !NO_STANDARD ) { - for ( Class myClass : new PluginManager( StandardEval.class ).getPlugins() ) { - if ( classMap.containsKey(myClass.getSimpleName()) ) - evaluationClasses.add(classMap.get(myClass.getSimpleName())); - } - } - - // get the specific classes provided - for ( String module : modulesToUse ) { - if ( !classMap.containsKey(module) ) - throw new UserException.CommandLineException("Module " + module + " could not be found; please check that you have specified the class name correctly"); - evaluationClasses.add(classMap.get(module)); - } - - for ( VariantEvaluator e : instantiateEvalationsSet() ) { - // for collecting purposes - variantEvaluationNames.add(e.getName()); - logger.debug("Including VariantEvaluator " + e.getName() + " of class " + e.getClass()); - } - - Collections.sort(variantEvaluationNames); - } - - private List append(List selectExps, T elt) { - List l = new ArrayList(selectExps); - l.add(elt); - return l; - } - - private List initializeEvaluationContexts(Set evalNames, Set compNames, List selectExps) { - List contexts = new ArrayList(); - - // todo -- add another for loop for each sample (be smart about the selection here - - // honor specifications of just one or a few samples), and put an "all" in here so - // that we don't lose multi-sample evaluations - - List filterTypes = MINIMAL ? Arrays.asList(RETAINED_SET_NAME) : Arrays.asList(RAW_SET_NAME, RETAINED_SET_NAME, FILTERED_SET_NAME); - - - selectExps = append(selectExps, null); - for ( String evalName : evalNames ) { - for ( String compName : compNames ) { - for ( VariantContextUtils.JexlVCMatchExp e : selectExps ) { - for ( String filteredName : filterTypes ) { - for ( String novelty : Arrays.asList(ALL_SET_NAME, KNOWN_SET_NAME, NOVEL_SET_NAME) ) { - EvaluationContext context = new EvaluationContext(evalName, compName, novelty, filteredName, e); - contexts.add(context); - } - } - } - } - } - - Collections.sort(contexts); - return contexts; - } - - private Set instantiateEvalationsSet() { - Set evals = new HashSet(); - Object[] args = new Object[]{this}; - Class[] argTypes = new Class[]{VariantEvalWalker.class}; - - for ( Class c : evaluationClasses ) { - try { - Constructor constructor = c.getConstructor(argTypes); - VariantEvaluator eval = constructor.newInstance(args); - evals.add(eval); - } catch (Exception e) { - throw new DynamicClassResolutionException(c, e); - } - } - - return evals; - } - - - - private boolean captureInterestingSitesOfEvalSet(EvaluationContext group) { - //System.out.printf("checking %s%n", name); - return group.requiresNotFiltered() && group.isIgnoringNovelty(); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // map - // - // -------------------------------------------------------------------------------------------------------------- - - // todo -- call a single function to build a map from track name -> variant context / null for all - // -- eval + comp names. Use this data structure to get data throughout rest of the loops here - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - //System.out.printf("map at %s with %d skipped%n", context.getLocation(), context.getSkippedBases()); - - Map vcs = getVariantContexts(ref, tracker, context); - //System.out.println("vcs has size "+vcs.size()); - //Collection comps = getCompVariantContexts(tracker, context); - - // to enable walking over pairs where eval or comps have no elements - for ( EvaluationContext group : contexts ) { - VariantContext vc = vcs.get(group.evalTrackName); - - //logger.debug(String.format("Updating %s with variant", vc)); - Set evaluations = group.evaluations; - boolean evalWantsVC = applyVCtoEvaluation(vc, vcs, group); - VariantContext interestingVC = vc; - List interestingReasons = new ArrayList(); - - for ( VariantEvaluator evaluation : evaluations ) { - synchronized ( evaluation ) { - if ( evaluation.enabled() ) { - // we always call update0 in case the evaluation tracks things like number of bases covered - evaluation.update0(tracker, ref, context); - - // the other updateN methods don't see a null context - if ( tracker == null ) - continue; - - // now call the single or paired update function - switch ( evaluation.getComparisonOrder() ) { - case 1: - if ( evalWantsVC && vc != null ) { - String interesting = evaluation.update1(vc, tracker, ref, context, group); - if ( interesting != null ) interestingReasons.add(interesting); - } - break; - case 2: - VariantContext comp = vcs.get(group.compTrackName); - if ( comp != null && - minCompQualScore != NO_MIN_QUAL_SCORE && - comp.hasNegLog10PError() && - comp.getNegLog10PError() < (minCompQualScore / 10.0) ) - comp = null; - - String interesting = evaluation.update2( evalWantsVC ? vc : null, comp, tracker, ref, context, group ); - - /** TODO - -- for Eric: Fix me (current implementation causes GenotypeConcordance - to treat sites that don't match JEXL as no-calls) - - String interesting = null; - if (evalWantsVC) - { - interesting = evaluation.update2( evalWantsVC ? vc : null, comp, tracker, ref, context, group ); - } - **/ - - - if ( interesting != null ) { - interestingVC = interestingVC == null ? ( vc == null ? comp : vc ) : interestingVC; - interestingReasons.add(interesting); - } - break; - default: - throw new ReviewedStingException("BUG: Unexpected evaluation order " + evaluation); - } - } - } - } - - if ( tracker != null && group.enableInterestingSiteCaptures && captureInterestingSitesOfEvalSet(group) ) - writeInterestingSite(interestingReasons, interestingVC, ref.getBase()); - } - - return 0; - } - - private void writeInterestingSite(List interestingReasons, VariantContext vc, byte ref) { - if ( vc != null && writer != null && interestingReasons.size() > 0 ) { - // todo -- the vc == null check is because you can be interesting because you are a FN, and so VC == null - MutableVariantContext mvc = new MutableVariantContext(vc); - - for ( String why : interestingReasons ) { - String key, value; - String[] parts = why.split("="); - - switch ( parts.length ) { - case 1: - key = parts[0]; - value = "1"; - break; - case 2: - key = parts[0]; - value = parts[1]; - break; - default: - throw new IllegalStateException("BUG: saw a interesting site reason sting with multiple = signs " + why); - } - - mvc.putAttribute(key, value); - } - - - writer.add(mvc, ref); - //interestingReasons.clear(); - } - } - - private static Set seenJEXLExceptions = new HashSet(); - private boolean applyVCtoEvaluation(VariantContext vc, Map vcs, EvaluationContext group) { - if ( vc == null ) - return true; - - if ( minQualScore != NO_MIN_QUAL_SCORE && - vc.hasNegLog10PError() && - vc.getNegLog10PError() < (minQualScore / 10.0) ) { - //System.out.printf("exclude %s%n", vc); - return false; - } - - if ( group.requiresFiltered() && vc.isNotFiltered() ) - return false; - - if ( group.requiresNotFiltered() && vc.isFiltered() ) - return false; - - boolean vcKnown = vcIsKnown(vc, vcs, KNOWN_NAMES); - if ( group.requiresKnown() && ! vcKnown ) - return false; - else if ( group.requiresNovel() && vcKnown ) - return false; - - if ( group.selectExp != null ) { - try { - if ( ! VariantContextUtils.match(vc, group.selectExp) ) - return false; - } catch ( RuntimeException e ) { - if ( ! seenJEXLExceptions.contains(group.selectExp.name) ) { - seenJEXLExceptions.add(group.selectExp.name); - logger.warn("JEXL evaluation error for SELECT " + group.selectExp.name + ": " + e.getMessage() + - "; this may be an error or may simply result from some variants not having INFO fields keys " + - "referenced in the JEXL expressions. Variants generating exceptions will *NOT* be matched " + - "by the expression. Occurred with variant " + vc); - } - return false; - } - } - - // nothing invalidated our membership in this set - return true; - } - - private boolean vcIsKnown(VariantContext vc, Map vcs, String[] knownNames ) { - for ( String knownName : knownNames ) { - VariantContext known = vcs.get(knownName); - if ( known != null && known.isNotFiltered() && known.getType() == vc.getType() ) { - return true; - } - } - - return false; - } - -// can't handle this situation -// todo -- warning, this leads to some missing SNPs at complex loci, such as: -// todo -- 591 1 841619 841620 rs4970464 0 - A A -/C/T genomic mixed unknown 0 0 near-gene-3 exact 1 -// todo -- 591 1 841619 841620 rs62677860 0 + A A C/T genomic single unknown 0 0 near-gene-3 exact 1 -// -//logger.info(String.format("Ignore second+ events at locus %s in rod %s => rec is %s", context.getLocation(), rodList.getName(), rec)); - - private Map getVariantContexts(ReferenceContext ref, RefMetaDataTracker tracker, AlignmentContext context) { - // todo -- we need to deal with dbSNP where there can be multiple records at the same start site. A potential solution is to - // todo -- allow the variant evaluation to specify the type of variants it wants to see and only take the first such record at a site - Map bindings = new HashMap(); - if ( tracker != null ) { - //System.out.println("Tracker is not null"); - bindVariantContexts(ref, bindings, evalNames, tracker, context, false); - bindVariantContexts(ref, bindings, compNames, tracker, context, true); - } - return bindings; - } - - private void bindVariantContexts(ReferenceContext ref, Map map, Collection names, - RefMetaDataTracker tracker, AlignmentContext context, boolean allowExcludes ) { - for ( String name : names ) { - Collection contexts = tracker.getVariantContexts(ref, name, ALLOW_VARIANT_CONTEXT_TYPES, context.getLocation(), true, true); - if ( contexts.size() > 1 ) - throw new UserException.CommandLineException("Found multiple variant contexts found in " + name + " at " + context.getLocation() + "; VariantEval assumes one variant per position"); - - VariantContext vc = contexts.size() == 1 ? contexts.iterator().next() : null; - - if ( SAMPLES_LIST.size() > 0 && vc != null ) { - boolean hasGenotypes = vc.hasGenotypes(SAMPLES_LIST); - if ( hasGenotypes ) { - //if ( ! name.equals("eval") ) logger.info(String.format("subsetting VC %s", vc)); - vc = vc.subContextFromGenotypes(vc.getGenotypes(SAMPLES_LIST).values()); - HashMap newAts = new HashMap(vc.getAttributes()); - VariantContextUtils.calculateChromosomeCounts(vc,newAts,true); - vc = VariantContext.modifyAttributes(vc,newAts); - logger.debug(String.format("VC %s subset to %s AC%n",vc.getSource(),vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY))); - //if ( ! name.equals("eval") ) logger.info(String.format(" => VC %s", vc)); - } else if ( !hasGenotypes && !name.equals("dbsnp") ) { - throw new UserException(String.format("Genotypes for the variant context %s do not contain all the provided samples %s",vc.getSource(), getMissingSamples(SAMPLES_LIST,vc))); - } - } - - map.put(name, allowExcludes && excludeComp(vc) ? null : vc); - } - } - - private static String getMissingSamples(Collection soughtSamples, VariantContext vc) { - StringBuffer buf = new StringBuffer(); - buf.append("Missing samples are:"); - for ( String s : soughtSamples ) { - if ( ! vc.getGenotypes().keySet().contains(s) ) { - buf.append(String.format("%n%s",s)); - } - } - - return buf.toString(); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // reduce - // - // -------------------------------------------------------------------------------------------------------------- - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer point, Integer sum) { - return point + sum; - } - - public Integer treeReduce(Integer point, Integer sum) { - return point + sum; - } - - public VariantEvaluator getEvalByName(String name, Set s) { - for ( VariantEvaluator e : s ) - if ( e.getName().equals(name) ) - return e; - return null; - } - - private void determineContextNamePartSizes() { - for ( EvaluationContext group : contexts ) { - String[] parts = group.getDisplayName().split(CONTEXT_SEPARATOR); - if ( parts.length != N_CONTEXT_NAME_PARTS ) { - throw new ReviewedStingException("Unexpected number of eval name parts " + group.getDisplayName() + " length = " + parts.length + ", expected " + N_CONTEXT_NAME_PARTS); - } else { - for ( int i = 0; i < parts.length; i++ ) - nameSizes[i] = Math.max(nameSizes[i], parts[i].length()); - } - } - } - - public void onTraversalDone(Integer result) { - writeReport(); - validateContext(); - } - - /** - * Writes the report out to disk. - */ - private void writeReport() { - // our report mashaller - ReportMarshaller marshaller; - - // create the report marshaller early, so that we can fail-fast if something is wrong with the output sources - if (outputLocation == null) - marshaller = VE2ReportFactory.createMarhsaller(new OutputStreamWriter(out), reportType, createExtraOutputTags()); - else - marshaller = VE2ReportFactory.createMarhsaller(outputLocation, reportType, createExtraOutputTags()); - for ( String evalName : variantEvaluationNames ) { - for ( EvaluationContext group : contexts ) { - VariantEvaluator eval = getEvalByName(evalName, group.evaluations); - // finalize the evaluation - eval.finalizeEvaluation(); - - if ( eval.enabled() ) - marshaller.write(createPrependNodeList(group),eval); - } - } - marshaller.close(); - } - - /** - * Validates the JEXL expressions and throws an exception if they do not all return true. - */ - private void validateContext() { - if (SUMMARY_EXPS.size() + VALIDATE_EXPS.size() == 0) - return; - - JexlContext jc = new MapContext(); - for (EvaluationContext context : contexts) - for (VariantEvaluator eval: context.evaluations) - jc.set(context.getJexlName() + "." + eval.getName(), eval); - - Uberspect uberspect = new UberspectImpl(LogFactory.getLog(JexlEngine.class)) { - /** Gets the field, even if the field was non-public. */ - @Override - public Field getField(Object obj, String name, JexlInfo info) { - Field result = super.getField(obj, name, info); - if (result == null && obj != null) { - Class clazz = obj instanceof Class ? (Class)obj : obj.getClass(); - try { - // TODO: Default UberspectImpl uses an internal field cache by class type - result = clazz.getDeclaredField(name); - result.setAccessible(true); - } catch (NoSuchFieldException nsfe) { - /* ignore */ - } - } - return result; - } - }; - - JexlEngine jexl = new JexlEngine(uberspect, null, null, null); - - for (String expression: SUMMARY_EXPS) { - Object jexlResult = jexl.createExpression(expression).evaluate(jc); - logger.info("Summary: " + expression + " = " + jexlResult); - } - - List failedExpressions = new ArrayList(); - for (String expression: VALIDATE_EXPS) { - // ex: evalYRI.compYRI.all.called.novel.titv.tiTvRatio > 1.0 - Object jexlResult = jexl.createExpression(expression).evaluate(jc); - boolean pass = Boolean.TRUE.equals(jexlResult); - if (!pass) { - logger.error("FAIL: " + expression); - failedExpressions.add(expression); - } else if (logger.isDebugEnabled()) { - logger.debug("PASS: " + expression); - } - } - - int failed = failedExpressions.size(); - int total = VALIDATE_EXPS.size(); - - logger.info(String.format("Validations: Total %s, Passed %s, Failed %s", total, (total-failed), failed)); - if (failed > 0) { - StringBuilder message = new StringBuilder("The validation expressions below did not return true. Please check the report output for more info."); - for (String expression: failedExpressions) - message.append(String.format("%n ")).append(expression); - throw new UserException(message.toString()); - } - } - - /** - * create some additional output lines about the analysis - * @return a list of nodes to attach to the report as tags - */ - private List createExtraOutputTags() { - List list = new ArrayList(); - list.add(new Node("reference file",getToolkit().getArguments().referenceFile.getName(),"The reference sequence file")); - for (String binding : getToolkit().getArguments().RODBindings) - list.add(new Node("ROD binding",binding,"The reference sequence file")); - return list; - } - - - /** - * given the evaluation name, and the context, create the list of pre-pended nodes for the output system. - * Currently it expects the the following list: jexl_expression, evaluation_name, comparison_name, filter_name, - * novelty_name - * @param group the evaluation context - * @return a list of Nodes to prepend the analysis module output with - */ - private List createPrependNodeList(EvaluationContext group) { - // add the branching nodes: jexl expression, comparison track, eval track etc - Node jexlNode = new Node("jexl_expression",(group.selectExp != null) ? group.selectExp.name : "none","The jexl filtering expression"); - Node compNode = new Node("comparison_name",group.compTrackName,"The comparison track name"); - Node evalNode = new Node("evaluation_name",group.evalTrackName,"The evaluation name"); - Node filterNode = new Node("filter_name",group.filtered,"The filter name"); - Node noveltyNode = new Node("novelty_name",group.novelty,"The novelty name"); - // the ordering is important below, this is the order the columns will appear in any output format - return Arrays.asList(evalNode,compNode,jexlNode,filterNode,noveltyNode); - } - - // - // utility functions - // - - /** - * Takes an eval generated VariantContext and attempts to return the number of samples in the - * VC. If there are genotypes, it returns that value first. Otherwise it returns nSamples, if - * provided. Returns -1 if no sample counts can be obtained. - * - * @param eval - * @return - */ - public int getNSamplesForEval( VariantContext eval ) { - return eval.hasGenotypes() ? eval.getNSamples() : nSamples; - } - - public Logger getLogger() { return logger; } -} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvaluator.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvaluator.java deleted file mode 100755 index 1d799af3a..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvaluator.java +++ /dev/null @@ -1,101 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.varianteval; - -import org.apache.log4j.Logger; -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; - -/** - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - *

- * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ -public abstract class VariantEvaluator { -// protected boolean accumulateInterestingSites = false, printInterestingSites = false; -// protected String interestingSitePrefix = null; -// protected boolean processedASite = false; -// protected List interestingSites = new ArrayList(); - -// do we want to keep track of things that are interesting -// public void accumulateInterestingSites(boolean enable) { accumulateInterestingSites = enable; } -// public void printInterestingSites(String prefix) { printInterestingSites = true; interestingSitePrefix = prefix; } -// public boolean isAccumulatingInterestingSites() { return accumulateInterestingSites; } -// public List getInterestingSites() { return interestingSites; } - -// protected void addInterestingSite(String why, VariantContext vc) { -// if ( accumulateInterestingSites ) -// interestingSites.add(vc); -// if ( printInterestingSites ) -// System.out.printf("%40s %s%n", interestingSitePrefix, why); -// } - - public abstract String getName(); - - protected VariantEvalWalker veWalker = null; - public VariantEvaluator(VariantEvalWalker parent) { - veWalker = parent; - // don't do anything - } - - protected VariantEvalWalker getVEWalker() { - return veWalker; - } - - protected Logger getLogger() { - return veWalker.getLogger(); - } - - public abstract boolean enabled(); - //public boolean processedAnySites() { return processedASite; } - //protected void markSiteAsProcessed() { processedASite = true; } - - // Should return the number of VariantContexts expected as inputs to update. Can be 1 or 2 - public abstract int getComparisonOrder(); - - // called at all sites, regardless of eval context itself; useful for counting processed bases - public void update0(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { } - - public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - return null; - } - - public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantEvalWalker.EvaluationContext group) { - return update1(vc1, tracker, ref, context); - } - - - public String update2(VariantContext vc1, VariantContext vc2, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - return null; - } - - public String update2(VariantContext vc1, VariantContext vc2, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantEvalWalker.EvaluationContext group) { - return update2(vc1, vc2, tracker, ref, context); - } - - - /** - * override this method for any finalization of calculations after the analysis is completed - */ - public void finalizeEvaluation() {} - - // - // useful common utility routines - // - protected double rate(long n, long d) { - return n / (1.0 * Math.max(d, 1)); - } - - protected long inverseRate(long n, long d) { - return n == 0 ? 0 : d / Math.max(n, 1); - } - - protected double ratio(long num, long denom) { - return ((double)num) / (Math.max(denom, 1)); - } - -} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantQualityScore.java b/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantQualityScore.java deleted file mode 100755 index ccebaac76..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantQualityScore.java +++ /dev/null @@ -1,258 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.varianteval; - -import org.broad.tribble.util.variantcontext.Allele; -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.utils.report.tags.Analysis; -import org.broadinstitute.sting.utils.report.tags.DataPoint; -import org.broadinstitute.sting.utils.report.utils.TableType; -import org.broadinstitute.sting.utils.collections.Pair; - -import java.util.ArrayList; -import java.util.HashMap; - -/** - * @author rpoplin - * @since Apr 6, 2010 - */ - -@Analysis(name = "Variant Quality Score", description = "Shows various stats of sets of variants binned by variant quality score") -public class VariantQualityScore extends VariantEvaluator { - - // a mapping from quality score histogram bin to Ti/Tv ratio - @DataPoint(name="TiTv by Quality", description = "the Ti/Tv ratio broken out by variant quality") - TiTvStats titvStats = null; - - @DataPoint(name="Quality by Allele Count", description = "average variant quality for each allele count") - AlleleCountStats alleleCountStats = null; - - static class TiTvStats implements TableType { - final static int NUM_BINS = 20; - final HashMap> qualByIsTransition = new HashMap>(); // A hashMap holds all the qualities until we are able to bin them appropriately - final long transitionByQuality[] = new long[NUM_BINS]; - final long transversionByQuality[] = new long[NUM_BINS]; - final double titvByQuality[] = new double[NUM_BINS]; // the final ti/tv sets that get reported out - - public Object[] getRowKeys() { - return new String[]{"sample"}; - } - - public Object[] getColumnKeys() { - final String columnKeys[] = new String[NUM_BINS]; - for( int iii = 0; iii < NUM_BINS; iii++ ) { - columnKeys[iii] = "titvBin" + iii; - } - return columnKeys; - } - - public String getName() { - return "TiTvStats"; - } - - public String getCell(int x, int y) { - return String.valueOf(titvByQuality[y]); - } - - public String toString() { - StringBuffer returnString = new StringBuffer(); - // output the ti/tv array - returnString.append("titvByQuality: "); - for( int iii = 0; iii < NUM_BINS; iii++ ) { - returnString.append(titvByQuality[iii]); - returnString.append(" "); - } - return returnString.toString(); - } - - public void incrValue( final double qual, final boolean isTransition ) { - final Integer qualKey = Math.round((float) qual); - final long numTransition = (isTransition ? 1L : 0L); - final long numTransversion = (isTransition ? 0L : 1L); - if( qualByIsTransition.containsKey(qualKey) ) { - Pair transitionPair = qualByIsTransition.get(qualKey); - transitionPair.set(transitionPair.getFirst() + numTransition, transitionPair.getSecond() + numTransversion); - qualByIsTransition.put(qualKey, transitionPair); - } else { - qualByIsTransition.put(qualKey, new Pair(numTransition,numTransversion)); - } - } - - public void organizeTiTvTables() { - for( int iii = 0; iii < NUM_BINS; iii++ ) { - transitionByQuality[iii] = 0L; - transversionByQuality[iii] = 0L; - titvByQuality[iii] = 0.0; - } - - int maxQual = 0; - - // Calculate the maximum quality score in order to normalize and histogram - for( final Integer qual : qualByIsTransition.keySet() ) { - if( qual > maxQual ) { - maxQual = qual; - } - } - - final double binSize = ((double)maxQual) / ((double) (NUM_BINS-1)); - - for( final Integer qual : qualByIsTransition.keySet() ) { - final int index = (int)Math.floor( ((double) qual) / binSize ); - if( index >= 0 ) { // BUGBUG: why is there overflow here? - Pair transitionPair = qualByIsTransition.get(qual); - transitionByQuality[index] += transitionPair.getFirst(); - transversionByQuality[index] += transitionPair.getSecond(); - } - } - - for( int iii = 0; iii < NUM_BINS; iii++ ) { - if( transitionByQuality[iii] + transversionByQuality[iii] > 800L ) { // need to have a sufficient number of variants to get a useful Ti/Tv ratio - titvByQuality[iii] = ((double) transitionByQuality[iii]) / ((double) transversionByQuality[iii]); - } else { - titvByQuality[iii] = 0.0; - } - } - - } - } - - class AlleleCountStats implements TableType { - final HashMap> qualityListMap = new HashMap>(); - final HashMap qualityMap = new HashMap(); - - public Object[] getRowKeys() { - final int NUM_BINS = qualityListMap.keySet().size(); - final String rowKeys[] = new String[NUM_BINS]; - int iii = 0; - for( final Integer key : qualityListMap.keySet() ) { - rowKeys[iii] = "AC" + key; - iii++; - } - return rowKeys; - - } - - public Object[] getColumnKeys() { - return new String[]{"alleleCount","avgQual"}; - } - - public String getName() { - return "AlleleCountStats"; - } - - public String getCell(int x, int y) { - int iii = 0; - for( final Integer key : qualityListMap.keySet() ) { - if(iii == x) { - if(y == 0) { return String.valueOf(key); } - else { return String.valueOf(qualityMap.get(key)); } - } - iii++; - } - return null; - } - - public String toString() { - String returnString = ""; - // output the quality map - returnString += "AlleleCountStats: "; - //for( int iii = 0; iii < NUM_BINS; iii++ ) { - // returnString += titvByQuality[iii] + " "; - //} - return returnString; - } - - public void incrValue( final double qual, final int alleleCount ) { - ArrayList list = qualityListMap.get(alleleCount); - if(list==null) { list = new ArrayList(); } - list.add(qual); - qualityListMap.put(alleleCount, list); - } - - public void organizeAlleleCountTables() { - for( final Integer key : qualityListMap.keySet() ) { - final ArrayList list = qualityListMap.get(key); - double meanQual = 0.0; - final double numQuals = (double)list.size(); - for( Double qual : list ) { - meanQual += qual / numQuals; - } - qualityMap.put(key, meanQual); - } - } - } - - public VariantQualityScore(VariantEvalWalker parent) { - super(parent); - } - - public String getName() { - return "VariantQualityScore"; - } - - public int getComparisonOrder() { - return 1; // we only need to see each eval track - } - - public boolean enabled() { - return true; - } - - public String toString() { - return getName(); - } - - public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - final String interesting = null; - - if( eval != null && eval.isSNP() && eval.isBiallelic() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites) - if( titvStats == null ) { titvStats = new TiTvStats(); } - titvStats.incrValue(eval.getPhredScaledQual(), VariantContextUtils.isTransition(eval)); - - if( alleleCountStats == null ) { alleleCountStats = new AlleleCountStats(); } - int alternateAlleleCount = 0; - for (final Allele a : eval.getAlternateAlleles()) { - alternateAlleleCount += eval.getChromosomeCount(a); - } - alleleCountStats.incrValue(eval.getPhredScaledQual(), alternateAlleleCount); - } - - return interesting; // This module doesn't capture any interesting sites, so return null - } - - public void finalizeEvaluation() { - if( titvStats != null ) { - titvStats.organizeTiTvTables(); - } - if( alleleCountStats != null ) { - alleleCountStats.organizeAlleleCountTables(); - } - } -} \ No newline at end of file