From 22e599ec76d952e21ff2181d2e8ce3f2e22a1af4 Mon Sep 17 00:00:00 2001 From: kiran Date: Wed, 26 Jan 2011 17:38:21 +0000 Subject: [PATCH] Fixed output report to properly handle evaluation modules with TableType objects. Promoted CpG to a standard stratification. Demoted Filter to a non-standard stratification. Now, if the filter stratification is not specified, VariantEval only evaluates PASSing sites. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5084 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/report/GATKReportTable.java | 2 +- .../newvarianteval/NewVariantEvalWalker.java | 135 ++- .../evaluators/CompEvalGenotypes.java | 4 +- .../evaluators/GenotypeConcordance.java | 793 ++++++++++++++++++ .../evaluators/GenotypePhasingEvaluator.java | 422 ++++++++++ .../evaluators/IndelLengthHistogram.java | 112 +++ .../evaluators/IndelMetricsByAC.java | 222 +++++ .../evaluators/IndelStatistics.java | 514 ++++++++++++ .../MendelianViolationEvaluator.java | 186 ++++ .../newvarianteval/evaluators/PhaseStats.java | 4 +- .../evaluators/PrintMissingComp.java | 67 ++ .../evaluators/SamplePreviousGenotypes.java | 4 +- .../evaluators/SimpleMetricsByAC.java | 178 ++++ .../evaluators/VariantEvaluator.java | 8 + .../evaluators/VariantQualityScore.java | 439 +++++----- .../newvarianteval/stratifications/CpG.java | 2 +- .../stratifications/Filter.java | 2 +- .../util/NewEvaluationContext.java | 9 +- 18 files changed, 2857 insertions(+), 246 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/GenotypeConcordance.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/GenotypePhasingEvaluator.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/IndelLengthHistogram.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/IndelMetricsByAC.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/IndelStatistics.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/MendelianViolationEvaluator.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/PrintMissingComp.java create mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/SimpleMetricsByAC.java diff --git a/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java b/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java index 90bef9f35..c5203f254 100755 --- a/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java +++ b/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java @@ -132,7 +132,7 @@ public class GATKReportTable { } /** - * Add a primary key column. This becomes the unique identifier for every column in the table, and will always be printed as the first column. + * Add a primary key column. This becomes the unique identifier for every column in the table. * * @param primaryKeyName the name of the primary key column */ diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/NewVariantEvalWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/NewVariantEvalWalker.java index 912a5ee40..4cb5d3d72 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/NewVariantEvalWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/NewVariantEvalWalker.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.report.utils.TableType; import org.broadinstitute.sting.utils.vcf.VCFUtils; import java.io.PrintStream; @@ -79,6 +80,10 @@ public class NewVariantEvalWalker extends RodWalker implements @Argument(fullName="doNotUseAllStandardModules", shortName="noEV", doc="Do not use the standard modules by default (instead, only those that are specified with the -E option)") protected Boolean NO_STANDARD_MODULES = false; + // Other arguments + @Argument(fullName="minPhaseQuality", shortName="mpq", doc="Minimum phasing quality", required=false) + protected double MIN_PHASE_QUALITY = 10.0; + // Variables private Set jexlExpressions = new TreeSet(); private Set compNames = new TreeSet(); @@ -258,16 +263,16 @@ public class NewVariantEvalWalker extends RodWalker implements } else { HashMap necs = new HashMap(); - StateKey statekey = new StateKey(); + StateKey stateKey = new StateKey(); for ( VariantStratifier vs : ec.keySet() ) { String state = ec.get(vs); - statekey.put(vs.getClass().getSimpleName(), state); + stateKey.put(vs.getClass().getSimpleName(), state); } - ec.addEvaluationClassList(evaluationObjects); + ec.addEvaluationClassList(this, stateKey, evaluationObjects); - necs.put(statekey, ec); + necs.put(stateKey, ec); return necs; } @@ -301,11 +306,31 @@ public class NewVariantEvalWalker extends RodWalker implements table.addColumn(columnName, "unknown"); } - AnalysisModuleScanner scanner = new AnalysisModuleScanner(ve); - Map datamap = scanner.getData(); + try { + VariantEvaluator vei = ve.newInstance(); + vei.initialize(this); - for (Field field : datamap.keySet()) { - table.addColumn(field.getName(), 0.0); + AnalysisModuleScanner scanner = new AnalysisModuleScanner(vei); + Map datamap = scanner.getData(); + + for (Field field : datamap.keySet()) { + field.setAccessible(true); + + if (field.get(vei) instanceof TableType) { + TableType t = (TableType) field.get(vei); + + for ( Object o : t.getColumnKeys() ) { + String c = (String) o; + table.addColumn(c, 0.0); + } + } else { + table.addColumn(field.getName(), 0.0); + } + } + } catch (InstantiationException e) { + throw new StingException("InstantiationException: " + e); + } catch (IllegalAccessException e) { + throw new StingException("IllegalAccessException: " + e); } } @@ -385,6 +410,13 @@ public class NewVariantEvalWalker extends RodWalker implements return allowableTypes; } + /** + * Subset a VariantContext to a single sample + * + * @param vc the VariantContext object containing multiple samples + * @param sampleName the sample to pull out of the VariantContext + * @return a new VariantContext with just the requested sample + */ private VariantContext getSubsetOfVariantContext(VariantContext vc, String sampleName) { ArrayList sampleNames = new ArrayList(); sampleNames.add(sampleName); @@ -392,6 +424,13 @@ public class NewVariantEvalWalker extends RodWalker implements return getSubsetOfVariantContext(vc, sampleNames); } + /** + * Subset a VariantContext to a set of samples + * + * @param vc the VariantContext object containing multiple samples + * @param sampleNames the samples to pull out of the VariantContext + * @return a new VariantContext with just the requested samples + */ private VariantContext getSubsetOfVariantContext(VariantContext vc, Collection sampleNames) { VariantContext vcsub = vc.subContextFromGenotypes(vc.getGenotypes(sampleNames).values()); @@ -424,7 +463,7 @@ public class NewVariantEvalWalker extends RodWalker implements * @param allowableTypes a set of allowable variation types * @return a mapping of track names to a list of VariantContext objects */ - private HashMap> bindVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set trackNames, Set sampleNames, EnumSet allowableTypes) { + private HashMap> bindVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set trackNames, Set sampleNames, EnumSet allowableTypes, boolean byFilter) { HashMap> bindings = new HashMap>(); for ( String trackName : trackNames ) { @@ -445,14 +484,17 @@ public class NewVariantEvalWalker extends RodWalker implements sampleNamesMinusAll.add(sampleName); } - vcs.put(sampleName, vcsub); + if (byFilter || !vcsub.isFiltered()) { + vcs.put(sampleName, vcsub); + } } if ( trackName.contains("eval") ) { - //logger.info(sampleNamesMinusAll); vc = getSubsetOfVariantContext(vc, sampleNamesMinusAll); - //logger.info(vc); - vcs.put(ALL_SAMPLE_NAME, vc); + + if (byFilter || !vc.isFiltered()) { + vcs.put(ALL_SAMPLE_NAME, vc); + } } bindings.put(trackName, vcs); @@ -481,24 +523,25 @@ public class NewVariantEvalWalker extends RodWalker implements EnumSet allowableTypes = getAllowableVariationTypes(tracker, ref, evalNames); - HashMap> compBindings = bindVariantContexts(tracker, ref, compNames, allSamplesList, allowableTypes); - - HashMap> evalBindings; - boolean perSampleIsEnabled = false; + boolean byFilter = false; for (VariantStratifier vs : stratificationObjects) { if (vs.getClass().getSimpleName().equals("Sample")) { perSampleIsEnabled = true; - break; + } else if (vs.getClass().getSimpleName().equals("Filter")) { + byFilter = true; } } + HashMap> evalBindings; if (perSampleIsEnabled) { - evalBindings = bindVariantContexts(tracker, ref, evalNames, sampleNames, allowableTypes); + evalBindings = bindVariantContexts(tracker, ref, evalNames, sampleNames, allowableTypes, byFilter); } else { - evalBindings = bindVariantContexts(tracker, ref, evalNames, allSamplesList, allowableTypes); + evalBindings = bindVariantContexts(tracker, ref, evalNames, allSamplesList, allowableTypes, byFilter); } + HashMap> compBindings = bindVariantContexts(tracker, ref, compNames, allSamplesList, allowableTypes, byFilter); + vcs.putAll(compBindings); vcs.putAll(evalBindings); @@ -552,6 +595,22 @@ public class NewVariantEvalWalker extends RodWalker implements return stateKeys; } + /** + * Return the number of samples being used + * @return the number of samples + */ + public int getNumSamples() { + return sampleNames.size(); + } + + /** + * Return the minimum phasing quality to be used with the GenotypePhasingEvaluator module + * @return the minimum phasing quality + */ + public double getMinPhaseQuality() { + return MIN_PHASE_QUALITY; + } + /** * Collect relevant information from each variant in the supplied VCFs */ @@ -638,11 +697,40 @@ public class NewVariantEvalWalker extends RodWalker implements for ( VariantEvaluator ve : nec.getEvaluationClassList().values() ) { ve.finalizeEvaluation(); + GATKReportTable table = report.getTable(ve.getClass().getSimpleName()); AnalysisModuleScanner scanner = new AnalysisModuleScanner(ve); Map datamap = scanner.getData(); - GATKReportTable table = report.getTable(ve.getClass().getSimpleName()); + // handle TableTypes + for (Field field : datamap.keySet()) { + try { + field.setAccessible(true); + + if (field.get(ve) instanceof TableType) { + TableType t = (TableType) field.get(ve); + + for (int row = 0; row < t.getRowKeys().length; row++) { + String r = (String) t.getRowKeys()[row]; + + for ( VariantStratifier vs : stratificationObjects ) { + String columnName = vs.getClass().getSimpleName(); + + table.set(stateKey.toString() + r, columnName, stateKey.get(vs.getClass().getSimpleName())); + } + + for (int col = 0; col < t.getColumnKeys().length; col++) { + String c = (String) t.getColumnKeys()[col]; + + String newStateKey = stateKey.toString() + r; + table.set(newStateKey, c, t.getCell(row, col)); + } + } + } + } catch (IllegalAccessException e) { + throw new StingException("IllegalAccessException: " + e); + } + } for ( VariantStratifier vs : stratificationObjects ) { String columnName = vs.getClass().getSimpleName(); @@ -653,7 +741,10 @@ public class NewVariantEvalWalker extends RodWalker implements for (Field field : datamap.keySet()) { try { field.setAccessible(true); - table.set(stateKey.toString(), field.getName(), field.get(ve)); + + if ( !(field.get(ve) instanceof TableType) ) { + table.set(stateKey.toString(), field.getName(), field.get(ve)); + } } catch (IllegalAccessException e) { throw new ReviewedStingException("Unable to access a data field in a VariantEval analysis module: " + e); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/CompEvalGenotypes.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/CompEvalGenotypes.java index 641878acd..e08b8555c 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/CompEvalGenotypes.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/CompEvalGenotypes.java @@ -3,12 +3,12 @@ package org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.evaluato import org.broad.tribble.util.variantcontext.Genotype; import org.broadinstitute.sting.utils.GenomeLoc; -class CompEvalGenotypes { +class NewCompEvalGenotypes { private GenomeLoc loc; private Genotype compGt; private Genotype evalGt; - public CompEvalGenotypes(GenomeLoc loc, Genotype compGt, Genotype evalGt) { + public NewCompEvalGenotypes(GenomeLoc loc, Genotype compGt, Genotype evalGt) { this.loc = loc; this.compGt = compGt; this.evalGt = evalGt; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/GenotypeConcordance.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/GenotypeConcordance.java new file mode 100755 index 000000000..0a8c1aae5 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/GenotypeConcordance.java @@ -0,0 +1,793 @@ +package org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.evaluators; + +import org.apache.commons.lang.ArrayUtils; +import org.broad.tribble.util.variantcontext.Allele; +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broad.tribble.vcf.VCFConstants; +import org.broadinstitute.sting.gatk.contexts.*; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.varianteval.*; +import org.broadinstitute.sting.gatk.walkers.varianteval.StandardEval; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.tags.Analysis; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.tags.DataPoint; +import org.broadinstitute.sting.utils.report.utils.TableType; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.util.*; + +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +@Analysis(name = "Genotype Concordance", description = "Determine the genotype concordance between the genotypes in difference tracks") +public class GenotypeConcordance extends org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvaluator implements StandardEval { + private static final boolean PRINT_INTERESTING_SITES = true; + + protected final static Logger logger = Logger.getLogger(GenotypeConcordance.class); + + // a mapping from allele count to stats + @DataPoint(description = "the frequency statistics for each allele") + FrequencyStats alleleFreqStats = new FrequencyStats(); + + // a mapping from sample to stats + @DataPoint(description = "the concordance statistics for each sample") + SampleStats sampleStats = null; + + // a mapping from sample to stats summary + @DataPoint(description = "the concordance statistics summary for each sample") + SampleSummaryStats sampleSummaryStats = null; + + // two histograms of variant quality scores, for true positive and false positive calls + @DataPoint(description = "the variant quality score histograms for true positive and false positive calls") + QualityScoreHistograms qualityScoreHistograms = null; + + @DataPoint(description = "the concordance statistics summary by allele count") + ACSummaryStats alleleCountSummary = null; + + @DataPoint(description = "the concordance statistics by allele count") + ACStats alleleCountStats = null; + + private static final int MAX_MISSED_VALIDATION_DATA = 100; + + private boolean discordantInteresting = false; + + private VariantEvalWalker.EvaluationContext group = null; + + static class FrequencyStats implements TableType { + class Stats { + public Stats(int found, int missed) { nFound = found; nMissed = missed; } + public long nFound = 0; + public long nMissed = 0; + } + public HashMap foundMissedByAC = new HashMap(); + + public Object[] getRowKeys() { + String rows[] = new String[foundMissedByAC.size()]; + int index = 0; + for (int i : foundMissedByAC.keySet()) rows[index++] = "Allele Count " + i; + return rows; + } + + public Object[] getColumnKeys() { + return new String[]{"number_found", "number_missing"}; + } + + public String getName() { + return "FrequencyStats"; + } + + public String getCell(int x, int y) { + if (x >= foundMissedByAC.size()) throw new IllegalStateException(x + " is greater than the max index of " + (foundMissedByAC.size()-1)); + if (y == 0) return String.valueOf(foundMissedByAC.get(foundMissedByAC.keySet().toArray(new Integer[foundMissedByAC.size()])[x]).nFound); + else return String.valueOf(foundMissedByAC.get(foundMissedByAC.keySet().toArray(new Integer[foundMissedByAC.size()])[x]).nMissed); + } + + public void incrementFoundCount(int alleleFreq) { + if (!foundMissedByAC.containsKey(alleleFreq)) + foundMissedByAC.put(alleleFreq,new Stats(1,0)); + else + foundMissedByAC.get(alleleFreq).nFound++; + } + + public void incrementMissedCount(int alleleFreq) { + if (!foundMissedByAC.containsKey(alleleFreq)) + foundMissedByAC.put(alleleFreq,new Stats(0,1)); + else + foundMissedByAC.get(alleleFreq).nMissed++; + } + } + + static class QualityScoreHistograms implements TableType { + final static int NUM_BINS = 20; + final HashMap truePositiveQualityScoreMap = new HashMap(); // A HashMap holds all the quality scores until we are able to bin them appropriately + final HashMap falsePositiveQualityScoreMap = new HashMap(); + final int truePositiveHist[] = new int[NUM_BINS]; // the final histograms that get reported out + final int falsePositiveHist[] = new int[NUM_BINS]; + final String[] rowKeys = new String[]{"true_positive_hist", "false_positive_hist"}; + + public Object[] getRowKeys() { + return rowKeys; + } + + public Object[] getColumnKeys() { + final String columnKeys[] = new String[NUM_BINS]; + for( int iii = 0; iii < NUM_BINS; iii++ ) { + columnKeys[iii] = "histBin" + iii; + } + return columnKeys; + } + + public String getName() { + return "QualityScoreHistogram"; + } + + public String getCell(int x, int y) { + if( x == 0 ) { + return String.valueOf(truePositiveHist[y]); + } else if ( x == 1 ) { + return String.valueOf(falsePositiveHist[y]); + } else { + throw new ReviewedStingException( "Unknown row in " + getName() + ", row = " + x ); + } + } + + public String toString() { + String returnString = ""; + // output both histogram arrays + returnString += "TP: "; + for( int iii = 0; iii < NUM_BINS; iii++ ) { + returnString += truePositiveHist[iii] + " "; + } + returnString += "\nFP: "; + for( int iii = 0; iii < NUM_BINS; iii++ ) { + returnString += falsePositiveHist[iii] + " "; + } + return returnString; + } + + public void incrValue( final double qual, final boolean isTruePositiveCall ) { + HashMap qualScoreMap; + if( isTruePositiveCall ) { + qualScoreMap = truePositiveQualityScoreMap; + } else { + qualScoreMap = falsePositiveQualityScoreMap; + } + final Integer qualKey = Math.round((float) qual); + if( qualScoreMap.containsKey(qualKey) ) { + qualScoreMap.put(qualKey, qualScoreMap.get(qualKey) + 1); + } else { + qualScoreMap.put(qualKey, 1); + } + } + + public void organizeHistogramTables() { + for( int iii = 0; iii < NUM_BINS; iii++ ) { + truePositiveHist[iii] = 0; + falsePositiveHist[iii] = 0; + } + + int maxQual = 0; + + // Calculate the maximum quality score for both TP and FP calls in order to normalize and histogram + for( final Integer qual : truePositiveQualityScoreMap.keySet()) { + if( qual > maxQual ) { + maxQual = qual; + } + } + for( final Integer qual : falsePositiveQualityScoreMap.keySet()) { + if( qual > maxQual ) { + maxQual = qual; + } + } + + final double binSize = ((double)maxQual) / ((double) (NUM_BINS-1)); //BUGBUG: should be normalized max to min, not max to 0 + + for( final Integer qual : truePositiveQualityScoreMap.keySet()) { + final int index = (int)Math.floor( ((double)qual) / binSize ); + if(index >= 0) { //BUGBUG: problem when maxQual is zero? + truePositiveHist[ index ] += truePositiveQualityScoreMap.get(qual); + } + } + for( final Integer qual : falsePositiveQualityScoreMap.keySet()) { + final int index = (int)Math.floor( ((double)qual) / binSize ); + if(index >= 0) { + falsePositiveHist[ index ] += falsePositiveQualityScoreMap.get(qual); + } + } + } + } + + // keep a list of the validation data we saw before the first eval data + private HashSet missedValidationData = new HashSet(); + + + //public GenotypeConcordance(VariantEvalWalker parent) { + // super(parent); + // discordantInteresting = parent.DISCORDANT_INTERESTING; + //} + + public String getName() { + return "genotypeConcordance"; + } + + public int getComparisonOrder() { + return 2; // we need to see each eval track and each comp track + } + + public boolean enabled() { + return true; + } + + public String toString() { + return getName() + ": "; + } + + private boolean warnedAboutValidationData = false; + + public String update2(VariantContext eval, VariantContext validation, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantEvalWalker.EvaluationContext group) { + this.group = group; + + String interesting = null; + + // sanity check that we at least have either eval or validation data + if (eval == null && !isValidVC(validation)) { + return interesting; + } + + if( qualityScoreHistograms == null ) { + qualityScoreHistograms = new QualityScoreHistograms(); + } + + if ( alleleCountStats == null && eval != null && validation != null ) { + alleleCountStats = new ACStats(eval,validation,Genotype.Type.values().length); + alleleCountSummary = new ACSummaryStats(eval, validation); + } + + if (sampleStats == null) { + if (eval != null) { + // initialize the concordance table + sampleStats = new SampleStats(eval,Genotype.Type.values().length); + sampleSummaryStats = new SampleSummaryStats(eval); + for (final VariantContext vc : missedValidationData) { + determineStats(null, vc); + } + missedValidationData = null; + } else { + // todo -- Eric, this results in a memory problem when eval is WEx data but you are using CG calls genome-wide + // todo -- perhaps you need should extend the evaluators with an initialize + // todo -- method that gets the header (or samples) for the first eval sites? + if (missedValidationData.size() > MAX_MISSED_VALIDATION_DATA) { + if (!warnedAboutValidationData) { + //logger.warn("Too many genotype sites missed before eval site appeared; ignoring"); + warnedAboutValidationData = true; + } + } else { + missedValidationData.add(validation); + } + return interesting; + } + } + + interesting = determineStats(eval, validation); + + return interesting; // we don't capture any interesting sites + } + + private String determineStats(final VariantContext eval, final VariantContext validation) { + + String interesting = null; + + final boolean validationIsValidVC = isValidVC(validation); + final String evalAC = ( vcHasGoodAC(eval) ) ? String.format("evalAC%d",getAC(eval)) : null ; + final String validationAC = ( vcHasGoodAC(validation) ) ? String.format("compAC%d",getAC(validation)) : null; + + // determine concordance for eval data + if (eval != null) { + for (final String sample : eval.getGenotypes().keySet()) { + final Genotype.Type called = eval.getGenotype(sample).getType(); + final Genotype.Type truth; + + if (!validationIsValidVC || !validation.hasGenotype(sample)) { + truth = Genotype.Type.NO_CALL; + } else { + truth = validation.getGenotype(sample).getType(); + // interesting = "ConcordanceStatus=FP"; + if (discordantInteresting && truth.ordinal() != called.ordinal()) + { + interesting = "ConcordanceStatus=" + truth + "/" + called; + } + } + + sampleStats.incrValue(sample, truth, called); + if ( evalAC != null && validationAC != null) { + alleleCountStats.incrValue(evalAC,truth,called); + alleleCountStats.incrValue(validationAC,truth,called); + } + } + } + // otherwise, mark no-calls for all samples + else { + final Genotype.Type called = Genotype.Type.NO_CALL; + + for (final String sample : validation.getGenotypes().keySet()) { + final Genotype.Type truth = validation.getGenotype(sample).getType(); + sampleStats.incrValue(sample, truth, called); + if ( evalAC != null ) { + alleleCountStats.incrValue(evalAC,truth,called); + } + // print out interesting sites + /* + if ( PRINT_INTERESTING_SITES && super.getVEWalker().gcLog != null ) { + if ( (truth == Genotype.Type.HOM_VAR || truth == Genotype.Type.HET) && called == Genotype.Type.NO_CALL ) { + super.getVEWalker().gcLog.printf("%s FN %s%n", group, validation); + } + if ( (called == Genotype.Type.HOM_VAR || called == Genotype.Type.HET) && truth == Genotype.Type.HOM_REF ) { + super.getVEWalker().gcLog.printf("%s FP %s%n", group, validation); + } + } + */ + } + } + + // determine allele count concordance () // this is really a FN rate estimate -- CH + if (validationIsValidVC && validation.isPolymorphic()) { + int trueAlleleCount = 0; + for (final Allele a : validation.getAlternateAlleles()) { + trueAlleleCount += validation.getChromosomeCount(a); + } + if (eval != null) { + alleleFreqStats.incrementFoundCount(trueAlleleCount); + } else { + alleleFreqStats.incrementMissedCount(trueAlleleCount); + } + } + + // TP & FP quality score histograms + if( eval != null && eval.isPolymorphic() && validationIsValidVC ) { + if( eval.getGenotypes().keySet().size() == 1 ) { // single sample calls + for( final String sample : eval.getGenotypes().keySet() ) { // only one sample + if( validation.hasGenotype(sample) ) { + final Genotype truth = validation.getGenotype(sample); + qualityScoreHistograms.incrValue( eval.getPhredScaledQual(), !truth.isHomRef() ); + } + } + } else { // multi sample calls + qualityScoreHistograms.incrValue( eval.getPhredScaledQual(), validation.isPolymorphic() ); + } + + } + + return interesting; + } + + private static boolean isValidVC(final VariantContext vc) { + return (vc != null && !vc.isFiltered()); + } + + public void finalizeEvaluation() { + if( qualityScoreHistograms != null ) { + qualityScoreHistograms.organizeHistogramTables(); + } + + if( sampleSummaryStats != null && sampleStats != null ) { + sampleSummaryStats.generateSampleSummaryStats( sampleStats ); + } + + if ( alleleCountSummary != null && alleleCountStats != null ) { + alleleCountSummary.generateSampleSummaryStats( alleleCountStats ); + } + } + + private boolean vcHasGoodAC(VariantContext vc) { + return ( vc != null && vc.getAlternateAlleles().size() == 1 && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ); + + } + + private int getAC(VariantContext vc) { + if ( List.class.isAssignableFrom(vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY).getClass()) ) { + return ((List) vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY)).get(0); + } else if ( Integer.class.isAssignableFrom(vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY).getClass())) { + return (Integer) vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY); + } else if ( String.class.isAssignableFrom(vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY).getClass()) ) { + // two ways of parsing + String ac = (String) vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY); + if ( ac.startsWith("[") ) { + return Integer.parseInt(ac.replaceAll("\\[","").replaceAll("\\]","")); + } else { + try { + return Integer.parseInt(ac); + } catch ( NumberFormatException e ) { + throw new UserException(String.format("The format of the AC field is improperly formatted: AC=%s",ac)); + } + } + } else { + throw new UserException(String.format("The format of the AC field does not appear to be of integer-list or String format, class was %s",vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY).getClass())); + } + } +} + +/** + * a table of sample names to genotype concordance figures + */ +class SampleStats implements TableType { + private final int nGenotypeTypes; + + // sample to concordance stats object + public final HashMap concordanceStats = new HashMap(); + + /** + * + * @return one row per sample + */ + public Object[] getRowKeys() { + return concordanceStats.keySet().toArray(new String[concordanceStats.size()]); + } + + /** + * increment the specified value + * @param sample the sample name + * @param truth the truth type + * @param called the called type + */ + public void incrValue(String sample, Genotype.Type truth, Genotype.Type called) { + if ( concordanceStats.containsKey(sample) ) + concordanceStats.get(sample)[truth.ordinal()][called.ordinal()]++; + else if ( called != Genotype.Type.NO_CALL ) + throw new UserException.CommandLineException("Sample " + sample + " has not been seen in a previous eval; this analysis module assumes that all samples are present in each variant context"); + } + + /** + * get the column keys + * @return a list of objects, in this case strings, that are the column names + */ + public Object[] getColumnKeys() { + return new String[]{"total_true_ref","%_ref/ref","n_ref/no-call", + "n_ref/ref","n_ref/het","n_ref/hom", + "total_true_het","%_het/het","n_het/no-call", + "n_het/ref","n_het/het","n_het/hom", + "total_true_hom","%_hom/hom","n_hom/no-call", + "n_hom/ref","n_hom/het","n_hom/hom"}; + } + + + public SampleStats(VariantContext vc, int nGenotypeTypes) { + this.nGenotypeTypes = nGenotypeTypes; + for (String sample : vc.getGenotypes().keySet()) + concordanceStats.put(sample, new long[nGenotypeTypes][nGenotypeTypes]); + } + + public SampleStats(int genotypeTypes) { + nGenotypeTypes = genotypeTypes; + } + + public Object getCell(int x, int y) { + // we have three rows of 6 right now for output (rows: ref, het, hom) + Genotype.Type type = Genotype.Type.values()[(y/6)+1]; // get the row type + // save some repeat work, get the total every time + long total = 0; + Object[] rowKeys = getRowKeys(); + for (int called = 0; called < nGenotypeTypes; called++) { + total += concordanceStats.get(rowKeys[x])[type.ordinal()][called]; + } + + // now get the cell they're interested in + switch (y % 6) { + case (0): // get the total_true for this type + return total; + case (1): + return total == 0 ? 0.0 : (100.0 * (double) concordanceStats.get(rowKeys[x])[type.ordinal()][type.ordinal()] / (double) total); + default: + return concordanceStats.get(rowKeys[x])[type.ordinal()][(y % 6) - 2]; + } + } + + public String getName() { + return "Sample Statistics"; + } +} + +/** + * Sample stats, but for AC + */ +class ACStats extends SampleStats { + private String[] rowKeys; + + public ACStats(VariantContext evalvc, VariantContext compvc, int nGenotypeTypes) { + super(nGenotypeTypes); + rowKeys = new String[1+2*evalvc.getGenotypes().size()+1+2*compvc.getGenotypes().size()]; + for ( int i = 0; i <= 2*evalvc.getGenotypes().size(); i++ ) { // todo -- assuming ploidy 2 here... + concordanceStats.put(String.format("evalAC%d",i),new long[nGenotypeTypes][nGenotypeTypes]); + rowKeys[i] = String.format("evalAC%d",i); + + } + + for ( int i = 0; i <= 2*compvc.getGenotypes().size(); i++ ) { + concordanceStats.put(String.format("compAC%d",i), new long[nGenotypeTypes][nGenotypeTypes]); + rowKeys[1+2*evalvc.getGenotypes().size()+i] = String.format("compAC%d",i); + } + } + + public String getName() { + return "Allele Count Statistics"; + } + + public Object[] getRowKeys() { + if ( rowKeys == null ) { + throw new StingException("RowKeys is null!"); + } + return rowKeys; + } +} + +/** + * a table of sample names to genotype concordance summary statistics + */ +class SampleSummaryStats implements TableType { + protected final static String ALL_SAMPLES_KEY = "allSamples"; + protected final static String[] COLUMN_KEYS = new String[]{ + "percent_comp_ref_called_var", + "percent_comp_het_called_het", + "percent_comp_het_called_var", + "percent_comp_hom_called_hom", + "percent_comp_hom_called_var", + "percent_non-reference_sensitivity", + "percent_overall_genotype_concordance", + "percent_non-reference_discrepancy_rate"}; + + // sample to concordance stats object + protected final HashMap concordanceSummary = new HashMap(); + + /** + * + * @return one row per sample + */ + public Object[] getRowKeys() { + return concordanceSummary.keySet().toArray(new String[concordanceSummary.size()]); + } + + /** + * get the column keys + * @return a list of objects, in this case strings, that are the column names + */ + public Object[] getColumnKeys() { + return COLUMN_KEYS; + } + + public SampleSummaryStats(final VariantContext vc) { + concordanceSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]); + for( final String sample : vc.getGenotypes().keySet() ) { + concordanceSummary.put(sample, new double[COLUMN_KEYS.length]); + } + } + + public SampleSummaryStats() { + + } + + public Object getCell(int x, int y) { + final Object[] rowKeys = getRowKeys(); + return String.format("%.2f",concordanceSummary.get(rowKeys[x])[y]); + } + + /** + * Helper routine that sums up all columns / rows found in stats specified by all pairs in d1 x d2 + * + * @param stats + * @param d1 + * @param d2 + * @return + */ + private long sumStatsAllPairs( final long[][] stats, EnumSet d1, EnumSet d2 ) { + long sum = 0L; + + for(final Genotype.Type e1 : d1 ) { + for(final Genotype.Type e2 : d2 ) { + sum += stats[e1.ordinal()][e2.ordinal()]; + } + } + + return sum; + } + + private long sumStatsDiag( final long[][] stats, EnumSet d1) { + long sum = 0L; + + for(final Genotype.Type e1 : d1 ) { + sum += stats[e1.ordinal()][e1.ordinal()]; + } + + return sum; + } + + private double ratio(long numer, long denom) { + return denom != 0L ? 100.0 * ( ((double)numer) / ((double)denom) ) : 0.0; + } + + final long[] allSamplesNumerators = new long[COLUMN_KEYS.length]; + final long[] allSamplesDenominators = new long[COLUMN_KEYS.length]; + + private void updateSummaries(int i, double[] summary, long numer, long denom ) { + allSamplesNumerators[i] += numer; + allSamplesDenominators[i] += denom; + summary[i] = ratio(numer, denom); + } + + + /** + * Calculate the five summary stats per sample + * @param sampleStats The Map which holds concordance values per sample + */ + public void generateSampleSummaryStats( final SampleStats sampleStats ) { + EnumSet allVariantGenotypes = EnumSet.of(Genotype.Type.HOM_VAR, Genotype.Type.HET); + EnumSet allCalledGenotypes = EnumSet.of(Genotype.Type.HOM_VAR, Genotype.Type.HET, Genotype.Type.HOM_REF); + EnumSet allGenotypes = EnumSet.allOf(Genotype.Type.class); + + for( final String sample : concordanceSummary.keySet() ) { + if ( sample.equals(ALL_SAMPLES_KEY) ) continue; + + final long[][] stats = sampleStats.concordanceStats.get(sample); + final double[] summary = concordanceSummary.get(sample); + if( stats == null ) { throw new ReviewedStingException( "SampleStats and SampleSummaryStats contain different samples! sample = " + sample ); } + + long numer, denom; + + // Summary 0: % ref called as var + numer = sumStatsAllPairs(stats, EnumSet.of(Genotype.Type.HOM_REF), allVariantGenotypes); + denom = sumStatsAllPairs(stats, EnumSet.of(Genotype.Type.HOM_REF), allGenotypes); + updateSummaries(0, summary, numer, denom); + + // Summary 1: % het called as het + numer = stats[Genotype.Type.HET.ordinal()][Genotype.Type.HET.ordinal()]; + denom = sumStatsAllPairs(stats, EnumSet.of(Genotype.Type.HET), allGenotypes); + updateSummaries(1, summary, numer, denom); + + // Summary 2: % het called as var + numer = sumStatsAllPairs(stats, EnumSet.of(Genotype.Type.HET), allVariantGenotypes); + denom = sumStatsAllPairs(stats, EnumSet.of(Genotype.Type.HET), allGenotypes); + updateSummaries(2, summary, numer, denom); + + // Summary 3: % homVar called as homVar + numer = stats[Genotype.Type.HOM_VAR.ordinal()][Genotype.Type.HOM_VAR.ordinal()]; + denom = sumStatsAllPairs(stats, EnumSet.of(Genotype.Type.HOM_VAR), allGenotypes); + updateSummaries(3, summary, numer, denom); + + // Summary 4: % homVars called as var + numer = sumStatsAllPairs(stats, EnumSet.of(Genotype.Type.HOM_VAR), allVariantGenotypes); + denom = sumStatsAllPairs(stats, EnumSet.of(Genotype.Type.HOM_VAR), allGenotypes); + updateSummaries(4, summary, numer, denom); + + // Summary 5: % non-ref called as non-ref + // MAD: this is known as the non-reference sensitivity (# non-ref according to comp found in eval / # non-ref in comp) + numer = sumStatsAllPairs(stats, allVariantGenotypes, allVariantGenotypes); + denom = sumStatsAllPairs(stats, allVariantGenotypes, allGenotypes); + updateSummaries(5, summary, numer, denom); + + // Summary 6: overall genotype concordance of sites called in eval track + // MAD: this is the tradition genotype concordance + numer = sumStatsDiag(stats, allCalledGenotypes); + denom = sumStatsAllPairs(stats, allCalledGenotypes, allCalledGenotypes); + updateSummaries(6, summary, numer, denom); + + // Summary 7: overall genotype concordance of sites called non-ref in eval track + long homrefConcords = stats[Genotype.Type.HOM_REF.ordinal()][Genotype.Type.HOM_REF.ordinal()]; + long diag = sumStatsDiag(stats, allVariantGenotypes); + long allNoHomRef = sumStatsAllPairs(stats, allCalledGenotypes, allCalledGenotypes) - homrefConcords; + numer = allNoHomRef - diag; + denom = allNoHomRef; + updateSummaries(7, summary, numer, denom); + } + + // update the final summary stats + final double[] allSamplesSummary = concordanceSummary.get(ALL_SAMPLES_KEY); + for ( int i = 0; i < allSamplesSummary.length; i++) { + allSamplesSummary[i] = ratio(allSamplesNumerators[i], allSamplesDenominators[i]); + } + + } + + public String getName() { + return "Sample Summary Statistics"; + } +} + +/** + * SampleSummaryStats .. but for allele counts + */ +class ACSummaryStats extends SampleSummaryStats { + private String[] rowKeys; + + public ACSummaryStats (final VariantContext evalvc, final VariantContext compvc) { + concordanceSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]); + rowKeys = new String[3+2*evalvc.getGenotypes().size() + 2*compvc.getGenotypes().size()]; + rowKeys[0] = ALL_SAMPLES_KEY; + for( int i = 0; i <= 2*evalvc.getGenotypes().size() ; i ++ ) { + concordanceSummary.put(String.format("evalAC%d",i), new double[COLUMN_KEYS.length]); + rowKeys[i+1] = String.format("evalAC%d",i); + } + for( int i = 0; i <= 2*compvc.getGenotypes().size() ; i ++ ) { + concordanceSummary.put(String.format("compAC%d",i), new double[COLUMN_KEYS.length]); + rowKeys[2+2*evalvc.getGenotypes().size()+i] = String.format("compAC%d",i); + } + + } + + public String getName() { + return "Allele Count Summary Statistics"; + } + + public Object[] getRowKeys() { + if ( rowKeys == null) { + throw new StingException("rowKeys is null!!"); + } + return rowKeys; + } +} + +class CompACNames implements Comparator{ + + final Logger myLogger; + private boolean info = true; + + public CompACNames(Logger l) { + myLogger = l; + } + + public boolean equals(Object o) { + return ( o.getClass() == CompACNames.class ); + } + + public int compare(Object o1, Object o2) { + if ( info ) { + myLogger.info("Sorting AC names"); + info = false; + } + //System.out.printf("Objects %s %s get ranks %d %d%n",o1.toString(),o2.toString(),getRank(o1),getRank(o2)); + return getRank(o1) - getRank(o2); + } + + public int getRank(Object o) { + if ( o.getClass() != String.class ) { + return Integer.MIN_VALUE/4; + } else { + String s = (String) o; + if ( s.startsWith("eval") ) { + return Integer.MIN_VALUE/4 + 1 + parseAC(s); + } else if ( s.startsWith("comp") ) { + return 1+ parseAC(s); + } else { + return Integer.MIN_VALUE/4; + } + } + } + + public int parseAC(String s) { + String[] g = s.split("AC"); + return Integer.parseInt(g[1]); + } +} + diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/GenotypePhasingEvaluator.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/GenotypePhasingEvaluator.java new file mode 100755 index 000000000..028a9f274 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/GenotypePhasingEvaluator.java @@ -0,0 +1,422 @@ +package org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.evaluators; + +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broad.tribble.vcf.VCFConstants; +import org.broadinstitute.sting.gatk.contexts.*; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.phasing.*; +import org.broadinstitute.sting.gatk.walkers.varianteval.*; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.NewVariantEvalWalker; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.tags.Analysis; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.tags.DataPoint; +import org.broadinstitute.sting.utils.report.utils.TableType; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.MathUtils; + +import java.util.*; + +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +@Analysis(name = "Genotype Phasing Evaluation", description = "Evaluates the phasing of genotypes in different tracks") +public class GenotypePhasingEvaluator extends org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvaluator { + protected final static Logger logger = Logger.getLogger(GenotypePhasingEvaluator.class); + + // a mapping from sample to stats + @DataPoint(description = "the phasing statistics for each sample") + SamplePhasingStatistics samplePhasingStatistics = null; + + SamplePreviousGenotypes samplePrevGenotypes = null; + + double minPhaseQuality = 10.0; + + public void initialize(NewVariantEvalWalker walker) { + this.samplePhasingStatistics = new SamplePhasingStatistics(walker.getMinPhaseQuality()); + this.samplePrevGenotypes = new SamplePreviousGenotypes(); + } + + public String getName() { + return "GenotypePhasingEvaluator"; + } + + public int getComparisonOrder() { + return 2; // we only need to see pairs of (comp, eval) + } + + public boolean enabled() { + return true; + } + + public String toString() { + return getName() + ":
"; + } + + public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantEvalWalker.EvaluationContext group) { + Reasons interesting = new Reasons(); + if (ref == null) + return interesting.toString(); + GenomeLoc curLocus = ref.getLocus(); + + logger.debug("update2() locus: " + curLocus); + logger.debug("comp = " + comp + " eval = " + eval); + + Set allSamples = new HashSet(); + + Map compSampGenotypes = null; + if (isRelevantToPhasing(comp)) { + allSamples.addAll(comp.getSampleNames()); + compSampGenotypes = comp.getGenotypes(); + } + + Map evalSampGenotypes = null; + if (isRelevantToPhasing(eval)) { + allSamples.addAll(eval.getSampleNames()); + evalSampGenotypes = eval.getGenotypes(); + } + + for (String samp : allSamples) { + logger.debug("sample = " + samp); + + Genotype compSampGt = null; + if (compSampGenotypes != null) + compSampGt = compSampGenotypes.get(samp); + + Genotype evalSampGt = null; + if (evalSampGenotypes != null) + evalSampGt = evalSampGenotypes.get(samp); + + if (compSampGt == null || evalSampGt == null) { // Since either comp or eval (or both) are missing the site, the best we can do is hope to preserve phase [if the non-missing one preserves phase] + // Having an unphased site breaks the phasing for the sample [does NOT permit "transitive phasing"] - hence, must reset phasing knowledge for both comp and eval [put a null CompEvalGenotypes]: + if (isNonNullButUnphased(compSampGt) || isNonNullButUnphased(evalSampGt)) + samplePrevGenotypes.put(samp, null); + } + else { // Both comp and eval have a non-null Genotype at this site: + AllelePair compAllelePair = new AllelePair(compSampGt); + AllelePair evalAllelePair = new AllelePair(evalSampGt); + + boolean breakPhasing = false; + if (compSampGt.isHet() != evalSampGt.isHet() || compSampGt.isHom() != evalSampGt.isHom()) + breakPhasing = true; // since they are not both het or both hom + else { // both are het, or both are hom: + boolean topMatchesTopAndBottomMatchesBottom = (topMatchesTop(compAllelePair, evalAllelePair) && bottomMatchesBottom(compAllelePair, evalAllelePair)); + boolean topMatchesBottomAndBottomMatchesTop = (topMatchesBottom(compAllelePair, evalAllelePair) && bottomMatchesTop(compAllelePair, evalAllelePair)); + if (!topMatchesTopAndBottomMatchesBottom && !topMatchesBottomAndBottomMatchesTop) + breakPhasing = true; // since the 2 VCFs have different diploid genotypes for this sample + } + + if (breakPhasing) { + samplePrevGenotypes.put(samp, null); // nothing to do for this site, AND must remove any history for the future + } + else if (compSampGt.isHet() && evalSampGt.isHet()) { + /* comp and eval have the HET same Genotype at this site: + [Note that if both are hom, then nothing is done here, but the het history IS preserved]. + */ + CompEvalGenotypes prevCompAndEval = samplePrevGenotypes.get(samp); + if (prevCompAndEval != null && !prevCompAndEval.getLocus().onSameContig(curLocus)) // exclude curLocus if it is "phased" relative to a different chromosome + prevCompAndEval = null; + + // Replace the previous hets with the current hets: + samplePrevGenotypes.put(samp, curLocus, compSampGt, evalSampGt); + + if (prevCompAndEval != null) { + GenomeLoc prevLocus = prevCompAndEval.getLocus(); + logger.debug("Potentially phaseable het locus: " + curLocus + " [relative to previous het locus: " + prevLocus + "]"); + PhaseStats ps = samplePhasingStatistics.ensureSampleStats(samp); + + boolean compSampIsPhased = genotypesArePhasedAboveThreshold(compSampGt); + boolean evalSampIsPhased = genotypesArePhasedAboveThreshold(evalSampGt); + if (compSampIsPhased || evalSampIsPhased) { + if (!evalSampIsPhased) { + ps.onlyCompPhased++; + interesting.addReason("ONLY_COMP", samp, group, prevLocus, ""); + } + else if (!compSampIsPhased) { + ps.onlyEvalPhased++; + interesting.addReason("ONLY_EVAL", samp, group, prevLocus, ""); + } + else { // both comp and eval are phased: + AllelePair prevCompAllelePair = new AllelePair(prevCompAndEval.getCompGenotpye()); + AllelePair prevEvalAllelePair = new AllelePair(prevCompAndEval.getEvalGenotype()); + + // Sufficient to check only the top of comp, since we ensured that comp and eval have the same diploid genotypes for this sample: + boolean topsMatch = (topMatchesTop(prevCompAllelePair, prevEvalAllelePair) && topMatchesTop(compAllelePair, evalAllelePair)); + boolean topMatchesBottom = (topMatchesBottom(prevCompAllelePair, prevEvalAllelePair) && topMatchesBottom(compAllelePair, evalAllelePair)); + + if (topsMatch || topMatchesBottom) { + ps.phasesAgree++; + + Double compPQ = getPQ(compSampGt); + Double evalPQ = getPQ(evalSampGt); + if (compPQ != null && evalPQ != null && MathUtils.compareDoubles(compPQ, evalPQ) != 0) + interesting.addReason("PQ_CHANGE", samp, group, prevLocus, compPQ + " -> " + evalPQ); + } + else { + ps.phasesDisagree++; + logger.debug("SWITCHED locus: " + curLocus); + interesting.addReason("SWITCH", samp, group, prevLocus, toString(prevCompAllelePair, compAllelePair) + " -> " + toString(prevEvalAllelePair, evalAllelePair)); + } + } + } + else { + ps.neitherPhased++; + } + } + } + } + } + logger.debug("\n" + samplePhasingStatistics + "\n"); + + return interesting.toString(); + } + + public static boolean isRelevantToPhasing(VariantContext vc) { + return (vc != null && !vc.isFiltered()); + } + + public boolean isNonNullButUnphased(Genotype gt) { + return (gt != null && !genotypesArePhasedAboveThreshold(gt)); + } + + public boolean genotypesArePhasedAboveThreshold(Genotype gt) { + if (gt.isHom()) // Can always consider a hom site to be phased to its predecessor, since its successor will only be phased to it if it's hom or "truly" phased + return true; + + if (!gt.isPhased()) + return false; + + Double pq = getPQ(gt); + return (pq == null || pq >= minPhaseQuality); + } + + public static Double getPQ(Genotype gt) { + return gt.getAttributeAsDoubleNoException(ReadBackedPhasingWalker.PQ_KEY); + } + + public boolean topMatchesTop(AllelePair b1, AllelePair b2) { + return b1.getTopAllele().equals(b2.getTopAllele()); + } + + public boolean topMatchesBottom(AllelePair b1, AllelePair b2) { + return b1.getTopAllele().equals(b2.getBottomAllele()); + } + + public boolean bottomMatchesTop(AllelePair b1, AllelePair b2) { + return topMatchesBottom(b2, b1); + } + + public boolean bottomMatchesBottom(AllelePair b1, AllelePair b2) { + return b1.getBottomAllele().equals(b2.getBottomAllele()); + } + + public String toString(AllelePair prev, AllelePair cur) { + return prev.getTopAllele().getBaseString() + "+" + cur.getTopAllele().getBaseString() + "|" + prev.getBottomAllele().getBaseString() + "+" + cur.getBottomAllele().getBaseString(); + } + + public void finalizeEvaluation() { + } + + private static class Reasons { + private StringBuilder sb; + + public Reasons() { + sb = new StringBuilder(); + } + + public void addReason(String category, String sample, VariantEvalWalker.EvaluationContext evalGroup, GenomeLoc prevLoc, String reason) { + sb.append(category + "(" + sample + ", previous: " + prevLoc + " [" + evalGroup.compTrackName + ", " + evalGroup.evalTrackName + "]): " + reason + ";"); + } + + public String toString() { + if (sb.length() == 0) + return null; + + return "reasons=" + sb.toString(); + } + } +} + + + +class CompEvalGenotypes { + private GenomeLoc loc; + private Genotype compGt; + private Genotype evalGt; + + public CompEvalGenotypes(GenomeLoc loc, Genotype compGt, Genotype evalGt) { + this.loc = loc; + this.compGt = compGt; + this.evalGt = evalGt; + } + + public GenomeLoc getLocus() { + return loc; + } + + public Genotype getCompGenotpye() { + return compGt; + } + public Genotype getEvalGenotype() { + return evalGt; + } + + public void setCompGenotype(Genotype compGt) { + this.compGt = compGt; + } + + public void setEvalGenotype(Genotype evalGt) { + this.evalGt = evalGt; + } +} + +class SamplePreviousGenotypes { + private HashMap sampleGenotypes = null; + + public SamplePreviousGenotypes() { + this.sampleGenotypes = new HashMap(); + } + + public CompEvalGenotypes get(String sample) { + return sampleGenotypes.get(sample); + } + + public void put(String sample, CompEvalGenotypes compEvalGts) { + sampleGenotypes.put(sample, compEvalGts); + } + + public void put(String sample, GenomeLoc locus, Genotype compGt, Genotype evalGt) { + sampleGenotypes.put(sample, new CompEvalGenotypes(locus, compGt, evalGt)); + } +} + +class PhaseStats { + public int neitherPhased; + public int onlyCompPhased; + public int onlyEvalPhased; + public int phasesAgree; + public int phasesDisagree; + + public PhaseStats() { + this.neitherPhased = 0; + this.onlyCompPhased = 0; + this.onlyEvalPhased = 0; + this.phasesAgree = 0; + this.phasesDisagree = 0; + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("Neither phased: " + neitherPhased + "\tOnly Comp: " + onlyCompPhased + "\tOnly Eval: " + onlyEvalPhased + "\tSame phase: " + phasesAgree + "\tOpposite phase: " + phasesDisagree); + return sb.toString(); + } + + public static String[] getFieldNamesArray() { + return new String[]{"total", "neither", "only_comp", "only_eval", "both", "match", "switch", "switch_rate"}; + } + + public Object getField(int index) { + switch (index) { + case (0): + return (neitherPhased + onlyCompPhased + onlyEvalPhased + phasesAgree + phasesDisagree); + case (1): + return neitherPhased; + case (2): + return onlyCompPhased; + case (3): + return onlyEvalPhased; + case (4): + return (phasesAgree + phasesDisagree); + case (5): + return phasesAgree; + case (6): + return phasesDisagree; + case (7): + return ((phasesDisagree == 0) ? 0 : ((double) phasesDisagree) / (phasesAgree + phasesDisagree)); + default: + return -1; + } + } +} + +/** + * a table of sample names to genotype phasing statistics + */ +class SamplePhasingStatistics implements TableType { + private HashMap sampleStats = null; + private double minPhaseQuality; + + public SamplePhasingStatistics(double minPhaseQuality) { + this.sampleStats = new HashMap(); + this.minPhaseQuality = minPhaseQuality; + } + + public PhaseStats ensureSampleStats(String samp) { + PhaseStats ps = sampleStats.get(samp); + if (ps == null) { + ps = new PhaseStats(); + sampleStats.put(samp, ps); + } + return ps; + } + + /** + * @return one row per sample + */ + public String[] getRowKeys() { + return sampleStats.keySet().toArray(new String[sampleStats.size()]); + } + + /** + * get the column keys + * + * @return a list of objects, in this case strings, that are the column names + */ + public String[] getColumnKeys() { + return PhaseStats.getFieldNamesArray(); + } + + public Object getCell(int x, int y) { + String[] rowKeys = getRowKeys(); + PhaseStats ps = sampleStats.get(rowKeys[x]); + return ps.getField(y); + } + + public String getName() { + return "Sample Phasing Statistics (for PQ >= " + minPhaseQuality + ")"; + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + for (Map.Entry sampPhaseStatsEnt : sampleStats.entrySet()) { + String sample = sampPhaseStatsEnt.getKey(); + PhaseStats ps = sampPhaseStatsEnt.getValue(); + + sb.append(sample + "\t" + ps); + } + return sb.toString(); + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/IndelLengthHistogram.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/IndelLengthHistogram.java new file mode 100755 index 000000000..7c3f55825 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/IndelLengthHistogram.java @@ -0,0 +1,112 @@ +package org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.evaluators; + +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.tags.Analysis; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.tags.DataPoint; +import org.broadinstitute.sting.utils.report.utils.TableType; + +/** + * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl + * + * @Author chartl + * @Date May 26, 2010 + */ +@Analysis(name = "Indel length histograms", description = "Shows the distrbution of insertion/deletion event lengths (negative for deletion, positive for insertion)") +public class IndelLengthHistogram extends VariantEvaluator { + private static final int SIZE_LIMIT = 50; + @DataPoint(description="Histogram of indel lengths") + IndelHistogram indelHistogram = new IndelHistogram(SIZE_LIMIT); + + /* + * Indel length histogram table object + */ + + static class IndelHistogram implements TableType { + private Integer[] colKeys; + private int limit; + private String[] rowKeys = {"EventLength"}; + private Integer[] indelHistogram; + + public IndelHistogram(int limit) { + colKeys = initColKeys(limit); + indelHistogram = initHistogram(limit); + this.limit = limit; + } + + public Object[] getColumnKeys() { + return colKeys; + } + + public Object[] getRowKeys() { + return rowKeys; + } + + public Object getCell(int row, int col) { + return indelHistogram[col]; + } + + private Integer[] initColKeys(int size) { + Integer[] cK = new Integer[size*2+1]; + int index = 0; + for ( int i = -size; i <= size; i ++ ) { + cK[index] = i; + index++; + } + + return cK; + } + + private Integer[] initHistogram(int size) { + Integer[] hist = new Integer[size*2+1]; + for ( int i = 0; i < 2*size+1; i ++ ) { + hist[i] = 0; + } + + return hist; + } + + public String getName() { return "indelHistTable"; } + + public void update(int eLength) { + indelHistogram[len2index(eLength)]++; + } + + private int len2index(int len) { + if ( len > limit || len < -limit ) { + throw new ReviewedStingException("Indel length exceeds limit of "+limit+" please increase indel limit size"); + } + return len + limit; + } + } + + public boolean enabled() { return false; } + + public String getName() { return "IndelLengthHistogram"; } + + public int getComparisonOrder() { return 1; } // need only the evals + + public String update1(VariantContext vc1, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + //System.out.println("Update1 called"); + if ( ! vc1.isBiallelic() && vc1.isIndel() ) { + //veWalker.getLogger().warn("[IndelLengthHistogram] Non-biallelic indel at "+ref.getLocus()+" ignored."); + return vc1.toString(); // biallelic sites are output + } + + if ( vc1.isIndel() ) { + //System.out.println("Is indel"); + if ( vc1.isInsertion() ) { + indelHistogram.update(vc1.getAlternateAllele(0).length()); + } else if ( vc1.isDeletion() ) { + indelHistogram.update(-vc1.getReference().length()); + } else { + throw new ReviewedStingException("Indel type that is not insertion or deletion."); + } + } + + return null; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/IndelMetricsByAC.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/IndelMetricsByAC.java new file mode 100755 index 000000000..b98413af3 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/IndelMetricsByAC.java @@ -0,0 +1,222 @@ +package org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.evaluators; + +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.NewVariantEvalWalker; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.tags.Analysis; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.tags.DataPoint; +import org.broadinstitute.sting.utils.report.utils.TableType; + +import java.util.ArrayList; + +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @author delangel + * @since Apr 11, 2010 + */ + +@Analysis(name = "Indel Metrics by allele count", description = "Shows various stats binned by allele count") +public class IndelMetricsByAC extends VariantEvaluator { + // a mapping from quality score histogram bin to Ti/Tv ratio + @DataPoint(description = "Indel Metrics by allele count") + IndelMetricsByAc metrics = null; + + int numSamples = 0; + + public void initialize(NewVariantEvalWalker walker) { + numSamples = walker.getNumSamples(); + } + + //@DataPoint(name="Quality by Allele Count", description = "average variant quality for each allele count") + //AlleleCountStats alleleCountStats = null; + private static final int INDEL_SIZE_LIMIT = 100; + private static final int NUM_SCALAR_COLUMNS = 6; + static int len2Index(int ind) { + return ind+INDEL_SIZE_LIMIT+NUM_SCALAR_COLUMNS; + } + + static int index2len(int ind) { + return ind-INDEL_SIZE_LIMIT-NUM_SCALAR_COLUMNS; + } + + protected final static String[] METRIC_COLUMNS; + static { + METRIC_COLUMNS= new String[NUM_SCALAR_COLUMNS+2*INDEL_SIZE_LIMIT+1]; + METRIC_COLUMNS[0] = "AC"; + METRIC_COLUMNS[1] = "nIns"; + METRIC_COLUMNS[2] = "nDels"; + METRIC_COLUMNS[3] = "n"; + METRIC_COLUMNS[4] = "nComplex"; + METRIC_COLUMNS[5] = "nLong"; + + for (int k=NUM_SCALAR_COLUMNS; k < NUM_SCALAR_COLUMNS+ 2*INDEL_SIZE_LIMIT+1; k++) + METRIC_COLUMNS[k] = "indel_size_len"+Integer.valueOf(index2len(k)); + } + + class IndelMetricsAtAC { + public int ac = -1, nIns =0, nDel = 0, nComplex = 0, nLong; + public int sizeCount[] = new int[2*INDEL_SIZE_LIMIT+1]; + + public IndelMetricsAtAC(int ac) { this.ac = ac; } + + public void update(VariantContext eval) { + int eventLength = 0; + if ( eval.isInsertion() ) { + eventLength = eval.getAlternateAllele(0).length(); + nIns++; + } else if ( eval.isDeletion() ) { + eventLength = -eval.getReference().length(); + nDel++; + } + else { + nComplex++; + } + if (Math.abs(eventLength) < INDEL_SIZE_LIMIT) + sizeCount[len2Index(eventLength)]++; + else + nLong++; + + + + } + + // corresponding to METRIC_COLUMNS + public String getColumn(int i) { + if (i >= NUM_SCALAR_COLUMNS && i <=NUM_SCALAR_COLUMNS+ 2*INDEL_SIZE_LIMIT) + return String.valueOf(sizeCount[i-NUM_SCALAR_COLUMNS]); + + switch (i) { + case 0: return String.valueOf(ac); + case 1: return String.valueOf(nIns); + case 2: return String.valueOf(nDel); + case 3: return String.valueOf(nIns + nDel); + case 4: return String.valueOf(nComplex); + case 5: return String.valueOf(nLong); + + default: + throw new ReviewedStingException("Unexpected column " + i); + } + } + } + + class IndelMetricsByAc implements TableType { + ArrayList metrics = new ArrayList(); + Object[] rows = null; + + public IndelMetricsByAc( int nchromosomes ) { + rows = new Object[nchromosomes+1]; + metrics = new ArrayList(nchromosomes+1); + for ( int i = 0; i < nchromosomes + 1; i++ ) { + metrics.add(new IndelMetricsAtAC(i)); + rows[i] = "ac" + i; + } + } + + public Object[] getRowKeys() { + return rows; + } + + public Object[] getColumnKeys() { + return METRIC_COLUMNS; + } + + public String getName() { + return "IndelMetricsByAc"; + } + + // + public String getCell(int ac, int y) { + return metrics.get(ac).getColumn(y); + } + + public String toString() { + String returnString = ""; + return returnString; + } + + public void incrValue( VariantContext eval ) { + int ac = -1; + + if ( eval.hasGenotypes() ) + ac = eval.getChromosomeCount(eval.getAlternateAllele(0)); + else if ( eval.hasAttribute("AC") ) { + ac = Integer.valueOf(eval.getAttributeAsString("AC")); + } + + if ( ac != -1 ) + metrics.get(ac).update(eval); + } + } + + //public IndelMetricsByAC(VariantEvalWalker parent) { + //super(parent); + // don't do anything + //} + + public String getName() { + return "IndelMetricsByAC"; + } + + public int getComparisonOrder() { + return 1; // we only need to see each eval track + } + + public boolean enabled() { + return true; + } + + public String toString() { + return getName(); + } + + public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + final String interesting = null; + + if (eval != null ) { + if ( metrics == null ) { + int nSamples = numSamples; + //int nSamples = 2; + if ( nSamples != -1 ) + metrics = new IndelMetricsByAc(2 * nSamples); + } + + if ( eval.isIndel() && eval.isBiallelic() && + metrics != null ) { + metrics.incrValue(eval); + } + } + + return interesting; // This module doesn't capture any interesting sites, so return null + } + + //public void finalizeEvaluation() { + // + //} +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/IndelStatistics.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/IndelStatistics.java new file mode 100755 index 000000000..59371aa81 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/IndelStatistics.java @@ -0,0 +1,514 @@ +package org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.evaluators; + +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.NewVariantEvalWalker; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.tags.Analysis; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.tags.DataPoint; +import org.broadinstitute.sting.utils.report.utils.TableType; + +import java.util.Arrays; +import java.util.HashMap; + +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +@Analysis(name = "IndelStatistics", description = "Shows various indel metrics and statistics") +public class IndelStatistics extends VariantEvaluator { + @DataPoint(description = "Indel Statistics") + IndelStats indelStats = null; + + @DataPoint(description = "Indel Classification") + IndelClasses indelClasses = null; + + int numSamples = 0; + + public void initialize(NewVariantEvalWalker walker) { + numSamples = walker.getNumSamples(); + } + + private static final int INDEL_SIZE_LIMIT = 100; + private static final int NUM_SCALAR_COLUMNS = 10; + + static int len2Index(int ind) { + return ind+INDEL_SIZE_LIMIT+NUM_SCALAR_COLUMNS; + } + + static int index2len(int ind) { + return ind-INDEL_SIZE_LIMIT-NUM_SCALAR_COLUMNS; + } + + static class IndelStats implements TableType { + protected final static String ALL_SAMPLES_KEY = "allSamples"; + protected final static String[] COLUMN_KEYS; + static { + COLUMN_KEYS= new String[NUM_SCALAR_COLUMNS+2*INDEL_SIZE_LIMIT+1]; + COLUMN_KEYS[0] = "heterozygosity"; + COLUMN_KEYS[1] = "number_of_insertions"; + COLUMN_KEYS[2] = "number_of_deletions"; + COLUMN_KEYS[3] = "number_het_insertions"; + COLUMN_KEYS[4] = "number_homozygous_insertions"; + COLUMN_KEYS[5] = "number_het_deletions"; + COLUMN_KEYS[6] = "number_homozygous_deletions"; + COLUMN_KEYS[7] = "number of homozygous reference sites"; + COLUMN_KEYS[8] = "number of complex events"; + COLUMN_KEYS[9] = "number of long indels"; + + for (int k=NUM_SCALAR_COLUMNS; k < NUM_SCALAR_COLUMNS+ 2*INDEL_SIZE_LIMIT+1; k++) + COLUMN_KEYS[k] = "indel_size_len"+Integer.valueOf(index2len(k)); + } + + // map of sample to statistics + protected final HashMap indelSummary = new HashMap(); + + public IndelStats(final VariantContext vc) { + indelSummary.put(ALL_SAMPLES_KEY, new double[COLUMN_KEYS.length]); + for( final String sample : vc.getGenotypes().keySet() ) { + indelSummary.put(sample, new double[COLUMN_KEYS.length]); + } + } + + /** + * + * @return one row per sample + */ + public Object[] getRowKeys() { + return indelSummary.keySet().toArray(new String[indelSummary.size()]); + } + public Object getCell(int x, int y) { + final Object[] rowKeys = getRowKeys(); + return String.format("%4.2f",indelSummary.get(rowKeys[x])[y]); + } + + /** + * get the column keys + * @return a list of objects, in this case strings, that are the column names + */ + public Object[] getColumnKeys() { + return COLUMN_KEYS; + } + + public String getName() { + return "IndelStats"; + } + + public int getComparisonOrder() { + return 1; // we only need to see each eval track + } + + public String toString() { + return getName(); + } + + /* + * increment the specified value + */ + public void incrValue(VariantContext vc) { + int eventLength = 0; + boolean isInsertion = false, isDeletion = false; + + if ( vc.isInsertion() ) { + eventLength = vc.getAlternateAllele(0).length(); + indelSummary.get(ALL_SAMPLES_KEY)[1]++; + isInsertion = true; + } else if ( vc.isDeletion() ) { + indelSummary.get(ALL_SAMPLES_KEY)[2]++; + eventLength = -vc.getReference().length(); + isDeletion = true; + } + else { + indelSummary.get(ALL_SAMPLES_KEY)[8]++; + } + + // make sure event doesn't overstep array boundaries + if (Math.abs(eventLength) < INDEL_SIZE_LIMIT) + indelSummary.get(ALL_SAMPLES_KEY)[len2Index(eventLength)]++; + else + indelSummary.get(ALL_SAMPLES_KEY)[9]++; + + + for( final String sample : vc.getGenotypes().keySet() ) { + if ( indelSummary.containsKey(sample) ) { + Genotype g = vc.getGenotype(sample); + boolean isVariant = (g.isCalled() && !g.isHomRef()); + if (isVariant) { + // update ins/del count + if (isInsertion) { + indelSummary.get(sample)[1]++; + } + else if (isDeletion) + indelSummary.get(sample)[2]++; + else + indelSummary.get(sample)[8]++; + + // update histogram + if (Math.abs(eventLength) < INDEL_SIZE_LIMIT) + indelSummary.get(sample)[len2Index(eventLength)]++; + else + indelSummary.get(sample)[9]++; + + if (g.isHet()) + if (isInsertion) + indelSummary.get(sample)[3]++; + else + indelSummary.get(sample)[5]++; + else + if (isInsertion) + indelSummary.get(sample)[4]++; + else + indelSummary.get(sample)[6]++; + + + + } + else + indelSummary.get(sample)[7]++; + } + } + + + } + } + + static class IndelClasses implements TableType { + protected final static String ALL_SAMPLES_KEY = "allSamples"; + protected final static String[] COLUMN_KEYS; + + + + static { + COLUMN_KEYS= new String[41]; + COLUMN_KEYS[0] = "Novel_A"; + COLUMN_KEYS[1] = "Novel_C"; + COLUMN_KEYS[2] = "Novel_G"; + COLUMN_KEYS[3] = "Novel_T"; + COLUMN_KEYS[4] = "NOVEL_1"; + COLUMN_KEYS[5] = "NOVEL_2"; + COLUMN_KEYS[6] = "NOVEL_3"; + COLUMN_KEYS[7] = "NOVEL_4"; + COLUMN_KEYS[8] = "NOVEL_5"; + COLUMN_KEYS[9] = "NOVEL_6"; + COLUMN_KEYS[10] = "NOVEL_7"; + COLUMN_KEYS[11] = "NOVEL_8"; + COLUMN_KEYS[12] = "NOVEL_9"; + COLUMN_KEYS[13] = "NOVEL_10orMore"; + COLUMN_KEYS[14] = "RepeatExpansion_A"; + COLUMN_KEYS[15] = "RepeatExpansion_C"; + COLUMN_KEYS[16] = "RepeatExpansion_G"; + COLUMN_KEYS[17] = "RepeatExpansion_T"; + COLUMN_KEYS[18] = "RepeatExpansion_AC"; + COLUMN_KEYS[19] = "RepeatExpansion_AG"; + COLUMN_KEYS[20] = "RepeatExpansion_AT"; + COLUMN_KEYS[21] = "RepeatExpansion_CA"; + COLUMN_KEYS[22] = "RepeatExpansion_CG"; + COLUMN_KEYS[23] = "RepeatExpansion_CT"; + COLUMN_KEYS[24] = "RepeatExpansion_GA"; + COLUMN_KEYS[25] = "RepeatExpansion_GC"; + COLUMN_KEYS[26] = "RepeatExpansion_GT"; + COLUMN_KEYS[27] = "RepeatExpansion_TA"; + COLUMN_KEYS[28] = "RepeatExpansion_TC"; + COLUMN_KEYS[29] = "RepeatExpansion_TG"; + COLUMN_KEYS[30] = "RepeatExpansion_1"; + COLUMN_KEYS[31] = "RepeatExpansion_2"; + COLUMN_KEYS[32] = "RepeatExpansion_3"; + COLUMN_KEYS[33] = "RepeatExpansion_4"; + COLUMN_KEYS[34] = "RepeatExpansion_5"; + COLUMN_KEYS[35] = "RepeatExpansion_6"; + COLUMN_KEYS[36] = "RepeatExpansion_7"; + COLUMN_KEYS[37] = "RepeatExpansion_8"; + COLUMN_KEYS[38] = "RepeatExpansion_9"; + COLUMN_KEYS[39] = "RepeatExpansion_10orMore"; + COLUMN_KEYS[40] = "Other"; + + } + + private static final int START_IND_NOVEL = 4; + private static final int STOP_IND_NOVEL = 13; + private static final int START_IND_FOR_REPEAT_EXPANSION_1 = 14; + private static final int STOP_IND_FOR_REPEAT_EXPANSION_2 = 29; + private static final int START_IND_FOR_REPEAT_EXPANSION_COUNTS = 30; + private static final int STOP_IND_FOR_REPEAT_EXPANSION_COUNTS = 39; + private static final int IND_FOR_OTHER_EVENT = 40; + private static final int START_IND_NOVEL_PER_BASE = 0; + private static final int STOP_IND_NOVEL_PER_BASE = 3; + + + // map of sample to statistics + protected final HashMap indelClassSummary = new HashMap(); + + public IndelClasses(final VariantContext vc) { + indelClassSummary.put(ALL_SAMPLES_KEY, new int[COLUMN_KEYS.length]); + for( final String sample : vc.getGenotypes().keySet() ) { + indelClassSummary.put(sample, new int[COLUMN_KEYS.length]); + } + } + + /** + * + * @return one row per sample + */ + public Object[] getRowKeys() { + return indelClassSummary.keySet().toArray(new String[indelClassSummary.size()]); + } + public Object getCell(int x, int y) { + final Object[] rowKeys = getRowKeys(); + return String.format("%d",indelClassSummary.get(rowKeys[x])[y]); + } + + /** + * get the column keys + * @return a list of objects, in this case strings, that are the column names + */ + public Object[] getColumnKeys() { + return COLUMN_KEYS; + } + + public String getName() { + return "IndelClasses"; + } + + public int getComparisonOrder() { + return 1; // we only need to see each eval track + } + + public String toString() { + return getName(); + } + + private void incrementSampleStat(VariantContext vc, int index) { + indelClassSummary.get(ALL_SAMPLES_KEY)[index]++; + for( final String sample : vc.getGenotypes().keySet() ) { + if ( indelClassSummary.containsKey(sample) ) { + Genotype g = vc.getGenotype(sample); + boolean isVariant = (g.isCalled() && !g.isHomRef()); + if (isVariant) + // update count + indelClassSummary.get(sample)[index]++; + + } + } + + } + /* + * increment the specified value + */ + private String findMinimalEvent(String eventString) { + + // for each length up to given string length, see if event string is a repetition of units of size N + boolean foundSubstring = false; + String minEvent = eventString; + for (int k=1; k < eventString.length(); k++) { + if (eventString.length() % k > 0) + continue; + String str = eventString.substring(0,k); + // now see if event string is a repetition of str + int numReps = eventString.length() / k; + String r = ""; + for (int j=0; j < numReps; j++) + r = r.concat(str); + + if (r.matches(eventString)) { + foundSubstring = true; + minEvent = str; + break; + } + + } + return minEvent; + } + public void incrValue(VariantContext vc, ReferenceContext ref) { + int eventLength = 0; + boolean isInsertion = false, isDeletion = false; + String indelAlleleString; + + if ( vc.isInsertion() ) { + isInsertion = true; + indelAlleleString = vc.getAlternateAllele(0).getDisplayString(); + } else if ( vc.isDeletion() ) { + isDeletion = true; + indelAlleleString = vc.getReference().getDisplayString(); + } + else { + incrementSampleStat(vc, IND_FOR_OTHER_EVENT); + + return; + } + + byte[] refBases = ref.getBases(); + + indelAlleleString = findMinimalEvent(indelAlleleString); + eventLength = indelAlleleString.length(); + + // See first if indel is a repetition of bases before current + int indStart = refBases.length/2-eventLength+1; + + boolean done = false; + int numRepetitions = 0; + while (!done) { + if (indStart < 0) + done = true; + else { + String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength)); + if (refPiece.matches(indelAlleleString)) + { + numRepetitions++; + indStart = indStart - eventLength; + } + else + done = true; + + } + } + + // now do it forward + done = false; + indStart = refBases.length/2+1; + while (!done) { + if (indStart + eventLength >= refBases.length) + break; + else { + String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength)); + if (refPiece.matches(indelAlleleString)) + { + numRepetitions++; + indStart = indStart + eventLength; + } + else + done = true; + + } + } + + if (numRepetitions == 0) { + //unrepeated sequence from surroundings + int ind = START_IND_NOVEL + (eventLength-1); + if (ind > STOP_IND_NOVEL) + ind = STOP_IND_NOVEL; + incrementSampleStat(vc, ind); + + if (eventLength == 1) { + // log single base indels additionally by base + String keyStr = "Novel_" + indelAlleleString; + int k; + for (k=START_IND_NOVEL_PER_BASE; k <= STOP_IND_NOVEL_PER_BASE; k++) { + if (keyStr.matches(COLUMN_KEYS[k])) + break; + } + // log now event + incrementSampleStat(vc, k); + } + } + else { + int ind = START_IND_FOR_REPEAT_EXPANSION_COUNTS + (numRepetitions-1); + if (ind > STOP_IND_FOR_REPEAT_EXPANSION_COUNTS) + ind = STOP_IND_FOR_REPEAT_EXPANSION_COUNTS; + incrementSampleStat(vc, ind); + + if (eventLength<=2) { + // for single or dinucleotide indels, we further log the base in which they occurred + String keyStr = "RepeatExpansion_" + indelAlleleString; + int k; + for (k=START_IND_FOR_REPEAT_EXPANSION_1; k <= STOP_IND_FOR_REPEAT_EXPANSION_2; k++) { + if (keyStr.matches(COLUMN_KEYS[k])) + break; + } + // log now event + incrementSampleStat(vc, k); + } + + } + //g+ + /* + System.out.format("RefBefore: %s\n",new String(refBefore)); + System.out.format("RefAfter: %s\n",new String(refAfter)); + System.out.format("Indel Allele: %s\n", indelAlleleString); + System.out.format("Num Repetitions: %d\n", numRepetitions); + */ + } + + } + + //public IndelStatistics(VariantEvalWalker parent) { + //super(parent); + // don't do anything + //} + + public String getName() { + return "IndelStatistics"; + } + + public int getComparisonOrder() { + return 1; // we only need to see each eval track + } + + public boolean enabled() { + return true; + } + + public String toString() { + return getName(); + } + + //public String update2(VariantContext eval, VariantContext validation, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantEvalWalker.EvaluationContext group) { + //return null; + //} + + public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + if (eval != null ) { + if ( indelStats == null ) { + int nSamples = numSamples; + + if ( nSamples != -1 ) + indelStats = new IndelStats(eval); + } + if ( indelClasses == null ) { + indelClasses = new IndelClasses(eval); + } + + if ( eval.isIndel() && eval.isBiallelic() ) { + if (indelStats != null ) + indelStats.incrValue(eval); + + if (indelClasses != null) + indelClasses.incrValue(eval, ref); + } + } + + return null; // This module doesn't capture any interesting sites, so return null + } + public String update0(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + return null; + } + public void finalizeEvaluation() { + // + int k=0; + } + +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/MendelianViolationEvaluator.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/MendelianViolationEvaluator.java new file mode 100755 index 000000000..7cb38bc92 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/MendelianViolationEvaluator.java @@ -0,0 +1,186 @@ +package org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.evaluators; + +import org.broad.tribble.util.variantcontext.Allele; +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.tags.Analysis; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.tags.DataPoint; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.List; +import java.util.Arrays; +import java.util.regex.Pattern; +import java.util.regex.Matcher; + +/** + * Mendelian violation detection and counting + *

+ * a violation looks like: + * Suppose dad = A/B and mom = C/D + * The child can be [A or B] / [C or D]. + * If the child doesn't match this, the site is a violation + *

+ * Some examples: + *

+ * mom = A/A, dad = C/C + * child can be A/C only + *

+ * mom = A/C, dad = C/C + * child can be A/C or C/C + *

+ * mom = A/C, dad = A/C + * child can be A/A, A/C, C/C + *

+ * The easiest way to do this calculation is to: + *

+ * Get alleles for mom => A/B + * Get alleles for dad => C/D + * Make allowed genotypes for child: A/C, A/D, B/C, B/D + * Check that the child is one of these. + */ +@Analysis(name = "Mendelian Violation Evaluator", description = "Mendelian Violation Evaluator") +public class MendelianViolationEvaluator extends VariantEvaluator { + + @DataPoint(description = "Number of mendelian variants found") + long nVariants; + + @DataPoint(description = "Number of mendelian violations found") + long nViolations; + + @DataPoint(description = "number of child hom ref calls where the parent was hom variant") + long KidHomRef_ParentHomVar; + @DataPoint(description = "number of child het calls where the parent was hom ref") + long KidHet_ParentsHomRef; + @DataPoint(description = "number of child het calls where the parent was hom variant") + long KidHet_ParentsHomVar; + @DataPoint(description = "number of child hom variant calls where the parent was hom ref") + long KidHomVar_ParentHomRef; + + TrioStructure trio; + + private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)"); + + public static class TrioStructure { + public String mom, dad, child; + } + + public static TrioStructure parseTrioDescription(String family) { + Matcher m = FAMILY_PATTERN.matcher(family); + if (m.matches()) { + TrioStructure trio = new TrioStructure(); + //System.out.printf("Found a family pattern: %s%n", parent.FAMILY_STRUCTURE); + trio.mom = m.group(1); + trio.dad = m.group(2); + trio.child = m.group(3); + return trio; + } else { + throw new IllegalArgumentException("Malformatted family structure string: " + family + " required format is mom+dad=child"); + } + } + + // todo: fix + + //public MendelianViolationEvaluator(VariantEvalWalker parent) { + //super(parent); + + //if (enabled()) { + //trio = parseTrioDescription(parent.FAMILY_STRUCTURE); + //parent.getLogger().debug(String.format("Found a family pattern: %s mom=%s dad=%s child=%s", + //parent.FAMILY_STRUCTURE, trio.mom, trio.dad, trio.child)); + //} + //} + + public boolean enabled() { + //return getVEWalker().FAMILY_STRUCTURE != null; + return false; + } + + private double getQThreshold() { + //return getVEWalker().MENDELIAN_VIOLATION_QUAL_THRESHOLD / 10; // we aren't 10x scaled in the GATK a la phred + return 0.0; + } + + public String getName() { + return "mendelian_violations"; + } + + public int getComparisonOrder() { + return 1; // we only need to see each eval track + } + + public String update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if (vc.isBiallelic() && vc.hasGenotypes()) { // todo -- currently limited to biallelic loci + Genotype momG = vc.getGenotype(trio.mom); + Genotype dadG = vc.getGenotype(trio.dad); + Genotype childG = vc.getGenotype(trio.child); + + if (includeGenotype(momG) && includeGenotype(dadG) && includeGenotype(childG)) { + nVariants++; + + if (momG == null || dadG == null || childG == null) + throw new IllegalArgumentException(String.format("VariantContext didn't contain genotypes for expected trio members: mom=%s dad=%s child=%s", trio.mom, trio.dad, trio.child)); + + // all genotypes are good, so let's see if child is a violation + + if (isViolation(vc, momG, dadG, childG)) { + nViolations++; + + String label; + if (childG.isHomRef() && (momG.isHomVar() || dadG.isHomVar())) { + label = "KidHomRef_ParentHomVar"; + KidHomRef_ParentHomVar++; + } else if (childG.isHet() && (momG.isHomRef() && dadG.isHomRef())) { + label = "KidHet_ParentsHomRef"; + KidHet_ParentsHomRef++; + } else if (childG.isHet() && (momG.isHomVar() && dadG.isHomVar())) { + label = "KidHet_ParentsHomVar"; + KidHet_ParentsHomVar++; + } else if (childG.isHomVar() && (momG.isHomRef() || dadG.isHomRef())) { + label = "KidHomVar_ParentHomRef"; + KidHomVar_ParentHomRef++; + } else { + throw new ReviewedStingException("BUG: unexpected child genotype class " + childG); + } + + return "MendelViolation=" + label; + } + } + } + + return null; // we don't capture any intersting sites + } + + private boolean includeGenotype(Genotype g) { + return g.getNegLog10PError() > getQThreshold() && g.isCalled(); + } + + public static boolean isViolation(VariantContext vc, Genotype momG, Genotype dadG, Genotype childG) { + return isViolation(vc, momG.getAlleles(), dadG.getAlleles(), childG.getAlleles()); + } + + public static boolean isViolation(VariantContext vc, TrioStructure trio ) { + return isViolation(vc, vc.getGenotype(trio.mom), vc.getGenotype(trio.dad), vc.getGenotype(trio.child) ); + } + + public static boolean isViolation(VariantContext vc, List momA, List dadA, List childA) { + //VariantContext momVC = vc.subContextFromGenotypes(momG); + //VariantContext dadVC = vc.subContextFromGenotypes(dadG); + int i = 0; + Genotype childG = new Genotype("kidG", childA); + for (Allele momAllele : momA) { + for (Allele dadAllele : dadA) { + if (momAllele.isCalled() && dadAllele.isCalled()) { + Genotype possibleChild = new Genotype("possibleGenotype" + i, Arrays.asList(momAllele, dadAllele)); + if (childG.sameGenotype(possibleChild, false)) { + return false; + } + } + } + } + + return true; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/PhaseStats.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/PhaseStats.java index 7e4419ecd..50addd946 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/PhaseStats.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/PhaseStats.java @@ -4,14 +4,14 @@ package org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.evaluato * Created by IntelliJ IDEA. User: kiran Date: Nov 29, 2010 Time: 3:25:59 PM To change this template use File | Settings * | File Templates. */ -class PhaseStats { +class NewPhaseStats { public int neitherPhased; public int onlyCompPhased; public int onlyEvalPhased; public int phasesAgree; public int phasesDisagree; - public PhaseStats() { + public NewPhaseStats() { this.neitherPhased = 0; this.onlyCompPhased = 0; this.onlyEvalPhased = 0; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/PrintMissingComp.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/PrintMissingComp.java new file mode 100755 index 000000000..616b1a30c --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/PrintMissingComp.java @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.evaluators; + +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.tags.Analysis; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.tags.DataPoint; + +@Analysis(name = "PrintMissingComp", description = "the overlap between eval and comp sites") +public class PrintMissingComp extends VariantEvaluator { + @DataPoint(description = "number of eval sites outside of comp sites") + long nMissing = 0; + + //public PrintMissingComp(VariantEvalWalker parent) { + // super(parent); + //} + + public String getName() { + return "PrintMissingComp"; + } + + public int getComparisonOrder() { + return 2; // we need to see each eval track and each comp track + } + + public boolean enabled() { + return true; + } + + + public String update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + boolean compIsGood = comp != null && comp.isNotFiltered() && comp.isSNP(); + boolean evalIsGood = eval != null && eval.isSNP(); + + if ( compIsGood & ! evalIsGood ) { + nMissing++; + return "MissingFrom" + comp.getSource(); + } else { + return null; + } + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/SamplePreviousGenotypes.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/SamplePreviousGenotypes.java index 0d5604efe..100f374b4 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/SamplePreviousGenotypes.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/SamplePreviousGenotypes.java @@ -9,10 +9,10 @@ import java.util.HashMap; * Created by IntelliJ IDEA. User: kiran Date: Nov 29, 2010 Time: 3:25:59 PM To change this template use File | Settings * | File Templates. */ -class SamplePreviousGenotypes { +class NewSamplePreviousGenotypes { private HashMap sampleGenotypes = null; - public SamplePreviousGenotypes() { + public NewSamplePreviousGenotypes() { this.sampleGenotypes = new HashMap(); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/SimpleMetricsByAC.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/SimpleMetricsByAC.java new file mode 100755 index 000000000..e1c6fefad --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/SimpleMetricsByAC.java @@ -0,0 +1,178 @@ +package org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.evaluators; + +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.NewVariantEvalWalker; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.stratifications.Sample; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.tags.Analysis; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.tags.DataPoint; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.util.StateKey; +import org.broadinstitute.sting.utils.report.utils.TableType; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.ArrayList; + +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @author depristo + * @since Apr 11, 2010 + */ + +@Analysis(name = "Quality Metrics by allele count", description = "Shows various stats binned by allele count") +public class SimpleMetricsByAC extends VariantEvaluator implements StandardEval { + // a mapping from quality score histogram bin to Ti/Tv ratio + @DataPoint(description = "TiTv by allele count") + MetricsByAc metrics = null; + + private final static Object[] METRIC_COLUMNS = {"AC", "nTi", "nTv", "n", "TiTv"}; + private int numSamples; + + class MetricsAtAC { + public int ac = -1, nTi = 0, nTv = 0; + + public MetricsAtAC(int ac) { this.ac = ac; } + + public void update(VariantContext eval) { + if ( VariantContextUtils.isTransition(eval) ) + nTi++; + else + nTv++; + } + + // corresponding to METRIC_COLUMNS + public String getColumn(int i) { + switch (i) { + case 0: return String.valueOf(ac); + case 1: return String.valueOf(nTi); + case 2: return String.valueOf(nTv); + case 3: return String.valueOf(nTi + nTv); + case 4: return String.valueOf(ratio(nTi, nTv)); + default: + throw new ReviewedStingException("Unexpected column " + i); + } + } + } + + class MetricsByAc implements TableType { + ArrayList metrics = new ArrayList(); + Object[] rows = null; + + public MetricsByAc( int nchromosomes ) { + rows = new Object[nchromosomes+1]; + metrics = new ArrayList(nchromosomes+1); + for ( int i = 0; i < nchromosomes + 1; i++ ) { + metrics.add(new MetricsAtAC(i)); + rows[i] = "ac" + i; + } + } + + public Object[] getRowKeys() { + return rows; + } + + public Object[] getColumnKeys() { + return METRIC_COLUMNS; + } + + public String getName() { + return "MetricsByAc"; + } + + public String getCell(int ac, int y) { + return metrics.get(ac).getColumn(y); + } + + public String toString() { + return ""; + } + + public void incrValue( VariantContext eval ) { + int ac = -1; + + if ( eval.hasGenotypes() ) + ac = eval.getChromosomeCount(eval.getAlternateAllele(0)); + else if ( eval.hasAttribute("AC") ) { + ac = Integer.valueOf(eval.getAttributeAsString("AC")); + } + + if ( ac != -1 ) + metrics.get(ac).update(eval); + } + } + + public void initialize(NewVariantEvalWalker walker) { + numSamples = walker.getNumSamples(); + metrics = new MetricsByAc(2*numSamples); + } + + public String getName() { + return "SimpleMetricsByAC"; + } + + public int getComparisonOrder() { + return 1; // we only need to see each eval track + } + + public boolean enabled() { + return true; + } + + public String toString() { + return getName(); + } + + public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + final String interesting = null; + + if (eval != null ) { + if ( metrics == null ) { + int nSamples = numSamples; + + if ( nSamples != -1 ) + metrics = new MetricsByAc(2 * nSamples); + } + + if ( eval.isSNP() && + eval.isBiallelic() && + metrics != null ) { + metrics.incrValue(eval); + } + } + + return interesting; // This module doesn't capture any interesting sites, so return null + } + + @Override + public boolean stateIsApplicable(StateKey stateKey) { + String className = Sample.class.getSimpleName(); + + return !(stateKey.containsKey(className) && !stateKey.get(className).equalsIgnoreCase("all")); + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/VariantEvaluator.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/VariantEvaluator.java index a18547b7b..8593a0d5c 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/VariantEvaluator.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/VariantEvaluator.java @@ -4,9 +4,13 @@ import org.broad.tribble.util.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.NewVariantEvalWalker; import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.util.NewEvaluationContext; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.util.StateKey; public abstract class VariantEvaluator { + public void initialize(NewVariantEvalWalker walker) {} + public abstract boolean enabled(); // Should return the number of VariantContexts expected as inputs to update. Can be 1 or 2 @@ -46,4 +50,8 @@ public abstract class VariantEvaluator { return ((double)num) / (Math.max(denom, 1)); } + public boolean stateIsApplicable(StateKey stateKey) { + return true; + } + } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/VariantQualityScore.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/VariantQualityScore.java index 494b42bfb..e6b53e3b2 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/VariantQualityScore.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/evaluators/VariantQualityScore.java @@ -25,221 +25,234 @@ package org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.evaluators; +import org.broad.tribble.util.variantcontext.Allele; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.tags.Analysis; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.tags.DataPoint; +import org.broadinstitute.sting.utils.report.utils.TableType; + +import java.util.ArrayList; +import java.util.HashMap; + /** * @author rpoplin * @since Apr 6, 2010 */ -//@Analysis(name = "Variant Quality Score", description = "Shows various stats of sets of variants binned by variant quality score") -//public class VariantQualityScore extends VariantEvaluator { -// -// // a mapping from quality score histogram bin to Ti/Tv ratio -// @DataPoint(name="TiTv by Quality", description = "the Ti/Tv ratio broken out by variant quality") -// TiTvStats titvStats; -// -// @DataPoint(name="Quality by Allele Count", description = "average variant quality for each allele count") -// AlleleCountStats alleleCountStats = null; -// -// static class TiTvStats implements TableType { -// final static int NUM_BINS = 20; -// final HashMap> qualByIsTransition = new HashMap>(); // A hashMap holds all the qualities until we are able to bin them appropriately -// final long transitionByQuality[] = new long[NUM_BINS]; -// final long transversionByQuality[] = new long[NUM_BINS]; -// final double titvByQuality[] = new double[NUM_BINS]; // the final ti/tv sets that get reported out -// -// public Object[] getRowKeys() { -// return new String[]{"sample"}; -// } -// -// public Object[] getColumnKeys() { -// final String columnKeys[] = new String[NUM_BINS]; -// for( int iii = 0; iii < NUM_BINS; iii++ ) { -// columnKeys[iii] = "titvBin" + iii; -// } -// return columnKeys; -// } -// -// public String getName() { -// return "TiTvStats"; -// } -// -// public String getCell(int x, int y) { -// return String.valueOf(titvByQuality[y]); -// } -// -// public String toString() { -// StringBuffer returnString = new StringBuffer(); -// // output the ti/tv array -// returnString.append("titvByQuality: "); -// for( int iii = 0; iii < NUM_BINS; iii++ ) { -// returnString.append(titvByQuality[iii]); -// returnString.append(" "); -// } -// return returnString.toString(); -// } -// -// public void incrValue( final double qual, final boolean isTransition ) { -// final Integer qualKey = Math.round((float) qual); -// final long numTransition = (isTransition ? 1L : 0L); -// final long numTransversion = (isTransition ? 0L : 1L); -// if( qualByIsTransition.containsKey(qualKey) ) { -// Pair transitionPair = qualByIsTransition.get(qualKey); -// transitionPair.set(transitionPair.getFirst() + numTransition, transitionPair.getSecond() + numTransversion); -// qualByIsTransition.put(qualKey, transitionPair); -// } else { -// qualByIsTransition.put(qualKey, new Pair(numTransition,numTransversion)); -// } -// } -// -// public void organizeTiTvTables() { -// for( int iii = 0; iii < NUM_BINS; iii++ ) { -// transitionByQuality[iii] = 0L; -// transversionByQuality[iii] = 0L; -// titvByQuality[iii] = 0.0; -// } -// -// int maxQual = 0; -// -// // Calculate the maximum quality score in order to normalize and histogram -// for( final Integer qual : qualByIsTransition.keySet() ) { -// if( qual > maxQual ) { -// maxQual = qual; -// } -// } -// -// final double binSize = ((double)maxQual) / ((double) (NUM_BINS-1)); -// -// for( final Integer qual : qualByIsTransition.keySet() ) { -// final int index = (int)Math.floor( ((double) qual) / binSize ); -// if( index >= 0 ) { // BUGBUG: why is there overflow here? -// Pair transitionPair = qualByIsTransition.get(qual); -// transitionByQuality[index] += transitionPair.getFirst(); -// transversionByQuality[index] += transitionPair.getSecond(); -// } -// } -// -// for( int iii = 0; iii < NUM_BINS; iii++ ) { -// if( transitionByQuality[iii] + transversionByQuality[iii] > 800L ) { // need to have a sufficient number of variants to get a useful Ti/Tv ratio -// titvByQuality[iii] = ((double) transitionByQuality[iii]) / ((double) transversionByQuality[iii]); -// } else { -// titvByQuality[iii] = 0.0; -// } -// } -// -// } -// } -// -// class AlleleCountStats implements TableType { -// final HashMap> qualityListMap = new HashMap>(); -// final HashMap qualityMap = new HashMap(); -// -// public Object[] getRowKeys() { -// final int NUM_BINS = qualityListMap.keySet().size(); -// final String rowKeys[] = new String[NUM_BINS]; -// int iii = 0; -// for( final Integer key : qualityListMap.keySet() ) { -// rowKeys[iii] = "AC" + key; -// iii++; -// } -// return rowKeys; -// -// } -// -// public Object[] getColumnKeys() { -// return new String[]{"alleleCount","avgQual"}; -// } -// -// public String getName() { -// return "AlleleCountStats"; -// } -// -// public String getCell(int x, int y) { -// int iii = 0; -// for( final Integer key : qualityListMap.keySet() ) { -// if(iii == x) { -// if(y == 0) { return String.valueOf(key); } -// else { return String.valueOf(qualityMap.get(key)); } -// } -// iii++; -// } -// return null; -// } -// -// public String toString() { -// String returnString = ""; -// // output the quality map -// returnString += "AlleleCountStats: "; -// //for( int iii = 0; iii < NUM_BINS; iii++ ) { -// // returnString += titvByQuality[iii] + " "; -// //} -// return returnString; -// } -// -// public void incrValue( final double qual, final int alleleCount ) { -// ArrayList list = qualityListMap.get(alleleCount); -// if(list==null) { list = new ArrayList(); } -// list.add(qual); -// qualityListMap.put(alleleCount, list); -// } -// -// public void organizeAlleleCountTables() { -// for( final Integer key : qualityListMap.keySet() ) { -// final ArrayList list = qualityListMap.get(key); -// double meanQual = 0.0; -// final double numQuals = (double)list.size(); -// for( Double qual : list ) { -// meanQual += qual / numQuals; -// } -// qualityMap.put(key, meanQual); -// } -// } -// } -// -// public VariantQualityScore(VariantEvalWalker parent) { -// super(parent); -// titvStats = null; -// } -// -// public String getName() { -// return "VariantQualityScore"; -// } -// -// public int getComparisonOrder() { -// return 1; // we only need to see each eval track -// } -// -// public boolean enabled() { -// return true; -// } -// -// public String toString() { -// return getName(); -// } -// -// public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { -// final String interesting = null; -// -// if( eval != null && eval.isSNP() && eval.isBiallelic() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites) -// if( titvStats == null ) { titvStats = new TiTvStats(); } -// titvStats.incrValue(eval.getPhredScaledQual(), VariantContextUtils.isTransition(eval)); -// -// if( alleleCountStats == null ) { alleleCountStats = new AlleleCountStats(); } -// int alternateAlleleCount = 0; -// for (final Allele a : eval.getAlternateAlleles()) { -// alternateAlleleCount += eval.getChromosomeCount(a); -// } -// alleleCountStats.incrValue(eval.getPhredScaledQual(), alternateAlleleCount); -// } -// -// return interesting; // This module doesn't capture any interesting sites, so return null -// } -// -// public void finalizeEvaluation() { -// if( titvStats != null ) { -// titvStats.organizeTiTvTables(); -// } -// if( alleleCountStats != null ) { -// alleleCountStats.organizeAlleleCountTables(); -// } -// } -//} \ No newline at end of file +@Analysis(name = "Variant Quality Score", description = "Shows various stats of sets of variants binned by variant quality score") +public class VariantQualityScore extends VariantEvaluator { + + // a mapping from quality score histogram bin to Ti/Tv ratio + @DataPoint(description = "the Ti/Tv ratio broken out by variant quality") + TiTvStats titvStats = null; + + @DataPoint(description = "average variant quality for each allele count") + AlleleCountStats alleleCountStats = null; + + static class TiTvStats implements TableType { + final static int NUM_BINS = 20; + final HashMap> qualByIsTransition = new HashMap>(); // A hashMap holds all the qualities until we are able to bin them appropriately + final long transitionByQuality[] = new long[NUM_BINS]; + final long transversionByQuality[] = new long[NUM_BINS]; + final double titvByQuality[] = new double[NUM_BINS]; // the final ti/tv sets that get reported out + + public Object[] getRowKeys() { + return new String[]{"sample"}; + } + + public Object[] getColumnKeys() { + final String columnKeys[] = new String[NUM_BINS]; + for( int iii = 0; iii < NUM_BINS; iii++ ) { + columnKeys[iii] = "titvBin" + iii; + } + return columnKeys; + } + + public String getName() { + return "TiTvStats"; + } + + public String getCell(int x, int y) { + return String.valueOf(titvByQuality[y]); + } + + public String toString() { + StringBuffer returnString = new StringBuffer(); + // output the ti/tv array + returnString.append("titvByQuality: "); + for( int iii = 0; iii < NUM_BINS; iii++ ) { + returnString.append(titvByQuality[iii]); + returnString.append(" "); + } + return returnString.toString(); + } + + public void incrValue( final double qual, final boolean isTransition ) { + final Integer qualKey = Math.round((float) qual); + final long numTransition = (isTransition ? 1L : 0L); + final long numTransversion = (isTransition ? 0L : 1L); + if( qualByIsTransition.containsKey(qualKey) ) { + Pair transitionPair = qualByIsTransition.get(qualKey); + transitionPair.set(transitionPair.getFirst() + numTransition, transitionPair.getSecond() + numTransversion); + qualByIsTransition.put(qualKey, transitionPair); + } else { + qualByIsTransition.put(qualKey, new Pair(numTransition,numTransversion)); + } + } + + public void organizeTiTvTables() { + for( int iii = 0; iii < NUM_BINS; iii++ ) { + transitionByQuality[iii] = 0L; + transversionByQuality[iii] = 0L; + titvByQuality[iii] = 0.0; + } + + int maxQual = 0; + + // Calculate the maximum quality score in order to normalize and histogram + for( final Integer qual : qualByIsTransition.keySet() ) { + if( qual > maxQual ) { + maxQual = qual; + } + } + + final double binSize = ((double)maxQual) / ((double) (NUM_BINS-1)); + + for( final Integer qual : qualByIsTransition.keySet() ) { + final int index = (int)Math.floor( ((double) qual) / binSize ); + if( index >= 0 ) { // BUGBUG: why is there overflow here? + Pair transitionPair = qualByIsTransition.get(qual); + transitionByQuality[index] += transitionPair.getFirst(); + transversionByQuality[index] += transitionPair.getSecond(); + } + } + + for( int iii = 0; iii < NUM_BINS; iii++ ) { + if( transitionByQuality[iii] + transversionByQuality[iii] > 800L ) { // need to have a sufficient number of variants to get a useful Ti/Tv ratio + titvByQuality[iii] = ((double) transitionByQuality[iii]) / ((double) transversionByQuality[iii]); + } else { + titvByQuality[iii] = 0.0; + } + } + + } + } + + class AlleleCountStats implements TableType { + final HashMap> qualityListMap = new HashMap>(); + final HashMap qualityMap = new HashMap(); + + public Object[] getRowKeys() { + final int NUM_BINS = qualityListMap.keySet().size(); + final String rowKeys[] = new String[NUM_BINS]; + int iii = 0; + for( final Integer key : qualityListMap.keySet() ) { + rowKeys[iii] = "AC" + key; + iii++; + } + return rowKeys; + + } + + public Object[] getColumnKeys() { + return new String[]{"alleleCount","avgQual"}; + } + + public String getName() { + return "AlleleCountStats"; + } + + public String getCell(int x, int y) { + int iii = 0; + for( final Integer key : qualityListMap.keySet() ) { + if(iii == x) { + if(y == 0) { return String.valueOf(key); } + else { return String.valueOf(qualityMap.get(key)); } + } + iii++; + } + return null; + } + + public String toString() { + String returnString = ""; + // output the quality map + returnString += "AlleleCountStats: "; + //for( int iii = 0; iii < NUM_BINS; iii++ ) { + // returnString += titvByQuality[iii] + " "; + //} + return returnString; + } + + public void incrValue( final double qual, final int alleleCount ) { + ArrayList list = qualityListMap.get(alleleCount); + if(list==null) { list = new ArrayList(); } + list.add(qual); + qualityListMap.put(alleleCount, list); + } + + public void organizeAlleleCountTables() { + for( final Integer key : qualityListMap.keySet() ) { + final ArrayList list = qualityListMap.get(key); + double meanQual = 0.0; + final double numQuals = (double)list.size(); + for( Double qual : list ) { + meanQual += qual / numQuals; + } + qualityMap.put(key, meanQual); + } + } + } + + //public VariantQualityScore(VariantEvalWalker parent) { + //super(parent); + //} + + public String getName() { + return "VariantQualityScore"; + } + + public int getComparisonOrder() { + return 1; // we only need to see each eval track + } + + public boolean enabled() { + return true; + } + + public String toString() { + return getName(); + } + + public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + final String interesting = null; + + if( eval != null && eval.isSNP() && eval.isBiallelic() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites) + if( titvStats == null ) { titvStats = new TiTvStats(); } + titvStats.incrValue(eval.getPhredScaledQual(), VariantContextUtils.isTransition(eval)); + + if( alleleCountStats == null ) { alleleCountStats = new AlleleCountStats(); } + int alternateAlleleCount = 0; + for (final Allele a : eval.getAlternateAlleles()) { + alternateAlleleCount += eval.getChromosomeCount(a); + } + alleleCountStats.incrValue(eval.getPhredScaledQual(), alternateAlleleCount); + } + + return interesting; // This module doesn't capture any interesting sites, so return null + } + + public void finalizeEvaluation() { + if( titvStats != null ) { + titvStats.organizeTiTvTables(); + } + if( alleleCountStats != null ) { + alleleCountStats.organizeAlleleCountTables(); + } + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/stratifications/CpG.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/stratifications/CpG.java index a6df31b7b..3763519aa 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/stratifications/CpG.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/stratifications/CpG.java @@ -7,7 +7,7 @@ import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils import java.util.ArrayList; import java.util.Set; -public class CpG extends VariantStratifier { +public class CpG extends VariantStratifier implements StandardStratification { private ArrayList states; @Override diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/stratifications/Filter.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/stratifications/Filter.java index 02b8792d0..3a58b2675 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/stratifications/Filter.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/stratifications/Filter.java @@ -7,7 +7,7 @@ import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils import java.util.ArrayList; import java.util.Set; -public class Filter extends VariantStratifier implements StandardStratification { +public class Filter extends VariantStratifier { // needs to know the variant context private ArrayList states; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/util/NewEvaluationContext.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/util/NewEvaluationContext.java index 2dc8c64b7..11079870e 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/util/NewEvaluationContext.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/newvarianteval/util/NewEvaluationContext.java @@ -4,6 +4,7 @@ import org.broad.tribble.util.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.NewVariantEvalWalker; import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.evaluators.VariantEvaluator; import org.broadinstitute.sting.playground.gatk.walkers.newvarianteval.stratifications.VariantStratifier; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -26,13 +27,17 @@ public class NewEvaluationContext extends HashMap { return value; } - public void addEvaluationClassList(Set> evaluationClasses) { + public void addEvaluationClassList(NewVariantEvalWalker walker, StateKey stateKey, Set> evaluationClasses) { evaluationInstances = new TreeMap(); for ( Class c : evaluationClasses ) { try { VariantEvaluator eval = c.newInstance(); - evaluationInstances.put(c.getSimpleName(), eval); + eval.initialize(walker); + + if (eval.stateIsApplicable(stateKey)) { + evaluationInstances.put(c.getSimpleName(), eval); + } } catch (InstantiationException e) { throw new StingException("Unable to instantiate eval module '" + c.getSimpleName() + "'"); } catch (IllegalAccessException e) {