From 17ff5bb09436b8805657859b787f4f7f8714cc08 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Sat, 2 Jul 2011 09:55:35 -0400 Subject: [PATCH] Variant records coming out of the VQSR are now annotated with which input annotation was most divergent from the Gaussian mixture model. This gives a general sense for why each variant was removed from the callset. --- .../ApplyRecalibration.java | 8 +++++++- .../GaussianMixtureModel.java | 13 +++++++++++++ .../VariantDataManager.java | 4 +++- .../variantrecalibration/VariantDatum.java | 1 + .../VariantRecalibrator.java | 2 ++ .../VariantRecalibratorEngine.java | 17 +++++++++++++++++ ...iantRecalibrationWalkersIntegrationTest.java | 4 ++-- 7 files changed, 45 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 9877781d1..02d850211 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -55,7 +55,6 @@ import java.util.*; public class ApplyRecalibration extends RodWalker { - ///////////////////////////// // Inputs ///////////////////////////// @@ -86,6 +85,7 @@ public class ApplyRecalibration extends RodWalker { final private List tranches = new ArrayList(); final private Set inputNames = new HashSet(); final private NestedHashMap lodMap = new NestedHashMap(); + final private NestedHashMap annotationMap = new NestedHashMap(); final private Set ignoreInputFilterSet = new TreeSet(); //--------------------------------------------------------------------------------------------------------------- @@ -124,6 +124,7 @@ public class ApplyRecalibration extends RodWalker { final Set hInfo = new HashSet(); hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames)); hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model")); + hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.CULPRIT_KEY, 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out")); final TreeSet samples = new TreeSet(); samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames)); @@ -149,6 +150,7 @@ public class ApplyRecalibration extends RodWalker { for ( final String line : new XReadLines( RECAL_FILE ) ) { final String[] vals = line.split(","); lodMap.put( Double.parseDouble(vals[3]), vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys + annotationMap.put( vals[4], vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys } } catch ( FileNotFoundException e ) { throw new UserException.CouldNotReadInputFile(RECAL_FILE, e); @@ -174,11 +176,15 @@ public class ApplyRecalibration extends RodWalker { String filterString = null; final Map attrs = new HashMap(vc.getAttributes()); final Double lod = (Double) lodMap.get( vc.getChr(), vc.getStart(), vc.getEnd() ); + final String worstAnnotation = (String) annotationMap.get( vc.getChr(), vc.getStart(), vc.getEnd() ); if( lod == null ) { throw new UserException("Encountered input variant which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants. First seen at: " + vc ); } + // Annotate the new record with its VQSLOD and the worst performing annotation attrs.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", lod)); + attrs.put(VariantRecalibrator.CULPRIT_KEY, worstAnnotation); + for( int i = tranches.size() - 1; i >= 0; i-- ) { final Tranche tranche = tranches.get(i); if( lod >= tranche.minVQSLod ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java index 9ffe7be7a..a09a30145 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java @@ -190,6 +190,19 @@ public class GaussianMixtureModel { return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k)) } + public Double evaluateDatumInOneDimension( final VariantDatum datum, final int iii ) { + if(datum.isNull[iii]) { return null; } + + final Normal normal = new Normal(0.0, 1.0, null); + final double[] pVarInGaussianLog10 = new double[gaussians.size()]; + int gaussianIndex = 0; + for( final MultivariateGaussian gaussian : gaussians ) { + normal.setState( gaussian.mu[iii], gaussian.sigma.get(iii, iii) ); + pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + Math.log10( normal.pdf( datum.annotations[iii] ) ); + } + return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k)) + } + public double evaluateDatumMarginalized( final VariantDatum datum ) { int numVals = 0; double sumPVarInGaussian = 0.0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index 2fd1326fe..efff5b780 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -254,7 +254,9 @@ public class VariantDataManager { public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) { for( final VariantDatum datum : data ) { - RECAL_FILE.println(String.format("%s,%d,%d,%.4f", datum.contig, datum.start, datum.stop, datum.lod)); + RECAL_FILE.println(String.format("%s,%d,%d,%.4f,%s", + datum.contig, datum.start, datum.stop, datum.lod, + (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL"))); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java index ac875b645..38e2affb6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java @@ -24,6 +24,7 @@ public class VariantDatum implements Comparable { public String contig; public int start; public int stop; + public int worstAnnotation; public MultivariateGaussian assignment; // used in K-means implementation public int compareTo( final VariantDatum other ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index e651b62e0..b5fda4443 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -60,6 +60,7 @@ import java.util.*; public class VariantRecalibrator extends RodWalker, ExpandingArrayList> implements TreeReducible> { public static final String VQS_LOD_KEY = "VQSLOD"; + public static final String CULPRIT_KEY = "culprit"; @ArgumentCollection private VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection(); @@ -232,6 +233,7 @@ public class VariantRecalibrator extends RodWalker randomData = dataManager.getRandomDataForPlotting( 6000 ); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java index a0fbc572d..81e4d190c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java @@ -57,6 +57,23 @@ public class VariantRecalibratorEngine { } } + public void calculateWorstPerformingAnnotation( final List data, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel ) { + for( final VariantDatum datum : data ) { + int worstAnnotation = -1; + double minProb = Double.MAX_VALUE; + for( int iii = 0; iii < datum.annotations.length; iii++ ) { + final Double goodProbLog10 = goodModel.evaluateDatumInOneDimension(datum, iii); + final Double badProbLog10 = badModel.evaluateDatumInOneDimension(datum, iii); + if( goodProbLog10 != null && badProbLog10 != null ) { + final double prob = goodProbLog10 - badProbLog10; + if(prob < minProb) { minProb = prob; worstAnnotation = iii; } + } + } + datum.worstAnnotation = worstAnnotation; + } + } + + ///////////////////////////// // Private Methods used for generating a GaussianMixtureModel ///////////////////////////// diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index eb6a1a4c6..9600046da 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -26,8 +26,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { VRTest lowPass = new VRTest("phase1.projectConsensus.chr20.raw.snps.vcf", "d33212a84368e821cbedecd4f59756d6", // tranches - "a35cd067f378442eee8cd5edeea92be0", // recal file - "126d52843f4a57199ee97750ffc16a07"); // cut VCF + "4652dca41222bebdf9d9fda343b2a835", // recal file + "5350b1a4c1250cf3b77ca45327c04711"); // cut VCF @DataProvider(name = "VRTest") public Object[][] createData1() {