From 564fe36d22ee19beebae4f79799c97c8428f2566 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 5 Apr 2013 09:28:46 -0400 Subject: [PATCH 1/2] VariantRecalibrator's VQSR.vcf now contains NEG/POS labels -- It's useful to know which sites have been used in the training of the model. The recal_file emitted by VR now contains VCF info field annotations labeling each site that was used in the positive or negative training models with POSITIVE_TRAINING_SITE and/or NEGATIVE_TRAINING_SITE -- Update MD5s, which all changed now that the recal file and the resulting applied vcfs all have these pos / neg labels --- .../ApplyRecalibration.java | 6 ++++++ .../VariantDataManager.java | 18 ++++++++---------- .../VariantRecalibrator.java | 2 ++ ...antRecalibrationWalkersIntegrationTest.java | 18 +++++++++--------- 4 files changed, 25 insertions(+), 19 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 7de0c7e60..e15b99824 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -200,6 +200,8 @@ public class ApplyRecalibration extends RodWalker implements T hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model")); hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.CULPRIT_KEY, 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out")); + hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.POSITIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the positive training set of good variants")); + hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.NEGATIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the negative training set of bad variants")); } //--------------------------------------------------------------------------------------------------------------- @@ -243,6 +245,10 @@ public class ApplyRecalibration extends RodWalker implements T // Annotate the new record with its VQSLOD and the worst performing annotation builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lod); builder.attribute(VariantRecalibrator.CULPRIT_KEY, recalDatum.getAttribute(VariantRecalibrator.CULPRIT_KEY)); + if ( recalDatum.hasAttribute(VariantRecalibrator.POSITIVE_LABEL_KEY)) + builder.attribute(VariantRecalibrator.POSITIVE_LABEL_KEY, true); + if ( recalDatum.hasAttribute(VariantRecalibrator.NEGATIVE_LABEL_KEY)) + builder.attribute(VariantRecalibrator.NEGATIVE_LABEL_KEY, true); for( int i = tranches.size() - 1; i >= 0; i-- ) { final Tranche tranche = tranches.get(i); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index 3f6b6ed09..a830a801e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -335,19 +335,17 @@ public class VariantDataManager { }} ); // create dummy alleles to be used - final List alleles = new ArrayList(2); - alleles.add(Allele.create("N", true)); - alleles.add(Allele.create("", false)); - - // to be used for the important INFO tags - final HashMap attributes = new HashMap(3); + final List alleles = Arrays.asList(Allele.create("N", true), Allele.create("", false)); for( final VariantDatum datum : data ) { - attributes.put(VCFConstants.END_KEY, datum.loc.getStop()); - attributes.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod)); - attributes.put(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")); + VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles); + builder.attribute(VCFConstants.END_KEY, datum.loc.getStop()); + builder.attribute(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod)); + builder.attribute(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")); + + if ( datum.atTrainingSite ) builder.attribute(VariantRecalibrator.POSITIVE_LABEL_KEY, true); + if ( datum.atAntiTrainingSite ) builder.attribute(VariantRecalibrator.NEGATIVE_LABEL_KEY, true); - VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles).attributes(attributes); recalWriter.add(builder.make()); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 320328ab1..b0970cce2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -135,6 +135,8 @@ public class VariantRecalibrator extends RodWalker Date: Thu, 11 Apr 2013 10:52:59 -0400 Subject: [PATCH 2/2] Improvements to the VariantRecalibrator R plots -- VariantRecalibrator now emits plots with denormlized values (original values) instead of their normalized (x - mu / sigma) which helps to understand the distribution of values that are good and bad --- .../VariantDataManager.java | 62 ++++++++++++------- .../VariantRecalibrator.java | 10 ++- 2 files changed, 47 insertions(+), 25 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index a830a801e..40032a886 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -127,30 +127,46 @@ public class VariantDataManager { } } - public void addTrainingSet( final TrainingSet trainingSet ) { - trainingSets.add( trainingSet ); - } + /** + * Convert a normalized point to it's original annotation value + * + * norm = (orig - mu) / sigma + * orig = norm * sigma + mu + * + * @param normalizedValue the normalized value of the ith annotation + * @param annI the index of the annotation value + * @return the denormalized value for the annotation + */ + public double denormalizeDatum(final double normalizedValue, final int annI) { + final double mu = meanVector[annI]; + final double sigma = varianceVector[annI]; + return normalizedValue * sigma + mu; + } - public boolean checkHasTrainingSet() { - for( final TrainingSet trainingSet : trainingSets ) { - if( trainingSet.isTraining ) { return true; } - } - return false; - } + public void addTrainingSet( final TrainingSet trainingSet ) { + trainingSets.add( trainingSet ); + } - public boolean checkHasTruthSet() { - for( final TrainingSet trainingSet : trainingSets ) { - if( trainingSet.isTruth ) { return true; } - } - return false; - } + public boolean checkHasTrainingSet() { + for( final TrainingSet trainingSet : trainingSets ) { + if( trainingSet.isTraining ) { return true; } + } + return false; + } - public boolean checkHasKnownSet() { - for( final TrainingSet trainingSet : trainingSets ) { - if( trainingSet.isKnown ) { return true; } - } - return false; - } + public boolean checkHasTruthSet() { + for( final TrainingSet trainingSet : trainingSets ) { + if( trainingSet.isTruth ) { return true; } + } + return false; + } + + public boolean checkHasKnownSet() { + for( final TrainingSet trainingSet : trainingSets ) { + if( trainingSet.isKnown ) { return true; } + } + return false; + } public ExpandingArrayList getTrainingData() { final ExpandingArrayList trainingData = new ExpandingArrayList(); @@ -260,7 +276,7 @@ public class VariantDataManager { value = vc.getAttributeAsDouble( annotationKey, Double.NaN ); if( Double.isInfinite(value) ) { value = Double.NaN; } if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM - value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble(); + value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } @@ -297,7 +313,7 @@ public class VariantDataManager { private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) { return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() && checkVariationClass( evalVC, trainVC ) && - (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphicInSamples()); + (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphicInSamples()); } protected static boolean checkVariationClass( final VariantContext evalVC, final VariantContext trainVC ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index b0970cce2..bee695e2a 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -435,14 +435,20 @@ public class VariantRecalibrator extends RodWalker