From 564fe36d22ee19beebae4f79799c97c8428f2566 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 5 Apr 2013 09:28:46 -0400 Subject: [PATCH] VariantRecalibrator's VQSR.vcf now contains NEG/POS labels -- It's useful to know which sites have been used in the training of the model. The recal_file emitted by VR now contains VCF info field annotations labeling each site that was used in the positive or negative training models with POSITIVE_TRAINING_SITE and/or NEGATIVE_TRAINING_SITE -- Update MD5s, which all changed now that the recal file and the resulting applied vcfs all have these pos / neg labels --- .../ApplyRecalibration.java | 6 ++++++ .../VariantDataManager.java | 18 ++++++++---------- .../VariantRecalibrator.java | 2 ++ ...antRecalibrationWalkersIntegrationTest.java | 18 +++++++++--------- 4 files changed, 25 insertions(+), 19 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 7de0c7e60..e15b99824 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -200,6 +200,8 @@ public class ApplyRecalibration extends RodWalker implements T hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model")); hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.CULPRIT_KEY, 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out")); + hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.POSITIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the positive training set of good variants")); + hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.NEGATIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the negative training set of bad variants")); } //--------------------------------------------------------------------------------------------------------------- @@ -243,6 +245,10 @@ public class ApplyRecalibration extends RodWalker implements T // Annotate the new record with its VQSLOD and the worst performing annotation builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lod); builder.attribute(VariantRecalibrator.CULPRIT_KEY, recalDatum.getAttribute(VariantRecalibrator.CULPRIT_KEY)); + if ( recalDatum.hasAttribute(VariantRecalibrator.POSITIVE_LABEL_KEY)) + builder.attribute(VariantRecalibrator.POSITIVE_LABEL_KEY, true); + if ( recalDatum.hasAttribute(VariantRecalibrator.NEGATIVE_LABEL_KEY)) + builder.attribute(VariantRecalibrator.NEGATIVE_LABEL_KEY, true); for( int i = tranches.size() - 1; i >= 0; i-- ) { final Tranche tranche = tranches.get(i); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index 3f6b6ed09..a830a801e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -335,19 +335,17 @@ public class VariantDataManager { }} ); // create dummy alleles to be used - final List alleles = new ArrayList(2); - alleles.add(Allele.create("N", true)); - alleles.add(Allele.create("", false)); - - // to be used for the important INFO tags - final HashMap attributes = new HashMap(3); + final List alleles = Arrays.asList(Allele.create("N", true), Allele.create("", false)); for( final VariantDatum datum : data ) { - attributes.put(VCFConstants.END_KEY, datum.loc.getStop()); - attributes.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod)); - attributes.put(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")); + VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles); + builder.attribute(VCFConstants.END_KEY, datum.loc.getStop()); + builder.attribute(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod)); + builder.attribute(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")); + + if ( datum.atTrainingSite ) builder.attribute(VariantRecalibrator.POSITIVE_LABEL_KEY, true); + if ( datum.atAntiTrainingSite ) builder.attribute(VariantRecalibrator.NEGATIVE_LABEL_KEY, true); - VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles).attributes(attributes); recalWriter.add(builder.make()); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 320328ab1..b0970cce2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -135,6 +135,8 @@ public class VariantRecalibrator extends RodWalker