From 1677a0a458e83075e2b0c0e14d45f33d39690593 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 14 Jun 2013 15:56:13 -0400 Subject: [PATCH] Simpler FILTER and info field encoding for BeagleOutputToVCF -- Previous version created FILTERs for each possible alt allele when that site was set to monomorphic by BEAGLE. So if you had a A/C SNP in the original file and beagle thought it was AC=0, then you'd get a record with BGL_RM_WAS_A in the FILTER field. This obviously would cause problems for indels, as so the tool was blowing up in this case. Now beagle sets the filter field to BGL_SET_TO_MONOMORPHIC and sets the info field annotation OriginalAltAllele to A instead. This works in general with any type of allele. -- Here's an example output line from the previous and current versions: old: 20 64150 rs7274499 C . 3041.68 BGL_RM_WAS_A AN=566;DB;DP=1069;Dels=0.00;HRun=0;HaplotypeScore=238.33;LOD=3.5783;MQ=83.74;MQ0=0;NumGenotypesChanged=1;OQ=1949.35;QD=10.95;SB=-6918.88 new: 20 64062 . G . 100.39 BGL_SET_TO_MONOMORPHIC AN=566;DP=1108;Dels=0.00;HRun=2;HaplotypeScore=221.59;LOD=-0.5051;MQ=85.69;MQ0=0;NumGenotypesChanged=1;OQ=189.66;OriginalAltAllele=A;QD=15.81;SB=-6087.15 -- update MD5s to reflect these changes -- [delivers #50847721] --- .../gatk/walkers/beagle/BeagleIntegrationTest.java | 4 ++-- .../gatk/walkers/beagle/BeagleOutputToVCF.java | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java index 69a5fc65f..5601d66fb 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java @@ -62,7 +62,7 @@ public class BeagleIntegrationTest extends WalkerTest { "--beagleR2:BEAGLE " + beagleValidationDataLocation + "inttestbgl.r2 " + "--beagleProbs:BEAGLE " + beagleValidationDataLocation + "inttestbgl.gprobs " + "--beaglePhased:BEAGLE " + beagleValidationDataLocation + "inttestbgl.phased " + - "-o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", 1, Arrays.asList("c5522304abf0633041c7772dd7dafcea")); + "-o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", 1, Arrays.asList("989449fa3e262b88ba126867fa3ad9fb")); spec.disableShadowBCF(); executeTest("test BeagleOutputToVCF", spec); } @@ -96,7 +96,7 @@ public class BeagleIntegrationTest extends WalkerTest { "--beagleR2:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+ "--beagleProbs:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+ "--beaglePhased:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+ - "-L 20:1-70000 -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING",1,Arrays.asList("d8906b67c7f9fdb5b37b8e9e050982d3")); + "-L 20:1-70000 -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING",1,Arrays.asList("e036636fcd6a748ede4a70ea47941d47")); spec.disableShadowBCF(); executeTest("testBeagleChangesSitesToRef",spec); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java index 15bd79586..7d5ad9b8a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCF.java @@ -129,6 +129,9 @@ public class BeagleOutputToVCF extends RodWalker { private final double MIN_PROB_ERROR = 0.000001; private final double MAX_GENOTYPE_QUALITY = -6.0; + private final static String BEAGLE_MONO_FILTER_STRING = "BGL_SET_TO_MONOMORPHIC"; + private final static String ORIGINAL_ALT_ALLELE_INFO_KEY = "OriginalAltAllele"; + public void initialize() { // setup the header fields @@ -138,10 +141,8 @@ public class BeagleOutputToVCF extends RodWalker { hInfo.add(new VCFFormatHeaderLine("OG",1, VCFHeaderLineType.String, "Original Genotype input to Beagle")); hInfo.add(new VCFInfoHeaderLine("R2", 1, VCFHeaderLineType.Float, "r2 Value reported by Beagle on each site")); hInfo.add(new VCFInfoHeaderLine("NumGenotypesChanged", 1, VCFHeaderLineType.Integer, "The number of genotypes changed by Beagle")); - hInfo.add(new VCFFilterHeaderLine("BGL_RM_WAS_A", "This 'A' site was set to monomorphic by Beagle")); - hInfo.add(new VCFFilterHeaderLine("BGL_RM_WAS_C", "This 'C' site was set to monomorphic by Beagle")); - hInfo.add(new VCFFilterHeaderLine("BGL_RM_WAS_G", "This 'G' site was set to monomorphic by Beagle")); - hInfo.add(new VCFFilterHeaderLine("BGL_RM_WAS_T", "This 'T' site was set to monomorphic by Beagle")); + hInfo.add(new VCFInfoHeaderLine(ORIGINAL_ALT_ALLELE_INFO_KEY, 1, VCFHeaderLineType.String, "The original alt allele for a site set to monomorphic by Beagle")); + hInfo.add(new VCFFilterHeaderLine(BEAGLE_MONO_FILTER_STRING, "This site was set to monomorphic by Beagle")); if ( comp.isBound() ) { hInfo.add(new VCFInfoHeaderLine("ACH", 1, VCFHeaderLineType.Integer, "Allele Count from Comparison ROD at this site")); @@ -335,9 +336,8 @@ public class BeagleOutputToVCF extends RodWalker { final VariantContextBuilder builder = new VariantContextBuilder(vc_input).source("outputvcf").genotypes(genotypes); if ( ! ( beagleVarCounts > 0 || DONT_FILTER_MONOMORPHIC_SITES ) ) { - Set removedFilters = vc_input.filtersWereApplied() ? new HashSet(vc_input.getFilters()) : new HashSet(1); - removedFilters.add(String.format("BGL_RM_WAS_%s",vc_input.getAlternateAllele(0))); - builder.alleles(new HashSet(Arrays.asList(vc_input.getReference()))).filters(removedFilters); + builder.attribute(ORIGINAL_ALT_ALLELE_INFO_KEY, vc_input.getAlternateAllele(0)); + builder.alleles(Collections.singleton(vc_input.getReference())).filter(BEAGLE_MONO_FILTER_STRING); } // re-compute chromosome counts