Simpler FILTER and info field encoding for BeagleOutputToVCF

-- Previous version created FILTERs for each possible alt allele when that site was set to monomorphic by BEAGLE.  So if you had a A/C SNP in the original file and beagle thought it was AC=0, then you'd get a record with BGL_RM_WAS_A in the FILTER field.  This obviously would cause problems for indels, as so the tool was blowing up in this case.  Now beagle sets the filter field to BGL_SET_TO_MONOMORPHIC and sets the info field annotation OriginalAltAllele to A instead.  This works in general with any type of allele.
 -- Here's an example output line from the previous and current versions:
 old: 20    64150   rs7274499       C       .       3041.68 BGL_RM_WAS_A    AN=566;DB;DP=1069;Dels=0.00;HRun=0;HaplotypeScore=238.33;LOD=3.5783;MQ=83.74;MQ0=0;NumGenotypesChanged=1;OQ=1949.35;QD=10.95;SB=-6918.88
 new: 20    64062   .       G       .       100.39  BGL_SET_TO_MONOMORPHIC  AN=566;DP=1108;Dels=0.00;HRun=2;HaplotypeScore=221.59;LOD=-0.5051;MQ=85.69;MQ0=0;NumGenotypesChanged=1;OQ=189.66;OriginalAltAllele=A;QD=15.81;SB=-6087.15
-- update MD5s to reflect these changes
-- [delivers #50847721]
This commit is contained in:
Mark DePristo 2013-06-14 15:56:13 -04:00
parent b72880cc94
commit 1677a0a458
2 changed files with 9 additions and 9 deletions

View File

@ -62,7 +62,7 @@ public class BeagleIntegrationTest extends WalkerTest {
"--beagleR2:BEAGLE " + beagleValidationDataLocation + "inttestbgl.r2 " +
"--beagleProbs:BEAGLE " + beagleValidationDataLocation + "inttestbgl.gprobs " +
"--beaglePhased:BEAGLE " + beagleValidationDataLocation + "inttestbgl.phased " +
"-o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", 1, Arrays.asList("c5522304abf0633041c7772dd7dafcea"));
"-o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", 1, Arrays.asList("989449fa3e262b88ba126867fa3ad9fb"));
spec.disableShadowBCF();
executeTest("test BeagleOutputToVCF", spec);
}
@ -96,7 +96,7 @@ public class BeagleIntegrationTest extends WalkerTest {
"--beagleR2:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+
"--beagleProbs:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+
"--beaglePhased:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+
"-L 20:1-70000 -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING",1,Arrays.asList("d8906b67c7f9fdb5b37b8e9e050982d3"));
"-L 20:1-70000 -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING",1,Arrays.asList("e036636fcd6a748ede4a70ea47941d47"));
spec.disableShadowBCF();
executeTest("testBeagleChangesSitesToRef",spec);
}

View File

@ -129,6 +129,9 @@ public class BeagleOutputToVCF extends RodWalker<Integer, Integer> {
private final double MIN_PROB_ERROR = 0.000001;
private final double MAX_GENOTYPE_QUALITY = -6.0;
private final static String BEAGLE_MONO_FILTER_STRING = "BGL_SET_TO_MONOMORPHIC";
private final static String ORIGINAL_ALT_ALLELE_INFO_KEY = "OriginalAltAllele";
public void initialize() {
// setup the header fields
@ -138,10 +141,8 @@ public class BeagleOutputToVCF extends RodWalker<Integer, Integer> {
hInfo.add(new VCFFormatHeaderLine("OG",1, VCFHeaderLineType.String, "Original Genotype input to Beagle"));
hInfo.add(new VCFInfoHeaderLine("R2", 1, VCFHeaderLineType.Float, "r2 Value reported by Beagle on each site"));
hInfo.add(new VCFInfoHeaderLine("NumGenotypesChanged", 1, VCFHeaderLineType.Integer, "The number of genotypes changed by Beagle"));
hInfo.add(new VCFFilterHeaderLine("BGL_RM_WAS_A", "This 'A' site was set to monomorphic by Beagle"));
hInfo.add(new VCFFilterHeaderLine("BGL_RM_WAS_C", "This 'C' site was set to monomorphic by Beagle"));
hInfo.add(new VCFFilterHeaderLine("BGL_RM_WAS_G", "This 'G' site was set to monomorphic by Beagle"));
hInfo.add(new VCFFilterHeaderLine("BGL_RM_WAS_T", "This 'T' site was set to monomorphic by Beagle"));
hInfo.add(new VCFInfoHeaderLine(ORIGINAL_ALT_ALLELE_INFO_KEY, 1, VCFHeaderLineType.String, "The original alt allele for a site set to monomorphic by Beagle"));
hInfo.add(new VCFFilterHeaderLine(BEAGLE_MONO_FILTER_STRING, "This site was set to monomorphic by Beagle"));
if ( comp.isBound() ) {
hInfo.add(new VCFInfoHeaderLine("ACH", 1, VCFHeaderLineType.Integer, "Allele Count from Comparison ROD at this site"));
@ -335,9 +336,8 @@ public class BeagleOutputToVCF extends RodWalker<Integer, Integer> {
final VariantContextBuilder builder = new VariantContextBuilder(vc_input).source("outputvcf").genotypes(genotypes);
if ( ! ( beagleVarCounts > 0 || DONT_FILTER_MONOMORPHIC_SITES ) ) {
Set<String> removedFilters = vc_input.filtersWereApplied() ? new HashSet<String>(vc_input.getFilters()) : new HashSet<String>(1);
removedFilters.add(String.format("BGL_RM_WAS_%s",vc_input.getAlternateAllele(0)));
builder.alleles(new HashSet<Allele>(Arrays.asList(vc_input.getReference()))).filters(removedFilters);
builder.attribute(ORIGINAL_ALT_ALLELE_INFO_KEY, vc_input.getAlternateAllele(0));
builder.alleles(Collections.singleton(vc_input.getReference())).filter(BEAGLE_MONO_FILTER_STRING);
}
// re-compute chromosome counts