From 5b087c98979813def02d6fd17c8dad73f5ff36a8 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 25 Aug 2014 11:57:30 -0400 Subject: [PATCH] Changed the functionality of the physical phasing in the HC: now hom vars are output as 0|1. We do this for technical reasons, mostly because we don't genotype in the HC anymore; it's all done downstream by GenotypeGVCFs so we can't be sure that the genotype will be hom var. Also, there are steps in the downstream pipeline where genotypes can change, so assuming anything in the HC is a bad idea, and if we have phasing info in the het state, we want to propagate that forward. Now, PGT tag fixing happens downstream in GenotypeGVCFs. While I was in there I also cleaned up the code a bit and fixed a bug where annotation was happening before genotype creation when using the --includeNonVariantSites argument. Added tests accordingly. --- .../HaplotypeCallerGenotypingEngine.java | 19 ++++--- .../walkers/variantutils/GenotypeGVCFs.java | 53 ++++++++++++++----- .../HaplotypeCallerGVCFIntegrationTest.java | 4 +- ...plotypeCallerGenotypingEngineUnitTest.java | 9 ++-- .../GenotypeGVCFsIntegrationTest.java | 11 +++- 5 files changed, 71 insertions(+), 25 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java index f3b7e68ec..7f19861bc 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java @@ -73,6 +73,8 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine NO_CALL = Collections.singletonList(Allele.NO_CALL); private static final int ALLELE_EXTENSION = 2; + private static final String phase01 = "0|1"; + private static final String phase10 = "1|0"; private MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger; @@ -404,14 +406,19 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine(uniqueCounter, callIsOnAllHaps ? "1|1" : "0|1")); - phaseSetMapping.put(comp, new Pair<>(uniqueCounter, compIsOnAllHaps ? "1|1" : "0|1")); + // An important note: even for homozygous variants we are setting the phase as "0|1" here. + // We do this because we cannot possibly know for sure at this time that the genotype for this + // sample will actually be homozygous downstream: there are steps in the pipeline that are liable + // to change the genotypes. Because we can't make those assumptions here, we have decided to output + // the phase as if the call is heterozygous and then "fix" it downstream as needed. + phaseSetMapping.put(call, new Pair<>(uniqueCounter, phase01)); + phaseSetMapping.put(comp, new Pair<>(uniqueCounter, phase01)); uniqueCounter++; } // otherwise it's part of an existing group so use that group's unique ID else if ( ! phaseSetMapping.containsKey(comp) ) { final Pair callPhase = phaseSetMapping.get(call); - phaseSetMapping.put(comp, new Pair<>(callPhase.first, compIsOnAllHaps ? "1|1" : callPhase.second)); + phaseSetMapping.put(comp, new Pair<>(callPhase.first, callPhase.second)); } } // if the variants are apart on *all* haplotypes, record that fact @@ -431,14 +438,14 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine(uniqueCounter, "0|1")); - phaseSetMapping.put(comp, new Pair<>(uniqueCounter, "1|0")); + phaseSetMapping.put(call, new Pair<>(uniqueCounter, phase01)); + phaseSetMapping.put(comp, new Pair<>(uniqueCounter, phase10)); uniqueCounter++; } // otherwise it's part of an existing group so use that group's unique ID else if ( ! phaseSetMapping.containsKey(comp) ){ final Pair callPhase = phaseSetMapping.get(call); - phaseSetMapping.put(comp, new Pair<>(callPhase.first, callPhase.second.equals("0|1") ? "1|0" : "0|1")); + phaseSetMapping.put(comp, new Pair<>(callPhase.first, callPhase.second.equals(phase01) ? phase10 : phase01)); } } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java index 6d83bbbf0..16071e84a 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java @@ -52,6 +52,7 @@ import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypingEngine; import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedSampleList; import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList; import org.broadinstitute.gatk.tools.walkers.genotyper.SampleListUtils; +import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCaller; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection; @@ -218,15 +219,7 @@ public class GenotypeGVCFs extends RodWalker attrs = new HashMap<>(originalVC.getAttributes()); - attrs.put(VCFConstants.MLE_ALLELE_COUNT_KEY, regenotypedVC.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY)); - attrs.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, regenotypedVC.getAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY)); - if (regenotypedVC.hasAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY)) - attrs.put(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, regenotypedVC.getAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY)); - - result = new VariantContextBuilder(regenotypedVC).attributes(attrs).make(); + result = addGenotypingAnnotations(originalVC.getAttributes(), regenotypedVC); } // if it turned monomorphic then we either need to ignore or fix such sites @@ -237,18 +230,47 @@ public class GenotypeGVCFs extends RodWalker originalAttributes, final VariantContext newVC) { + // we want to carry forward the attributes from the original VC but make sure to add the MLE-based annotations + final Map attrs = new HashMap<>(originalAttributes); + attrs.put(VCFConstants.MLE_ALLELE_COUNT_KEY, newVC.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY)); + attrs.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, newVC.getAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY)); + if (newVC.hasAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY)) + attrs.put(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, newVC.getAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY)); + + return new VariantContextBuilder(newVC).attributes(attrs).make(); + } + + /** * Cleans up genotype-level annotations that need to be updated. * 1. move MIN_DP to DP if present * 2. propagate DP to AD if not present * 3. remove SB if present + * 4. change the PGT value from "0|1" to "1|1" for homozygous variant genotypes * * @param VC the VariantContext with the Genotypes to fix * @param createRefGTs if true we will also create proper hom ref genotypes since we assume the site is monomorphic @@ -273,6 +295,11 @@ public class GenotypeGVCFs extends RodWalker(haplotypeMap), 2, 3, 1, 2, 1}); + // test 2 hets + haplotypeMap.remove(vc3); + tests.add(new Object[]{Arrays.asList(vc2, vc4), new HashMap<>(haplotypeMap), 1, 2, 1, 2, 0}); + // test 2 with opposite phase final Set haplotypes1 = new HashSet<>(); haplotypes1.add(pos1); haplotypeMap.put(vc1, haplotypes1); - haplotypeMap.remove(vc3); tests.add(new Object[]{Arrays.asList(vc1, vc2, vc4), new HashMap<>(haplotypeMap), 2, 3, 1, 1, 2}); // test homs around a het @@ -498,7 +501,7 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest { haplotypeMap.put(vc2, haplotypes2hom); haplotypeMap.put(vc3, haplotypes3het); haplotypeMap.put(vc4, haplotypes4hom); - tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 1, 0}); + tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 3, 0}); // test hets around a hom final Set haplotypes2het = new HashSet<>(); @@ -511,7 +514,7 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest { haplotypeMap.put(vc2, haplotypes2het); haplotypeMap.put(vc3, haplotypes3hom); haplotypeMap.put(vc4, haplotypes4het); - tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 2, 0}); + tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 3, 0}); // test no phased variants around a hom final Set haplotypes2incomplete = new HashSet<>(); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java index 158533450..f4ebb596f 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -61,6 +61,15 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { return baseTestString(" -V " + privateTestDir + "gvcf.basepairResolution.vcf " + args, b37KGReference); } + @Test(enabled = true) + public void testUpdatePGT() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf", b37KGReference), + 1, + Arrays.asList("bc3d3eff337836af245b81f52b94d70c")); + executeTest("testUpdatePGT", spec); + } + @Test(enabled = true) public void combineSingleSamplePipelineGVCF() { WalkerTestSpec spec = new WalkerTestSpec( @@ -81,7 +90,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " --includeNonVariantSites -L 20:10,030,000-10,033,000 -L 20:10,386,000-10,386,500", b37KGReference), 1, - Arrays.asList("f1dd2b9ab422c0b806392464e466ed91")); + Arrays.asList("4193e7c4e6fc889b4aa5a326899b1a4e")); executeTest("combineSingleSamplePipelineGVCF_includeNonVariants", spec); }