Changed the functionality of the physical phasing in the HC: now hom vars are output as 0|1.

We do this for technical reasons, mostly because we don't genotype in the HC anymore; it's all
done downstream by GenotypeGVCFs so we can't be sure that the genotype will be hom var.  Also,
there are steps in the downstream pipeline where genotypes can change, so assuming anything in
the HC is a bad idea, and if we have phasing info in the het state, we want to propagate that forward.

Now, PGT tag fixing happens downstream in GenotypeGVCFs.
While I was in there I also cleaned up the code a bit and fixed a bug where annotation was happening
before genotype creation when using the --includeNonVariantSites argument.

Added tests accordingly.
This commit is contained in:
Eric Banks 2014-08-25 11:57:30 -04:00
parent 9324121af4
commit 5b087c9897
5 changed files with 71 additions and 25 deletions

View File

@ -73,6 +73,8 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
private final static List<Allele> NO_CALL = Collections.singletonList(Allele.NO_CALL);
private static final int ALLELE_EXTENSION = 2;
private static final String phase01 = "0|1";
private static final String phase10 = "1|0";
private MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger;
@ -404,14 +406,19 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
return 0;
}
phaseSetMapping.put(call, new Pair<>(uniqueCounter, callIsOnAllHaps ? "1|1" : "0|1"));
phaseSetMapping.put(comp, new Pair<>(uniqueCounter, compIsOnAllHaps ? "1|1" : "0|1"));
// An important note: even for homozygous variants we are setting the phase as "0|1" here.
// We do this because we cannot possibly know for sure at this time that the genotype for this
// sample will actually be homozygous downstream: there are steps in the pipeline that are liable
// to change the genotypes. Because we can't make those assumptions here, we have decided to output
// the phase as if the call is heterozygous and then "fix" it downstream as needed.
phaseSetMapping.put(call, new Pair<>(uniqueCounter, phase01));
phaseSetMapping.put(comp, new Pair<>(uniqueCounter, phase01));
uniqueCounter++;
}
// otherwise it's part of an existing group so use that group's unique ID
else if ( ! phaseSetMapping.containsKey(comp) ) {
final Pair<Integer, String> callPhase = phaseSetMapping.get(call);
phaseSetMapping.put(comp, new Pair<>(callPhase.first, compIsOnAllHaps ? "1|1" : callPhase.second));
phaseSetMapping.put(comp, new Pair<>(callPhase.first, callPhase.second));
}
}
// if the variants are apart on *all* haplotypes, record that fact
@ -431,14 +438,14 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
return 0;
}
phaseSetMapping.put(call, new Pair<>(uniqueCounter, "0|1"));
phaseSetMapping.put(comp, new Pair<>(uniqueCounter, "1|0"));
phaseSetMapping.put(call, new Pair<>(uniqueCounter, phase01));
phaseSetMapping.put(comp, new Pair<>(uniqueCounter, phase10));
uniqueCounter++;
}
// otherwise it's part of an existing group so use that group's unique ID
else if ( ! phaseSetMapping.containsKey(comp) ){
final Pair<Integer, String> callPhase = phaseSetMapping.get(call);
phaseSetMapping.put(comp, new Pair<>(callPhase.first, callPhase.second.equals("0|1") ? "1|0" : "0|1"));
phaseSetMapping.put(comp, new Pair<>(callPhase.first, callPhase.second.equals(phase01) ? phase10 : phase01));
}
}
}

View File

@ -52,6 +52,7 @@ import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypingEngine;
import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedSampleList;
import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList;
import org.broadinstitute.gatk.tools.walkers.genotyper.SampleListUtils;
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCaller;
import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection;
@ -218,15 +219,7 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
return null;
regenotypedVC = GATKVariantContextUtils.reverseTrimAlleles(regenotypedVC);
// we want to carry forward the attributes from the original VC but make sure to add the MLE-based annotations
final Map<String, Object> attrs = new HashMap<>(originalVC.getAttributes());
attrs.put(VCFConstants.MLE_ALLELE_COUNT_KEY, regenotypedVC.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY));
attrs.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, regenotypedVC.getAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY));
if (regenotypedVC.hasAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY))
attrs.put(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, regenotypedVC.getAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY));
result = new VariantContextBuilder(regenotypedVC).attributes(attrs).make();
result = addGenotypingAnnotations(originalVC.getAttributes(), regenotypedVC);
}
// if it turned monomorphic then we either need to ignore or fix such sites
@ -237,18 +230,47 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
createRefGTs = true;
}
// re-annotate it
result = annotationEngine.annotateContext(tracker, ref, null, result);
// Re-annotate and fix/remove some of the original annotations.
// Note that the order of these actions matters and is different for polymorphic and monomorphic sites.
// For polymorphic sites we need to make sure e.g. the SB tag is sent to the annotation engine and then removed later.
// For monomorphic sites we need to make sure e.g. the hom ref genotypes are created and only then are passed to the annotation engine.
// We could theoretically make 2 passes to re-create the genotypes, but that gets extremely expensive with large sample sizes.
if ( createRefGTs ) {
result = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, true)).make();
result = annotationEngine.annotateContext(tracker, ref, null, result);
} else {
result = annotationEngine.annotateContext(tracker, ref, null, result);
result = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, false)).make();
}
// fix some of the annotations
return new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, createRefGTs)).make();
return result;
}
/**
* Add genotyping-based annotations to the new VC
*
* @param originalAttributes the non-null annotations from the original VC
* @param newVC the new non-null VC
* @return a non-null VC
*/
private VariantContext addGenotypingAnnotations(final Map<String, Object> originalAttributes, final VariantContext newVC) {
// we want to carry forward the attributes from the original VC but make sure to add the MLE-based annotations
final Map<String, Object> attrs = new HashMap<>(originalAttributes);
attrs.put(VCFConstants.MLE_ALLELE_COUNT_KEY, newVC.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY));
attrs.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, newVC.getAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY));
if (newVC.hasAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY))
attrs.put(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, newVC.getAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY));
return new VariantContextBuilder(newVC).attributes(attrs).make();
}
/**
* Cleans up genotype-level annotations that need to be updated.
* 1. move MIN_DP to DP if present
* 2. propagate DP to AD if not present
* 3. remove SB if present
* 4. change the PGT value from "0|1" to "1|1" for homozygous variant genotypes
*
* @param VC the VariantContext with the Genotypes to fix
* @param createRefGTs if true we will also create proper hom ref genotypes since we assume the site is monomorphic
@ -273,6 +295,11 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
// remove SB
attrs.remove("SB");
// update PGT for hom vars
if ( oldGT.isHomVar() && oldGT.hasExtendedAttribute(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_GT_KEY) ) {
attrs.put(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_GT_KEY, "1|1");
}
// create AD if it's not there
if ( !oldGT.hasAD() && VC.isVariant() ) {
final int[] AD = new int[VC.getNAlleles()];

View File

@ -86,8 +86,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "9db87ae56df22456f3c08024384f3e5e"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "90dc8950e28e042097817d785022482e"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "a64e2d8e60a1be9cb0b4306c85e96393"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "ad0b6274991d451a6491f3118f0e7b2b"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "956938fd1b5c7bf1b717326a7dd46d91"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "d5c07fa3edca496a84fd17cecad06230"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "66019a0914f905522da6bd3b557a57d1"});

View File

@ -479,11 +479,14 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest {
haplotypeMap.put(vc4, haplotypes24);
tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 2, 1});
// test 2 hets
haplotypeMap.remove(vc3);
tests.add(new Object[]{Arrays.asList(vc2, vc4), new HashMap<>(haplotypeMap), 1, 2, 1, 2, 0});
// test 2 with opposite phase
final Set<Haplotype> haplotypes1 = new HashSet<>();
haplotypes1.add(pos1);
haplotypeMap.put(vc1, haplotypes1);
haplotypeMap.remove(vc3);
tests.add(new Object[]{Arrays.asList(vc1, vc2, vc4), new HashMap<>(haplotypeMap), 2, 3, 1, 1, 2});
// test homs around a het
@ -498,7 +501,7 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest {
haplotypeMap.put(vc2, haplotypes2hom);
haplotypeMap.put(vc3, haplotypes3het);
haplotypeMap.put(vc4, haplotypes4hom);
tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 1, 0});
tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 3, 0});
// test hets around a hom
final Set<Haplotype> haplotypes2het = new HashSet<>();
@ -511,7 +514,7 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest {
haplotypeMap.put(vc2, haplotypes2het);
haplotypeMap.put(vc3, haplotypes3hom);
haplotypeMap.put(vc4, haplotypes4het);
tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 2, 0});
tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 3, 0});
// test no phased variants around a hom
final Set<Haplotype> haplotypes2incomplete = new HashSet<>();

View File

@ -61,6 +61,15 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
return baseTestString(" -V " + privateTestDir + "gvcf.basepairResolution.vcf " + args, b37KGReference);
}
@Test(enabled = true)
public void testUpdatePGT() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf", b37KGReference),
1,
Arrays.asList("bc3d3eff337836af245b81f52b94d70c"));
executeTest("testUpdatePGT", spec);
}
@Test(enabled = true)
public void combineSingleSamplePipelineGVCF() {
WalkerTestSpec spec = new WalkerTestSpec(
@ -81,7 +90,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" --includeNonVariantSites -L 20:10,030,000-10,033,000 -L 20:10,386,000-10,386,500", b37KGReference),
1,
Arrays.asList("f1dd2b9ab422c0b806392464e466ed91"));
Arrays.asList("4193e7c4e6fc889b4aa5a326899b1a4e"));
executeTest("combineSingleSamplePipelineGVCF_includeNonVariants", spec);
}