Merge pull request #717 from broadinstitute/eb_change_phasing_of_hom_vars

Changed the functionality of the physical phasing in the HC: now hom var...
This commit is contained in:
Eric Banks 2014-08-25 21:41:11 -04:00
commit b654590ed6
5 changed files with 71 additions and 25 deletions

View File

@ -73,6 +73,8 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
private final static List<Allele> NO_CALL = Collections.singletonList(Allele.NO_CALL);
private static final int ALLELE_EXTENSION = 2;
private static final String phase01 = "0|1";
private static final String phase10 = "1|0";
private MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger;
@ -404,14 +406,19 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
return 0;
}
phaseSetMapping.put(call, new Pair<>(uniqueCounter, callIsOnAllHaps ? "1|1" : "0|1"));
phaseSetMapping.put(comp, new Pair<>(uniqueCounter, compIsOnAllHaps ? "1|1" : "0|1"));
// An important note: even for homozygous variants we are setting the phase as "0|1" here.
// We do this because we cannot possibly know for sure at this time that the genotype for this
// sample will actually be homozygous downstream: there are steps in the pipeline that are liable
// to change the genotypes. Because we can't make those assumptions here, we have decided to output
// the phase as if the call is heterozygous and then "fix" it downstream as needed.
phaseSetMapping.put(call, new Pair<>(uniqueCounter, phase01));
phaseSetMapping.put(comp, new Pair<>(uniqueCounter, phase01));
uniqueCounter++;
}
// otherwise it's part of an existing group so use that group's unique ID
else if ( ! phaseSetMapping.containsKey(comp) ) {
final Pair<Integer, String> callPhase = phaseSetMapping.get(call);
phaseSetMapping.put(comp, new Pair<>(callPhase.first, compIsOnAllHaps ? "1|1" : callPhase.second));
phaseSetMapping.put(comp, new Pair<>(callPhase.first, callPhase.second));
}
}
// if the variants are apart on *all* haplotypes, record that fact
@ -431,14 +438,14 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
return 0;
}
phaseSetMapping.put(call, new Pair<>(uniqueCounter, "0|1"));
phaseSetMapping.put(comp, new Pair<>(uniqueCounter, "1|0"));
phaseSetMapping.put(call, new Pair<>(uniqueCounter, phase01));
phaseSetMapping.put(comp, new Pair<>(uniqueCounter, phase10));
uniqueCounter++;
}
// otherwise it's part of an existing group so use that group's unique ID
else if ( ! phaseSetMapping.containsKey(comp) ){
final Pair<Integer, String> callPhase = phaseSetMapping.get(call);
phaseSetMapping.put(comp, new Pair<>(callPhase.first, callPhase.second.equals("0|1") ? "1|0" : "0|1"));
phaseSetMapping.put(comp, new Pair<>(callPhase.first, callPhase.second.equals(phase01) ? phase10 : phase01));
}
}
}

View File

@ -52,6 +52,7 @@ import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypingEngine;
import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedSampleList;
import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList;
import org.broadinstitute.gatk.tools.walkers.genotyper.SampleListUtils;
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCaller;
import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection;
@ -218,15 +219,7 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
return null;
regenotypedVC = GATKVariantContextUtils.reverseTrimAlleles(regenotypedVC);
// we want to carry forward the attributes from the original VC but make sure to add the MLE-based annotations
final Map<String, Object> attrs = new HashMap<>(originalVC.getAttributes());
attrs.put(VCFConstants.MLE_ALLELE_COUNT_KEY, regenotypedVC.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY));
attrs.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, regenotypedVC.getAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY));
if (regenotypedVC.hasAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY))
attrs.put(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, regenotypedVC.getAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY));
result = new VariantContextBuilder(regenotypedVC).attributes(attrs).make();
result = addGenotypingAnnotations(originalVC.getAttributes(), regenotypedVC);
}
// if it turned monomorphic then we either need to ignore or fix such sites
@ -237,18 +230,47 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
createRefGTs = true;
}
// re-annotate it
result = annotationEngine.annotateContext(tracker, ref, null, result);
// Re-annotate and fix/remove some of the original annotations.
// Note that the order of these actions matters and is different for polymorphic and monomorphic sites.
// For polymorphic sites we need to make sure e.g. the SB tag is sent to the annotation engine and then removed later.
// For monomorphic sites we need to make sure e.g. the hom ref genotypes are created and only then are passed to the annotation engine.
// We could theoretically make 2 passes to re-create the genotypes, but that gets extremely expensive with large sample sizes.
if ( createRefGTs ) {
result = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, true)).make();
result = annotationEngine.annotateContext(tracker, ref, null, result);
} else {
result = annotationEngine.annotateContext(tracker, ref, null, result);
result = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, false)).make();
}
// fix some of the annotations
return new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, createRefGTs)).make();
return result;
}
/**
* Add genotyping-based annotations to the new VC
*
* @param originalAttributes the non-null annotations from the original VC
* @param newVC the new non-null VC
* @return a non-null VC
*/
private VariantContext addGenotypingAnnotations(final Map<String, Object> originalAttributes, final VariantContext newVC) {
// we want to carry forward the attributes from the original VC but make sure to add the MLE-based annotations
final Map<String, Object> attrs = new HashMap<>(originalAttributes);
attrs.put(VCFConstants.MLE_ALLELE_COUNT_KEY, newVC.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY));
attrs.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, newVC.getAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY));
if (newVC.hasAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY))
attrs.put(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, newVC.getAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY));
return new VariantContextBuilder(newVC).attributes(attrs).make();
}
/**
* Cleans up genotype-level annotations that need to be updated.
* 1. move MIN_DP to DP if present
* 2. propagate DP to AD if not present
* 3. remove SB if present
* 4. change the PGT value from "0|1" to "1|1" for homozygous variant genotypes
*
* @param VC the VariantContext with the Genotypes to fix
* @param createRefGTs if true we will also create proper hom ref genotypes since we assume the site is monomorphic
@ -273,6 +295,11 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
// remove SB
attrs.remove("SB");
// update PGT for hom vars
if ( oldGT.isHomVar() && oldGT.hasExtendedAttribute(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_GT_KEY) ) {
attrs.put(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_GT_KEY, "1|1");
}
// create AD if it's not there
if ( !oldGT.hasAD() && VC.isVariant() ) {
final int[] AD = new int[VC.getNAlleles()];

View File

@ -86,8 +86,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "9db87ae56df22456f3c08024384f3e5e"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "90dc8950e28e042097817d785022482e"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "a64e2d8e60a1be9cb0b4306c85e96393"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "ad0b6274991d451a6491f3118f0e7b2b"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "956938fd1b5c7bf1b717326a7dd46d91"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "d5c07fa3edca496a84fd17cecad06230"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "66019a0914f905522da6bd3b557a57d1"});

View File

@ -479,11 +479,14 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest {
haplotypeMap.put(vc4, haplotypes24);
tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 2, 1});
// test 2 hets
haplotypeMap.remove(vc3);
tests.add(new Object[]{Arrays.asList(vc2, vc4), new HashMap<>(haplotypeMap), 1, 2, 1, 2, 0});
// test 2 with opposite phase
final Set<Haplotype> haplotypes1 = new HashSet<>();
haplotypes1.add(pos1);
haplotypeMap.put(vc1, haplotypes1);
haplotypeMap.remove(vc3);
tests.add(new Object[]{Arrays.asList(vc1, vc2, vc4), new HashMap<>(haplotypeMap), 2, 3, 1, 1, 2});
// test homs around a het
@ -498,7 +501,7 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest {
haplotypeMap.put(vc2, haplotypes2hom);
haplotypeMap.put(vc3, haplotypes3het);
haplotypeMap.put(vc4, haplotypes4hom);
tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 1, 0});
tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 3, 0});
// test hets around a hom
final Set<Haplotype> haplotypes2het = new HashSet<>();
@ -511,7 +514,7 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest {
haplotypeMap.put(vc2, haplotypes2het);
haplotypeMap.put(vc3, haplotypes3hom);
haplotypeMap.put(vc4, haplotypes4het);
tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 2, 0});
tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 3, 0});
// test no phased variants around a hom
final Set<Haplotype> haplotypes2incomplete = new HashSet<>();

View File

@ -61,6 +61,15 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
return baseTestString(" -V " + privateTestDir + "gvcf.basepairResolution.vcf " + args, b37KGReference);
}
@Test(enabled = true)
public void testUpdatePGT() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf", b37KGReference),
1,
Arrays.asList("bc3d3eff337836af245b81f52b94d70c"));
executeTest("testUpdatePGT", spec);
}
@Test(enabled = true)
public void combineSingleSamplePipelineGVCF() {
WalkerTestSpec spec = new WalkerTestSpec(
@ -81,7 +90,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" --includeNonVariantSites -L 20:10,030,000-10,033,000 -L 20:10,386,000-10,386,500", b37KGReference),
1,
Arrays.asList("f1dd2b9ab422c0b806392464e466ed91"));
Arrays.asList("4193e7c4e6fc889b4aa5a326899b1a4e"));
executeTest("combineSingleSamplePipelineGVCF_includeNonVariants", spec);
}