Merge pull request #717 from broadinstitute/eb_change_phasing_of_hom_vars
Changed the functionality of the physical phasing in the HC: now hom var...
This commit is contained in:
commit
b654590ed6
|
|
@ -73,6 +73,8 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
|
|||
|
||||
private final static List<Allele> NO_CALL = Collections.singletonList(Allele.NO_CALL);
|
||||
private static final int ALLELE_EXTENSION = 2;
|
||||
private static final String phase01 = "0|1";
|
||||
private static final String phase10 = "1|0";
|
||||
|
||||
private MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger;
|
||||
|
||||
|
|
@ -404,14 +406,19 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
|
|||
return 0;
|
||||
}
|
||||
|
||||
phaseSetMapping.put(call, new Pair<>(uniqueCounter, callIsOnAllHaps ? "1|1" : "0|1"));
|
||||
phaseSetMapping.put(comp, new Pair<>(uniqueCounter, compIsOnAllHaps ? "1|1" : "0|1"));
|
||||
// An important note: even for homozygous variants we are setting the phase as "0|1" here.
|
||||
// We do this because we cannot possibly know for sure at this time that the genotype for this
|
||||
// sample will actually be homozygous downstream: there are steps in the pipeline that are liable
|
||||
// to change the genotypes. Because we can't make those assumptions here, we have decided to output
|
||||
// the phase as if the call is heterozygous and then "fix" it downstream as needed.
|
||||
phaseSetMapping.put(call, new Pair<>(uniqueCounter, phase01));
|
||||
phaseSetMapping.put(comp, new Pair<>(uniqueCounter, phase01));
|
||||
uniqueCounter++;
|
||||
}
|
||||
// otherwise it's part of an existing group so use that group's unique ID
|
||||
else if ( ! phaseSetMapping.containsKey(comp) ) {
|
||||
final Pair<Integer, String> callPhase = phaseSetMapping.get(call);
|
||||
phaseSetMapping.put(comp, new Pair<>(callPhase.first, compIsOnAllHaps ? "1|1" : callPhase.second));
|
||||
phaseSetMapping.put(comp, new Pair<>(callPhase.first, callPhase.second));
|
||||
}
|
||||
}
|
||||
// if the variants are apart on *all* haplotypes, record that fact
|
||||
|
|
@ -431,14 +438,14 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
|
|||
return 0;
|
||||
}
|
||||
|
||||
phaseSetMapping.put(call, new Pair<>(uniqueCounter, "0|1"));
|
||||
phaseSetMapping.put(comp, new Pair<>(uniqueCounter, "1|0"));
|
||||
phaseSetMapping.put(call, new Pair<>(uniqueCounter, phase01));
|
||||
phaseSetMapping.put(comp, new Pair<>(uniqueCounter, phase10));
|
||||
uniqueCounter++;
|
||||
}
|
||||
// otherwise it's part of an existing group so use that group's unique ID
|
||||
else if ( ! phaseSetMapping.containsKey(comp) ){
|
||||
final Pair<Integer, String> callPhase = phaseSetMapping.get(call);
|
||||
phaseSetMapping.put(comp, new Pair<>(callPhase.first, callPhase.second.equals("0|1") ? "1|0" : "0|1"));
|
||||
phaseSetMapping.put(comp, new Pair<>(callPhase.first, callPhase.second.equals(phase01) ? phase10 : phase01));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -52,6 +52,7 @@ import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypingEngine;
|
|||
import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedSampleList;
|
||||
import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList;
|
||||
import org.broadinstitute.gatk.tools.walkers.genotyper.SampleListUtils;
|
||||
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCaller;
|
||||
import org.broadinstitute.gatk.utils.commandline.*;
|
||||
import org.broadinstitute.gatk.engine.CommandLineGATK;
|
||||
import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection;
|
||||
|
|
@ -218,15 +219,7 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
|
|||
return null;
|
||||
|
||||
regenotypedVC = GATKVariantContextUtils.reverseTrimAlleles(regenotypedVC);
|
||||
|
||||
// we want to carry forward the attributes from the original VC but make sure to add the MLE-based annotations
|
||||
final Map<String, Object> attrs = new HashMap<>(originalVC.getAttributes());
|
||||
attrs.put(VCFConstants.MLE_ALLELE_COUNT_KEY, regenotypedVC.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY));
|
||||
attrs.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, regenotypedVC.getAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY));
|
||||
if (regenotypedVC.hasAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY))
|
||||
attrs.put(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, regenotypedVC.getAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY));
|
||||
|
||||
result = new VariantContextBuilder(regenotypedVC).attributes(attrs).make();
|
||||
result = addGenotypingAnnotations(originalVC.getAttributes(), regenotypedVC);
|
||||
}
|
||||
|
||||
// if it turned monomorphic then we either need to ignore or fix such sites
|
||||
|
|
@ -237,18 +230,47 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
|
|||
createRefGTs = true;
|
||||
}
|
||||
|
||||
// re-annotate it
|
||||
result = annotationEngine.annotateContext(tracker, ref, null, result);
|
||||
// Re-annotate and fix/remove some of the original annotations.
|
||||
// Note that the order of these actions matters and is different for polymorphic and monomorphic sites.
|
||||
// For polymorphic sites we need to make sure e.g. the SB tag is sent to the annotation engine and then removed later.
|
||||
// For monomorphic sites we need to make sure e.g. the hom ref genotypes are created and only then are passed to the annotation engine.
|
||||
// We could theoretically make 2 passes to re-create the genotypes, but that gets extremely expensive with large sample sizes.
|
||||
if ( createRefGTs ) {
|
||||
result = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, true)).make();
|
||||
result = annotationEngine.annotateContext(tracker, ref, null, result);
|
||||
} else {
|
||||
result = annotationEngine.annotateContext(tracker, ref, null, result);
|
||||
result = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, false)).make();
|
||||
}
|
||||
|
||||
// fix some of the annotations
|
||||
return new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, createRefGTs)).make();
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add genotyping-based annotations to the new VC
|
||||
*
|
||||
* @param originalAttributes the non-null annotations from the original VC
|
||||
* @param newVC the new non-null VC
|
||||
* @return a non-null VC
|
||||
*/
|
||||
private VariantContext addGenotypingAnnotations(final Map<String, Object> originalAttributes, final VariantContext newVC) {
|
||||
// we want to carry forward the attributes from the original VC but make sure to add the MLE-based annotations
|
||||
final Map<String, Object> attrs = new HashMap<>(originalAttributes);
|
||||
attrs.put(VCFConstants.MLE_ALLELE_COUNT_KEY, newVC.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY));
|
||||
attrs.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, newVC.getAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY));
|
||||
if (newVC.hasAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY))
|
||||
attrs.put(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, newVC.getAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY));
|
||||
|
||||
return new VariantContextBuilder(newVC).attributes(attrs).make();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Cleans up genotype-level annotations that need to be updated.
|
||||
* 1. move MIN_DP to DP if present
|
||||
* 2. propagate DP to AD if not present
|
||||
* 3. remove SB if present
|
||||
* 4. change the PGT value from "0|1" to "1|1" for homozygous variant genotypes
|
||||
*
|
||||
* @param VC the VariantContext with the Genotypes to fix
|
||||
* @param createRefGTs if true we will also create proper hom ref genotypes since we assume the site is monomorphic
|
||||
|
|
@ -273,6 +295,11 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
|
|||
// remove SB
|
||||
attrs.remove("SB");
|
||||
|
||||
// update PGT for hom vars
|
||||
if ( oldGT.isHomVar() && oldGT.hasExtendedAttribute(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_GT_KEY) ) {
|
||||
attrs.put(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_GT_KEY, "1|1");
|
||||
}
|
||||
|
||||
// create AD if it's not there
|
||||
if ( !oldGT.hasAD() && VC.isVariant() ) {
|
||||
final int[] AD = new int[VC.getNAlleles()];
|
||||
|
|
|
|||
|
|
@ -86,8 +86,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
|
|||
|
||||
// this functionality can be adapted to provide input data for whatever you might want in your data
|
||||
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "9db87ae56df22456f3c08024384f3e5e"});
|
||||
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "90dc8950e28e042097817d785022482e"});
|
||||
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "a64e2d8e60a1be9cb0b4306c85e96393"});
|
||||
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "ad0b6274991d451a6491f3118f0e7b2b"});
|
||||
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "956938fd1b5c7bf1b717326a7dd46d91"});
|
||||
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "d5c07fa3edca496a84fd17cecad06230"});
|
||||
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "66019a0914f905522da6bd3b557a57d1"});
|
||||
|
||||
|
|
|
|||
|
|
@ -479,11 +479,14 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest {
|
|||
haplotypeMap.put(vc4, haplotypes24);
|
||||
tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 2, 1});
|
||||
|
||||
// test 2 hets
|
||||
haplotypeMap.remove(vc3);
|
||||
tests.add(new Object[]{Arrays.asList(vc2, vc4), new HashMap<>(haplotypeMap), 1, 2, 1, 2, 0});
|
||||
|
||||
// test 2 with opposite phase
|
||||
final Set<Haplotype> haplotypes1 = new HashSet<>();
|
||||
haplotypes1.add(pos1);
|
||||
haplotypeMap.put(vc1, haplotypes1);
|
||||
haplotypeMap.remove(vc3);
|
||||
tests.add(new Object[]{Arrays.asList(vc1, vc2, vc4), new HashMap<>(haplotypeMap), 2, 3, 1, 1, 2});
|
||||
|
||||
// test homs around a het
|
||||
|
|
@ -498,7 +501,7 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest {
|
|||
haplotypeMap.put(vc2, haplotypes2hom);
|
||||
haplotypeMap.put(vc3, haplotypes3het);
|
||||
haplotypeMap.put(vc4, haplotypes4hom);
|
||||
tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 1, 0});
|
||||
tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 3, 0});
|
||||
|
||||
// test hets around a hom
|
||||
final Set<Haplotype> haplotypes2het = new HashSet<>();
|
||||
|
|
@ -511,7 +514,7 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest {
|
|||
haplotypeMap.put(vc2, haplotypes2het);
|
||||
haplotypeMap.put(vc3, haplotypes3hom);
|
||||
haplotypeMap.put(vc4, haplotypes4het);
|
||||
tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 2, 0});
|
||||
tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 3, 0});
|
||||
|
||||
// test no phased variants around a hom
|
||||
final Set<Haplotype> haplotypes2incomplete = new HashSet<>();
|
||||
|
|
|
|||
|
|
@ -61,6 +61,15 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
|
|||
return baseTestString(" -V " + privateTestDir + "gvcf.basepairResolution.vcf " + args, b37KGReference);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testUpdatePGT() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf", b37KGReference),
|
||||
1,
|
||||
Arrays.asList("bc3d3eff337836af245b81f52b94d70c"));
|
||||
executeTest("testUpdatePGT", spec);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void combineSingleSamplePipelineGVCF() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
|
|
@ -81,7 +90,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
|
|||
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
|
||||
" --includeNonVariantSites -L 20:10,030,000-10,033,000 -L 20:10,386,000-10,386,500", b37KGReference),
|
||||
1,
|
||||
Arrays.asList("f1dd2b9ab422c0b806392464e466ed91"));
|
||||
Arrays.asList("4193e7c4e6fc889b4aa5a326899b1a4e"));
|
||||
executeTest("combineSingleSamplePipelineGVCF_includeNonVariants", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue