Merge pull request #717 from broadinstitute/eb_change_phasing_of_hom_vars
Changed the functionality of the physical phasing in the HC: now hom var...
This commit is contained in:
commit
b654590ed6
|
|
@ -73,6 +73,8 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
|
||||||
|
|
||||||
private final static List<Allele> NO_CALL = Collections.singletonList(Allele.NO_CALL);
|
private final static List<Allele> NO_CALL = Collections.singletonList(Allele.NO_CALL);
|
||||||
private static final int ALLELE_EXTENSION = 2;
|
private static final int ALLELE_EXTENSION = 2;
|
||||||
|
private static final String phase01 = "0|1";
|
||||||
|
private static final String phase10 = "1|0";
|
||||||
|
|
||||||
private MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger;
|
private MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger;
|
||||||
|
|
||||||
|
|
@ -404,14 +406,19 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
phaseSetMapping.put(call, new Pair<>(uniqueCounter, callIsOnAllHaps ? "1|1" : "0|1"));
|
// An important note: even for homozygous variants we are setting the phase as "0|1" here.
|
||||||
phaseSetMapping.put(comp, new Pair<>(uniqueCounter, compIsOnAllHaps ? "1|1" : "0|1"));
|
// We do this because we cannot possibly know for sure at this time that the genotype for this
|
||||||
|
// sample will actually be homozygous downstream: there are steps in the pipeline that are liable
|
||||||
|
// to change the genotypes. Because we can't make those assumptions here, we have decided to output
|
||||||
|
// the phase as if the call is heterozygous and then "fix" it downstream as needed.
|
||||||
|
phaseSetMapping.put(call, new Pair<>(uniqueCounter, phase01));
|
||||||
|
phaseSetMapping.put(comp, new Pair<>(uniqueCounter, phase01));
|
||||||
uniqueCounter++;
|
uniqueCounter++;
|
||||||
}
|
}
|
||||||
// otherwise it's part of an existing group so use that group's unique ID
|
// otherwise it's part of an existing group so use that group's unique ID
|
||||||
else if ( ! phaseSetMapping.containsKey(comp) ) {
|
else if ( ! phaseSetMapping.containsKey(comp) ) {
|
||||||
final Pair<Integer, String> callPhase = phaseSetMapping.get(call);
|
final Pair<Integer, String> callPhase = phaseSetMapping.get(call);
|
||||||
phaseSetMapping.put(comp, new Pair<>(callPhase.first, compIsOnAllHaps ? "1|1" : callPhase.second));
|
phaseSetMapping.put(comp, new Pair<>(callPhase.first, callPhase.second));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// if the variants are apart on *all* haplotypes, record that fact
|
// if the variants are apart on *all* haplotypes, record that fact
|
||||||
|
|
@ -431,14 +438,14 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
phaseSetMapping.put(call, new Pair<>(uniqueCounter, "0|1"));
|
phaseSetMapping.put(call, new Pair<>(uniqueCounter, phase01));
|
||||||
phaseSetMapping.put(comp, new Pair<>(uniqueCounter, "1|0"));
|
phaseSetMapping.put(comp, new Pair<>(uniqueCounter, phase10));
|
||||||
uniqueCounter++;
|
uniqueCounter++;
|
||||||
}
|
}
|
||||||
// otherwise it's part of an existing group so use that group's unique ID
|
// otherwise it's part of an existing group so use that group's unique ID
|
||||||
else if ( ! phaseSetMapping.containsKey(comp) ){
|
else if ( ! phaseSetMapping.containsKey(comp) ){
|
||||||
final Pair<Integer, String> callPhase = phaseSetMapping.get(call);
|
final Pair<Integer, String> callPhase = phaseSetMapping.get(call);
|
||||||
phaseSetMapping.put(comp, new Pair<>(callPhase.first, callPhase.second.equals("0|1") ? "1|0" : "0|1"));
|
phaseSetMapping.put(comp, new Pair<>(callPhase.first, callPhase.second.equals(phase01) ? phase10 : phase01));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -52,6 +52,7 @@ import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypingEngine;
|
||||||
import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedSampleList;
|
import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedSampleList;
|
||||||
import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList;
|
import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList;
|
||||||
import org.broadinstitute.gatk.tools.walkers.genotyper.SampleListUtils;
|
import org.broadinstitute.gatk.tools.walkers.genotyper.SampleListUtils;
|
||||||
|
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCaller;
|
||||||
import org.broadinstitute.gatk.utils.commandline.*;
|
import org.broadinstitute.gatk.utils.commandline.*;
|
||||||
import org.broadinstitute.gatk.engine.CommandLineGATK;
|
import org.broadinstitute.gatk.engine.CommandLineGATK;
|
||||||
import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection;
|
import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection;
|
||||||
|
|
@ -218,15 +219,7 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
regenotypedVC = GATKVariantContextUtils.reverseTrimAlleles(regenotypedVC);
|
regenotypedVC = GATKVariantContextUtils.reverseTrimAlleles(regenotypedVC);
|
||||||
|
result = addGenotypingAnnotations(originalVC.getAttributes(), regenotypedVC);
|
||||||
// we want to carry forward the attributes from the original VC but make sure to add the MLE-based annotations
|
|
||||||
final Map<String, Object> attrs = new HashMap<>(originalVC.getAttributes());
|
|
||||||
attrs.put(VCFConstants.MLE_ALLELE_COUNT_KEY, regenotypedVC.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY));
|
|
||||||
attrs.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, regenotypedVC.getAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY));
|
|
||||||
if (regenotypedVC.hasAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY))
|
|
||||||
attrs.put(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, regenotypedVC.getAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY));
|
|
||||||
|
|
||||||
result = new VariantContextBuilder(regenotypedVC).attributes(attrs).make();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// if it turned monomorphic then we either need to ignore or fix such sites
|
// if it turned monomorphic then we either need to ignore or fix such sites
|
||||||
|
|
@ -237,18 +230,47 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
|
||||||
createRefGTs = true;
|
createRefGTs = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// re-annotate it
|
// Re-annotate and fix/remove some of the original annotations.
|
||||||
|
// Note that the order of these actions matters and is different for polymorphic and monomorphic sites.
|
||||||
|
// For polymorphic sites we need to make sure e.g. the SB tag is sent to the annotation engine and then removed later.
|
||||||
|
// For monomorphic sites we need to make sure e.g. the hom ref genotypes are created and only then are passed to the annotation engine.
|
||||||
|
// We could theoretically make 2 passes to re-create the genotypes, but that gets extremely expensive with large sample sizes.
|
||||||
|
if ( createRefGTs ) {
|
||||||
|
result = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, true)).make();
|
||||||
result = annotationEngine.annotateContext(tracker, ref, null, result);
|
result = annotationEngine.annotateContext(tracker, ref, null, result);
|
||||||
|
} else {
|
||||||
// fix some of the annotations
|
result = annotationEngine.annotateContext(tracker, ref, null, result);
|
||||||
return new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, createRefGTs)).make();
|
result = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, false)).make();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add genotyping-based annotations to the new VC
|
||||||
|
*
|
||||||
|
* @param originalAttributes the non-null annotations from the original VC
|
||||||
|
* @param newVC the new non-null VC
|
||||||
|
* @return a non-null VC
|
||||||
|
*/
|
||||||
|
private VariantContext addGenotypingAnnotations(final Map<String, Object> originalAttributes, final VariantContext newVC) {
|
||||||
|
// we want to carry forward the attributes from the original VC but make sure to add the MLE-based annotations
|
||||||
|
final Map<String, Object> attrs = new HashMap<>(originalAttributes);
|
||||||
|
attrs.put(VCFConstants.MLE_ALLELE_COUNT_KEY, newVC.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY));
|
||||||
|
attrs.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, newVC.getAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY));
|
||||||
|
if (newVC.hasAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY))
|
||||||
|
attrs.put(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, newVC.getAttribute(GenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY));
|
||||||
|
|
||||||
|
return new VariantContextBuilder(newVC).attributes(attrs).make();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Cleans up genotype-level annotations that need to be updated.
|
* Cleans up genotype-level annotations that need to be updated.
|
||||||
* 1. move MIN_DP to DP if present
|
* 1. move MIN_DP to DP if present
|
||||||
* 2. propagate DP to AD if not present
|
* 2. propagate DP to AD if not present
|
||||||
* 3. remove SB if present
|
* 3. remove SB if present
|
||||||
|
* 4. change the PGT value from "0|1" to "1|1" for homozygous variant genotypes
|
||||||
*
|
*
|
||||||
* @param VC the VariantContext with the Genotypes to fix
|
* @param VC the VariantContext with the Genotypes to fix
|
||||||
* @param createRefGTs if true we will also create proper hom ref genotypes since we assume the site is monomorphic
|
* @param createRefGTs if true we will also create proper hom ref genotypes since we assume the site is monomorphic
|
||||||
|
|
@ -273,6 +295,11 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
|
||||||
// remove SB
|
// remove SB
|
||||||
attrs.remove("SB");
|
attrs.remove("SB");
|
||||||
|
|
||||||
|
// update PGT for hom vars
|
||||||
|
if ( oldGT.isHomVar() && oldGT.hasExtendedAttribute(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_GT_KEY) ) {
|
||||||
|
attrs.put(HaplotypeCaller.HAPLOTYPE_CALLER_PHASING_GT_KEY, "1|1");
|
||||||
|
}
|
||||||
|
|
||||||
// create AD if it's not there
|
// create AD if it's not there
|
||||||
if ( !oldGT.hasAD() && VC.isVariant() ) {
|
if ( !oldGT.hasAD() && VC.isVariant() ) {
|
||||||
final int[] AD = new int[VC.getNAlleles()];
|
final int[] AD = new int[VC.getNAlleles()];
|
||||||
|
|
|
||||||
|
|
@ -86,8 +86,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
// this functionality can be adapted to provide input data for whatever you might want in your data
|
// this functionality can be adapted to provide input data for whatever you might want in your data
|
||||||
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "9db87ae56df22456f3c08024384f3e5e"});
|
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "9db87ae56df22456f3c08024384f3e5e"});
|
||||||
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "90dc8950e28e042097817d785022482e"});
|
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "ad0b6274991d451a6491f3118f0e7b2b"});
|
||||||
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "a64e2d8e60a1be9cb0b4306c85e96393"});
|
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "956938fd1b5c7bf1b717326a7dd46d91"});
|
||||||
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "d5c07fa3edca496a84fd17cecad06230"});
|
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "d5c07fa3edca496a84fd17cecad06230"});
|
||||||
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "66019a0914f905522da6bd3b557a57d1"});
|
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "66019a0914f905522da6bd3b557a57d1"});
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -479,11 +479,14 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest {
|
||||||
haplotypeMap.put(vc4, haplotypes24);
|
haplotypeMap.put(vc4, haplotypes24);
|
||||||
tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 2, 1});
|
tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 2, 1});
|
||||||
|
|
||||||
|
// test 2 hets
|
||||||
|
haplotypeMap.remove(vc3);
|
||||||
|
tests.add(new Object[]{Arrays.asList(vc2, vc4), new HashMap<>(haplotypeMap), 1, 2, 1, 2, 0});
|
||||||
|
|
||||||
// test 2 with opposite phase
|
// test 2 with opposite phase
|
||||||
final Set<Haplotype> haplotypes1 = new HashSet<>();
|
final Set<Haplotype> haplotypes1 = new HashSet<>();
|
||||||
haplotypes1.add(pos1);
|
haplotypes1.add(pos1);
|
||||||
haplotypeMap.put(vc1, haplotypes1);
|
haplotypeMap.put(vc1, haplotypes1);
|
||||||
haplotypeMap.remove(vc3);
|
|
||||||
tests.add(new Object[]{Arrays.asList(vc1, vc2, vc4), new HashMap<>(haplotypeMap), 2, 3, 1, 1, 2});
|
tests.add(new Object[]{Arrays.asList(vc1, vc2, vc4), new HashMap<>(haplotypeMap), 2, 3, 1, 1, 2});
|
||||||
|
|
||||||
// test homs around a het
|
// test homs around a het
|
||||||
|
|
@ -498,7 +501,7 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest {
|
||||||
haplotypeMap.put(vc2, haplotypes2hom);
|
haplotypeMap.put(vc2, haplotypes2hom);
|
||||||
haplotypeMap.put(vc3, haplotypes3het);
|
haplotypeMap.put(vc3, haplotypes3het);
|
||||||
haplotypeMap.put(vc4, haplotypes4hom);
|
haplotypeMap.put(vc4, haplotypes4hom);
|
||||||
tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 1, 0});
|
tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 3, 0});
|
||||||
|
|
||||||
// test hets around a hom
|
// test hets around a hom
|
||||||
final Set<Haplotype> haplotypes2het = new HashSet<>();
|
final Set<Haplotype> haplotypes2het = new HashSet<>();
|
||||||
|
|
@ -511,7 +514,7 @@ public class HaplotypeCallerGenotypingEngineUnitTest extends BaseTest {
|
||||||
haplotypeMap.put(vc2, haplotypes2het);
|
haplotypeMap.put(vc2, haplotypes2het);
|
||||||
haplotypeMap.put(vc3, haplotypes3hom);
|
haplotypeMap.put(vc3, haplotypes3hom);
|
||||||
haplotypeMap.put(vc4, haplotypes4het);
|
haplotypeMap.put(vc4, haplotypes4het);
|
||||||
tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 2, 0});
|
tests.add(new Object[]{calls, new HashMap<>(haplotypeMap), 2, 3, 1, 3, 0});
|
||||||
|
|
||||||
// test no phased variants around a hom
|
// test no phased variants around a hom
|
||||||
final Set<Haplotype> haplotypes2incomplete = new HashSet<>();
|
final Set<Haplotype> haplotypes2incomplete = new HashSet<>();
|
||||||
|
|
|
||||||
|
|
@ -61,6 +61,15 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
|
||||||
return baseTestString(" -V " + privateTestDir + "gvcf.basepairResolution.vcf " + args, b37KGReference);
|
return baseTestString(" -V " + privateTestDir + "gvcf.basepairResolution.vcf " + args, b37KGReference);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test(enabled = true)
|
||||||
|
public void testUpdatePGT() {
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf", b37KGReference),
|
||||||
|
1,
|
||||||
|
Arrays.asList("bc3d3eff337836af245b81f52b94d70c"));
|
||||||
|
executeTest("testUpdatePGT", spec);
|
||||||
|
}
|
||||||
|
|
||||||
@Test(enabled = true)
|
@Test(enabled = true)
|
||||||
public void combineSingleSamplePipelineGVCF() {
|
public void combineSingleSamplePipelineGVCF() {
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
|
@ -81,7 +90,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
|
||||||
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
|
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
|
||||||
" --includeNonVariantSites -L 20:10,030,000-10,033,000 -L 20:10,386,000-10,386,500", b37KGReference),
|
" --includeNonVariantSites -L 20:10,030,000-10,033,000 -L 20:10,386,000-10,386,500", b37KGReference),
|
||||||
1,
|
1,
|
||||||
Arrays.asList("f1dd2b9ab422c0b806392464e466ed91"));
|
Arrays.asList("4193e7c4e6fc889b4aa5a326899b1a4e"));
|
||||||
executeTest("combineSingleSamplePipelineGVCF_includeNonVariants", spec);
|
executeTest("combineSingleSamplePipelineGVCF_includeNonVariants", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue