From 34d11fe40c91a4659b1524857755e9bf36325849 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 28 Jan 2014 09:11:44 -0500 Subject: [PATCH] Added the consensus mode used for the 1000 Genomes Project to the HaplotypeCaller. -- All the provided alleles are added to the assembly graph as potential haplotypes but they aren't forcibly genotyped like in GGA mode. -- Added integration test for this mode --- .../walkers/haplotypecaller/HaplotypeCaller.java | 13 +++++++++++-- ...erComplexAndSymbolicVariantsIntegrationTest.java | 13 +++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 56f12d020..b6eddab51 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -289,6 +289,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="dontRecoverDanglingTails", shortName="dontRecoverDanglingTails", doc="Should we disable dangling tail recovery in the read threading assembler?", required = false) protected boolean dontRecoverDanglingTails = false; + @Advanced + @Argument(fullName="consensus", shortName="consensus", doc="In 1000G consensus mode. Inject all provided alleles to the assembly graph but don't forcibly genotype all of them.", required = false) + protected boolean consensusMode = false; + // ----------------------------------------------------------------------------------------------- // general advanced arguments to control haplotype caller behavior // ----------------------------------------------------------------------------------------------- @@ -575,7 +579,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // initialize the UnifiedGenotyper Engine which is used to call into the exact model final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user // HC GGA mode depends critically on EMIT_ALL_SITES being set for the UG engine - UAC.OutputMode = SCAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES + UAC.OutputMode = SCAC.GenotypingMode.equals(GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) ? UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES : UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); @@ -598,6 +602,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In UAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(UAC.CONTAMINATION_FRACTION_FILE, UAC.CONTAMINATION_FRACTION, samples, logger)); } + if( SCAC.GenotypingMode.equals(GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) && consensusMode ) { + throw new UserException("HaplotypeCaller cannot be run in both GENOTYPE_GIVEN_ALLELES mode and in consensus mode. Please choose one or the other."); + } + // initialize the output VCF header final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); @@ -878,7 +886,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In regionForGenotyping.getLocation(), getToolkit().getGenomeLocParser(), metaDataTracker, - activeAllelesToGenotype, emitReferenceConfidence() ); + ( consensusMode ? Collections.emptyList() : activeAllelesToGenotype ), + emitReferenceConfidence() ); // TODO -- must disable if we are doing NCT, or set the output type of ! presorted if ( bamWriter != null ) { diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 2838648d5..ac0f5671c 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -96,4 +96,17 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", "f50e0b35e2240b19b1b8b6dfa0cf9796"); } + + private void HCTestComplexConsensusMode(String bam, String args, String md5) { + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -consensus -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf -alleles " + validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf"; + final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); + executeTest("testHaplotypeCallerComplexConsensusMode: args=" + args, spec); + } + + @Test + public void testHaplotypeCallerMultiSampleConsensusModeComplex() { + HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538 -L 20:133041-133161 -L 20:300207-300337", + "21e521d51b826450d348e5201684ffe4"); + } + }