From f6025d25aeabd7c52ef89c6202438f0a40199ee1 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Thu, 16 May 2013 10:04:11 -0400 Subject: [PATCH] Feature requested by Reich lab and Paavo lab in Leipzig for ancient DNA processing: -- When doing cross-species comparisons and studying population history and ancient DNA data, having SOME measure of confidence is needed at every single site that doesn't depend on the reference base, even in a naive per-site SNP mode. Old versions of GATK provided GQ and some wrong PL values at reference sites but these were wrong. This commit addresses this need by adding a new UG command line argument, -allSitePLs, that, if enabled will: a) Emit all 3 ALT snp alleles in the ALT column. b) Emit all corresponding 10 PL values. It's up to the user to process these PL values downstream to make sense of these. Note that, in order to follow VCF spec, the QUAL field in a reference call when there are non-null ALT alleles present will be zero, so QUAL will be useless and filtering will need to be done based on other fields. -- Tweaks and fixes to processing pipelines for Reich lab. --- .../SNPGenotypeLikelihoodsCalculationModel.java | 12 ++++++++++-- .../genotyper/UnifiedArgumentCollection.java | 17 ++++++++++++++++- .../genotyper/UnifiedGenotyperEngine.java | 12 ++++++++++-- .../UnifiedGenotyperIntegrationTest.java | 8 ++++++++ .../picard/CollectGcBiasMetrics.scala | 3 +-- 5 files changed, 45 insertions(+), 7 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index ce5f94478..360f88e51 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -147,9 +147,17 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC // if we only want variants, then we don't need to calculate genotype likelihoods if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY ) return builder.make(); + // if user requires all PLs at all sites, add all possible alt alleles + else if (UAC.annotateAllSitesWithPLs) { + for ( final byte base : BaseUtils.BASES ) { + if ( base != refBase ) + alleles.add(Allele.create(base)); + } + } - // otherwise, choose any alternate allele (it doesn't really matter) - alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(indexOfRefBase == 0 ? 1 : 0))); + else + // otherwise, choose any alternate allele (it doesn't really matter) + alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(indexOfRefBase == 0 ? 1 : 0))); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index e346b10b7..b96b5733f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -52,6 +52,9 @@ import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.VariantContext; +import java.util.Collections; +import java.util.List; + public class UnifiedArgumentCollection extends StandardCallerArgumentCollection { @Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false) @@ -95,6 +98,18 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) public Double MAX_DELETION_FRACTION = 0.05; + /** + * Advanced, experimental argument: if SNP likelihood model is specified, and if EMIT_ALL_SITES output mode is set, when we set this argument then we will also emit PLs at all sites. + * This will give a measure of reference confidence and a measure of which alt alleles are more plausible (if any). + * WARNINGS: + * - This feature will inflate VCF file size considerably. + * - All SNP ALT alleles will be emitted with corresponding 10 PL values. + * - An error will be emitted if EMIT_ALL_SITES is not set, or if anything other than diploid SNP model is used + */ + @Advanced + @Argument(fullName = "allSitePLs", shortName = "allSitePLs", doc = "Annotate all sites with PLs", required = false) + public boolean annotateAllSitesWithPLs = false; + // indel-related arguments /** * A candidate indel is genotyped (and potentially called) if there are this number of reads with a consensus indel at a site. @@ -247,7 +262,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES; this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO; this.pairHMM = uac.pairHMM; - + this.annotateAllSitesWithPLs = uac.annotateAllSitesWithPLs; // todo- arguments to remove this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 3d9f75d45..9f3368cf8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -168,6 +168,13 @@ public class UnifiedGenotyperEngine { filter.add(LOW_QUAL_FILTER_NAME); determineGLModelsToUse(); + + // do argument checking + if (UAC.annotateAllSitesWithPLs) { + if (!modelsToUse.contains(GenotypeLikelihoodsCalculationModel.Model.SNP)) + throw new IllegalArgumentException("Invalid genotype likelihood model specification: Only diploid SNP model can be used in conjunction with option allSitePLs"); + + } } /** @@ -439,7 +446,8 @@ public class UnifiedGenotyperEngine { bestGuessIsRef = false; } // if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele - else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { + else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES || + UAC.annotateAllSitesWithPLs) { myAlleles.add(alternateAllele); alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); } @@ -449,7 +457,7 @@ public class UnifiedGenotyperEngine { // note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice final double phredScaledConfidence = - Math.abs(! bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES + Math.abs(! bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES || UAC.annotateAllSitesWithPLs ? -10 * AFresult.getLog10PosteriorOfAFEq0() : -10 * AFresult.getLog10PosteriorOfAFGT0()); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 300d7f5da..3eb9b4e1c 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -156,6 +156,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { } + @Test + public void emitPLsAtAllSites() { + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --output_mode EMIT_ALL_SITES -allSitePLs", 1, + Arrays.asList("7cc55db8693759e059a05bc4398f6f69")); + executeTest("test all site PLs 1", spec1); + + } // -------------------------------------------------------------------------------------------------------------- // // testing heterozygosity diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala index 5d887016e..7c4c3f26a 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala @@ -52,6 +52,5 @@ class CollectGcBiasMetrics extends org.broadinstitute.sting.queue.function.JavaC override def commandLine = super.commandLine + required("SUMMARY_OUTPUT=" + output) + required("CHART_OUTPUT=" + output+".pdf") + - required("REFERENCE_SEQUENCE=" + reference) + - required("ASSUME_SORTED=true") + required("REFERENCE_SEQUENCE=" + reference) }