diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java index 3ffefcf0e..c036ce740 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java @@ -70,22 +70,16 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{ /** * The expected heterozygosity value used to compute prior probability that a locus is non-reference. * - * The default priors are for provided for humans: + * From the heterozygosity we calculate the probability of N samples being hom-ref at a site as 1 - sum_i_2N (hets / i) + * where hets is this case is analogous to the parameter theta from population genetics. See https://en.wikipedia.org/wiki/Coalescent_theory for more details. * - * het = 1e-3 + * Note that heterozygosity as used here is the population genetics concept. (See http://en.wikipedia.org/wiki/Zygosity#Heterozygosity_in_population_genetics. + * We also suggest the book "Population Genetics: A Concise Guide" by John H. Gillespie for further details on the theory.) That is, a hets value of 0.001 + * implies that two randomly chosen chromosomes from the population of organisms would differ from each other at a rate of 1 in 1000 bp. * - * which means that the probability of N samples being hom-ref at a site is: + * The default priors provided for humans (hets = 1e-3) * - * 1 - sum_i_2N (het / i) - * - * Note that heterozygosity as used here is the population genetics concept: - * - * http://en.wikipedia.org/wiki/Zygosity#Heterozygosity_in_population_genetics - * - * That is, a hets value of 0.01 implies that two randomly chosen chromosomes from the population of organisms - * would differ from each other (one being A and the other B) at a rate of 1 in 100 bp. - * - * Note that this quantity has nothing to do with the likelihood of any given sample having a heterozygous genotype, + * Also note that this quantity has nothing to do with the likelihood of any given sample having a heterozygous genotype, * which in the GATK is purely determined by the probability of the observed data P(D | AB) under the model that there * may be a AB het genotype. The posterior probability of this AB genotype would use the het prior, but the GATK * only uses this posterior probability in determining the prob. that a site is polymorphic. So changing the @@ -95,13 +89,13 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{ * The quantity that changes whether the GATK considers the possibility of a het genotype at all is the ploidy, * which determines how many chromosomes each individual in the species carries. */ - @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus. See the GATKDocs for full details on the meaning of this population genetics concept", required = false) + @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) public Double snpHeterozygosity = HomoSapiensConstants.SNP_HETEROZYGOSITY; /** * This argument informs the prior probability of having an indel at a site. */ - @Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling. See the GATKDocs for heterozygosity for full details on the meaning of this population genetics concept", required = false) + @Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false) public double indelHeterozygosity = HomoSapiensConstants.INDEL_HETEROZYGOSITY; /** @@ -135,12 +129,13 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{ * see e.g. Waterson (1975) or Tajima (1996). * This model asserts that the probability of having a population of k variant sites in N chromosomes is proportional to theta/k, for 1=1:N * - * There are instances where using this prior might not be desireable, e.g. for population studies where prior might not be appropriate, + * There are instances where using this prior might not be desirable, e.g. for population studies where prior might not be appropriate, * as for example when the ancestral status of the reference allele is not known. - * By using this argument, user can manually specify priors to be used for calling as a vector for doubles, with the following restriciotns: + * By using this argument, the user can manually specify a list of probabilities for each AC>1 to be used as priors for genotyping, + * with the following restrictions: * a) User must specify 2N values, where N is the number of samples. * b) Only diploid calls supported. - * c) Probability values are specified in double format, in linear space. + * c) Probability values are specified in Double format, in linear space (not log10 space or Phred-scale). * d) No negative values allowed. * e) Values will be added and Pr(AC=0) will be 1-sum, so that they sum up to one. * f) If user-defined values add to more than one, an error will be produced. diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java index ab6bc3b77..c3f44a1f0 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java @@ -238,10 +238,13 @@ public abstract class GenotypingEngine= configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING); } - /** - * Based in the model used, returns the appropriate heterozygosity argument value. - * @param model genotyping model. - * - * @return a valid heterozygosity in (0,1). - */ - private double getModelTheta(final GenotypeLikelihoodsCalculationModel.Model model) { - switch (model) { - case SNP: - case GENERALPLOIDYSNP: - return configuration.genotypeArgs.snpHeterozygosity; - case INDEL: - case GENERALPLOIDYINDEL: - return configuration.genotypeArgs.indelHeterozygosity; - default: - throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model); - } - } - /** * Checks whether the variant context has too many alternative alleles for progress to genotyping the site. @@ -473,7 +457,7 @@ public abstract class GenotypingEngine contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) { + protected final VariantCallContext estimateReferenceConfidence(VariantContext vc, Map contexts, double log10OfTheta, boolean ignoreCoveredSamples, double initialPofRef) { if ( contexts == null ) return null; @@ -487,7 +471,7 @@ public abstract class GenotypingEngine= configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING, false); @@ -523,16 +507,13 @@ public abstract class GenotypingEngine= 1) throw new IllegalArgumentException("theta must be greater than 0 and less than 1"); - final double log10PofNonRef = Math.log10(theta / 2.0) + getRefBinomialProbLog10(depth); + protected final double estimateLog10ReferenceConfidenceForOneSample(final int depth, final double log10OfTheta) { + final double log10PofNonRef = log10OfTheta + getRefBinomialProbLog10(depth); return MathUtils.log10OneMinusX(Math.pow(10.0, log10PofNonRef)); } @@ -558,14 +539,19 @@ public abstract class GenotypingEngine inputPriors) { + public static AFPriorProvider composeAlleleFrequencyPriorProvider(final int N, final double heterozygosity, final List inputPriors) { if (!inputPriors.isEmpty()) { // user-specified priors if (inputPriors.size() != N) throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 "); + for (final Double prior : inputPriors) { + if (prior <= 0 || prior >= 1) throw new UserException.BadArgumentValue("inputPrior","inputPrior vector values must be greater than 0 and less than 1"); + } return new CustomAFPriorProvider(inputPriors); } else diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculatorTestBuilder.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculatorTestBuilder.java index 21cb74d40..d5abda74b 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculatorTestBuilder.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculatorTestBuilder.java @@ -52,7 +52,9 @@ package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc; import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.gatk.tools.walkers.genotyper.AFPriorProvider; import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypingEngine; +import org.broadinstitute.gatk.tools.walkers.genotyper.UnifiedGenotypingEngine; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.Utils; import htsjdk.variant.variantcontext.*; @@ -111,6 +113,7 @@ public class AFCalculatorTestBuilder { public double[] makePriors() { final int nPriorValues = 2*nSamples+1; + final double human_theta = 0.001; switch ( priorType ) { case flat: @@ -118,8 +121,9 @@ public class AFCalculatorTestBuilder { //TODO break dependency with human... avoid special reference to this species. case human: - final double[] humanPriors = new double[nPriorValues]; - GenotypingEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList()); + + final AFPriorProvider log10priorProvider = GenotypingEngine.composeAlleleFrequencyPriorProvider(2*nSamples, human_theta, new ArrayList()); + final double[] humanPriors = log10priorProvider.forTotalPloidy(2*nSamples); return humanPriors; default: throw new RuntimeException("Unexpected type " + priorType); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java index a2c0e06ce..61cb79428 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java @@ -99,12 +99,13 @@ public class UnifiedGenotyperEngineUnitTest extends BaseTest { final List tests = new ArrayList<>(); // this functionality can be adapted to provide input data for whatever you might want in your data + //Note that this copies code from GenotypingEngine::estimateLog10ReferenceConfidenceForOneSample to provide expected values final double p = Math.log10(0.5); - for ( final double theta : Arrays.asList(0.1, 0.01, 0.001) ) { + for ( final double log10ofTheta : Arrays.asList(0.0, -1.0, -2.0, -3.0) ) { for ( final int depth : Arrays.asList(0, 1, 2, 10, 100, 1000, 10000) ) { - final double log10PofNonRef = Math.log10(theta / 2.0) + MathUtils.log10BinomialProbability(depth, 0, p); + final double log10PofNonRef = log10ofTheta + MathUtils.log10BinomialProbability(depth, 0, p); final double log10POfRef = MathUtils.log10OneMinusX(Math.pow(10.0, log10PofNonRef)); - tests.add(new Object[]{depth, theta, log10POfRef}); + tests.add(new Object[]{depth, log10ofTheta, log10POfRef}); } } @@ -112,8 +113,8 @@ public class UnifiedGenotyperEngineUnitTest extends BaseTest { } @Test(dataProvider = "ReferenceQualityCalculation") - public void testReferenceQualityCalculation(final int depth, final double theta, final double expected) { - final double ref = getEngine().estimateLog10ReferenceConfidenceForOneSample(depth, theta); + public void testReferenceQualityCalculation(final int depth, final double log10ofTheta, final double expected) { + final double ref = getEngine().estimateLog10ReferenceConfidenceForOneSample(depth, log10ofTheta); Assert.assertTrue(MathUtils.goodLog10Probability(ref), "Reference calculation wasn't a well formed log10 prob " + ref); Assert.assertEquals(ref, expected, TOLERANCE, "Failed reference confidence for single sample"); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculationUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculationUnitTest.java index 65529093c..eba969d90 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculationUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculationUnitTest.java @@ -53,6 +53,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc; import htsjdk.variant.variantcontext.*; import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.gatk.tools.walkers.genotyper.AFPriorProvider; import org.broadinstitute.gatk.tools.walkers.genotyper.UnifiedGenotypingEngine; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.MathUtils; @@ -197,10 +198,14 @@ public class AFCalculationUnitTest extends BaseTest { for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) { List calcs = createAFCalculators(Arrays.asList(AFCalculatorImplementation.values()), MAX_ALT_ALLELES, PLOIDY); + //number of entries in the priors array, one for AC=[0,2*nSamples] final int nPriorValues = 2*nSamples+1; + //total number of chromosomes in our samples -- here we're assuming diploid + final int totalPloidy = 2*nSamples; + final double theta = 0.001; final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors - final double[] humanPriors = new double[nPriorValues]; - UnifiedGenotypingEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList()); + final AFPriorProvider log10priorProvider = UnifiedGenotypingEngine.composeAlleleFrequencyPriorProvider(totalPloidy, theta, new ArrayList()); + final double[] humanPriors = log10priorProvider.forTotalPloidy(totalPloidy); for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) { for ( AFCalculator model : calcs ) { @@ -609,16 +614,16 @@ public class AFCalculationUnitTest extends BaseTest { final Genotype AB = makePL(Arrays.asList(A,C), REF_PL, 0, 10000); final double[] flatPriors = new double[]{0.0,0.0,0.0}; - final double[] noPriors = new double[3]; // test that function computeAlleleFrequency correctly operates when the flat prior option is set // computeAlleleFrequencyPriors takes linear priors final ArrayList inputPrior = new ArrayList(); inputPrior.add(1.0/3); inputPrior.add(1.0/3); - UnifiedGenotypingEngine.computeAlleleFrequencyPriors(2, noPriors, 0.0, inputPrior); + final AFPriorProvider log10priorProvider = UnifiedGenotypingEngine.composeAlleleFrequencyPriorProvider(2, 0.0, inputPrior); + final double[] noPriors = log10priorProvider.forTotalPloidy(2); GetGLsTest cfgFlatPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "flatPrior"); - GetGLsTest cfgNoPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "noPrior"); + GetGLsTest cfgNoPrior = new GetGLsTest(model, 1, Arrays.asList(AB), noPriors, "noPrior"); final AFCalculationResult resultTrackerFlat = cfgFlatPrior.execute(); final AFCalculationResult resultTrackerNoPrior = cfgNoPrior.execute(); diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java index 3e4861cd9..e314d95c6 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java @@ -149,7 +149,7 @@ public class GATKVariantContextUtils { } /** - * Calculates the total ploidy of a variant context as the sum of all plodies across genotypes. + * Calculates the total ploidy of a variant context as the sum of all ploidies across genotypes. * @param vc the target variant context. * @param defaultPloidy the default ploidy to be assume when there is no ploidy information for a genotype. * @return never {@code null}.