Merge pull request #1133 from broadinstitute/ldg_fixInputPriors

Make sure inputPriors get used everywhere if they are specified
2015-09-12 11:24:31 -04:00 · 2015-09-12 11:24:31 -04:00 · 3db0686dc4
parent 5fe568c664 53b506a0b8
commit 3db0686dc4
6 changed files with 52 additions and 61 deletions
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java
@ -70,22 +70,16 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{
    /**
     * The expected heterozygosity value used to compute prior probability that a locus is non-reference.
     *
-     * The default priors are for provided for humans:
+     * From the heterozygosity we calculate the probability of N samples being hom-ref at a site as 1 - sum_i_2N (hets / i)
+     * where hets is this case is analogous to the parameter theta from population genetics. See https://en.wikipedia.org/wiki/Coalescent_theory for more details.
     *
-     * het = 1e-3
+     * Note that heterozygosity as used here is the population genetics concept. (See http://en.wikipedia.org/wiki/Zygosity#Heterozygosity_in_population_genetics. 
+     * We also suggest the book "Population Genetics: A Concise Guide" by John H. Gillespie for further details on the theory.) That is, a hets value of 0.001 
+     * implies that two randomly chosen chromosomes from the population of organisms would differ from each other at a rate of 1 in 1000 bp.
     *
-     * which means that the probability of N samples being hom-ref at a site is:
+     * The default priors provided for humans (hets = 1e-3)
     *
-     * 1 - sum_i_2N (het / i)
-     *
-     * Note that heterozygosity as used here is the population genetics concept:
-     *
-     * http://en.wikipedia.org/wiki/Zygosity#Heterozygosity_in_population_genetics
-     *
-     * That is, a hets value of 0.01 implies that two randomly chosen chromosomes from the population of organisms
-     * would differ from each other (one being A and the other B) at a rate of 1 in 100 bp.
-     *
-     * Note that this quantity has nothing to do with the likelihood of any given sample having a heterozygous genotype,
+     * Also note that this quantity has nothing to do with the likelihood of any given sample having a heterozygous genotype,
     * which in the GATK is purely determined by the probability of the observed data P(D | AB) under the model that there
     * may be a AB het genotype.  The posterior probability of this AB genotype would use the het prior, but the GATK
     * only uses this posterior probability in determining the prob. that a site is polymorphic.  So changing the
@ -95,13 +89,13 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{
     * The quantity that changes whether the GATK considers the possibility of a het genotype at all is the ploidy,
     * which determines how many chromosomes each individual in the species carries.
     */
-    @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus.  See the GATKDocs for full details on the meaning of this population genetics concept", required = false)
+    @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
    public Double snpHeterozygosity = HomoSapiensConstants.SNP_HETEROZYGOSITY;

    /**
     * This argument informs the prior probability of having an indel at a site.
     */
-    @Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling.  See the GATKDocs for heterozygosity for full details on the meaning of this population genetics concept", required = false)
+    @Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false)
    public double indelHeterozygosity = HomoSapiensConstants.INDEL_HETEROZYGOSITY;

    /**
@ -135,12 +129,13 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{
     * see e.g. Waterson (1975) or Tajima (1996).
     * This model asserts that the probability of having a population of k variant sites in N chromosomes is proportional to theta/k, for 1=1:N
     *
-     * There are instances where using this prior might not be desireable, e.g. for population studies where prior might not be appropriate,
+     * There are instances where using this prior might not be desirable, e.g. for population studies where prior might not be appropriate,
     * as for example when the ancestral status of the reference allele is not known.
-     * By using this argument, user can manually specify priors to be used for calling as a vector for doubles, with the following restriciotns:
+     * By using this argument, the user can manually specify a list of probabilities for each AC>1 to be used as priors for genotyping,
+     * with the following restrictions:
     * a) User must specify 2N values, where N is the number of samples.
     * b) Only diploid calls supported.
-     * c) Probability values are specified in double format, in linear space.
+     * c) Probability values are specified in Double format, in linear space (not log10 space or Phred-scale).
     * d) No negative values allowed.
     * e) Values will be added and Pr(AC=0) will be 1-sum, so that they sum up to one.
     * f) If user-defined values add to more than one, an error will be produced.
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java
@ -238,10 +238,13 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
        final double phredScaledConfidence = (-10.0 * log10Confidence) + 0.0;

        // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
-        if ( !passesEmitThreshold(phredScaledConfidence, outputAlternativeAlleles.siteIsMonomorphic) && !forceSiteEmission())
+        if ( !passesEmitThreshold(phredScaledConfidence, outputAlternativeAlleles.siteIsMonomorphic) && !forceSiteEmission()) {
            // technically, at this point our confidence in a reference call isn't accurately estimated
            //  because it didn't take into account samples with no data, so let's get a better estimate
-            return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getModelTheta(model), true, PoFGT0);
+            double[] AFpriors = getAlleleFrequencyPriors(vc, defaultPloidy, model);
+            final int INDEX_FOR_AC_EQUALS_1 = 1;
+            return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, AFpriors[INDEX_FOR_AC_EQUALS_1], true, PoFGT0);
+        }

        // start constructing the resulting VC
        final GenomeLocParser genomeLocParser = this.genomeLocParser != null || refContext == null ? this.genomeLocParser : refContext.getGenomeLocParser();
@ -375,25 +378,6 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
                        && QualityUtils.phredScaleErrorRate(PofF) >= configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING);
    }

-    /**
-     * Based in the model used, returns the appropriate heterozygosity argument value.
-     * @param model genotyping model.
-     *
-     * @return a valid heterozygosity in (0,1).
-     */
-    private double getModelTheta(final GenotypeLikelihoodsCalculationModel.Model model) {
-        switch (model) {
-            case SNP:
-            case GENERALPLOIDYSNP:
-                return configuration.genotypeArgs.snpHeterozygosity;
-            case INDEL:
-            case GENERALPLOIDYINDEL:
-                return configuration.genotypeArgs.indelHeterozygosity;
-            default:
-                throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model);
-        }
-    }
-

    /**
     * Checks whether the variant context has too many alternative alleles for progress to genotyping the site.
@ -473,7 +457,7 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
     */
    protected abstract boolean forceSiteEmission();

-    protected final VariantCallContext estimateReferenceConfidence(VariantContext vc, Map<String, AlignmentContext> contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) {
+    protected final VariantCallContext estimateReferenceConfidence(VariantContext vc, Map<String, AlignmentContext> contexts, double log10OfTheta, boolean ignoreCoveredSamples, double initialPofRef) {
        if ( contexts == null )
            return null;

@ -487,7 +471,7 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
            if ( ignoreCoveredSamples && context != null )
                continue;
            final int depth = context == null ? 0 : context.getBasePileup().depthOfCoverage();
-            log10POfRef += estimateLog10ReferenceConfidenceForOneSample(depth, theta);
+            log10POfRef += estimateLog10ReferenceConfidenceForOneSample(depth, log10OfTheta);
        }

        return new VariantCallContext(vc, QualityUtils.phredScaleLog10CorrectRate(log10POfRef) >= configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING, false);
@ -523,16 +507,13 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
     * Assumes the sample is diploid
     *
     * @param depth the depth of the sample
-     * @param theta the heterozygosity of this species (between 0 and 1)
-     *
-     * @throws IllegalArgumentException if {@code depth} is less than 0 or {@code theta} is not in the (0,1) range.
+     * @param log10OfTheta the heterozygosity of this species (in log10-space)
     *
     * @return a valid log10 probability of the sample being hom-ref
     */
    @Ensures("MathUtils.goodLog10Probability(result)")
-    protected final double estimateLog10ReferenceConfidenceForOneSample(final int depth, final double theta) {
-        if (theta <= 0 || theta >= 1) throw new IllegalArgumentException("theta must be greater than 0 and less than 1");
-        final double log10PofNonRef = Math.log10(theta / 2.0) + getRefBinomialProbLog10(depth);
+    protected final double estimateLog10ReferenceConfidenceForOneSample(final int depth, final double log10OfTheta) {
+        final double log10PofNonRef = log10OfTheta + getRefBinomialProbLog10(depth);
        return MathUtils.log10OneMinusX(Math.pow(10.0, log10PofNonRef));
    }

@ -558,14 +539,19 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
     * @param heterozygosity                   default heterozygosity to use, if inputPriors is empty
     * @param inputPriors                      Input priors to use (in which case heterozygosity is ignored)
     *
+     * @throws IllegalArgumentException if {@code inputPriors} has size != {@code N} or any entry in {@code inputPriors} is not in the (0,1) range.
+     *
     * @return never {@code null}.
     */
-    private static AFPriorProvider composeAlleleFrequencyPriorProvider(final int N, final double heterozygosity, final List<Double> inputPriors) {
+    public static AFPriorProvider composeAlleleFrequencyPriorProvider(final int N, final double heterozygosity, final List<Double> inputPriors) {

        if (!inputPriors.isEmpty()) {
            // user-specified priors
            if (inputPriors.size() != N)
                throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 ");
+            for (final Double prior : inputPriors) {
+                if (prior <= 0 || prior >= 1) throw new UserException.BadArgumentValue("inputPrior","inputPrior vector values must be greater than 0 and less than 1");
+            }
            return new CustomAFPriorProvider(inputPriors);
        }
        else
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculatorTestBuilder.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculatorTestBuilder.java
@ -52,7 +52,9 @@
 package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc;

 import org.apache.commons.lang.ArrayUtils;
+import org.broadinstitute.gatk.tools.walkers.genotyper.AFPriorProvider;
 import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypingEngine;
+import org.broadinstitute.gatk.tools.walkers.genotyper.UnifiedGenotypingEngine;
 import org.broadinstitute.gatk.utils.MathUtils;
 import org.broadinstitute.gatk.utils.Utils;
 import htsjdk.variant.variantcontext.*;
@ -111,6 +113,7 @@ public class AFCalculatorTestBuilder {

    public double[] makePriors() {
        final int nPriorValues = 2*nSamples+1;
+        final double human_theta = 0.001;

        switch ( priorType ) {
            case flat:
@ -118,8 +121,9 @@ public class AFCalculatorTestBuilder {

            //TODO break dependency with human... avoid special reference to this species.
            case human:
-                final double[] humanPriors = new double[nPriorValues];
-                GenotypingEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList<Double>());
+
+                final AFPriorProvider log10priorProvider = GenotypingEngine.composeAlleleFrequencyPriorProvider(2*nSamples, human_theta, new ArrayList<Double>());
+                final double[] humanPriors = log10priorProvider.forTotalPloidy(2*nSamples);
                return humanPriors;
            default:
                throw new RuntimeException("Unexpected type " + priorType);
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java
@ -99,12 +99,13 @@ public class UnifiedGenotyperEngineUnitTest extends BaseTest {
        final List<Object[]> tests = new ArrayList<>();

        // this functionality can be adapted to provide input data for whatever you might want in your data
+        //Note that this copies code from GenotypingEngine::estimateLog10ReferenceConfidenceForOneSample to provide expected values
        final double p = Math.log10(0.5);
-        for ( final double theta : Arrays.asList(0.1, 0.01, 0.001) ) {
+        for ( final double log10ofTheta : Arrays.asList(0.0, -1.0, -2.0, -3.0) ) {
            for ( final int depth : Arrays.asList(0, 1, 2, 10, 100, 1000, 10000) ) {
-                final double log10PofNonRef = Math.log10(theta / 2.0) + MathUtils.log10BinomialProbability(depth, 0, p);
+                final double log10PofNonRef = log10ofTheta + MathUtils.log10BinomialProbability(depth, 0, p);
                final double log10POfRef = MathUtils.log10OneMinusX(Math.pow(10.0, log10PofNonRef));
-                tests.add(new Object[]{depth, theta, log10POfRef});
+                tests.add(new Object[]{depth, log10ofTheta, log10POfRef});
            }
        }

@ -112,8 +113,8 @@ public class UnifiedGenotyperEngineUnitTest extends BaseTest {
    }

    @Test(dataProvider = "ReferenceQualityCalculation")
-    public void testReferenceQualityCalculation(final int depth, final double theta, final double expected) {
-        final double ref = getEngine().estimateLog10ReferenceConfidenceForOneSample(depth, theta);
+    public void testReferenceQualityCalculation(final int depth, final double log10ofTheta, final double expected) {
+        final double ref = getEngine().estimateLog10ReferenceConfidenceForOneSample(depth, log10ofTheta);
        Assert.assertTrue(MathUtils.goodLog10Probability(ref), "Reference calculation wasn't a well formed log10 prob " + ref);
        Assert.assertEquals(ref, expected, TOLERANCE, "Failed reference confidence for single sample");
    }
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculationUnitTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculationUnitTest.java
@ -53,6 +53,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc;

 import htsjdk.variant.variantcontext.*;
 import org.apache.commons.lang.ArrayUtils;
+import org.broadinstitute.gatk.tools.walkers.genotyper.AFPriorProvider;
 import org.broadinstitute.gatk.tools.walkers.genotyper.UnifiedGenotypingEngine;
 import org.broadinstitute.gatk.utils.BaseTest;
 import org.broadinstitute.gatk.utils.MathUtils;
@ -197,10 +198,14 @@ public class AFCalculationUnitTest extends BaseTest {
        for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) {
            List<AFCalculator> calcs = createAFCalculators(Arrays.asList(AFCalculatorImplementation.values()), MAX_ALT_ALLELES, PLOIDY);

+            //number of entries in the priors array, one for AC=[0,2*nSamples]
            final int nPriorValues = 2*nSamples+1;
+            //total number of chromosomes in our samples -- here we're assuming diploid
+            final int totalPloidy = 2*nSamples;
+            final double theta = 0.001;
            final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true);  // flat priors
-            final double[] humanPriors = new double[nPriorValues];
-            UnifiedGenotypingEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList<Double>());
+            final AFPriorProvider log10priorProvider = UnifiedGenotypingEngine.composeAlleleFrequencyPriorProvider(totalPloidy, theta, new ArrayList<Double>());
+            final double[] humanPriors = log10priorProvider.forTotalPloidy(totalPloidy);

            for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) {
                for ( AFCalculator model : calcs ) {
@ -609,16 +614,16 @@ public class AFCalculationUnitTest extends BaseTest {
            final Genotype AB = makePL(Arrays.asList(A,C), REF_PL, 0, 10000);

            final double[] flatPriors = new double[]{0.0,0.0,0.0};
-            final double[] noPriors = new double[3];
            // test that function computeAlleleFrequency correctly operates when the flat prior option is set
            // computeAlleleFrequencyPriors takes linear priors
            final ArrayList<Double> inputPrior = new ArrayList<Double>();
            inputPrior.add(1.0/3);
            inputPrior.add(1.0/3);
-            UnifiedGenotypingEngine.computeAlleleFrequencyPriors(2, noPriors, 0.0, inputPrior);
+            final AFPriorProvider log10priorProvider = UnifiedGenotypingEngine.composeAlleleFrequencyPriorProvider(2, 0.0, inputPrior);
+            final double[] noPriors = log10priorProvider.forTotalPloidy(2);

            GetGLsTest cfgFlatPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "flatPrior");
-            GetGLsTest cfgNoPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "noPrior");
+            GetGLsTest cfgNoPrior = new GetGLsTest(model, 1, Arrays.asList(AB), noPriors, "noPrior");
            final AFCalculationResult resultTrackerFlat = cfgFlatPrior.execute();
            final AFCalculationResult resultTrackerNoPrior = cfgNoPrior.execute();

--- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java
+++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java
@ -149,7 +149,7 @@ public class GATKVariantContextUtils {
    }

    /**
-     * Calculates the total ploidy of a variant context as the sum of all plodies across genotypes.
+     * Calculates the total ploidy of a variant context as the sum of all ploidies across genotypes.
     * @param vc the target variant context.
     * @param defaultPloidy the default ploidy to be assume when there is no ploidy information for a genotype.
     * @return never {@code null}.