Make sure inputPriors get used if they are specified

Fix usage of AF prior (i.e. theta) in probability of non-reference calculation
Refactored duplicate functions
Updated docs for heterozygosity
This commit is contained in:
Laura Gauthier 2015-08-07 14:56:49 -04:00
parent 5fe568c664
commit 53b506a0b8
6 changed files with 52 additions and 61 deletions

View File

@ -70,22 +70,16 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{
/**
* The expected heterozygosity value used to compute prior probability that a locus is non-reference.
*
* The default priors are for provided for humans:
* From the heterozygosity we calculate the probability of N samples being hom-ref at a site as 1 - sum_i_2N (hets / i)
* where hets is this case is analogous to the parameter theta from population genetics. See https://en.wikipedia.org/wiki/Coalescent_theory for more details.
*
* het = 1e-3
* Note that heterozygosity as used here is the population genetics concept. (See http://en.wikipedia.org/wiki/Zygosity#Heterozygosity_in_population_genetics.
* We also suggest the book "Population Genetics: A Concise Guide" by John H. Gillespie for further details on the theory.) That is, a hets value of 0.001
* implies that two randomly chosen chromosomes from the population of organisms would differ from each other at a rate of 1 in 1000 bp.
*
* which means that the probability of N samples being hom-ref at a site is:
* The default priors provided for humans (hets = 1e-3)
*
* 1 - sum_i_2N (het / i)
*
* Note that heterozygosity as used here is the population genetics concept:
*
* http://en.wikipedia.org/wiki/Zygosity#Heterozygosity_in_population_genetics
*
* That is, a hets value of 0.01 implies that two randomly chosen chromosomes from the population of organisms
* would differ from each other (one being A and the other B) at a rate of 1 in 100 bp.
*
* Note that this quantity has nothing to do with the likelihood of any given sample having a heterozygous genotype,
* Also note that this quantity has nothing to do with the likelihood of any given sample having a heterozygous genotype,
* which in the GATK is purely determined by the probability of the observed data P(D | AB) under the model that there
* may be a AB het genotype. The posterior probability of this AB genotype would use the het prior, but the GATK
* only uses this posterior probability in determining the prob. that a site is polymorphic. So changing the
@ -95,13 +89,13 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{
* The quantity that changes whether the GATK considers the possibility of a het genotype at all is the ploidy,
* which determines how many chromosomes each individual in the species carries.
*/
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus. See the GATKDocs for full details on the meaning of this population genetics concept", required = false)
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
public Double snpHeterozygosity = HomoSapiensConstants.SNP_HETEROZYGOSITY;
/**
* This argument informs the prior probability of having an indel at a site.
*/
@Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling. See the GATKDocs for heterozygosity for full details on the meaning of this population genetics concept", required = false)
@Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false)
public double indelHeterozygosity = HomoSapiensConstants.INDEL_HETEROZYGOSITY;
/**
@ -135,12 +129,13 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{
* see e.g. Waterson (1975) or Tajima (1996).
* This model asserts that the probability of having a population of k variant sites in N chromosomes is proportional to theta/k, for 1=1:N
*
* There are instances where using this prior might not be desireable, e.g. for population studies where prior might not be appropriate,
* There are instances where using this prior might not be desirable, e.g. for population studies where prior might not be appropriate,
* as for example when the ancestral status of the reference allele is not known.
* By using this argument, user can manually specify priors to be used for calling as a vector for doubles, with the following restriciotns:
* By using this argument, the user can manually specify a list of probabilities for each AC>1 to be used as priors for genotyping,
* with the following restrictions:
* a) User must specify 2N values, where N is the number of samples.
* b) Only diploid calls supported.
* c) Probability values are specified in double format, in linear space.
* c) Probability values are specified in Double format, in linear space (not log10 space or Phred-scale).
* d) No negative values allowed.
* e) Values will be added and Pr(AC=0) will be 1-sum, so that they sum up to one.
* f) If user-defined values add to more than one, an error will be produced.

View File

@ -238,10 +238,13 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
final double phredScaledConfidence = (-10.0 * log10Confidence) + 0.0;
// return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
if ( !passesEmitThreshold(phredScaledConfidence, outputAlternativeAlleles.siteIsMonomorphic) && !forceSiteEmission())
if ( !passesEmitThreshold(phredScaledConfidence, outputAlternativeAlleles.siteIsMonomorphic) && !forceSiteEmission()) {
// technically, at this point our confidence in a reference call isn't accurately estimated
// because it didn't take into account samples with no data, so let's get a better estimate
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getModelTheta(model), true, PoFGT0);
double[] AFpriors = getAlleleFrequencyPriors(vc, defaultPloidy, model);
final int INDEX_FOR_AC_EQUALS_1 = 1;
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, AFpriors[INDEX_FOR_AC_EQUALS_1], true, PoFGT0);
}
// start constructing the resulting VC
final GenomeLocParser genomeLocParser = this.genomeLocParser != null || refContext == null ? this.genomeLocParser : refContext.getGenomeLocParser();
@ -375,25 +378,6 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
&& QualityUtils.phredScaleErrorRate(PofF) >= configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING);
}
/**
* Based in the model used, returns the appropriate heterozygosity argument value.
* @param model genotyping model.
*
* @return a valid heterozygosity in (0,1).
*/
private double getModelTheta(final GenotypeLikelihoodsCalculationModel.Model model) {
switch (model) {
case SNP:
case GENERALPLOIDYSNP:
return configuration.genotypeArgs.snpHeterozygosity;
case INDEL:
case GENERALPLOIDYINDEL:
return configuration.genotypeArgs.indelHeterozygosity;
default:
throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model);
}
}
/**
* Checks whether the variant context has too many alternative alleles for progress to genotyping the site.
@ -473,7 +457,7 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
*/
protected abstract boolean forceSiteEmission();
protected final VariantCallContext estimateReferenceConfidence(VariantContext vc, Map<String, AlignmentContext> contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) {
protected final VariantCallContext estimateReferenceConfidence(VariantContext vc, Map<String, AlignmentContext> contexts, double log10OfTheta, boolean ignoreCoveredSamples, double initialPofRef) {
if ( contexts == null )
return null;
@ -487,7 +471,7 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
if ( ignoreCoveredSamples && context != null )
continue;
final int depth = context == null ? 0 : context.getBasePileup().depthOfCoverage();
log10POfRef += estimateLog10ReferenceConfidenceForOneSample(depth, theta);
log10POfRef += estimateLog10ReferenceConfidenceForOneSample(depth, log10OfTheta);
}
return new VariantCallContext(vc, QualityUtils.phredScaleLog10CorrectRate(log10POfRef) >= configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING, false);
@ -523,16 +507,13 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
* Assumes the sample is diploid
*
* @param depth the depth of the sample
* @param theta the heterozygosity of this species (between 0 and 1)
*
* @throws IllegalArgumentException if {@code depth} is less than 0 or {@code theta} is not in the (0,1) range.
* @param log10OfTheta the heterozygosity of this species (in log10-space)
*
* @return a valid log10 probability of the sample being hom-ref
*/
@Ensures("MathUtils.goodLog10Probability(result)")
protected final double estimateLog10ReferenceConfidenceForOneSample(final int depth, final double theta) {
if (theta <= 0 || theta >= 1) throw new IllegalArgumentException("theta must be greater than 0 and less than 1");
final double log10PofNonRef = Math.log10(theta / 2.0) + getRefBinomialProbLog10(depth);
protected final double estimateLog10ReferenceConfidenceForOneSample(final int depth, final double log10OfTheta) {
final double log10PofNonRef = log10OfTheta + getRefBinomialProbLog10(depth);
return MathUtils.log10OneMinusX(Math.pow(10.0, log10PofNonRef));
}
@ -558,14 +539,19 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
* @param heterozygosity default heterozygosity to use, if inputPriors is empty
* @param inputPriors Input priors to use (in which case heterozygosity is ignored)
*
* @throws IllegalArgumentException if {@code inputPriors} has size != {@code N} or any entry in {@code inputPriors} is not in the (0,1) range.
*
* @return never {@code null}.
*/
private static AFPriorProvider composeAlleleFrequencyPriorProvider(final int N, final double heterozygosity, final List<Double> inputPriors) {
public static AFPriorProvider composeAlleleFrequencyPriorProvider(final int N, final double heterozygosity, final List<Double> inputPriors) {
if (!inputPriors.isEmpty()) {
// user-specified priors
if (inputPriors.size() != N)
throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 ");
for (final Double prior : inputPriors) {
if (prior <= 0 || prior >= 1) throw new UserException.BadArgumentValue("inputPrior","inputPrior vector values must be greater than 0 and less than 1");
}
return new CustomAFPriorProvider(inputPriors);
}
else

View File

@ -52,7 +52,9 @@
package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.gatk.tools.walkers.genotyper.AFPriorProvider;
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypingEngine;
import org.broadinstitute.gatk.tools.walkers.genotyper.UnifiedGenotypingEngine;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.Utils;
import htsjdk.variant.variantcontext.*;
@ -111,6 +113,7 @@ public class AFCalculatorTestBuilder {
public double[] makePriors() {
final int nPriorValues = 2*nSamples+1;
final double human_theta = 0.001;
switch ( priorType ) {
case flat:
@ -118,8 +121,9 @@ public class AFCalculatorTestBuilder {
//TODO break dependency with human... avoid special reference to this species.
case human:
final double[] humanPriors = new double[nPriorValues];
GenotypingEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList<Double>());
final AFPriorProvider log10priorProvider = GenotypingEngine.composeAlleleFrequencyPriorProvider(2*nSamples, human_theta, new ArrayList<Double>());
final double[] humanPriors = log10priorProvider.forTotalPloidy(2*nSamples);
return humanPriors;
default:
throw new RuntimeException("Unexpected type " + priorType);

View File

@ -99,12 +99,13 @@ public class UnifiedGenotyperEngineUnitTest extends BaseTest {
final List<Object[]> tests = new ArrayList<>();
// this functionality can be adapted to provide input data for whatever you might want in your data
//Note that this copies code from GenotypingEngine::estimateLog10ReferenceConfidenceForOneSample to provide expected values
final double p = Math.log10(0.5);
for ( final double theta : Arrays.asList(0.1, 0.01, 0.001) ) {
for ( final double log10ofTheta : Arrays.asList(0.0, -1.0, -2.0, -3.0) ) {
for ( final int depth : Arrays.asList(0, 1, 2, 10, 100, 1000, 10000) ) {
final double log10PofNonRef = Math.log10(theta / 2.0) + MathUtils.log10BinomialProbability(depth, 0, p);
final double log10PofNonRef = log10ofTheta + MathUtils.log10BinomialProbability(depth, 0, p);
final double log10POfRef = MathUtils.log10OneMinusX(Math.pow(10.0, log10PofNonRef));
tests.add(new Object[]{depth, theta, log10POfRef});
tests.add(new Object[]{depth, log10ofTheta, log10POfRef});
}
}
@ -112,8 +113,8 @@ public class UnifiedGenotyperEngineUnitTest extends BaseTest {
}
@Test(dataProvider = "ReferenceQualityCalculation")
public void testReferenceQualityCalculation(final int depth, final double theta, final double expected) {
final double ref = getEngine().estimateLog10ReferenceConfidenceForOneSample(depth, theta);
public void testReferenceQualityCalculation(final int depth, final double log10ofTheta, final double expected) {
final double ref = getEngine().estimateLog10ReferenceConfidenceForOneSample(depth, log10ofTheta);
Assert.assertTrue(MathUtils.goodLog10Probability(ref), "Reference calculation wasn't a well formed log10 prob " + ref);
Assert.assertEquals(ref, expected, TOLERANCE, "Failed reference confidence for single sample");
}

View File

@ -53,6 +53,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc;
import htsjdk.variant.variantcontext.*;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.gatk.tools.walkers.genotyper.AFPriorProvider;
import org.broadinstitute.gatk.tools.walkers.genotyper.UnifiedGenotypingEngine;
import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.MathUtils;
@ -197,10 +198,14 @@ public class AFCalculationUnitTest extends BaseTest {
for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) {
List<AFCalculator> calcs = createAFCalculators(Arrays.asList(AFCalculatorImplementation.values()), MAX_ALT_ALLELES, PLOIDY);
//number of entries in the priors array, one for AC=[0,2*nSamples]
final int nPriorValues = 2*nSamples+1;
//total number of chromosomes in our samples -- here we're assuming diploid
final int totalPloidy = 2*nSamples;
final double theta = 0.001;
final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
final double[] humanPriors = new double[nPriorValues];
UnifiedGenotypingEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList<Double>());
final AFPriorProvider log10priorProvider = UnifiedGenotypingEngine.composeAlleleFrequencyPriorProvider(totalPloidy, theta, new ArrayList<Double>());
final double[] humanPriors = log10priorProvider.forTotalPloidy(totalPloidy);
for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) {
for ( AFCalculator model : calcs ) {
@ -609,16 +614,16 @@ public class AFCalculationUnitTest extends BaseTest {
final Genotype AB = makePL(Arrays.asList(A,C), REF_PL, 0, 10000);
final double[] flatPriors = new double[]{0.0,0.0,0.0};
final double[] noPriors = new double[3];
// test that function computeAlleleFrequency correctly operates when the flat prior option is set
// computeAlleleFrequencyPriors takes linear priors
final ArrayList<Double> inputPrior = new ArrayList<Double>();
inputPrior.add(1.0/3);
inputPrior.add(1.0/3);
UnifiedGenotypingEngine.computeAlleleFrequencyPriors(2, noPriors, 0.0, inputPrior);
final AFPriorProvider log10priorProvider = UnifiedGenotypingEngine.composeAlleleFrequencyPriorProvider(2, 0.0, inputPrior);
final double[] noPriors = log10priorProvider.forTotalPloidy(2);
GetGLsTest cfgFlatPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "flatPrior");
GetGLsTest cfgNoPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "noPrior");
GetGLsTest cfgNoPrior = new GetGLsTest(model, 1, Arrays.asList(AB), noPriors, "noPrior");
final AFCalculationResult resultTrackerFlat = cfgFlatPrior.execute();
final AFCalculationResult resultTrackerNoPrior = cfgNoPrior.execute();

View File

@ -149,7 +149,7 @@ public class GATKVariantContextUtils {
}
/**
* Calculates the total ploidy of a variant context as the sum of all plodies across genotypes.
* Calculates the total ploidy of a variant context as the sum of all ploidies across genotypes.
* @param vc the target variant context.
* @param defaultPloidy the default ploidy to be assume when there is no ploidy information for a genotype.
* @return never {@code null}.