Merge pull request #1133 from broadinstitute/ldg_fixInputPriors
Make sure inputPriors get used everywhere if they are specified
This commit is contained in:
commit
3db0686dc4
|
|
@ -70,22 +70,16 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{
|
|||
/**
|
||||
* The expected heterozygosity value used to compute prior probability that a locus is non-reference.
|
||||
*
|
||||
* The default priors are for provided for humans:
|
||||
* From the heterozygosity we calculate the probability of N samples being hom-ref at a site as 1 - sum_i_2N (hets / i)
|
||||
* where hets is this case is analogous to the parameter theta from population genetics. See https://en.wikipedia.org/wiki/Coalescent_theory for more details.
|
||||
*
|
||||
* het = 1e-3
|
||||
* Note that heterozygosity as used here is the population genetics concept. (See http://en.wikipedia.org/wiki/Zygosity#Heterozygosity_in_population_genetics.
|
||||
* We also suggest the book "Population Genetics: A Concise Guide" by John H. Gillespie for further details on the theory.) That is, a hets value of 0.001
|
||||
* implies that two randomly chosen chromosomes from the population of organisms would differ from each other at a rate of 1 in 1000 bp.
|
||||
*
|
||||
* which means that the probability of N samples being hom-ref at a site is:
|
||||
* The default priors provided for humans (hets = 1e-3)
|
||||
*
|
||||
* 1 - sum_i_2N (het / i)
|
||||
*
|
||||
* Note that heterozygosity as used here is the population genetics concept:
|
||||
*
|
||||
* http://en.wikipedia.org/wiki/Zygosity#Heterozygosity_in_population_genetics
|
||||
*
|
||||
* That is, a hets value of 0.01 implies that two randomly chosen chromosomes from the population of organisms
|
||||
* would differ from each other (one being A and the other B) at a rate of 1 in 100 bp.
|
||||
*
|
||||
* Note that this quantity has nothing to do with the likelihood of any given sample having a heterozygous genotype,
|
||||
* Also note that this quantity has nothing to do with the likelihood of any given sample having a heterozygous genotype,
|
||||
* which in the GATK is purely determined by the probability of the observed data P(D | AB) under the model that there
|
||||
* may be a AB het genotype. The posterior probability of this AB genotype would use the het prior, but the GATK
|
||||
* only uses this posterior probability in determining the prob. that a site is polymorphic. So changing the
|
||||
|
|
@ -95,13 +89,13 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{
|
|||
* The quantity that changes whether the GATK considers the possibility of a het genotype at all is the ploidy,
|
||||
* which determines how many chromosomes each individual in the species carries.
|
||||
*/
|
||||
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus. See the GATKDocs for full details on the meaning of this population genetics concept", required = false)
|
||||
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
|
||||
public Double snpHeterozygosity = HomoSapiensConstants.SNP_HETEROZYGOSITY;
|
||||
|
||||
/**
|
||||
* This argument informs the prior probability of having an indel at a site.
|
||||
*/
|
||||
@Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling. See the GATKDocs for heterozygosity for full details on the meaning of this population genetics concept", required = false)
|
||||
@Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false)
|
||||
public double indelHeterozygosity = HomoSapiensConstants.INDEL_HETEROZYGOSITY;
|
||||
|
||||
/**
|
||||
|
|
@ -135,12 +129,13 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{
|
|||
* see e.g. Waterson (1975) or Tajima (1996).
|
||||
* This model asserts that the probability of having a population of k variant sites in N chromosomes is proportional to theta/k, for 1=1:N
|
||||
*
|
||||
* There are instances where using this prior might not be desireable, e.g. for population studies where prior might not be appropriate,
|
||||
* There are instances where using this prior might not be desirable, e.g. for population studies where prior might not be appropriate,
|
||||
* as for example when the ancestral status of the reference allele is not known.
|
||||
* By using this argument, user can manually specify priors to be used for calling as a vector for doubles, with the following restriciotns:
|
||||
* By using this argument, the user can manually specify a list of probabilities for each AC>1 to be used as priors for genotyping,
|
||||
* with the following restrictions:
|
||||
* a) User must specify 2N values, where N is the number of samples.
|
||||
* b) Only diploid calls supported.
|
||||
* c) Probability values are specified in double format, in linear space.
|
||||
* c) Probability values are specified in Double format, in linear space (not log10 space or Phred-scale).
|
||||
* d) No negative values allowed.
|
||||
* e) Values will be added and Pr(AC=0) will be 1-sum, so that they sum up to one.
|
||||
* f) If user-defined values add to more than one, an error will be produced.
|
||||
|
|
|
|||
|
|
@ -238,10 +238,13 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
|
|||
final double phredScaledConfidence = (-10.0 * log10Confidence) + 0.0;
|
||||
|
||||
// return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
|
||||
if ( !passesEmitThreshold(phredScaledConfidence, outputAlternativeAlleles.siteIsMonomorphic) && !forceSiteEmission())
|
||||
if ( !passesEmitThreshold(phredScaledConfidence, outputAlternativeAlleles.siteIsMonomorphic) && !forceSiteEmission()) {
|
||||
// technically, at this point our confidence in a reference call isn't accurately estimated
|
||||
// because it didn't take into account samples with no data, so let's get a better estimate
|
||||
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getModelTheta(model), true, PoFGT0);
|
||||
double[] AFpriors = getAlleleFrequencyPriors(vc, defaultPloidy, model);
|
||||
final int INDEX_FOR_AC_EQUALS_1 = 1;
|
||||
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, AFpriors[INDEX_FOR_AC_EQUALS_1], true, PoFGT0);
|
||||
}
|
||||
|
||||
// start constructing the resulting VC
|
||||
final GenomeLocParser genomeLocParser = this.genomeLocParser != null || refContext == null ? this.genomeLocParser : refContext.getGenomeLocParser();
|
||||
|
|
@ -375,25 +378,6 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
|
|||
&& QualityUtils.phredScaleErrorRate(PofF) >= configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING);
|
||||
}
|
||||
|
||||
/**
|
||||
* Based in the model used, returns the appropriate heterozygosity argument value.
|
||||
* @param model genotyping model.
|
||||
*
|
||||
* @return a valid heterozygosity in (0,1).
|
||||
*/
|
||||
private double getModelTheta(final GenotypeLikelihoodsCalculationModel.Model model) {
|
||||
switch (model) {
|
||||
case SNP:
|
||||
case GENERALPLOIDYSNP:
|
||||
return configuration.genotypeArgs.snpHeterozygosity;
|
||||
case INDEL:
|
||||
case GENERALPLOIDYINDEL:
|
||||
return configuration.genotypeArgs.indelHeterozygosity;
|
||||
default:
|
||||
throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether the variant context has too many alternative alleles for progress to genotyping the site.
|
||||
|
|
@ -473,7 +457,7 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
|
|||
*/
|
||||
protected abstract boolean forceSiteEmission();
|
||||
|
||||
protected final VariantCallContext estimateReferenceConfidence(VariantContext vc, Map<String, AlignmentContext> contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) {
|
||||
protected final VariantCallContext estimateReferenceConfidence(VariantContext vc, Map<String, AlignmentContext> contexts, double log10OfTheta, boolean ignoreCoveredSamples, double initialPofRef) {
|
||||
if ( contexts == null )
|
||||
return null;
|
||||
|
||||
|
|
@ -487,7 +471,7 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
|
|||
if ( ignoreCoveredSamples && context != null )
|
||||
continue;
|
||||
final int depth = context == null ? 0 : context.getBasePileup().depthOfCoverage();
|
||||
log10POfRef += estimateLog10ReferenceConfidenceForOneSample(depth, theta);
|
||||
log10POfRef += estimateLog10ReferenceConfidenceForOneSample(depth, log10OfTheta);
|
||||
}
|
||||
|
||||
return new VariantCallContext(vc, QualityUtils.phredScaleLog10CorrectRate(log10POfRef) >= configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_CALLING, false);
|
||||
|
|
@ -523,16 +507,13 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
|
|||
* Assumes the sample is diploid
|
||||
*
|
||||
* @param depth the depth of the sample
|
||||
* @param theta the heterozygosity of this species (between 0 and 1)
|
||||
*
|
||||
* @throws IllegalArgumentException if {@code depth} is less than 0 or {@code theta} is not in the (0,1) range.
|
||||
* @param log10OfTheta the heterozygosity of this species (in log10-space)
|
||||
*
|
||||
* @return a valid log10 probability of the sample being hom-ref
|
||||
*/
|
||||
@Ensures("MathUtils.goodLog10Probability(result)")
|
||||
protected final double estimateLog10ReferenceConfidenceForOneSample(final int depth, final double theta) {
|
||||
if (theta <= 0 || theta >= 1) throw new IllegalArgumentException("theta must be greater than 0 and less than 1");
|
||||
final double log10PofNonRef = Math.log10(theta / 2.0) + getRefBinomialProbLog10(depth);
|
||||
protected final double estimateLog10ReferenceConfidenceForOneSample(final int depth, final double log10OfTheta) {
|
||||
final double log10PofNonRef = log10OfTheta + getRefBinomialProbLog10(depth);
|
||||
return MathUtils.log10OneMinusX(Math.pow(10.0, log10PofNonRef));
|
||||
}
|
||||
|
||||
|
|
@ -558,14 +539,19 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
|
|||
* @param heterozygosity default heterozygosity to use, if inputPriors is empty
|
||||
* @param inputPriors Input priors to use (in which case heterozygosity is ignored)
|
||||
*
|
||||
* @throws IllegalArgumentException if {@code inputPriors} has size != {@code N} or any entry in {@code inputPriors} is not in the (0,1) range.
|
||||
*
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
private static AFPriorProvider composeAlleleFrequencyPriorProvider(final int N, final double heterozygosity, final List<Double> inputPriors) {
|
||||
public static AFPriorProvider composeAlleleFrequencyPriorProvider(final int N, final double heterozygosity, final List<Double> inputPriors) {
|
||||
|
||||
if (!inputPriors.isEmpty()) {
|
||||
// user-specified priors
|
||||
if (inputPriors.size() != N)
|
||||
throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 ");
|
||||
for (final Double prior : inputPriors) {
|
||||
if (prior <= 0 || prior >= 1) throw new UserException.BadArgumentValue("inputPrior","inputPrior vector values must be greater than 0 and less than 1");
|
||||
}
|
||||
return new CustomAFPriorProvider(inputPriors);
|
||||
}
|
||||
else
|
||||
|
|
|
|||
|
|
@ -52,7 +52,9 @@
|
|||
package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc;
|
||||
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.gatk.tools.walkers.genotyper.AFPriorProvider;
|
||||
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypingEngine;
|
||||
import org.broadinstitute.gatk.tools.walkers.genotyper.UnifiedGenotypingEngine;
|
||||
import org.broadinstitute.gatk.utils.MathUtils;
|
||||
import org.broadinstitute.gatk.utils.Utils;
|
||||
import htsjdk.variant.variantcontext.*;
|
||||
|
|
@ -111,6 +113,7 @@ public class AFCalculatorTestBuilder {
|
|||
|
||||
public double[] makePriors() {
|
||||
final int nPriorValues = 2*nSamples+1;
|
||||
final double human_theta = 0.001;
|
||||
|
||||
switch ( priorType ) {
|
||||
case flat:
|
||||
|
|
@ -118,8 +121,9 @@ public class AFCalculatorTestBuilder {
|
|||
|
||||
//TODO break dependency with human... avoid special reference to this species.
|
||||
case human:
|
||||
final double[] humanPriors = new double[nPriorValues];
|
||||
GenotypingEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList<Double>());
|
||||
|
||||
final AFPriorProvider log10priorProvider = GenotypingEngine.composeAlleleFrequencyPriorProvider(2*nSamples, human_theta, new ArrayList<Double>());
|
||||
final double[] humanPriors = log10priorProvider.forTotalPloidy(2*nSamples);
|
||||
return humanPriors;
|
||||
default:
|
||||
throw new RuntimeException("Unexpected type " + priorType);
|
||||
|
|
|
|||
|
|
@ -99,12 +99,13 @@ public class UnifiedGenotyperEngineUnitTest extends BaseTest {
|
|||
final List<Object[]> tests = new ArrayList<>();
|
||||
|
||||
// this functionality can be adapted to provide input data for whatever you might want in your data
|
||||
//Note that this copies code from GenotypingEngine::estimateLog10ReferenceConfidenceForOneSample to provide expected values
|
||||
final double p = Math.log10(0.5);
|
||||
for ( final double theta : Arrays.asList(0.1, 0.01, 0.001) ) {
|
||||
for ( final double log10ofTheta : Arrays.asList(0.0, -1.0, -2.0, -3.0) ) {
|
||||
for ( final int depth : Arrays.asList(0, 1, 2, 10, 100, 1000, 10000) ) {
|
||||
final double log10PofNonRef = Math.log10(theta / 2.0) + MathUtils.log10BinomialProbability(depth, 0, p);
|
||||
final double log10PofNonRef = log10ofTheta + MathUtils.log10BinomialProbability(depth, 0, p);
|
||||
final double log10POfRef = MathUtils.log10OneMinusX(Math.pow(10.0, log10PofNonRef));
|
||||
tests.add(new Object[]{depth, theta, log10POfRef});
|
||||
tests.add(new Object[]{depth, log10ofTheta, log10POfRef});
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -112,8 +113,8 @@ public class UnifiedGenotyperEngineUnitTest extends BaseTest {
|
|||
}
|
||||
|
||||
@Test(dataProvider = "ReferenceQualityCalculation")
|
||||
public void testReferenceQualityCalculation(final int depth, final double theta, final double expected) {
|
||||
final double ref = getEngine().estimateLog10ReferenceConfidenceForOneSample(depth, theta);
|
||||
public void testReferenceQualityCalculation(final int depth, final double log10ofTheta, final double expected) {
|
||||
final double ref = getEngine().estimateLog10ReferenceConfidenceForOneSample(depth, log10ofTheta);
|
||||
Assert.assertTrue(MathUtils.goodLog10Probability(ref), "Reference calculation wasn't a well formed log10 prob " + ref);
|
||||
Assert.assertEquals(ref, expected, TOLERANCE, "Failed reference confidence for single sample");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -53,6 +53,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc;
|
|||
|
||||
import htsjdk.variant.variantcontext.*;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.gatk.tools.walkers.genotyper.AFPriorProvider;
|
||||
import org.broadinstitute.gatk.tools.walkers.genotyper.UnifiedGenotypingEngine;
|
||||
import org.broadinstitute.gatk.utils.BaseTest;
|
||||
import org.broadinstitute.gatk.utils.MathUtils;
|
||||
|
|
@ -197,10 +198,14 @@ public class AFCalculationUnitTest extends BaseTest {
|
|||
for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) {
|
||||
List<AFCalculator> calcs = createAFCalculators(Arrays.asList(AFCalculatorImplementation.values()), MAX_ALT_ALLELES, PLOIDY);
|
||||
|
||||
//number of entries in the priors array, one for AC=[0,2*nSamples]
|
||||
final int nPriorValues = 2*nSamples+1;
|
||||
//total number of chromosomes in our samples -- here we're assuming diploid
|
||||
final int totalPloidy = 2*nSamples;
|
||||
final double theta = 0.001;
|
||||
final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
|
||||
final double[] humanPriors = new double[nPriorValues];
|
||||
UnifiedGenotypingEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList<Double>());
|
||||
final AFPriorProvider log10priorProvider = UnifiedGenotypingEngine.composeAlleleFrequencyPriorProvider(totalPloidy, theta, new ArrayList<Double>());
|
||||
final double[] humanPriors = log10priorProvider.forTotalPloidy(totalPloidy);
|
||||
|
||||
for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) {
|
||||
for ( AFCalculator model : calcs ) {
|
||||
|
|
@ -609,16 +614,16 @@ public class AFCalculationUnitTest extends BaseTest {
|
|||
final Genotype AB = makePL(Arrays.asList(A,C), REF_PL, 0, 10000);
|
||||
|
||||
final double[] flatPriors = new double[]{0.0,0.0,0.0};
|
||||
final double[] noPriors = new double[3];
|
||||
// test that function computeAlleleFrequency correctly operates when the flat prior option is set
|
||||
// computeAlleleFrequencyPriors takes linear priors
|
||||
final ArrayList<Double> inputPrior = new ArrayList<Double>();
|
||||
inputPrior.add(1.0/3);
|
||||
inputPrior.add(1.0/3);
|
||||
UnifiedGenotypingEngine.computeAlleleFrequencyPriors(2, noPriors, 0.0, inputPrior);
|
||||
final AFPriorProvider log10priorProvider = UnifiedGenotypingEngine.composeAlleleFrequencyPriorProvider(2, 0.0, inputPrior);
|
||||
final double[] noPriors = log10priorProvider.forTotalPloidy(2);
|
||||
|
||||
GetGLsTest cfgFlatPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "flatPrior");
|
||||
GetGLsTest cfgNoPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "noPrior");
|
||||
GetGLsTest cfgNoPrior = new GetGLsTest(model, 1, Arrays.asList(AB), noPriors, "noPrior");
|
||||
final AFCalculationResult resultTrackerFlat = cfgFlatPrior.execute();
|
||||
final AFCalculationResult resultTrackerNoPrior = cfgNoPrior.execute();
|
||||
|
||||
|
|
|
|||
|
|
@ -149,7 +149,7 @@ public class GATKVariantContextUtils {
|
|||
}
|
||||
|
||||
/**
|
||||
* Calculates the total ploidy of a variant context as the sum of all plodies across genotypes.
|
||||
* Calculates the total ploidy of a variant context as the sum of all ploidies across genotypes.
|
||||
* @param vc the target variant context.
|
||||
* @param defaultPloidy the default ploidy to be assume when there is no ploidy information for a genotype.
|
||||
* @return never {@code null}.
|
||||
|
|
|
|||
Loading…
Reference in New Issue