From 4168aaf2806f5b6925eb6eea312c65b985663cde Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Fri, 26 Apr 2013 15:49:31 -0400 Subject: [PATCH] Add feature to specify Allele frequency priors by command line when calling variants. Use case: The default AF priors used (infinite sites model, neutral variation) is appropriate in the case where the reference allele is ancestral, and the called allele is a derived allele. Most of the times this is true but in several population studies and in ancient DNA analyses this might introduce reference biases, and in some other cases it's hard to ascertain what the ancestral allele is (normally requiring to look up homologous chimp sequence). Specifying no prior is one solution, but this may introduce a lot of artifactual het calls in shallower coverage regions. With this option, users can specify what the prior for each AC should be according to their needs, subject to the restrictions documented in the code and in GATK docs. -- Updated ancient DNA single sample calling script with filtering options and other cleanups. -- Added integration test. Removed old -noPrior syntax. --- .../StandardCallerArgumentCollection.java | 27 +++++++++--- .../genotyper/UnifiedGenotyperEngine.java | 42 ++++++++++++------- .../genotyper/afcalc/AFCalcTestBuilder.java | 3 +- .../UnifiedGenotyperIntegrationTest.java | 10 ++++- .../genotyper/afcalc/AFCalcUnitTest.java | 10 +++-- 5 files changed, 67 insertions(+), 25 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index 63b8717c0..5016526c0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -54,7 +54,10 @@ import org.broadinstitute.sting.utils.collections.DefaultHashMap; import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.File; +import java.io.PrintStream; +import java.util.ArrayList; import java.util.Collections; +import java.util.List; import java.util.Map; /** @@ -118,13 +121,27 @@ public class StandardCallerArgumentCollection { public int MAX_ALTERNATE_ALLELES = 6; /** - * By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus. - * If This argument is true, the heterozygosity prior will not be used - main application is for population studies where prior might not be appropriate, + * By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus, using an infinite sites model, + * see e.g. Waterson (1975) or Tajima (1996). + * This model asserts that the probability of having a population of k variant sites in N chromosomes is proportional to theta/k, for 1=1:N + * + * There are instances where using this prior might not be desireable, e.g. for population studies where prior might not be appropriate, * as for example when the ancestral status of the reference allele is not known. + * By using this argument, user can manually specify priors to be used for calling as a vector for doubles, with the following restriciotns: + * a) User must specify 2N values, where N is the number of samples. + * b) Only diploid calls supported. + * c) Probability values are specified in double format, in linear space. + * d) No negative values allowed. + * e) Values will be added and Pr(AC=0) will be 1-sum, so that they sum up to one. + * f) If user-defined values add to more than one, an error will be produced. + * + * If user wants completely flat priors, then user should specify the same value (=1/(2*N+1)) 2*N times,e.g. + * -inputPrior 0.33 -inputPrior 0.33 + * for the single-sample diploid case. */ @Advanced - @Argument(fullName = "dont_use_site_prior", shortName = "noPrior", doc = "If true, skip prior for variant discovery", required = false) - public boolean ignoreHeterozygosityPrior = false; + @Argument(fullName = "input_prior", shortName = "inputPrior", doc = "Input prior for calls", required = false) + public List inputPrior = Collections.emptyList(); /** * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads. @@ -190,6 +207,6 @@ public class StandardCallerArgumentCollection { this.exactCallsLog = SCAC.exactCallsLog; this.sampleContamination=SCAC.sampleContamination; this.AFmodel = SCAC.AFmodel; - this.ignoreHeterozygosityPrior = SCAC.ignoreHeterozygosityPrior; + this.inputPrior = SCAC.inputPrior; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 55db44052..3380efcc9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -159,8 +159,8 @@ public class UnifiedGenotyperEngine { this.N = samples.size() * ploidy; log10AlleleFrequencyPriorsSNPs = new double[N+1]; log10AlleleFrequencyPriorsIndels = new double[N+1]; - computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity, UAC.ignoreHeterozygosityPrior); - computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.ignoreHeterozygosityPrior); + computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity,UAC.inputPrior); + computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.inputPrior); filter.add(LOW_QUAL_FILTER_NAME); @@ -744,27 +744,39 @@ public class UnifiedGenotyperEngine { * where Pr(AC=i) = theta/i where theta is heterozygosity * @param N Number of chromosomes * @param priors (output) array to be filled with priors - * @param theta Heterozygosity - * @param ignorePriors If true, priors are ignored and zeros returned + * @param heterozygosity default heterozygosity to use, if inputPriors is empty + * @param inputPriors Input priors to use (in which case heterozygosity is ignored) */ - public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta, final boolean ignorePriors) { + public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double heterozygosity, final List inputPriors) { + - if (ignorePriors) { - Arrays.fill(priors, 0,N,0.0); - return; - } double sum = 0.0; - // for each i - for (int i = 1; i <= N; i++) { - final double value = theta / (double)i; - priors[i] = Math.log10(value); - sum += value; + if (!inputPriors.isEmpty()) { + // user-specified priors + if (inputPriors.size() != N) + throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 "); + + int idx = 1; + for (final double prior: inputPriors) { + if (prior < 0.0) + throw new UserException.BadArgumentValue("Bad argument: negative values not allowed","inputPrior"); + priors[idx++] = Math.log10(prior); + sum += prior; + } + } + else { + // for each i + for (int i = 1; i <= N; i++) { + final double value = heterozygosity / (double)i; + priors[i] = Math.log10(value); + sum += value; + } } // protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions) if (sum > 1.0) { - throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed - try reducing heterozygosity value or using the -noPrior argument"); + throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed, or invalid values specified if input priors were provided - try reducing heterozygosity value or correct input priors."); } // null frequency for AF=0 is (1 - sum(all other frequencies)) priors[0] = Math.log10(1.0 - sum); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java index a4224bf6c..042e04767 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; @@ -111,7 +112,7 @@ public class AFCalcTestBuilder { return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors case human: final double[] humanPriors = new double[nPriorValues]; - UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, false); + UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList()); return humanPriors; default: throw new RuntimeException("Unexpected type " + priorType); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 605be3b2d..253467a76 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -142,10 +142,18 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testNoPrior() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -noPrior", 1, + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.33333 -inputPrior 0.33333", 1, Arrays.asList("7ac60bdc355d97c0939e644b58de47d7")); executeTest("test no prior 1", spec1); + } + @Test + public void testUserPrior() { + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.001 -inputPrior 0.495", 1, + Arrays.asList("04d05900849d5a3f6f3f98bd0f262369")); + executeTest("test user prior 1", spec1); + } // -------------------------------------------------------------------------------------------------------------- diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index 5eebe9670..2bdf5078d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -176,7 +176,7 @@ public class AFCalcUnitTest extends BaseTest { final int nPriorValues = 2*nSamples+1; final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors final double[] humanPriors = new double[nPriorValues]; - UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, false); + UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList()); for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) { for ( AFCalc model : calcs ) { @@ -583,8 +583,12 @@ public class AFCalcUnitTest extends BaseTest { final double[] flatPriors = new double[]{0.0,0.0,0.0}; final double[] noPriors = new double[3]; - // test that function computeAlleleFrequency correctly operates when the -noPrior option is set - UnifiedGenotyperEngine.computeAlleleFrequencyPriors(2, noPriors, 0.001, true); + // test that function computeAlleleFrequency correctly operates when the flat prior option is set + // computeAlleleFrequencyPriors takes linear priors + final ArrayList inputPrior = new ArrayList(); + inputPrior.add(1.0/3); + inputPrior.add(1.0/3); + UnifiedGenotyperEngine.computeAlleleFrequencyPriors(2, noPriors, 0.0,inputPrior); GetGLsTest cfgFlatPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "flatPrior"); GetGLsTest cfgNoPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "noPrior");