diff --git a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index 63b8717c0..5016526c0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -54,7 +54,10 @@ import org.broadinstitute.sting.utils.collections.DefaultHashMap; import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.File; +import java.io.PrintStream; +import java.util.ArrayList; import java.util.Collections; +import java.util.List; import java.util.Map; /** @@ -118,13 +121,27 @@ public class StandardCallerArgumentCollection { public int MAX_ALTERNATE_ALLELES = 6; /** - * By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus. - * If This argument is true, the heterozygosity prior will not be used - main application is for population studies where prior might not be appropriate, + * By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus, using an infinite sites model, + * see e.g. Waterson (1975) or Tajima (1996). + * This model asserts that the probability of having a population of k variant sites in N chromosomes is proportional to theta/k, for 1=1:N + * + * There are instances where using this prior might not be desireable, e.g. for population studies where prior might not be appropriate, * as for example when the ancestral status of the reference allele is not known. + * By using this argument, user can manually specify priors to be used for calling as a vector for doubles, with the following restriciotns: + * a) User must specify 2N values, where N is the number of samples. + * b) Only diploid calls supported. + * c) Probability values are specified in double format, in linear space. + * d) No negative values allowed. + * e) Values will be added and Pr(AC=0) will be 1-sum, so that they sum up to one. + * f) If user-defined values add to more than one, an error will be produced. + * + * If user wants completely flat priors, then user should specify the same value (=1/(2*N+1)) 2*N times,e.g. + * -inputPrior 0.33 -inputPrior 0.33 + * for the single-sample diploid case. */ @Advanced - @Argument(fullName = "dont_use_site_prior", shortName = "noPrior", doc = "If true, skip prior for variant discovery", required = false) - public boolean ignoreHeterozygosityPrior = false; + @Argument(fullName = "input_prior", shortName = "inputPrior", doc = "Input prior for calls", required = false) + public List inputPrior = Collections.emptyList(); /** * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads. @@ -190,6 +207,6 @@ public class StandardCallerArgumentCollection { this.exactCallsLog = SCAC.exactCallsLog; this.sampleContamination=SCAC.sampleContamination; this.AFmodel = SCAC.AFmodel; - this.ignoreHeterozygosityPrior = SCAC.ignoreHeterozygosityPrior; + this.inputPrior = SCAC.inputPrior; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 55db44052..3380efcc9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -159,8 +159,8 @@ public class UnifiedGenotyperEngine { this.N = samples.size() * ploidy; log10AlleleFrequencyPriorsSNPs = new double[N+1]; log10AlleleFrequencyPriorsIndels = new double[N+1]; - computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity, UAC.ignoreHeterozygosityPrior); - computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.ignoreHeterozygosityPrior); + computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity,UAC.inputPrior); + computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.inputPrior); filter.add(LOW_QUAL_FILTER_NAME); @@ -744,27 +744,39 @@ public class UnifiedGenotyperEngine { * where Pr(AC=i) = theta/i where theta is heterozygosity * @param N Number of chromosomes * @param priors (output) array to be filled with priors - * @param theta Heterozygosity - * @param ignorePriors If true, priors are ignored and zeros returned + * @param heterozygosity default heterozygosity to use, if inputPriors is empty + * @param inputPriors Input priors to use (in which case heterozygosity is ignored) */ - public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta, final boolean ignorePriors) { + public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double heterozygosity, final List inputPriors) { + - if (ignorePriors) { - Arrays.fill(priors, 0,N,0.0); - return; - } double sum = 0.0; - // for each i - for (int i = 1; i <= N; i++) { - final double value = theta / (double)i; - priors[i] = Math.log10(value); - sum += value; + if (!inputPriors.isEmpty()) { + // user-specified priors + if (inputPriors.size() != N) + throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 "); + + int idx = 1; + for (final double prior: inputPriors) { + if (prior < 0.0) + throw new UserException.BadArgumentValue("Bad argument: negative values not allowed","inputPrior"); + priors[idx++] = Math.log10(prior); + sum += prior; + } + } + else { + // for each i + for (int i = 1; i <= N; i++) { + final double value = heterozygosity / (double)i; + priors[i] = Math.log10(value); + sum += value; + } } // protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions) if (sum > 1.0) { - throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed - try reducing heterozygosity value or using the -noPrior argument"); + throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed, or invalid values specified if input priors were provided - try reducing heterozygosity value or correct input priors."); } // null frequency for AF=0 is (1 - sum(all other frequencies)) priors[0] = Math.log10(1.0 - sum); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java index a4224bf6c..042e04767 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; @@ -111,7 +112,7 @@ public class AFCalcTestBuilder { return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors case human: final double[] humanPriors = new double[nPriorValues]; - UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, false); + UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList()); return humanPriors; default: throw new RuntimeException("Unexpected type " + priorType); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 605be3b2d..253467a76 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -142,10 +142,18 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testNoPrior() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -noPrior", 1, + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.33333 -inputPrior 0.33333", 1, Arrays.asList("7ac60bdc355d97c0939e644b58de47d7")); executeTest("test no prior 1", spec1); + } + @Test + public void testUserPrior() { + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.001 -inputPrior 0.495", 1, + Arrays.asList("04d05900849d5a3f6f3f98bd0f262369")); + executeTest("test user prior 1", spec1); + } // -------------------------------------------------------------------------------------------------------------- diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index 5eebe9670..2bdf5078d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -176,7 +176,7 @@ public class AFCalcUnitTest extends BaseTest { final int nPriorValues = 2*nSamples+1; final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors final double[] humanPriors = new double[nPriorValues]; - UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, false); + UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList()); for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) { for ( AFCalc model : calcs ) { @@ -583,8 +583,12 @@ public class AFCalcUnitTest extends BaseTest { final double[] flatPriors = new double[]{0.0,0.0,0.0}; final double[] noPriors = new double[3]; - // test that function computeAlleleFrequency correctly operates when the -noPrior option is set - UnifiedGenotyperEngine.computeAlleleFrequencyPriors(2, noPriors, 0.001, true); + // test that function computeAlleleFrequency correctly operates when the flat prior option is set + // computeAlleleFrequencyPriors takes linear priors + final ArrayList inputPrior = new ArrayList(); + inputPrior.add(1.0/3); + inputPrior.add(1.0/3); + UnifiedGenotyperEngine.computeAlleleFrequencyPriors(2, noPriors, 0.0,inputPrior); GetGLsTest cfgFlatPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "flatPrior"); GetGLsTest cfgNoPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "noPrior");