Add feature to specify Allele frequency priors by command line when calling variants.

Use case:
The default AF priors used (infinite sites model, neutral variation) is appropriate in the case where the reference allele is ancestral, and the called allele is a derived allele.
Most of the times this is true but in several population studies and in ancient DNA analyses this might introduce reference biases, and in some other cases it's hard to ascertain what the ancestral allele is (normally requiring to look up homologous chimp sequence).
Specifying no prior is one solution, but this may introduce a lot of artifactual het calls in shallower coverage regions.
With this option, users can specify what the prior for each AC should be according to their needs, subject to the restrictions documented in the code and in GATK docs.
-- Updated ancient DNA single sample calling script with filtering options and other cleanups.
-- Added integration test. Removed old -noPrior syntax.
This commit is contained in:
Guillermo del Angel 2013-04-26 15:49:31 -04:00
parent 759c531d1b
commit 4168aaf280
5 changed files with 67 additions and 25 deletions

View File

@ -54,7 +54,10 @@ import org.broadinstitute.sting.utils.collections.DefaultHashMap;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.io.File;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
/**
@ -118,13 +121,27 @@ public class StandardCallerArgumentCollection {
public int MAX_ALTERNATE_ALLELES = 6;
/**
* By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus.
* If This argument is true, the heterozygosity prior will not be used - main application is for population studies where prior might not be appropriate,
* By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus, using an infinite sites model,
* see e.g. Waterson (1975) or Tajima (1996).
* This model asserts that the probability of having a population of k variant sites in N chromosomes is proportional to theta/k, for 1=1:N
*
* There are instances where using this prior might not be desireable, e.g. for population studies where prior might not be appropriate,
* as for example when the ancestral status of the reference allele is not known.
* By using this argument, user can manually specify priors to be used for calling as a vector for doubles, with the following restriciotns:
* a) User must specify 2N values, where N is the number of samples.
* b) Only diploid calls supported.
* c) Probability values are specified in double format, in linear space.
* d) No negative values allowed.
* e) Values will be added and Pr(AC=0) will be 1-sum, so that they sum up to one.
* f) If user-defined values add to more than one, an error will be produced.
*
* If user wants completely flat priors, then user should specify the same value (=1/(2*N+1)) 2*N times,e.g.
* -inputPrior 0.33 -inputPrior 0.33
* for the single-sample diploid case.
*/
@Advanced
@Argument(fullName = "dont_use_site_prior", shortName = "noPrior", doc = "If true, skip prior for variant discovery", required = false)
public boolean ignoreHeterozygosityPrior = false;
@Argument(fullName = "input_prior", shortName = "inputPrior", doc = "Input prior for calls", required = false)
public List<Double> inputPrior = Collections.emptyList();
/**
* If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
@ -190,6 +207,6 @@ public class StandardCallerArgumentCollection {
this.exactCallsLog = SCAC.exactCallsLog;
this.sampleContamination=SCAC.sampleContamination;
this.AFmodel = SCAC.AFmodel;
this.ignoreHeterozygosityPrior = SCAC.ignoreHeterozygosityPrior;
this.inputPrior = SCAC.inputPrior;
}
}

View File

@ -159,8 +159,8 @@ public class UnifiedGenotyperEngine {
this.N = samples.size() * ploidy;
log10AlleleFrequencyPriorsSNPs = new double[N+1];
log10AlleleFrequencyPriorsIndels = new double[N+1];
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity, UAC.ignoreHeterozygosityPrior);
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.ignoreHeterozygosityPrior);
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity,UAC.inputPrior);
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.inputPrior);
filter.add(LOW_QUAL_FILTER_NAME);
@ -744,27 +744,39 @@ public class UnifiedGenotyperEngine {
* where Pr(AC=i) = theta/i where theta is heterozygosity
* @param N Number of chromosomes
* @param priors (output) array to be filled with priors
* @param theta Heterozygosity
* @param ignorePriors If true, priors are ignored and zeros returned
* @param heterozygosity default heterozygosity to use, if inputPriors is empty
* @param inputPriors Input priors to use (in which case heterozygosity is ignored)
*/
public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta, final boolean ignorePriors) {
public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double heterozygosity, final List<Double> inputPriors) {
if (ignorePriors) {
Arrays.fill(priors, 0,N,0.0);
return;
}
double sum = 0.0;
// for each i
for (int i = 1; i <= N; i++) {
final double value = theta / (double)i;
priors[i] = Math.log10(value);
sum += value;
if (!inputPriors.isEmpty()) {
// user-specified priors
if (inputPriors.size() != N)
throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 ");
int idx = 1;
for (final double prior: inputPriors) {
if (prior < 0.0)
throw new UserException.BadArgumentValue("Bad argument: negative values not allowed","inputPrior");
priors[idx++] = Math.log10(prior);
sum += prior;
}
}
else {
// for each i
for (int i = 1; i <= N; i++) {
final double value = heterozygosity / (double)i;
priors[i] = Math.log10(value);
sum += value;
}
}
// protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions)
if (sum > 1.0) {
throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed - try reducing heterozygosity value or using the -noPrior argument");
throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed, or invalid values specified if input priors were provided - try reducing heterozygosity value or correct input priors.");
}
// null frequency for AF=0 is (1 - sum(all other frequencies))
priors[0] = Math.log10(1.0 - sum);

View File

@ -47,6 +47,7 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
@ -111,7 +112,7 @@ public class AFCalcTestBuilder {
return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
case human:
final double[] humanPriors = new double[nPriorValues];
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, false);
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList<Double>());
return humanPriors;
default:
throw new RuntimeException("Unexpected type " + priorType);

View File

@ -142,10 +142,18 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testNoPrior() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -noPrior", 1,
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.33333 -inputPrior 0.33333", 1,
Arrays.asList("7ac60bdc355d97c0939e644b58de47d7"));
executeTest("test no prior 1", spec1);
}
@Test
public void testUserPrior() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.001 -inputPrior 0.495", 1,
Arrays.asList("04d05900849d5a3f6f3f98bd0f262369"));
executeTest("test user prior 1", spec1);
}
// --------------------------------------------------------------------------------------------------------------

View File

@ -176,7 +176,7 @@ public class AFCalcUnitTest extends BaseTest {
final int nPriorValues = 2*nSamples+1;
final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
final double[] humanPriors = new double[nPriorValues];
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, false);
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList<Double>());
for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) {
for ( AFCalc model : calcs ) {
@ -583,8 +583,12 @@ public class AFCalcUnitTest extends BaseTest {
final double[] flatPriors = new double[]{0.0,0.0,0.0};
final double[] noPriors = new double[3];
// test that function computeAlleleFrequency correctly operates when the -noPrior option is set
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(2, noPriors, 0.001, true);
// test that function computeAlleleFrequency correctly operates when the flat prior option is set
// computeAlleleFrequencyPriors takes linear priors
final ArrayList<Double> inputPrior = new ArrayList<Double>();
inputPrior.add(1.0/3);
inputPrior.add(1.0/3);
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(2, noPriors, 0.0,inputPrior);
GetGLsTest cfgFlatPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "flatPrior");
GetGLsTest cfgNoPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "noPrior");