Merge pull request #194 from broadinstitute/gda_ancient_dna_newPipeline

Add feature to specify Allele frequency priors by command line when call...
This commit is contained in:
delangel 2013-04-27 04:59:09 -07:00
commit 651e1f23b1
5 changed files with 67 additions and 25 deletions

View File

@ -54,7 +54,10 @@ import org.broadinstitute.sting.utils.collections.DefaultHashMap;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.io.File;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
/**
@ -118,13 +121,27 @@ public class StandardCallerArgumentCollection {
public int MAX_ALTERNATE_ALLELES = 6;
/**
* By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus.
* If This argument is true, the heterozygosity prior will not be used - main application is for population studies where prior might not be appropriate,
* By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus, using an infinite sites model,
* see e.g. Waterson (1975) or Tajima (1996).
* This model asserts that the probability of having a population of k variant sites in N chromosomes is proportional to theta/k, for 1=1:N
*
* There are instances where using this prior might not be desireable, e.g. for population studies where prior might not be appropriate,
* as for example when the ancestral status of the reference allele is not known.
* By using this argument, user can manually specify priors to be used for calling as a vector for doubles, with the following restriciotns:
* a) User must specify 2N values, where N is the number of samples.
* b) Only diploid calls supported.
* c) Probability values are specified in double format, in linear space.
* d) No negative values allowed.
* e) Values will be added and Pr(AC=0) will be 1-sum, so that they sum up to one.
* f) If user-defined values add to more than one, an error will be produced.
*
* If user wants completely flat priors, then user should specify the same value (=1/(2*N+1)) 2*N times,e.g.
* -inputPrior 0.33 -inputPrior 0.33
* for the single-sample diploid case.
*/
@Advanced
@Argument(fullName = "dont_use_site_prior", shortName = "noPrior", doc = "If true, skip prior for variant discovery", required = false)
public boolean ignoreHeterozygosityPrior = false;
@Argument(fullName = "input_prior", shortName = "inputPrior", doc = "Input prior for calls", required = false)
public List<Double> inputPrior = Collections.emptyList();
/**
* If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
@ -190,6 +207,6 @@ public class StandardCallerArgumentCollection {
this.exactCallsLog = SCAC.exactCallsLog;
this.sampleContamination=SCAC.sampleContamination;
this.AFmodel = SCAC.AFmodel;
this.ignoreHeterozygosityPrior = SCAC.ignoreHeterozygosityPrior;
this.inputPrior = SCAC.inputPrior;
}
}

View File

@ -159,8 +159,8 @@ public class UnifiedGenotyperEngine {
this.N = samples.size() * ploidy;
log10AlleleFrequencyPriorsSNPs = new double[N+1];
log10AlleleFrequencyPriorsIndels = new double[N+1];
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity, UAC.ignoreHeterozygosityPrior);
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.ignoreHeterozygosityPrior);
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity,UAC.inputPrior);
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.inputPrior);
filter.add(LOW_QUAL_FILTER_NAME);
@ -744,27 +744,39 @@ public class UnifiedGenotyperEngine {
* where Pr(AC=i) = theta/i where theta is heterozygosity
* @param N Number of chromosomes
* @param priors (output) array to be filled with priors
* @param theta Heterozygosity
* @param ignorePriors If true, priors are ignored and zeros returned
* @param heterozygosity default heterozygosity to use, if inputPriors is empty
* @param inputPriors Input priors to use (in which case heterozygosity is ignored)
*/
public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta, final boolean ignorePriors) {
public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double heterozygosity, final List<Double> inputPriors) {
if (ignorePriors) {
Arrays.fill(priors, 0,N,0.0);
return;
}
double sum = 0.0;
// for each i
for (int i = 1; i <= N; i++) {
final double value = theta / (double)i;
priors[i] = Math.log10(value);
sum += value;
if (!inputPriors.isEmpty()) {
// user-specified priors
if (inputPriors.size() != N)
throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 ");
int idx = 1;
for (final double prior: inputPriors) {
if (prior < 0.0)
throw new UserException.BadArgumentValue("Bad argument: negative values not allowed","inputPrior");
priors[idx++] = Math.log10(prior);
sum += prior;
}
}
else {
// for each i
for (int i = 1; i <= N; i++) {
final double value = heterozygosity / (double)i;
priors[i] = Math.log10(value);
sum += value;
}
}
// protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions)
if (sum > 1.0) {
throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed - try reducing heterozygosity value or using the -noPrior argument");
throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed, or invalid values specified if input priors were provided - try reducing heterozygosity value or correct input priors.");
}
// null frequency for AF=0 is (1 - sum(all other frequencies))
priors[0] = Math.log10(1.0 - sum);

View File

@ -47,6 +47,7 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
@ -111,7 +112,7 @@ public class AFCalcTestBuilder {
return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
case human:
final double[] humanPriors = new double[nPriorValues];
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, false);
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList<Double>());
return humanPriors;
default:
throw new RuntimeException("Unexpected type " + priorType);

View File

@ -142,10 +142,18 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testNoPrior() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -noPrior", 1,
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.33333 -inputPrior 0.33333", 1,
Arrays.asList("7ac60bdc355d97c0939e644b58de47d7"));
executeTest("test no prior 1", spec1);
}
@Test
public void testUserPrior() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.001 -inputPrior 0.495", 1,
Arrays.asList("04d05900849d5a3f6f3f98bd0f262369"));
executeTest("test user prior 1", spec1);
}
// --------------------------------------------------------------------------------------------------------------

View File

@ -176,7 +176,7 @@ public class AFCalcUnitTest extends BaseTest {
final int nPriorValues = 2*nSamples+1;
final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
final double[] humanPriors = new double[nPriorValues];
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, false);
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList<Double>());
for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) {
for ( AFCalc model : calcs ) {
@ -583,8 +583,12 @@ public class AFCalcUnitTest extends BaseTest {
final double[] flatPriors = new double[]{0.0,0.0,0.0};
final double[] noPriors = new double[3];
// test that function computeAlleleFrequency correctly operates when the -noPrior option is set
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(2, noPriors, 0.001, true);
// test that function computeAlleleFrequency correctly operates when the flat prior option is set
// computeAlleleFrequencyPriors takes linear priors
final ArrayList<Double> inputPrior = new ArrayList<Double>();
inputPrior.add(1.0/3);
inputPrior.add(1.0/3);
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(2, noPriors, 0.0,inputPrior);
GetGLsTest cfgFlatPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "flatPrior");
GetGLsTest cfgNoPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "noPrior");