Merge pull request #194 from broadinstitute/gda_ancient_dna_newPipeline
Add feature to specify Allele frequency priors by command line when call...
This commit is contained in:
commit
651e1f23b1
|
|
@ -54,7 +54,10 @@ import org.broadinstitute.sting.utils.collections.DefaultHashMap;
|
|||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
|
|
@ -118,13 +121,27 @@ public class StandardCallerArgumentCollection {
|
|||
public int MAX_ALTERNATE_ALLELES = 6;
|
||||
|
||||
/**
|
||||
* By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus.
|
||||
* If This argument is true, the heterozygosity prior will not be used - main application is for population studies where prior might not be appropriate,
|
||||
* By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus, using an infinite sites model,
|
||||
* see e.g. Waterson (1975) or Tajima (1996).
|
||||
* This model asserts that the probability of having a population of k variant sites in N chromosomes is proportional to theta/k, for 1=1:N
|
||||
*
|
||||
* There are instances where using this prior might not be desireable, e.g. for population studies where prior might not be appropriate,
|
||||
* as for example when the ancestral status of the reference allele is not known.
|
||||
* By using this argument, user can manually specify priors to be used for calling as a vector for doubles, with the following restriciotns:
|
||||
* a) User must specify 2N values, where N is the number of samples.
|
||||
* b) Only diploid calls supported.
|
||||
* c) Probability values are specified in double format, in linear space.
|
||||
* d) No negative values allowed.
|
||||
* e) Values will be added and Pr(AC=0) will be 1-sum, so that they sum up to one.
|
||||
* f) If user-defined values add to more than one, an error will be produced.
|
||||
*
|
||||
* If user wants completely flat priors, then user should specify the same value (=1/(2*N+1)) 2*N times,e.g.
|
||||
* -inputPrior 0.33 -inputPrior 0.33
|
||||
* for the single-sample diploid case.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "dont_use_site_prior", shortName = "noPrior", doc = "If true, skip prior for variant discovery", required = false)
|
||||
public boolean ignoreHeterozygosityPrior = false;
|
||||
@Argument(fullName = "input_prior", shortName = "inputPrior", doc = "Input prior for calls", required = false)
|
||||
public List<Double> inputPrior = Collections.emptyList();
|
||||
|
||||
/**
|
||||
* If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
|
||||
|
|
@ -190,6 +207,6 @@ public class StandardCallerArgumentCollection {
|
|||
this.exactCallsLog = SCAC.exactCallsLog;
|
||||
this.sampleContamination=SCAC.sampleContamination;
|
||||
this.AFmodel = SCAC.AFmodel;
|
||||
this.ignoreHeterozygosityPrior = SCAC.ignoreHeterozygosityPrior;
|
||||
this.inputPrior = SCAC.inputPrior;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -159,8 +159,8 @@ public class UnifiedGenotyperEngine {
|
|||
this.N = samples.size() * ploidy;
|
||||
log10AlleleFrequencyPriorsSNPs = new double[N+1];
|
||||
log10AlleleFrequencyPriorsIndels = new double[N+1];
|
||||
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity, UAC.ignoreHeterozygosityPrior);
|
||||
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.ignoreHeterozygosityPrior);
|
||||
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity,UAC.inputPrior);
|
||||
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.inputPrior);
|
||||
|
||||
filter.add(LOW_QUAL_FILTER_NAME);
|
||||
|
||||
|
|
@ -744,27 +744,39 @@ public class UnifiedGenotyperEngine {
|
|||
* where Pr(AC=i) = theta/i where theta is heterozygosity
|
||||
* @param N Number of chromosomes
|
||||
* @param priors (output) array to be filled with priors
|
||||
* @param theta Heterozygosity
|
||||
* @param ignorePriors If true, priors are ignored and zeros returned
|
||||
* @param heterozygosity default heterozygosity to use, if inputPriors is empty
|
||||
* @param inputPriors Input priors to use (in which case heterozygosity is ignored)
|
||||
*/
|
||||
public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta, final boolean ignorePriors) {
|
||||
public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double heterozygosity, final List<Double> inputPriors) {
|
||||
|
||||
|
||||
if (ignorePriors) {
|
||||
Arrays.fill(priors, 0,N,0.0);
|
||||
return;
|
||||
}
|
||||
double sum = 0.0;
|
||||
|
||||
// for each i
|
||||
for (int i = 1; i <= N; i++) {
|
||||
final double value = theta / (double)i;
|
||||
priors[i] = Math.log10(value);
|
||||
sum += value;
|
||||
if (!inputPriors.isEmpty()) {
|
||||
// user-specified priors
|
||||
if (inputPriors.size() != N)
|
||||
throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 ");
|
||||
|
||||
int idx = 1;
|
||||
for (final double prior: inputPriors) {
|
||||
if (prior < 0.0)
|
||||
throw new UserException.BadArgumentValue("Bad argument: negative values not allowed","inputPrior");
|
||||
priors[idx++] = Math.log10(prior);
|
||||
sum += prior;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// for each i
|
||||
for (int i = 1; i <= N; i++) {
|
||||
final double value = heterozygosity / (double)i;
|
||||
priors[i] = Math.log10(value);
|
||||
sum += value;
|
||||
}
|
||||
}
|
||||
|
||||
// protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions)
|
||||
if (sum > 1.0) {
|
||||
throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed - try reducing heterozygosity value or using the -noPrior argument");
|
||||
throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed, or invalid values specified if input priors were provided - try reducing heterozygosity value or correct input priors.");
|
||||
}
|
||||
// null frequency for AF=0 is (1 - sum(all other frequencies))
|
||||
priors[0] = Math.log10(1.0 - sum);
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
|
@ -111,7 +112,7 @@ public class AFCalcTestBuilder {
|
|||
return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
|
||||
case human:
|
||||
final double[] humanPriors = new double[nPriorValues];
|
||||
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, false);
|
||||
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList<Double>());
|
||||
return humanPriors;
|
||||
default:
|
||||
throw new RuntimeException("Unexpected type " + priorType);
|
||||
|
|
|
|||
|
|
@ -142,10 +142,18 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testNoPrior() {
|
||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -noPrior", 1,
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.33333 -inputPrior 0.33333", 1,
|
||||
Arrays.asList("7ac60bdc355d97c0939e644b58de47d7"));
|
||||
executeTest("test no prior 1", spec1);
|
||||
|
||||
}
|
||||
@Test
|
||||
public void testUserPrior() {
|
||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.001 -inputPrior 0.495", 1,
|
||||
Arrays.asList("04d05900849d5a3f6f3f98bd0f262369"));
|
||||
executeTest("test user prior 1", spec1);
|
||||
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -176,7 +176,7 @@ public class AFCalcUnitTest extends BaseTest {
|
|||
final int nPriorValues = 2*nSamples+1;
|
||||
final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
|
||||
final double[] humanPriors = new double[nPriorValues];
|
||||
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, false);
|
||||
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList<Double>());
|
||||
|
||||
for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) {
|
||||
for ( AFCalc model : calcs ) {
|
||||
|
|
@ -583,8 +583,12 @@ public class AFCalcUnitTest extends BaseTest {
|
|||
|
||||
final double[] flatPriors = new double[]{0.0,0.0,0.0};
|
||||
final double[] noPriors = new double[3];
|
||||
// test that function computeAlleleFrequency correctly operates when the -noPrior option is set
|
||||
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(2, noPriors, 0.001, true);
|
||||
// test that function computeAlleleFrequency correctly operates when the flat prior option is set
|
||||
// computeAlleleFrequencyPriors takes linear priors
|
||||
final ArrayList<Double> inputPrior = new ArrayList<Double>();
|
||||
inputPrior.add(1.0/3);
|
||||
inputPrior.add(1.0/3);
|
||||
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(2, noPriors, 0.0,inputPrior);
|
||||
|
||||
GetGLsTest cfgFlatPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "flatPrior");
|
||||
GetGLsTest cfgNoPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "noPrior");
|
||||
|
|
|
|||
Loading…
Reference in New Issue