Merge pull request #194 from broadinstitute/gda_ancient_dna_newPipeline
Add feature to specify Allele frequency priors by command line when call...
This commit is contained in:
commit
651e1f23b1
|
|
@ -54,7 +54,10 @@ import org.broadinstitute.sting.utils.collections.DefaultHashMap;
|
||||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -118,13 +121,27 @@ public class StandardCallerArgumentCollection {
|
||||||
public int MAX_ALTERNATE_ALLELES = 6;
|
public int MAX_ALTERNATE_ALLELES = 6;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus.
|
* By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus, using an infinite sites model,
|
||||||
* If This argument is true, the heterozygosity prior will not be used - main application is for population studies where prior might not be appropriate,
|
* see e.g. Waterson (1975) or Tajima (1996).
|
||||||
|
* This model asserts that the probability of having a population of k variant sites in N chromosomes is proportional to theta/k, for 1=1:N
|
||||||
|
*
|
||||||
|
* There are instances where using this prior might not be desireable, e.g. for population studies where prior might not be appropriate,
|
||||||
* as for example when the ancestral status of the reference allele is not known.
|
* as for example when the ancestral status of the reference allele is not known.
|
||||||
|
* By using this argument, user can manually specify priors to be used for calling as a vector for doubles, with the following restriciotns:
|
||||||
|
* a) User must specify 2N values, where N is the number of samples.
|
||||||
|
* b) Only diploid calls supported.
|
||||||
|
* c) Probability values are specified in double format, in linear space.
|
||||||
|
* d) No negative values allowed.
|
||||||
|
* e) Values will be added and Pr(AC=0) will be 1-sum, so that they sum up to one.
|
||||||
|
* f) If user-defined values add to more than one, an error will be produced.
|
||||||
|
*
|
||||||
|
* If user wants completely flat priors, then user should specify the same value (=1/(2*N+1)) 2*N times,e.g.
|
||||||
|
* -inputPrior 0.33 -inputPrior 0.33
|
||||||
|
* for the single-sample diploid case.
|
||||||
*/
|
*/
|
||||||
@Advanced
|
@Advanced
|
||||||
@Argument(fullName = "dont_use_site_prior", shortName = "noPrior", doc = "If true, skip prior for variant discovery", required = false)
|
@Argument(fullName = "input_prior", shortName = "inputPrior", doc = "Input prior for calls", required = false)
|
||||||
public boolean ignoreHeterozygosityPrior = false;
|
public List<Double> inputPrior = Collections.emptyList();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
|
* If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
|
||||||
|
|
@ -190,6 +207,6 @@ public class StandardCallerArgumentCollection {
|
||||||
this.exactCallsLog = SCAC.exactCallsLog;
|
this.exactCallsLog = SCAC.exactCallsLog;
|
||||||
this.sampleContamination=SCAC.sampleContamination;
|
this.sampleContamination=SCAC.sampleContamination;
|
||||||
this.AFmodel = SCAC.AFmodel;
|
this.AFmodel = SCAC.AFmodel;
|
||||||
this.ignoreHeterozygosityPrior = SCAC.ignoreHeterozygosityPrior;
|
this.inputPrior = SCAC.inputPrior;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -159,8 +159,8 @@ public class UnifiedGenotyperEngine {
|
||||||
this.N = samples.size() * ploidy;
|
this.N = samples.size() * ploidy;
|
||||||
log10AlleleFrequencyPriorsSNPs = new double[N+1];
|
log10AlleleFrequencyPriorsSNPs = new double[N+1];
|
||||||
log10AlleleFrequencyPriorsIndels = new double[N+1];
|
log10AlleleFrequencyPriorsIndels = new double[N+1];
|
||||||
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity, UAC.ignoreHeterozygosityPrior);
|
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity,UAC.inputPrior);
|
||||||
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.ignoreHeterozygosityPrior);
|
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.inputPrior);
|
||||||
|
|
||||||
filter.add(LOW_QUAL_FILTER_NAME);
|
filter.add(LOW_QUAL_FILTER_NAME);
|
||||||
|
|
||||||
|
|
@ -744,27 +744,39 @@ public class UnifiedGenotyperEngine {
|
||||||
* where Pr(AC=i) = theta/i where theta is heterozygosity
|
* where Pr(AC=i) = theta/i where theta is heterozygosity
|
||||||
* @param N Number of chromosomes
|
* @param N Number of chromosomes
|
||||||
* @param priors (output) array to be filled with priors
|
* @param priors (output) array to be filled with priors
|
||||||
* @param theta Heterozygosity
|
* @param heterozygosity default heterozygosity to use, if inputPriors is empty
|
||||||
* @param ignorePriors If true, priors are ignored and zeros returned
|
* @param inputPriors Input priors to use (in which case heterozygosity is ignored)
|
||||||
*/
|
*/
|
||||||
public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta, final boolean ignorePriors) {
|
public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double heterozygosity, final List<Double> inputPriors) {
|
||||||
|
|
||||||
|
|
||||||
if (ignorePriors) {
|
|
||||||
Arrays.fill(priors, 0,N,0.0);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
double sum = 0.0;
|
double sum = 0.0;
|
||||||
|
|
||||||
// for each i
|
if (!inputPriors.isEmpty()) {
|
||||||
for (int i = 1; i <= N; i++) {
|
// user-specified priors
|
||||||
final double value = theta / (double)i;
|
if (inputPriors.size() != N)
|
||||||
priors[i] = Math.log10(value);
|
throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 ");
|
||||||
sum += value;
|
|
||||||
|
int idx = 1;
|
||||||
|
for (final double prior: inputPriors) {
|
||||||
|
if (prior < 0.0)
|
||||||
|
throw new UserException.BadArgumentValue("Bad argument: negative values not allowed","inputPrior");
|
||||||
|
priors[idx++] = Math.log10(prior);
|
||||||
|
sum += prior;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// for each i
|
||||||
|
for (int i = 1; i <= N; i++) {
|
||||||
|
final double value = heterozygosity / (double)i;
|
||||||
|
priors[i] = Math.log10(value);
|
||||||
|
sum += value;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions)
|
// protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions)
|
||||||
if (sum > 1.0) {
|
if (sum > 1.0) {
|
||||||
throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed - try reducing heterozygosity value or using the -noPrior argument");
|
throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed, or invalid values specified if input priors were provided - try reducing heterozygosity value or correct input priors.");
|
||||||
}
|
}
|
||||||
// null frequency for AF=0 is (1 - sum(all other frequencies))
|
// null frequency for AF=0 is (1 - sum(all other frequencies))
|
||||||
priors[0] = Math.log10(1.0 - sum);
|
priors[0] = Math.log10(1.0 - sum);
|
||||||
|
|
|
||||||
|
|
@ -47,6 +47,7 @@
|
||||||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||||
|
|
||||||
import org.apache.commons.lang.ArrayUtils;
|
import org.apache.commons.lang.ArrayUtils;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
|
||||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||||
import org.broadinstitute.sting.utils.MathUtils;
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
|
@ -111,7 +112,7 @@ public class AFCalcTestBuilder {
|
||||||
return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
|
return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
|
||||||
case human:
|
case human:
|
||||||
final double[] humanPriors = new double[nPriorValues];
|
final double[] humanPriors = new double[nPriorValues];
|
||||||
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, false);
|
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList<Double>());
|
||||||
return humanPriors;
|
return humanPriors;
|
||||||
default:
|
default:
|
||||||
throw new RuntimeException("Unexpected type " + priorType);
|
throw new RuntimeException("Unexpected type " + priorType);
|
||||||
|
|
|
||||||
|
|
@ -142,10 +142,18 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
||||||
@Test
|
@Test
|
||||||
public void testNoPrior() {
|
public void testNoPrior() {
|
||||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -noPrior", 1,
|
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.33333 -inputPrior 0.33333", 1,
|
||||||
Arrays.asList("7ac60bdc355d97c0939e644b58de47d7"));
|
Arrays.asList("7ac60bdc355d97c0939e644b58de47d7"));
|
||||||
executeTest("test no prior 1", spec1);
|
executeTest("test no prior 1", spec1);
|
||||||
|
|
||||||
|
}
|
||||||
|
@Test
|
||||||
|
public void testUserPrior() {
|
||||||
|
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||||
|
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -inputPrior 0.001 -inputPrior 0.495", 1,
|
||||||
|
Arrays.asList("04d05900849d5a3f6f3f98bd0f262369"));
|
||||||
|
executeTest("test user prior 1", spec1);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
|
|
||||||
|
|
@ -176,7 +176,7 @@ public class AFCalcUnitTest extends BaseTest {
|
||||||
final int nPriorValues = 2*nSamples+1;
|
final int nPriorValues = 2*nSamples+1;
|
||||||
final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
|
final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
|
||||||
final double[] humanPriors = new double[nPriorValues];
|
final double[] humanPriors = new double[nPriorValues];
|
||||||
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, false);
|
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList<Double>());
|
||||||
|
|
||||||
for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) {
|
for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) {
|
||||||
for ( AFCalc model : calcs ) {
|
for ( AFCalc model : calcs ) {
|
||||||
|
|
@ -583,8 +583,12 @@ public class AFCalcUnitTest extends BaseTest {
|
||||||
|
|
||||||
final double[] flatPriors = new double[]{0.0,0.0,0.0};
|
final double[] flatPriors = new double[]{0.0,0.0,0.0};
|
||||||
final double[] noPriors = new double[3];
|
final double[] noPriors = new double[3];
|
||||||
// test that function computeAlleleFrequency correctly operates when the -noPrior option is set
|
// test that function computeAlleleFrequency correctly operates when the flat prior option is set
|
||||||
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(2, noPriors, 0.001, true);
|
// computeAlleleFrequencyPriors takes linear priors
|
||||||
|
final ArrayList<Double> inputPrior = new ArrayList<Double>();
|
||||||
|
inputPrior.add(1.0/3);
|
||||||
|
inputPrior.add(1.0/3);
|
||||||
|
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(2, noPriors, 0.0,inputPrior);
|
||||||
|
|
||||||
GetGLsTest cfgFlatPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "flatPrior");
|
GetGLsTest cfgFlatPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "flatPrior");
|
||||||
GetGLsTest cfgNoPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "noPrior");
|
GetGLsTest cfgNoPrior = new GetGLsTest(model, 1, Arrays.asList(AB), flatPriors, "noPrior");
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue