Removed binomialProb() method. Set better values for qHom, qHet, and qHomNonRef and allowed those to be set from the command-line.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@546 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kiran 2009-04-27 15:09:02 +00:00
parent b9c9dbb1d7
commit 444bc18183
1 changed files with 54 additions and 45 deletions

View File

@ -8,8 +8,8 @@ import org.broadinstitute.sting.playground.utils.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.ReadBackedPileup;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import net.sf.samtools.SAMRecord;
@ -19,93 +19,102 @@ import java.util.*;
// j.maguire 3-7-2009
public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate, Integer> {
@Argument(fullName="metrics",required=false,defaultValue="/dev/null")
public String metricsFileName;
@Argument(fullName="lodThreshold",shortName="lod",required=false,defaultValue="5.0")
public Double lodThreshold;
@Argument(fullName="fourBaseMode",required=false,defaultValue="false")
public Boolean fourBaseMode;
@Argument(fullName="retest",required=false,defaultValue="false")
public Boolean retest;
@Argument(fullName="printMetrics",required=false,defaultValue="true")
public Boolean printMetrics;
@Argument(fullName="metrics", shortName="met", required=false, defaultValue="/dev/null") public String metricsFileName;
@Argument(fullName="metInterval", shortName="mi", required=false, defaultValue="50000") public Integer metricsInterval;
@Argument(fullName="printMetrics", shortName="printMetrics", required=false, defaultValue="true") public Boolean printMetrics;
@Argument(fullName="lodThreshold", shortName="lod", required=false, defaultValue="5.0") public Double lodThreshold;
@Argument(fullName="fourBaseMode", shortName="fb", required=false, defaultValue="false") public Boolean fourBaseMode;
@Argument(fullName="retest", shortName="re", required=false, defaultValue="false") public Boolean retest;
@Argument(fullName="qHom", shortName="qHom", required=false, defaultValue="0.04") public Double qHom;
@Argument(fullName="qHet", shortName="qHet", required=false, defaultValue="0.49") public Double qHet;
@Argument(fullName="qHomNonRef", shortName="qHomNonRef", required=false, defaultValue="0.97") public Double qHomNonRef;
public AlleleMetrics metrics;
public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) { return true; }
public boolean requiresReads() { return true; }
public void initialize() {
metrics = new AlleleMetrics(metricsFileName, lodThreshold);
}
public void initialize() { metrics = new AlleleMetrics(metricsFileName, lodThreshold); }
public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context) {
String rodString = getRodString(tracker);
AlleleFrequencyEstimate freq = null;
if (fourBaseMode)
{
AlleleFrequencyEstimate freq;
if (fourBaseMode) {
if (retest) {
// Compute single quality score genotype likelihoods
freq = getOneProbAlleleFrequency(ref, context, rodString);
if (MathUtils.compareDoubles(freq.qhat, 0.5) == 0) {
// If we found a putative het, recompute four-base prob genotype likelihoods
if (freq.qhat > 0.1) {
// If we found a putative variant, recompute four-base prob genotype likelihoods
freq = getFourProbAlleleFrequency(ref, context, rodString, tracker);
}
} else {
freq = getFourProbAlleleFrequency(ref, context, rodString, tracker);
}
}
else
{
} else {
// Compute single quality score genotype likelihoods
freq = getOneProbAlleleFrequency(ref, context, rodString);
}
if (printMetrics) {
if (freq != null) { metrics.nextPosition(freq, tracker); }
metrics.printMetricsAtLocusIntervals(1000);
metrics.printMetricsAtLocusIntervals(metricsInterval);
}
return freq;
}
private AlleleFrequencyEstimate getFourProbAlleleFrequency(char ref, LocusContext context, String rodString, RefMetaDataTracker tracker) {
/*
P(D)*P(G,q|D) = P(D|G,q)*P(G,q)
= P(D|q)*P(q|G)*P(G)
P(G) = { 0.999 (hom-ref), 1e-3 (het), 1e-5 (hom-nonref) }
n
P(D|q)P(q|G) = product [ P_i(A)*B(i, n, G) ]
i=1
*/
double[][] probs = ReadBackedPileup.probDistPileup(context.getReads(), context.getOffsets());
int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(ref);
int altBaseIndex = getMostFrequentNonRefBase(getFractionalCounts(probs), refBaseIndex);
if (refBaseIndex >= 0 && altBaseIndex >= 0) {
double[] obsWeights = getObservationWeights(probs, refBaseIndex, altBaseIndex);
// Set the priors for the hom-ref, het, and non-hom ref (we don't evaluate the
// possibility of a non-ref het).
double[] genotypePriors = { 0.999, 1e-3, 1e-5 };
double[] genotypePriors = { 0.999, 1e-3, 1e-5 };
double[] genotypeBalances = { 0.02, 0.5, 0.98 };
double[] reportingBalances = { 0.0, 0.5, 1.0 }; // the metrics class doesn't like it if you don't tell it these exact values
// I'm not sure what the difference between qhat and qstar is in AlleleMetrics, but
// to make my life simpler, I evaluate the non-ref balances in genotypeBalances and
// report the balances in reportingBalances.
double[] genotypeBalances = { qHom, qHet, qHomNonRef };
double[] reportingBalances = { 0.0, 0.5, 1.0 };
// Compute the distribution of second-best bases (should be random)
/*
double[] secondaryBaseProbs = { 0.333, 0.333, 0.333 };
int[] allBaseCounts = new int[3];
for (int baseIndex = 0; baseIndex < 4; baseIndex++) {
}
*/
// Determine the probability distribution of non-ref alleles
//double[] obsWeights = getObservationWeights(probs, refBaseIndex, altBaseIndex);
// Convolve them with the binomial sampling probability distribution in order to
// marginalize the non-ref allele balance.
double[] posteriors = new double[3];
double qhat = 0.0, qstar = 0.0, lodVsRef = 0.0, pBest = Double.MIN_VALUE;
for (int hypothesis = 0; hypothesis < 3; hypothesis++) {
int base1;
int base2;
switch (hypothesis) {
case 0: base1 = refBaseIndex; base2 = refBaseIndex; break;
case 1: base1 = refBaseIndex; base2 = altBaseIndex; break;
case 2: base1 = altBaseIndex; base2 = altBaseIndex; break;
default: base1 = refBaseIndex; base2 = refBaseIndex; break;
}
double[] obsWeights = getObservationWeights(probs, refBaseIndex, altBaseIndex);
posteriors[hypothesis] = 0.0;
for (int weightIndex = 0; weightIndex < obsWeights.length; weightIndex++) {
double binomialWeight = MathUtils.binomialProbability(weightIndex, probs.length, genotypeBalances[hypothesis]);
// The hom-ref posterior probability distribution may be too weak to
// effectively account for noise. Just set it to equal weighting for
// the lower quantile of the non-ref balance range.
//if (hypothesis == 0 && weightIndex < obsWeights.length/4) {
// binomialWeight = 1.0;
//}
@ -138,7 +147,7 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
2,
qhat,
qstar,
(MathUtils.compareDoubles(qhat, reportingBalances[0]) == 0 ? -lodVsNextBest : lodVsRef),
(MathUtils.compareDoubles(qhat, reportingBalances[0]) == 0 ? lodVsNextBest : lodVsRef),
lodVsNextBest,
pBest,
pRef,