Removed binomialProb() method. Set better values for qHom, qHet, and qHomNonRef and allowed those to be set from the command-line.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@546 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kiran 2009-04-27 15:09:02 +00:00
parent b9c9dbb1d7
commit 444bc18183
1 changed files with 54 additions and 45 deletions

View File

@ -8,8 +8,8 @@ import org.broadinstitute.sting.playground.utils.*;
import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.ReadBackedPileup; import org.broadinstitute.sting.utils.ReadBackedPileup;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
@ -19,93 +19,102 @@ import java.util.*;
// j.maguire 3-7-2009 // j.maguire 3-7-2009
public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate, Integer> { public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate, Integer> {
@Argument(fullName="metrics",required=false,defaultValue="/dev/null") @Argument(fullName="metrics", shortName="met", required=false, defaultValue="/dev/null") public String metricsFileName;
public String metricsFileName; @Argument(fullName="metInterval", shortName="mi", required=false, defaultValue="50000") public Integer metricsInterval;
@Argument(fullName="printMetrics", shortName="printMetrics", required=false, defaultValue="true") public Boolean printMetrics;
@Argument(fullName="lodThreshold",shortName="lod",required=false,defaultValue="5.0") @Argument(fullName="lodThreshold", shortName="lod", required=false, defaultValue="5.0") public Double lodThreshold;
public Double lodThreshold; @Argument(fullName="fourBaseMode", shortName="fb", required=false, defaultValue="false") public Boolean fourBaseMode;
@Argument(fullName="retest", shortName="re", required=false, defaultValue="false") public Boolean retest;
@Argument(fullName="fourBaseMode",required=false,defaultValue="false") @Argument(fullName="qHom", shortName="qHom", required=false, defaultValue="0.04") public Double qHom;
public Boolean fourBaseMode; @Argument(fullName="qHet", shortName="qHet", required=false, defaultValue="0.49") public Double qHet;
@Argument(fullName="qHomNonRef", shortName="qHomNonRef", required=false, defaultValue="0.97") public Double qHomNonRef;
@Argument(fullName="retest",required=false,defaultValue="false")
public Boolean retest;
@Argument(fullName="printMetrics",required=false,defaultValue="true")
public Boolean printMetrics;
public AlleleMetrics metrics; public AlleleMetrics metrics;
public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) { return true; } public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) { return true; }
public boolean requiresReads() { return true; } public boolean requiresReads() { return true; }
public void initialize() { public void initialize() { metrics = new AlleleMetrics(metricsFileName, lodThreshold); }
metrics = new AlleleMetrics(metricsFileName, lodThreshold);
}
public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context) { public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context) {
String rodString = getRodString(tracker); String rodString = getRodString(tracker);
AlleleFrequencyEstimate freq = null; AlleleFrequencyEstimate freq;
if (fourBaseMode) if (fourBaseMode) {
{
if (retest) { if (retest) {
// Compute single quality score genotype likelihoods // Compute single quality score genotype likelihoods
freq = getOneProbAlleleFrequency(ref, context, rodString); freq = getOneProbAlleleFrequency(ref, context, rodString);
if (MathUtils.compareDoubles(freq.qhat, 0.5) == 0) { if (freq.qhat > 0.1) {
// If we found a putative het, recompute four-base prob genotype likelihoods // If we found a putative variant, recompute four-base prob genotype likelihoods
freq = getFourProbAlleleFrequency(ref, context, rodString, tracker); freq = getFourProbAlleleFrequency(ref, context, rodString, tracker);
} }
} else { } else {
freq = getFourProbAlleleFrequency(ref, context, rodString, tracker); freq = getFourProbAlleleFrequency(ref, context, rodString, tracker);
} }
} } else {
else
{
// Compute single quality score genotype likelihoods // Compute single quality score genotype likelihoods
freq = getOneProbAlleleFrequency(ref, context, rodString); freq = getOneProbAlleleFrequency(ref, context, rodString);
} }
if (printMetrics) { if (printMetrics) {
if (freq != null) { metrics.nextPosition(freq, tracker); } if (freq != null) { metrics.nextPosition(freq, tracker); }
metrics.printMetricsAtLocusIntervals(1000); metrics.printMetricsAtLocusIntervals(metricsInterval);
} }
return freq; return freq;
} }
private AlleleFrequencyEstimate getFourProbAlleleFrequency(char ref, LocusContext context, String rodString, RefMetaDataTracker tracker) { private AlleleFrequencyEstimate getFourProbAlleleFrequency(char ref, LocusContext context, String rodString, RefMetaDataTracker tracker) {
/*
P(D)*P(G,q|D) = P(D|G,q)*P(G,q)
= P(D|q)*P(q|G)*P(G)
P(G) = { 0.999 (hom-ref), 1e-3 (het), 1e-5 (hom-nonref) }
n
P(D|q)P(q|G) = product [ P_i(A)*B(i, n, G) ]
i=1
*/
double[][] probs = ReadBackedPileup.probDistPileup(context.getReads(), context.getOffsets()); double[][] probs = ReadBackedPileup.probDistPileup(context.getReads(), context.getOffsets());
int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(ref); int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(ref);
int altBaseIndex = getMostFrequentNonRefBase(getFractionalCounts(probs), refBaseIndex); int altBaseIndex = getMostFrequentNonRefBase(getFractionalCounts(probs), refBaseIndex);
if (refBaseIndex >= 0 && altBaseIndex >= 0) { if (refBaseIndex >= 0 && altBaseIndex >= 0) {
double[] obsWeights = getObservationWeights(probs, refBaseIndex, altBaseIndex); // Set the priors for the hom-ref, het, and non-hom ref (we don't evaluate the
// possibility of a non-ref het).
double[] genotypePriors = { 0.999, 1e-3, 1e-5 };
double[] genotypePriors = { 0.999, 1e-3, 1e-5 }; // I'm not sure what the difference between qhat and qstar is in AlleleMetrics, but
double[] genotypeBalances = { 0.02, 0.5, 0.98 }; // to make my life simpler, I evaluate the non-ref balances in genotypeBalances and
double[] reportingBalances = { 0.0, 0.5, 1.0 }; // the metrics class doesn't like it if you don't tell it these exact values // report the balances in reportingBalances.
double[] genotypeBalances = { qHom, qHet, qHomNonRef };
double[] reportingBalances = { 0.0, 0.5, 1.0 };
// Compute the distribution of second-best bases (should be random)
/*
double[] secondaryBaseProbs = { 0.333, 0.333, 0.333 };
int[] allBaseCounts = new int[3];
for (int baseIndex = 0; baseIndex < 4; baseIndex++) {
}
*/
// Determine the probability distribution of non-ref alleles
//double[] obsWeights = getObservationWeights(probs, refBaseIndex, altBaseIndex);
// Convolve them with the binomial sampling probability distribution in order to
// marginalize the non-ref allele balance.
double[] posteriors = new double[3]; double[] posteriors = new double[3];
double qhat = 0.0, qstar = 0.0, lodVsRef = 0.0, pBest = Double.MIN_VALUE; double qhat = 0.0, qstar = 0.0, lodVsRef = 0.0, pBest = Double.MIN_VALUE;
for (int hypothesis = 0; hypothesis < 3; hypothesis++) { for (int hypothesis = 0; hypothesis < 3; hypothesis++) {
int base1;
int base2;
switch (hypothesis) {
case 0: base1 = refBaseIndex; base2 = refBaseIndex; break;
case 1: base1 = refBaseIndex; base2 = altBaseIndex; break;
case 2: base1 = altBaseIndex; base2 = altBaseIndex; break;
default: base1 = refBaseIndex; base2 = refBaseIndex; break;
}
double[] obsWeights = getObservationWeights(probs, refBaseIndex, altBaseIndex);
posteriors[hypothesis] = 0.0; posteriors[hypothesis] = 0.0;
for (int weightIndex = 0; weightIndex < obsWeights.length; weightIndex++) { for (int weightIndex = 0; weightIndex < obsWeights.length; weightIndex++) {
double binomialWeight = MathUtils.binomialProbability(weightIndex, probs.length, genotypeBalances[hypothesis]); double binomialWeight = MathUtils.binomialProbability(weightIndex, probs.length, genotypeBalances[hypothesis]);
// The hom-ref posterior probability distribution may be too weak to
// effectively account for noise. Just set it to equal weighting for
// the lower quantile of the non-ref balance range.
//if (hypothesis == 0 && weightIndex < obsWeights.length/4) { //if (hypothesis == 0 && weightIndex < obsWeights.length/4) {
// binomialWeight = 1.0; // binomialWeight = 1.0;
//} //}
@ -138,7 +147,7 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
2, 2,
qhat, qhat,
qstar, qstar,
(MathUtils.compareDoubles(qhat, reportingBalances[0]) == 0 ? -lodVsNextBest : lodVsRef), (MathUtils.compareDoubles(qhat, reportingBalances[0]) == 0 ? lodVsNextBest : lodVsRef),
lodVsNextBest, lodVsNextBest,
pBest, pBest,
pRef, pRef,