Four-base mode now estimates the genotype using the one-base method and retests the site if the one-base method suggests the site is a het.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@503 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kiran 2009-04-23 17:23:24 +00:00
parent bd719f9c06
commit 11e85f1969
1 changed files with 101 additions and 49 deletions

View File

@ -18,17 +18,22 @@ import java.util.*;
// j.maguire 3-7-2009
public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate, Integer> {
@Argument(fullName="metrics",required=true)
public String metricsFileName;
@Argument(fullName="lodThreshold",shortName="lod",required=false,defaultValue="0.0")
public Double lodThreshold;
@Argument(fullName="fourBaseMode",required=false,defaultValue="false")
public Boolean fourBaseMode;
@Argument(fullName="decideOnBase",required=false,defaultValue="false")
public Boolean decideOnBase;
private AlleleMetrics metrics;
public AlleleMetrics metrics;
public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) { return true; } // We are keeping all the reads
public boolean requiresReads() { return true; }
public void initialize() { metrics = new AlleleMetrics("metrics.out"); }
public void initialize() {
metrics = new AlleleMetrics(metricsFileName, lodThreshold);
}
public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context) {
String rodString = getRodString(tracker);
@ -36,11 +41,13 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
AlleleFrequencyEstimate freq = null;
if (fourBaseMode)
{
// Compute four-base prob genotype likelihoods
freq = getFourProbAlleleFrequency(ref, context, rodString);
}
else if (decideOnBase)
{
// Compute single quality score genotype likelihoods
freq = getOneProbAlleleFrequency(ref, context, rodString);
if (Math.abs(freq.qhat - 0.5) < 0.01) {
// If we found a putative het, recompute four-base prob genotype likelihoods
freq = getFourProbAlleleFrequency(ref, context, rodString, tracker);
}
}
else
{
@ -54,7 +61,7 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
return freq;
}
private AlleleFrequencyEstimate getFourProbAlleleFrequency(char ref, LocusContext context, String rodString) {
private AlleleFrequencyEstimate getFourProbAlleleFrequency(char ref, LocusContext context, String rodString, RefMetaDataTracker tracker) {
/*
P(D)*P(G,q|D) = P(D|G,q)*P(G,q)
= P(D|q)*P(q|G)*P(G)
@ -71,62 +78,97 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
int altBaseIndex = getMostFrequentNonRefBase(getFractionalCounts(probs), refBaseIndex);
if (refBaseIndex >= 0 && altBaseIndex >= 0) {
System.out.println(context.getLocation().toString() + " " + refBaseIndex + " " + altBaseIndex + " " + probs.length + " " + rodString);
/*
for (int i = 0; i < probs.length; i++) {
System.out.printf(" [ %4.4f %4.4f %4.4f %4.4f ]\n", probs[i][0], probs[i][1], probs[i][2], probs[i][3]);
}
*/
double[] obsWeights = getObservationWeights(probs, refBaseIndex, altBaseIndex);
/*
System.out.print(" Weights: ");
for (int i = 0; i < obsWeights.length; i++) {
System.out.printf("%4.4f ", obsWeights[i]);
System.out.print(obsWeights[i] + " ");
}
System.out.println();
*/
double[] genotypePriors = { 0.999, 1e-3, 1e-5 };
double[] genotypeBalances = { 0.02, 0.5, 0.98 };
//double[] genotypeBalances = { 0.02, 0.5, 0.98 };
double[] genotypeBalances = { 0.002, 0.5, 1.0 };
double[] posteriors = new double[3];
double qhat = 0.0, qstar = 0.0, lodVsRef = 0.0, lodVsNextBest = 0.0, pBest = Double.MIN_VALUE;
double qhat = 0.0, qstar = 0.0, lodVsRef = 0.0, pBest = Double.MIN_VALUE;
for (int hypothesis = 0; hypothesis < 3; hypothesis++) {
posteriors[hypothesis] = 0.0;
for (int weightIndex = 0; weightIndex < obsWeights.length; weightIndex++) {
posteriors[hypothesis] += obsWeights[weightIndex]*binomialProb(weightIndex, probs.length, genotypeBalances[hypothesis]);
//if (hypothesis == 0) {
// System.out.println("DEBUG1: " + hypothesis + " " + weightIndex + " " + obsWeights[weightIndex] + " " + binomialProb(weightIndex, probs.length, genotypeBalances[hypothesis]));
//}
if (hypothesis == 0) {
//System.out.println(weightIndex + " " + probs.length + " " + genotypeBalances[hypothesis] + " " + binomialProb(weightIndex, probs.length, genotypeBalances[hypothesis]));
}
double binomialWeight = binomialProb(weightIndex, probs.length, genotypeBalances[hypothesis]);;
if (hypothesis == 0 && weightIndex < obsWeights.length/4) {
binomialWeight = 1.0;
}
posteriors[hypothesis] += obsWeights[weightIndex]*binomialWeight;
}
posteriors[hypothesis] *= genotypePriors[hypothesis];
System.out.printf(" Hypothesis %d %f %f %f\n", hypothesis, genotypeBalances[hypothesis], posteriors[hypothesis], Math.log10(posteriors[hypothesis]/posteriors[0]));
double refLikelihood = Double.isInfinite(Math.log10(posteriors[0])) ? -10000.0 : Math.log10(posteriors[0]);
//System.out.println(" Hypothesis " + hypothesis + " " + genotypeBalances[hypothesis] + " " + posteriors[hypothesis] + " " + (Math.log10(posteriors[hypothesis]) - refLikelihood));
if (posteriors[hypothesis] > pBest) {
qhat = genotypeBalances[hypothesis];
qstar = genotypeBalances[hypothesis];
lodVsRef = Math.log10(posteriors[hypothesis]/posteriors[0]);
lodVsRef = Math.log10(posteriors[hypothesis]) - refLikelihood;
pBest = posteriors[hypothesis];
}
}
double pRef = posteriors[0];
System.out.println("\n");
return new AlleleFrequencyEstimate(context.getLocation(),
ref,
BaseUtils.baseIndexToSimpleBase(altBaseIndex),
2,
qhat,
qstar,
lodVsRef,
lodVsNextBest,
pBest,
pRef,
probs.length,
ReadBackedPileup.basePileupAsString(context.getReads(), context.getOffsets()),
probs,
posteriors);
double[] sposteriors = Arrays.copyOf(posteriors, posteriors.length);
Arrays.sort(sposteriors);
double bestLogPosterior = (Double.isInfinite(Math.log10(sposteriors[2]))) ? -10000.0 : Math.log10(sposteriors[2]);
double nextBestLogPosterior = (Double.isInfinite(Math.log10(sposteriors[1]))) ? -10000.0 : Math.log10(sposteriors[1]);
double lodVsNextBest = bestLogPosterior - nextBestLogPosterior;
//System.out.println("\n");
AlleleFrequencyEstimate freq = new AlleleFrequencyEstimate(context.getLocation(),
ref,
BaseUtils.baseIndexToSimpleBase(altBaseIndex),
2,
qhat,
qstar,
(Math.abs(qhat - genotypeBalances[0]) < 0.001 ? -lodVsNextBest : lodVsRef),
lodVsNextBest,
pBest,
pRef,
probs.length,
ReadBackedPileup.basePileupAsString(context.getReads(), context.getOffsets()),
probs,
posteriors);
for ( ReferenceOrderedDatum datum : tracker.getAllRods() ) {
if ( datum != null ) {
if ( datum instanceof rodGFF ) {
rodGFF hapmap_chip_genotype = (rodGFF) datum;
//System.out.println(hapmap_chip_genotype.toString());
//System.out.println(freq.asTabularString());
}
}
}
return freq;
}
return null;
@ -136,11 +178,13 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
// k - number of successes
// n - number of Bernoulli trials
// p - probability of success
double dbinom = 0.0;
if ((n*p < 5) && (n*(1-p) < 5))
{
// For small n and the edges, compute it directly.
return (double)nchoosek(n, k) * Math.pow(p, k) * Math.pow(1-p, n-k);
//return (double)nchoosek(n, k) * Math.pow(p, k) * Math.pow(1-p, n-k);
dbinom = (double)nchoosek(n, k) * Math.pow(p, k) * Math.pow(1-p, n-k);
}
else
{
@ -151,8 +195,10 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
double check = (double)nchoosek(n, k) * Math.pow(p, k) * Math.pow(1-p, n-k);
double residual = ans - check;
return check;
dbinom = check;
}
return (dbinom < 0.0) ? 0.0 : dbinom;
}
long nchoosek(long n, long k) {
@ -169,11 +215,11 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
private double[] getObservationWeights(double[][] probs, int refBaseIndex, int altBaseIndex) {
//if (probs.length <= 10)
//{
// return getWeightTableTraces(getWeightTable(probs, refBaseIndex, altBaseIndex, probs.length));
return getWeightTableTraces(getWeightTable(probs, refBaseIndex, altBaseIndex, probs.length));
//}
//else
//{
return FastObservationWeights(probs, refBaseIndex, altBaseIndex);
// return FastObservationWeights(probs, refBaseIndex, altBaseIndex);
//}
}
@ -235,7 +281,7 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
paths = Utils.PermuteList(paths, permutation);
likelihoods = Utils.PermuteList(likelihoods, permutation);
while ((output_paths.size() < 10) && (paths.size() > 0))
while (/*(output_paths.size() < 10) &&*/ (paths.size() > 0))
{
// 4. Choose the next best path
int[] next_best_path = paths.get(paths.size()-1);
@ -291,22 +337,28 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
likelihoods = Utils.PermuteList(likelihoods, permutation);
}
double[] ans = new double[probs.length+1];
for (int i = 0; i < output_paths.size(); i++)
{
int[] path = output_paths.get(i);
double likelihood = output_likelihoods.get(i);
int altCount = 0;
System.out.printf("DBG %d ", i);
//System.out.printf("DBG %d ", i);
for (int j = 0; j < path.length; j++)
{
System.out.printf("%c", BaseUtils.baseIndexToSimpleBase(path[j]));
}
System.out.printf(" %f\n", likelihood);
}
System.out.printf("\n");
//System.out.printf("%c", BaseUtils.baseIndexToSimpleBase(path[j]));
if (path[j] != ref) { altCount++; }
}
//System.out.printf(" %f\n", likelihood);
ans[altCount] += Math.pow(10.0, likelihood);
//System.out.println(altCount + " " + likelihood + " " + Math.pow(10.0, likelihood));
}
//System.out.printf("\n");
double[] ans = new double[probs.length+1];
for (int i = 0; i < ans.length; i++) { ans[i] = 1.0; }
return ans;
}
@ -389,7 +441,7 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
}
G.ApplyPrior(ref, Double.NaN);
System.out.printf("%s %s %s %s\n", context.getLocation(), ref, bases, G.toString(ref), rodString);
//System.out.printf("%s %s %s %s\n", context.getLocation(), ref, bases, G.toString(ref), rodString);
return G.toAlleleFrequencyEstimate(context.getLocation(), ref, bases.length(), bases, G.likelihoods);
}