Intermediate commit while working on getting four-base probs to work in the single sample genotyper. Has infrastructure for the new combinatorial approach and just choosing the best base more intelligently given a probability distribution over bases and the reference base.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@492 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4cafb95be8
commit
ffcd672c1c
|
|
@ -5,6 +5,10 @@ import org.broadinstitute.sting.gatk.refdata.*;
|
|||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
|
||||
import org.broadinstitute.sting.playground.utils.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
|
|
@ -13,59 +17,232 @@ import java.util.List;
|
|||
// Draft single sample genotyper
|
||||
// j.maguire 3-7-2009
|
||||
|
||||
public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate, Integer>
|
||||
{
|
||||
AlleleMetrics metrics;
|
||||
public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate, Integer> {
|
||||
@Argument(fullName="fourBaseMode",required=false,defaultValue="false")
|
||||
public Boolean fourBaseMode;
|
||||
|
||||
@Argument(fullName="decideOnBase",required=false,defaultValue="false")
|
||||
public Boolean decideOnBase;
|
||||
|
||||
public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context)
|
||||
{
|
||||
return true; // We are keeping all the reads
|
||||
private AlleleMetrics metrics;
|
||||
|
||||
public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) { return true; } // We are keeping all the reads
|
||||
public boolean requiresReads() { return true; }
|
||||
public void initialize() { metrics = new AlleleMetrics("metrics.out"); }
|
||||
|
||||
public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
String rodString = getRodString(tracker);
|
||||
|
||||
AlleleFrequencyEstimate freq = null;
|
||||
if (fourBaseMode) {
|
||||
// Compute four-base prob genotype likelihoods
|
||||
freq = getFourProbAlleleFrequency(ref, context, rodString);
|
||||
} else if (decideOnBase) {
|
||||
} else {
|
||||
// Compute single quality score genotype likelihoods
|
||||
freq = getOneProbAlleleFrequency(ref, context, rodString);
|
||||
}
|
||||
|
||||
if (freq != null) { metrics.nextPosition(freq, tracker); }
|
||||
metrics.printMetricsAtLocusIntervals(1000);
|
||||
|
||||
return freq;
|
||||
}
|
||||
|
||||
public boolean requiresReads() { return true; }
|
||||
private AlleleFrequencyEstimate getFourProbAlleleFrequency(char ref, LocusContext context, String rodString) {
|
||||
/*
|
||||
P(D)*P(G,q|D) = P(D|G,q)*P(G,q)
|
||||
= P(D|q)*P(q|G)*P(G)
|
||||
|
||||
public void initialize()
|
||||
{
|
||||
metrics = new AlleleMetrics("metrics.out");
|
||||
}
|
||||
P(G) = { 0.999 (hom-ref), 1e-3 (het), 1e-5 (hom-nonref) }
|
||||
|
||||
n
|
||||
P(D|q)P(q|G) = product [ P_i(A)*B(i, n, G) ]
|
||||
i=1
|
||||
*/
|
||||
|
||||
public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context)
|
||||
{
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
String bases = "";
|
||||
String quals = "";
|
||||
double[][] probs = ReadBackedPileup.probDistPileup(context.getReads(), context.getOffsets());
|
||||
int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(ref);
|
||||
int altBaseIndex = getMostFrequentNonRefBase(getFractionalCounts(probs), refBaseIndex);
|
||||
|
||||
ref = Character.toUpperCase(ref);
|
||||
if (refBaseIndex >= 0 && altBaseIndex >= 0) {
|
||||
System.out.println(context.getLocation().toString() + " " + refBaseIndex + " " + altBaseIndex + " " + probs.length + " " + rodString);
|
||||
for (int i = 0; i < probs.length; i++) {
|
||||
System.out.printf(" [ %4.4f %4.4f %4.4f %4.4f ]\n", probs[i][0], probs[i][1], probs[i][2], probs[i][3]);
|
||||
}
|
||||
|
||||
double[] obsWeights = getObservationWeights(probs, refBaseIndex, altBaseIndex);
|
||||
|
||||
String rodString = "";
|
||||
// Look up dbsnp priors
|
||||
for ( ReferenceOrderedDatum datum : tracker.getAllRods() )
|
||||
{
|
||||
if ( datum != null )
|
||||
{
|
||||
if ( datum instanceof rodDbSNP)
|
||||
{
|
||||
rodDbSNP dbsnp = (rodDbSNP)datum;
|
||||
rodString += dbsnp.toString();
|
||||
System.out.print(" Weights: ");
|
||||
for (int i = 0; i < obsWeights.length; i++) {
|
||||
System.out.printf("%4.4f ", obsWeights[i]);
|
||||
}
|
||||
System.out.println();
|
||||
|
||||
double[] genotypePriors = { 0.999, 1e-3, 1e-5 };
|
||||
double[] genotypeBalances = { 0.02, 0.5, 0.98 };
|
||||
|
||||
double[] posteriors = new double[3];
|
||||
double qhat = 0.0, qstar = 0.0, lodVsRef = 0.0, lodVsNextBest = 0.0, pBest = Double.MIN_VALUE;
|
||||
|
||||
for (int hypothesis = 0; hypothesis < 3; hypothesis++) {
|
||||
posteriors[hypothesis] = 0.0;
|
||||
|
||||
for (int weightIndex = 0; weightIndex < obsWeights.length; weightIndex++) {
|
||||
posteriors[hypothesis] += obsWeights[weightIndex]*binomialProb(weightIndex, probs.length, genotypeBalances[hypothesis]);
|
||||
}
|
||||
else
|
||||
{
|
||||
rodString += datum.toSimpleString();
|
||||
posteriors[hypothesis] *= genotypePriors[hypothesis];
|
||||
|
||||
System.out.printf(" Hypothesis %d %f %f %f\n", hypothesis, genotypeBalances[hypothesis], posteriors[hypothesis], Math.log10(posteriors[hypothesis]/posteriors[0]));
|
||||
|
||||
if (posteriors[hypothesis] > pBest) {
|
||||
qhat = genotypeBalances[hypothesis];
|
||||
qstar = genotypeBalances[hypothesis];
|
||||
lodVsRef = Math.log10(posteriors[hypothesis]/posteriors[0]);
|
||||
pBest = posteriors[hypothesis];
|
||||
}
|
||||
}
|
||||
|
||||
double pRef = posteriors[0];
|
||||
|
||||
System.out.println("\n");
|
||||
|
||||
return new AlleleFrequencyEstimate(context.getLocation(),
|
||||
ref,
|
||||
BaseUtils.baseIndexToSimpleBase(altBaseIndex),
|
||||
2,
|
||||
qhat,
|
||||
qstar,
|
||||
lodVsRef,
|
||||
lodVsNextBest,
|
||||
pBest,
|
||||
pRef,
|
||||
probs.length,
|
||||
ReadBackedPileup.basePileupAsString(context.getReads(), context.getOffsets()),
|
||||
probs,
|
||||
posteriors);
|
||||
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
double binomialProb(long k, long n, double p) {
|
||||
// k - number of successes
|
||||
// n - number of Bernoulli trials
|
||||
// p - probability of success
|
||||
|
||||
if ((n*p < 5) && (n*(1-p) < 5))
|
||||
{
|
||||
// For small n and the edges, compute it directly.
|
||||
return (double)nchoosek(n, k) * Math.pow(p, k) * Math.pow(1-p, n-k);
|
||||
}
|
||||
else
|
||||
{
|
||||
// For large n, approximate with a gaussian.
|
||||
double mean = (double)(n*p);
|
||||
double var = Math.sqrt((double)(n*p)*(1.0-p));
|
||||
double ans = (double)(1.0 / (var*Math.sqrt(2*Math.PI)))*Math.exp(-1.0 * Math.pow((double)k-mean,2)/(2.0*var*var));
|
||||
double check = (double)nchoosek(n, k) * Math.pow(p, k) * Math.pow(1-p, n-k);
|
||||
double residual = ans - check;
|
||||
|
||||
return check;
|
||||
}
|
||||
}
|
||||
|
||||
long nchoosek(long n, long k) {
|
||||
long m = n - k;
|
||||
if (k < m)
|
||||
k = m;
|
||||
|
||||
long t = 1;
|
||||
for (long i = n, j = 1; i > k; i--, j++)
|
||||
t = t * i / j;
|
||||
return t;
|
||||
}
|
||||
|
||||
private double[] getObservationWeights(double[][] probs, int refBaseIndex, int altBaseIndex) {
|
||||
if (probs.length <= 10) {
|
||||
return getWeightTableTraces(getWeightTable(probs, refBaseIndex, altBaseIndex, probs.length));
|
||||
}
|
||||
|
||||
return new double[probs.length + 1];
|
||||
}
|
||||
|
||||
private double[][] getWeightTable(double[][] probs, int refBaseIndex, int altBaseIndex, int numReadsToConsider) {
|
||||
if (numReadsToConsider == 1) {
|
||||
double[][] partialProbTable = new double[1][2];
|
||||
partialProbTable[0][0] = probs[0][refBaseIndex];
|
||||
partialProbTable[0][1] = probs[0][altBaseIndex];
|
||||
|
||||
return partialProbTable;
|
||||
}
|
||||
|
||||
double[][] oldPartialProbTable = getWeightTable(probs, refBaseIndex, altBaseIndex, numReadsToConsider - 1);
|
||||
double[] traces = getWeightTableTraces(oldPartialProbTable);
|
||||
|
||||
double[][] newPartialProbTable = new double[numReadsToConsider][2];
|
||||
for (int row = 0, traceElement = traces.length - 1; row < newPartialProbTable.length; row++, traceElement--) {
|
||||
newPartialProbTable[row][0] = traces[traceElement]*probs[numReadsToConsider - 1][refBaseIndex];
|
||||
newPartialProbTable[row][1] = traces[traceElement]*probs[numReadsToConsider - 1][altBaseIndex];
|
||||
}
|
||||
|
||||
return newPartialProbTable;
|
||||
}
|
||||
|
||||
private double[] getWeightTableTraces(double[][] partialProbTable) {
|
||||
double[] traces = new double[partialProbTable.length + 1];
|
||||
|
||||
traces[0] = partialProbTable[partialProbTable.length - 1][0];
|
||||
traces[partialProbTable.length] = partialProbTable[0][1];
|
||||
|
||||
for (int element = 1; element < traces.length - 1; element++) {
|
||||
traces[element] = partialProbTable[partialProbTable.length - element - 1][0] +
|
||||
partialProbTable[partialProbTable.length - element][1];
|
||||
}
|
||||
|
||||
return traces;
|
||||
}
|
||||
|
||||
private double[] getFractionalCounts(double[][] probs) {
|
||||
double[] fractionalCounts = new double[4];
|
||||
|
||||
for (int i = 0; i < probs.length; i++) {
|
||||
for (int j = 0; j < 4; j++) {
|
||||
fractionalCounts[j] += probs[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
return fractionalCounts;
|
||||
}
|
||||
|
||||
private int getMostFrequentNonRefBase(double[] fractionalCounts, int refBaseIndex) {
|
||||
double maxFractionalCounts = -1.0;
|
||||
int bestAltBaseIndex = -1;
|
||||
for (int altBaseIndex = 0; altBaseIndex < 4; altBaseIndex++) {
|
||||
if (altBaseIndex != refBaseIndex) {
|
||||
if (fractionalCounts[altBaseIndex] > maxFractionalCounts) {
|
||||
maxFractionalCounts = fractionalCounts[altBaseIndex];
|
||||
bestAltBaseIndex = altBaseIndex;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ( rodString != "" )
|
||||
rodString = "[ROD: " + rodString + "]";
|
||||
|
||||
// Accumulate genotype likelihoods
|
||||
GenotypeLikelihoods G = new GenotypeLikelihoods();
|
||||
for ( int i = 0; i < reads.size(); i++ )
|
||||
{
|
||||
return bestAltBaseIndex;
|
||||
}
|
||||
|
||||
private AlleleFrequencyEstimate getOneProbAlleleFrequency(char ref, LocusContext context, String rodString) {
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
|
||||
String bases = pileup.getBases();
|
||||
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
ref = Character.toUpperCase(ref);
|
||||
|
||||
GenotypeLikelihoods G = new GenotypeLikelihoods();
|
||||
for ( int i = 0; i < reads.size(); i++ ) {
|
||||
SAMRecord read = reads.get(i);
|
||||
int offset = offsets.get(i);
|
||||
bases += read.getReadString().charAt(offset);
|
||||
quals += read.getBaseQualityString().charAt(offset);
|
||||
|
||||
G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]);
|
||||
}
|
||||
|
|
@ -73,18 +250,28 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
|
|||
|
||||
System.out.printf("%s %s %s %s\n", context.getLocation(), ref, bases, G.toString(ref), rodString);
|
||||
|
||||
AlleleFrequencyEstimate freq = G.toAlleleFrequencyEstimate(context.getLocation(), ref, bases.length(), bases, G.likelihoods);
|
||||
return G.toAlleleFrequencyEstimate(context.getLocation(), ref, bases.length(), bases, G.likelihoods);
|
||||
}
|
||||
|
||||
metrics.nextPosition(freq, tracker);
|
||||
metrics.printMetricsAtLocusIntervals(1000);
|
||||
private String getRodString(RefMetaDataTracker tracker) {
|
||||
String rodString = "";
|
||||
|
||||
return freq;
|
||||
for ( ReferenceOrderedDatum datum : tracker.getAllRods() ) {
|
||||
if ( datum != null ) {
|
||||
if ( datum instanceof rodDbSNP) {
|
||||
rodDbSNP dbsnp = (rodDbSNP)datum;
|
||||
rodString += dbsnp.toString();
|
||||
} else {
|
||||
rodString += datum.toSimpleString();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( rodString != "" ) { rodString = "[ROD: " + rodString + "]"; }
|
||||
return rodString;
|
||||
}
|
||||
|
||||
// Given result of map function
|
||||
public Integer reduceInit() { return 0; }
|
||||
public Integer reduce(AlleleFrequencyEstimate value, Integer sum)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
public Integer reduce(AlleleFrequencyEstimate value, Integer sum) { return 0; }
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue