Intermediate commit while working on getting four-base probs to work in the single sample genotyper. Has infrastructure for the new combinatorial approach and just choosing the best base more intelligently given a probability distribution over bases and the reference base.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@492 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4cafb95be8
commit
ffcd672c1c
|
|
@ -5,6 +5,10 @@ import org.broadinstitute.sting.gatk.refdata.*;
|
||||||
import org.broadinstitute.sting.gatk.walkers.*;
|
import org.broadinstitute.sting.gatk.walkers.*;
|
||||||
|
|
||||||
import org.broadinstitute.sting.playground.utils.*;
|
import org.broadinstitute.sting.playground.utils.*;
|
||||||
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
|
|
@ -13,59 +17,232 @@ import java.util.List;
|
||||||
// Draft single sample genotyper
|
// Draft single sample genotyper
|
||||||
// j.maguire 3-7-2009
|
// j.maguire 3-7-2009
|
||||||
|
|
||||||
public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate, Integer>
|
public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate, Integer> {
|
||||||
{
|
@Argument(fullName="fourBaseMode",required=false,defaultValue="false")
|
||||||
AlleleMetrics metrics;
|
public Boolean fourBaseMode;
|
||||||
|
|
||||||
public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context)
|
@Argument(fullName="decideOnBase",required=false,defaultValue="false")
|
||||||
{
|
public Boolean decideOnBase;
|
||||||
return true; // We are keeping all the reads
|
|
||||||
|
private AlleleMetrics metrics;
|
||||||
|
|
||||||
|
public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) { return true; } // We are keeping all the reads
|
||||||
|
public boolean requiresReads() { return true; }
|
||||||
|
public void initialize() { metrics = new AlleleMetrics("metrics.out"); }
|
||||||
|
|
||||||
|
public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||||
|
String rodString = getRodString(tracker);
|
||||||
|
|
||||||
|
AlleleFrequencyEstimate freq = null;
|
||||||
|
if (fourBaseMode) {
|
||||||
|
// Compute four-base prob genotype likelihoods
|
||||||
|
freq = getFourProbAlleleFrequency(ref, context, rodString);
|
||||||
|
} else if (decideOnBase) {
|
||||||
|
} else {
|
||||||
|
// Compute single quality score genotype likelihoods
|
||||||
|
freq = getOneProbAlleleFrequency(ref, context, rodString);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (freq != null) { metrics.nextPosition(freq, tracker); }
|
||||||
|
metrics.printMetricsAtLocusIntervals(1000);
|
||||||
|
|
||||||
|
return freq;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean requiresReads() { return true; }
|
private AlleleFrequencyEstimate getFourProbAlleleFrequency(char ref, LocusContext context, String rodString) {
|
||||||
|
/*
|
||||||
|
P(D)*P(G,q|D) = P(D|G,q)*P(G,q)
|
||||||
|
= P(D|q)*P(q|G)*P(G)
|
||||||
|
|
||||||
public void initialize()
|
P(G) = { 0.999 (hom-ref), 1e-3 (het), 1e-5 (hom-nonref) }
|
||||||
{
|
|
||||||
metrics = new AlleleMetrics("metrics.out");
|
|
||||||
}
|
|
||||||
|
|
||||||
public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context)
|
n
|
||||||
{
|
P(D|q)P(q|G) = product [ P_i(A)*B(i, n, G) ]
|
||||||
List<SAMRecord> reads = context.getReads();
|
i=1
|
||||||
List<Integer> offsets = context.getOffsets();
|
*/
|
||||||
String bases = "";
|
|
||||||
String quals = "";
|
|
||||||
|
|
||||||
ref = Character.toUpperCase(ref);
|
double[][] probs = ReadBackedPileup.probDistPileup(context.getReads(), context.getOffsets());
|
||||||
|
int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(ref);
|
||||||
|
int altBaseIndex = getMostFrequentNonRefBase(getFractionalCounts(probs), refBaseIndex);
|
||||||
|
|
||||||
String rodString = "";
|
if (refBaseIndex >= 0 && altBaseIndex >= 0) {
|
||||||
// Look up dbsnp priors
|
System.out.println(context.getLocation().toString() + " " + refBaseIndex + " " + altBaseIndex + " " + probs.length + " " + rodString);
|
||||||
for ( ReferenceOrderedDatum datum : tracker.getAllRods() )
|
for (int i = 0; i < probs.length; i++) {
|
||||||
{
|
System.out.printf(" [ %4.4f %4.4f %4.4f %4.4f ]\n", probs[i][0], probs[i][1], probs[i][2], probs[i][3]);
|
||||||
if ( datum != null )
|
}
|
||||||
{
|
|
||||||
if ( datum instanceof rodDbSNP)
|
double[] obsWeights = getObservationWeights(probs, refBaseIndex, altBaseIndex);
|
||||||
{
|
|
||||||
rodDbSNP dbsnp = (rodDbSNP)datum;
|
System.out.print(" Weights: ");
|
||||||
rodString += dbsnp.toString();
|
for (int i = 0; i < obsWeights.length; i++) {
|
||||||
|
System.out.printf("%4.4f ", obsWeights[i]);
|
||||||
|
}
|
||||||
|
System.out.println();
|
||||||
|
|
||||||
|
double[] genotypePriors = { 0.999, 1e-3, 1e-5 };
|
||||||
|
double[] genotypeBalances = { 0.02, 0.5, 0.98 };
|
||||||
|
|
||||||
|
double[] posteriors = new double[3];
|
||||||
|
double qhat = 0.0, qstar = 0.0, lodVsRef = 0.0, lodVsNextBest = 0.0, pBest = Double.MIN_VALUE;
|
||||||
|
|
||||||
|
for (int hypothesis = 0; hypothesis < 3; hypothesis++) {
|
||||||
|
posteriors[hypothesis] = 0.0;
|
||||||
|
|
||||||
|
for (int weightIndex = 0; weightIndex < obsWeights.length; weightIndex++) {
|
||||||
|
posteriors[hypothesis] += obsWeights[weightIndex]*binomialProb(weightIndex, probs.length, genotypeBalances[hypothesis]);
|
||||||
}
|
}
|
||||||
else
|
posteriors[hypothesis] *= genotypePriors[hypothesis];
|
||||||
{
|
|
||||||
rodString += datum.toSimpleString();
|
System.out.printf(" Hypothesis %d %f %f %f\n", hypothesis, genotypeBalances[hypothesis], posteriors[hypothesis], Math.log10(posteriors[hypothesis]/posteriors[0]));
|
||||||
|
|
||||||
|
if (posteriors[hypothesis] > pBest) {
|
||||||
|
qhat = genotypeBalances[hypothesis];
|
||||||
|
qstar = genotypeBalances[hypothesis];
|
||||||
|
lodVsRef = Math.log10(posteriors[hypothesis]/posteriors[0]);
|
||||||
|
pBest = posteriors[hypothesis];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double pRef = posteriors[0];
|
||||||
|
|
||||||
|
System.out.println("\n");
|
||||||
|
|
||||||
|
return new AlleleFrequencyEstimate(context.getLocation(),
|
||||||
|
ref,
|
||||||
|
BaseUtils.baseIndexToSimpleBase(altBaseIndex),
|
||||||
|
2,
|
||||||
|
qhat,
|
||||||
|
qstar,
|
||||||
|
lodVsRef,
|
||||||
|
lodVsNextBest,
|
||||||
|
pBest,
|
||||||
|
pRef,
|
||||||
|
probs.length,
|
||||||
|
ReadBackedPileup.basePileupAsString(context.getReads(), context.getOffsets()),
|
||||||
|
probs,
|
||||||
|
posteriors);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
double binomialProb(long k, long n, double p) {
|
||||||
|
// k - number of successes
|
||||||
|
// n - number of Bernoulli trials
|
||||||
|
// p - probability of success
|
||||||
|
|
||||||
|
if ((n*p < 5) && (n*(1-p) < 5))
|
||||||
|
{
|
||||||
|
// For small n and the edges, compute it directly.
|
||||||
|
return (double)nchoosek(n, k) * Math.pow(p, k) * Math.pow(1-p, n-k);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// For large n, approximate with a gaussian.
|
||||||
|
double mean = (double)(n*p);
|
||||||
|
double var = Math.sqrt((double)(n*p)*(1.0-p));
|
||||||
|
double ans = (double)(1.0 / (var*Math.sqrt(2*Math.PI)))*Math.exp(-1.0 * Math.pow((double)k-mean,2)/(2.0*var*var));
|
||||||
|
double check = (double)nchoosek(n, k) * Math.pow(p, k) * Math.pow(1-p, n-k);
|
||||||
|
double residual = ans - check;
|
||||||
|
|
||||||
|
return check;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
long nchoosek(long n, long k) {
|
||||||
|
long m = n - k;
|
||||||
|
if (k < m)
|
||||||
|
k = m;
|
||||||
|
|
||||||
|
long t = 1;
|
||||||
|
for (long i = n, j = 1; i > k; i--, j++)
|
||||||
|
t = t * i / j;
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
|
||||||
|
private double[] getObservationWeights(double[][] probs, int refBaseIndex, int altBaseIndex) {
|
||||||
|
if (probs.length <= 10) {
|
||||||
|
return getWeightTableTraces(getWeightTable(probs, refBaseIndex, altBaseIndex, probs.length));
|
||||||
|
}
|
||||||
|
|
||||||
|
return new double[probs.length + 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
private double[][] getWeightTable(double[][] probs, int refBaseIndex, int altBaseIndex, int numReadsToConsider) {
|
||||||
|
if (numReadsToConsider == 1) {
|
||||||
|
double[][] partialProbTable = new double[1][2];
|
||||||
|
partialProbTable[0][0] = probs[0][refBaseIndex];
|
||||||
|
partialProbTable[0][1] = probs[0][altBaseIndex];
|
||||||
|
|
||||||
|
return partialProbTable;
|
||||||
|
}
|
||||||
|
|
||||||
|
double[][] oldPartialProbTable = getWeightTable(probs, refBaseIndex, altBaseIndex, numReadsToConsider - 1);
|
||||||
|
double[] traces = getWeightTableTraces(oldPartialProbTable);
|
||||||
|
|
||||||
|
double[][] newPartialProbTable = new double[numReadsToConsider][2];
|
||||||
|
for (int row = 0, traceElement = traces.length - 1; row < newPartialProbTable.length; row++, traceElement--) {
|
||||||
|
newPartialProbTable[row][0] = traces[traceElement]*probs[numReadsToConsider - 1][refBaseIndex];
|
||||||
|
newPartialProbTable[row][1] = traces[traceElement]*probs[numReadsToConsider - 1][altBaseIndex];
|
||||||
|
}
|
||||||
|
|
||||||
|
return newPartialProbTable;
|
||||||
|
}
|
||||||
|
|
||||||
|
private double[] getWeightTableTraces(double[][] partialProbTable) {
|
||||||
|
double[] traces = new double[partialProbTable.length + 1];
|
||||||
|
|
||||||
|
traces[0] = partialProbTable[partialProbTable.length - 1][0];
|
||||||
|
traces[partialProbTable.length] = partialProbTable[0][1];
|
||||||
|
|
||||||
|
for (int element = 1; element < traces.length - 1; element++) {
|
||||||
|
traces[element] = partialProbTable[partialProbTable.length - element - 1][0] +
|
||||||
|
partialProbTable[partialProbTable.length - element][1];
|
||||||
|
}
|
||||||
|
|
||||||
|
return traces;
|
||||||
|
}
|
||||||
|
|
||||||
|
private double[] getFractionalCounts(double[][] probs) {
|
||||||
|
double[] fractionalCounts = new double[4];
|
||||||
|
|
||||||
|
for (int i = 0; i < probs.length; i++) {
|
||||||
|
for (int j = 0; j < 4; j++) {
|
||||||
|
fractionalCounts[j] += probs[i][j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return fractionalCounts;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int getMostFrequentNonRefBase(double[] fractionalCounts, int refBaseIndex) {
|
||||||
|
double maxFractionalCounts = -1.0;
|
||||||
|
int bestAltBaseIndex = -1;
|
||||||
|
for (int altBaseIndex = 0; altBaseIndex < 4; altBaseIndex++) {
|
||||||
|
if (altBaseIndex != refBaseIndex) {
|
||||||
|
if (fractionalCounts[altBaseIndex] > maxFractionalCounts) {
|
||||||
|
maxFractionalCounts = fractionalCounts[altBaseIndex];
|
||||||
|
bestAltBaseIndex = altBaseIndex;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if ( rodString != "" )
|
|
||||||
rodString = "[ROD: " + rodString + "]";
|
|
||||||
|
|
||||||
// Accumulate genotype likelihoods
|
return bestAltBaseIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
private AlleleFrequencyEstimate getOneProbAlleleFrequency(char ref, LocusContext context, String rodString) {
|
||||||
|
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
|
||||||
|
String bases = pileup.getBases();
|
||||||
|
|
||||||
|
List<SAMRecord> reads = context.getReads();
|
||||||
|
List<Integer> offsets = context.getOffsets();
|
||||||
|
ref = Character.toUpperCase(ref);
|
||||||
|
|
||||||
GenotypeLikelihoods G = new GenotypeLikelihoods();
|
GenotypeLikelihoods G = new GenotypeLikelihoods();
|
||||||
for ( int i = 0; i < reads.size(); i++ )
|
for ( int i = 0; i < reads.size(); i++ ) {
|
||||||
{
|
|
||||||
SAMRecord read = reads.get(i);
|
SAMRecord read = reads.get(i);
|
||||||
int offset = offsets.get(i);
|
int offset = offsets.get(i);
|
||||||
bases += read.getReadString().charAt(offset);
|
|
||||||
quals += read.getBaseQualityString().charAt(offset);
|
|
||||||
|
|
||||||
G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]);
|
G.add(ref, read.getReadString().charAt(offset), read.getBaseQualities()[offset]);
|
||||||
}
|
}
|
||||||
|
|
@ -73,18 +250,28 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
|
||||||
|
|
||||||
System.out.printf("%s %s %s %s\n", context.getLocation(), ref, bases, G.toString(ref), rodString);
|
System.out.printf("%s %s %s %s\n", context.getLocation(), ref, bases, G.toString(ref), rodString);
|
||||||
|
|
||||||
AlleleFrequencyEstimate freq = G.toAlleleFrequencyEstimate(context.getLocation(), ref, bases.length(), bases, G.likelihoods);
|
return G.toAlleleFrequencyEstimate(context.getLocation(), ref, bases.length(), bases, G.likelihoods);
|
||||||
|
}
|
||||||
|
|
||||||
metrics.nextPosition(freq, tracker);
|
private String getRodString(RefMetaDataTracker tracker) {
|
||||||
metrics.printMetricsAtLocusIntervals(1000);
|
String rodString = "";
|
||||||
|
|
||||||
return freq;
|
for ( ReferenceOrderedDatum datum : tracker.getAllRods() ) {
|
||||||
|
if ( datum != null ) {
|
||||||
|
if ( datum instanceof rodDbSNP) {
|
||||||
|
rodDbSNP dbsnp = (rodDbSNP)datum;
|
||||||
|
rodString += dbsnp.toString();
|
||||||
|
} else {
|
||||||
|
rodString += datum.toSimpleString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( rodString != "" ) { rodString = "[ROD: " + rodString + "]"; }
|
||||||
|
return rodString;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Given result of map function
|
// Given result of map function
|
||||||
public Integer reduceInit() { return 0; }
|
public Integer reduceInit() { return 0; }
|
||||||
public Integer reduce(AlleleFrequencyEstimate value, Integer sum)
|
public Integer reduce(AlleleFrequencyEstimate value, Integer sum) { return 0; }
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue