Removed some useless code. Don't apply second-base test if the coverage is too high, since the binomial probs explode and return NaN or Infinite values.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@961 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kiran 2009-06-10 08:27:06 +00:00
parent a12ed404ce
commit 87ba8b3451
1 changed files with 19 additions and 215 deletions

View File

@ -23,38 +23,27 @@ import java.util.*;
public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate, String>
{
@Argument(fullName="calls", shortName="calls", doc="File to write SNP calls to", required=true) public String callsFileName;
@Argument(fullName="metrics", shortName="met", doc="metrics", required=false) public String metricsFileName = "/dev/null";
@Argument(fullName="metInterval", shortName="mi", doc="metInterval", required=false) public Integer metricsInterval = 50000;
@Argument(fullName="printMetrics", shortName="printMetrics", doc="printMetrics", required=false) public Boolean printMetrics = true;
@Argument(fullName="lodThreshold", shortName="lod", doc="lodThreshold", required=false) public Double lodThreshold = 5.0;
@Argument(fullName="fourBaseMode", shortName="fb", doc="fourBaseMode", required=false) public Boolean fourBaseMode = false;
@Argument(fullName="retest", shortName="re", doc="retest", required=false) public Boolean retest = false;
@Argument(fullName="call_indels", shortName="call_indels", doc="Call Indels", required=false) public Boolean call_indels = false;
@Argument(fullName="qHom", shortName="qHom", doc="qHom", required=false) public Double qHom = 0.04;
@Argument(fullName="qHet", shortName="qHet", doc="qHet", required=false) public Double qHet = 0.49;
@Argument(fullName="qHomNonRef", shortName="qHomNonRef", doc="qHomNonRef", required=false) public Double qHomNonRef = 0.97;
@Argument(fullName="sample_name_regex", shortName="sample_name_regex", required=false, doc="sample_name_regex") public String SAMPLE_NAME_REGEX = null;
@Argument(fullName="calls", shortName="calls", doc="File to write SNP calls to", required=true) public String callsFileName;
@Argument(fullName="metrics", shortName="met", doc="metrics", required=false) public String metricsFileName = "/dev/null";
@Argument(fullName="metInterval", shortName="mi", doc="metInterval", required=false) public Integer metricsInterval = 50000;
@Argument(fullName="printMetrics", shortName="printMetrics", doc="printMetrics", required=false) public Boolean printMetrics = true;
@Argument(fullName="lodThreshold", shortName="lod", doc="lodThreshold", required=false) public Double lodThreshold = 5.0;
@Argument(fullName="fourBaseMode", shortName="fb", doc="fourBaseMode", required=false) public Boolean fourBaseMode = false;
@Argument(fullName="call_indels", shortName="call_indels", doc="Call Indels", required=false) public Boolean call_indels = false;
@Argument(fullName="refPrior", shortName="refPrior", required=false, doc="Prior likelihood of the reference theory") public double REF_PRIOR = 0.999;
@Argument(fullName="hetVarPrior", shortName="hetVarPrior", required=false, doc="Prior likelihood of a heterozygous variant theory") public double HETVAR_PRIOR = 1e-3;
@Argument(fullName="homVarPrior", shortName="homVarPrior", required=false, doc="Prior likelihood of the homozygous variant theory") public double HOMVAR_PRIOR = 1e-5;
@Argument(fullName="geli", shortName="geli", doc="If true, output will be in Geli/Picard format", required=false) public boolean GeliOutputFormat = false;
@Argument(fullName="sample_name_regex", shortName="sample_name_regex", doc="sample_name_regex", required=false) public String SAMPLE_NAME_REGEX = null;
@Argument(fullName="geli", shortName="geli", required=false, doc="If true, output will be in Geli/Picard format")
public boolean GeliOutputFormat = false;
// todo: enable argument after fixing all of the hard-codings in GenotypeLikelihoods and other places
//@Argument(fullName="refPrior", shortName="refPrior", required=false, doc="Prior likelihood of the reference theory")
public double REF_PRIOR = 0.999;
//@Argument(fullName="hetVarPrior", shortName="hetVarPrior", required=false, doc="Prior likelihood of a heterozygous variant theory")
public double HETVAR_PRIOR = 1e-3;
//@Argument(fullName="homVarPrior", shortName="homVarPrior", required=false, doc="Prior likelihood of the homozygous variant theory")
public double HOMVAR_PRIOR = 1e-5;
public AlleleMetrics metrics;
public PrintStream calls_file;
public PrintStream calls_file;
public String sample_name;
public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) { return true; }
public boolean requiresReads() { return true; }
public void initialize()
{
try
@ -99,7 +88,7 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
}
}
AlleleFrequencyEstimate freq = getOneProbAlleleFrequency(ref, context, rodString, sample_name);
AlleleFrequencyEstimate freq = getAlleleFrequency(ref, context, rodString, sample_name);
if (printMetrics) {
if (freq != null) { metrics.nextPosition(freq, tracker); }
@ -109,192 +98,7 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
return freq;
}
private AlleleFrequencyEstimate getFourProbAlleleFrequency(char ref, LocusContext context, String rodString, RefMetaDataTracker tracker) {
double[][] probs = ReadBackedPileup.probDistPileup(context.getReads(), context.getOffsets());
int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(ref);
int altBaseIndex = getMostFrequentNonRefBase(getFractionalCounts(probs), refBaseIndex);
if (refBaseIndex >= 0 && altBaseIndex >= 0) {
// Set the priors for the hom-ref, het, and non-hom ref (we don't evaluate the
// possibility of a non-ref het).
//double[] genotypePriors = { 0.999, 1e-3, 1e-5 };
double[] genotypePriors = { REF_PRIOR, HETVAR_PRIOR, HOMVAR_PRIOR };
// I'm not sure what the difference between qhat and qstar is in AlleleMetrics, but
// to make my life simpler, I evaluate the non-ref balances in genotypeBalances and
// report the balances in reportingBalances.
double[] genotypeBalances = { qHom, qHet, qHomNonRef };
double[] reportingBalances = { 0.0, 0.5, 1.0 };
// Compute the probability of the distribution of second-best bases (should be random)
int[] secondaryBaseCounts = getSecondaryBaseCounts(probs);
double[] secondaryBaseProbs = new double[4];
for (int baseIndex = 0; baseIndex < 4; baseIndex++) {
secondaryBaseProbs[baseIndex] = 0.25;
}
double secondaryBaseDistProb = MathUtils.multinomialProbability(secondaryBaseCounts, secondaryBaseProbs);
//System.out.printf("%d %d %d %d ", secondaryBaseCounts[0], secondaryBaseCounts[1], secondaryBaseCounts[2], secondaryBaseCounts[3]);
//System.out.println(secondaryBaseDistProb);
// Determine the probability distribution of non-ref alleles
double[] obsWeights = getObservationWeights(probs, refBaseIndex, altBaseIndex);
// Convolve them with the binomial sampling probability distribution in order to
// marginalize the non-ref allele balance.
double[] posteriors = new double[3];
double qhat = 0.0, qstar = 0.0, lodVsRef = 0.0, pBest = Double.MIN_VALUE;
for (int hypothesis = 0; hypothesis < 3; hypothesis++) {
posteriors[hypothesis] = 0.0;
for (int weightIndex = 0; weightIndex < obsWeights.length; weightIndex++) {
double binomialWeight = MathUtils.binomialProbability(weightIndex, probs.length, genotypeBalances[hypothesis]);
// The hom-ref posterior probability distribution may be too weak to
// effectively account for noise. Just set it to equal weighting for
// the lower quantile of the non-ref balance range.
//if (hypothesis == 0 && weightIndex < obsWeights.length/4) {
// binomialWeight = 1.0;
//}
posteriors[hypothesis] += obsWeights[weightIndex]*binomialWeight;
}
posteriors[hypothesis] *= genotypePriors[hypothesis];
double refLikelihood = Double.isInfinite(Math.log10(posteriors[0])) ? -10000.0 : Math.log10(posteriors[0]);
if (posteriors[hypothesis] > pBest) {
qhat = reportingBalances[hypothesis];
qstar = reportingBalances[hypothesis];
lodVsRef = Math.log10(posteriors[hypothesis]) - refLikelihood;
pBest = posteriors[hypothesis];
}
}
double pRef = posteriors[0];
double[] sposteriors = Arrays.copyOf(posteriors, posteriors.length);
Arrays.sort(sposteriors);
double bestLogPosterior = (Double.isInfinite(Math.log10(sposteriors[2]))) ? -10000.0 : Math.log10(sposteriors[2]);
double nextBestLogPosterior = (Double.isInfinite(Math.log10(sposteriors[1]))) ? -10000.0 : Math.log10(sposteriors[1]);
double lodVsNextBest = bestLogPosterior - nextBestLogPosterior;
AlleleFrequencyEstimate freq = new AlleleFrequencyEstimate(context.getLocation(),
ref,
BaseUtils.baseIndexToSimpleBase(altBaseIndex),
2,
qhat,
qstar,
(MathUtils.compareDoubles(qhat, reportingBalances[0]) == 0 ? lodVsNextBest : lodVsRef),
lodVsNextBest,
pBest,
pRef,
probs.length,
ReadBackedPileup.basePileupAsString(context.getReads(), context.getOffsets()),
probs,
posteriors,
"unknown_sample");
return freq;
}
return null;
}
private double[] getObservationWeights(double[][] probs, int refBaseIndex, int altBaseIndex) {
return getWeightTableTraces(getWeightTable(probs, refBaseIndex, altBaseIndex, probs.length));
}
private double[][] getWeightTable(double[][] probs, int refBaseIndex, int altBaseIndex, int numReadsToConsider) {
if (numReadsToConsider == 1) {
double[][] partialProbTable = new double[1][2];
partialProbTable[0][0] = probs[0][refBaseIndex];
partialProbTable[0][1] = probs[0][altBaseIndex];
return partialProbTable;
}
double[][] oldPartialProbTable = getWeightTable(probs, refBaseIndex, altBaseIndex, numReadsToConsider - 1);
double[] traces = getWeightTableTraces(oldPartialProbTable);
double[][] newPartialProbTable = new double[numReadsToConsider][2];
for (int row = 0, traceElement = traces.length - 1; row < newPartialProbTable.length; row++, traceElement--) {
newPartialProbTable[row][0] = traces[traceElement]*probs[numReadsToConsider - 1][refBaseIndex];
newPartialProbTable[row][1] = traces[traceElement]*probs[numReadsToConsider - 1][altBaseIndex];
}
return newPartialProbTable;
}
private double[] getWeightTableTraces(double[][] partialProbTable) {
double[] traces = new double[partialProbTable.length + 1];
traces[0] = partialProbTable[partialProbTable.length - 1][0];
traces[partialProbTable.length] = partialProbTable[0][1];
for (int element = 1; element < traces.length - 1; element++) {
traces[element] = partialProbTable[partialProbTable.length - element - 1][0] +
partialProbTable[partialProbTable.length - element][1];
}
return traces;
}
private double[] getFractionalCounts(double[][] probs) {
double[] fractionalCounts = new double[4];
for (int i = 0; i < probs.length; i++) {
for (int j = 0; j < 4; j++) {
fractionalCounts[j] += probs[i][j];
}
}
return fractionalCounts;
}
private int getMostFrequentNonRefBase(double[] fractionalCounts, int refBaseIndex) {
double maxFractionalCounts = -1.0;
int bestAltBaseIndex = -1;
for (int altBaseIndex = 0; altBaseIndex < 4; altBaseIndex++) {
if (altBaseIndex != refBaseIndex) {
if (fractionalCounts[altBaseIndex] > maxFractionalCounts) {
maxFractionalCounts = fractionalCounts[altBaseIndex];
bestAltBaseIndex = altBaseIndex;
}
}
}
return bestAltBaseIndex;
}
private int[] getSecondaryBaseCounts(double[][] probs) {
int[] secondaryBaseCounts = new int[4];
for (int readIndex = 0; readIndex < probs.length; readIndex++) {
double bestProb = 0.0;
for (int baseIndex = 0; baseIndex < probs[readIndex].length; baseIndex++) {
if (probs[readIndex][baseIndex] > bestProb) {
bestProb = probs[readIndex][baseIndex];
}
}
double secondBestProb = 0.0;
int secondBestBaseIndex = 0;
for (int baseIndex = 0; baseIndex < 4; baseIndex++) {
if (probs[readIndex][baseIndex] > secondBestProb && probs[readIndex][baseIndex] < bestProb) {
secondBestProb = probs[readIndex][baseIndex];
secondBestBaseIndex = baseIndex;
}
}
secondaryBaseCounts[secondBestBaseIndex]++;
}
return secondaryBaseCounts;
}
private AlleleFrequencyEstimate getOneProbAlleleFrequency(char ref, LocusContext context, String rodString, String sample_name) {
private AlleleFrequencyEstimate getAlleleFrequency(char ref, LocusContext context, String rodString, String sample_name) {
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
String bases = pileup.getBases();
@ -333,8 +137,8 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
}
G.ApplyPrior(ref, this.alt_allele, this.allele_frequency_prior);
if (fourBaseMode) {
G.applyFourBaseDistributionPrior(pileup.getBases(), pileup.getSecondaryBasePileup());
if (fourBaseMode && pileup.getBases().length() < 750) {
G.applySecondBaseDistributionPrior(pileup.getBases(), pileup.getSecondaryBasePileup());
}
return G.toAlleleFrequencyEstimate(context.getLocation(), ref, bases.length(), bases, G.likelihoods, sample_name);