Removed some useless code. Don't apply second-base test if the coverage is too high, since the binomial probs explode and return NaN or Infinite values.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@961 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a12ed404ce
commit
87ba8b3451
|
|
@ -23,38 +23,27 @@ import java.util.*;
|
|||
|
||||
public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate, String>
|
||||
{
|
||||
@Argument(fullName="calls", shortName="calls", doc="File to write SNP calls to", required=true) public String callsFileName;
|
||||
@Argument(fullName="metrics", shortName="met", doc="metrics", required=false) public String metricsFileName = "/dev/null";
|
||||
@Argument(fullName="metInterval", shortName="mi", doc="metInterval", required=false) public Integer metricsInterval = 50000;
|
||||
@Argument(fullName="printMetrics", shortName="printMetrics", doc="printMetrics", required=false) public Boolean printMetrics = true;
|
||||
@Argument(fullName="lodThreshold", shortName="lod", doc="lodThreshold", required=false) public Double lodThreshold = 5.0;
|
||||
@Argument(fullName="fourBaseMode", shortName="fb", doc="fourBaseMode", required=false) public Boolean fourBaseMode = false;
|
||||
@Argument(fullName="retest", shortName="re", doc="retest", required=false) public Boolean retest = false;
|
||||
@Argument(fullName="call_indels", shortName="call_indels", doc="Call Indels", required=false) public Boolean call_indels = false;
|
||||
@Argument(fullName="qHom", shortName="qHom", doc="qHom", required=false) public Double qHom = 0.04;
|
||||
@Argument(fullName="qHet", shortName="qHet", doc="qHet", required=false) public Double qHet = 0.49;
|
||||
@Argument(fullName="qHomNonRef", shortName="qHomNonRef", doc="qHomNonRef", required=false) public Double qHomNonRef = 0.97;
|
||||
@Argument(fullName="sample_name_regex", shortName="sample_name_regex", required=false, doc="sample_name_regex") public String SAMPLE_NAME_REGEX = null;
|
||||
@Argument(fullName="calls", shortName="calls", doc="File to write SNP calls to", required=true) public String callsFileName;
|
||||
@Argument(fullName="metrics", shortName="met", doc="metrics", required=false) public String metricsFileName = "/dev/null";
|
||||
@Argument(fullName="metInterval", shortName="mi", doc="metInterval", required=false) public Integer metricsInterval = 50000;
|
||||
@Argument(fullName="printMetrics", shortName="printMetrics", doc="printMetrics", required=false) public Boolean printMetrics = true;
|
||||
@Argument(fullName="lodThreshold", shortName="lod", doc="lodThreshold", required=false) public Double lodThreshold = 5.0;
|
||||
@Argument(fullName="fourBaseMode", shortName="fb", doc="fourBaseMode", required=false) public Boolean fourBaseMode = false;
|
||||
@Argument(fullName="call_indels", shortName="call_indels", doc="Call Indels", required=false) public Boolean call_indels = false;
|
||||
@Argument(fullName="refPrior", shortName="refPrior", required=false, doc="Prior likelihood of the reference theory") public double REF_PRIOR = 0.999;
|
||||
@Argument(fullName="hetVarPrior", shortName="hetVarPrior", required=false, doc="Prior likelihood of a heterozygous variant theory") public double HETVAR_PRIOR = 1e-3;
|
||||
@Argument(fullName="homVarPrior", shortName="homVarPrior", required=false, doc="Prior likelihood of the homozygous variant theory") public double HOMVAR_PRIOR = 1e-5;
|
||||
@Argument(fullName="geli", shortName="geli", doc="If true, output will be in Geli/Picard format", required=false) public boolean GeliOutputFormat = false;
|
||||
@Argument(fullName="sample_name_regex", shortName="sample_name_regex", doc="sample_name_regex", required=false) public String SAMPLE_NAME_REGEX = null;
|
||||
|
||||
@Argument(fullName="geli", shortName="geli", required=false, doc="If true, output will be in Geli/Picard format")
|
||||
public boolean GeliOutputFormat = false;
|
||||
|
||||
// todo: enable argument after fixing all of the hard-codings in GenotypeLikelihoods and other places
|
||||
|
||||
//@Argument(fullName="refPrior", shortName="refPrior", required=false, doc="Prior likelihood of the reference theory")
|
||||
public double REF_PRIOR = 0.999;
|
||||
//@Argument(fullName="hetVarPrior", shortName="hetVarPrior", required=false, doc="Prior likelihood of a heterozygous variant theory")
|
||||
public double HETVAR_PRIOR = 1e-3;
|
||||
//@Argument(fullName="homVarPrior", shortName="homVarPrior", required=false, doc="Prior likelihood of the homozygous variant theory")
|
||||
public double HOMVAR_PRIOR = 1e-5;
|
||||
|
||||
public AlleleMetrics metrics;
|
||||
public PrintStream calls_file;
|
||||
|
||||
public PrintStream calls_file;
|
||||
public String sample_name;
|
||||
|
||||
public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) { return true; }
|
||||
|
||||
public boolean requiresReads() { return true; }
|
||||
|
||||
public void initialize()
|
||||
{
|
||||
try
|
||||
|
|
@ -99,7 +88,7 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
|
|||
}
|
||||
}
|
||||
|
||||
AlleleFrequencyEstimate freq = getOneProbAlleleFrequency(ref, context, rodString, sample_name);
|
||||
AlleleFrequencyEstimate freq = getAlleleFrequency(ref, context, rodString, sample_name);
|
||||
|
||||
if (printMetrics) {
|
||||
if (freq != null) { metrics.nextPosition(freq, tracker); }
|
||||
|
|
@ -109,192 +98,7 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
|
|||
return freq;
|
||||
}
|
||||
|
||||
private AlleleFrequencyEstimate getFourProbAlleleFrequency(char ref, LocusContext context, String rodString, RefMetaDataTracker tracker) {
|
||||
double[][] probs = ReadBackedPileup.probDistPileup(context.getReads(), context.getOffsets());
|
||||
int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(ref);
|
||||
int altBaseIndex = getMostFrequentNonRefBase(getFractionalCounts(probs), refBaseIndex);
|
||||
|
||||
if (refBaseIndex >= 0 && altBaseIndex >= 0) {
|
||||
// Set the priors for the hom-ref, het, and non-hom ref (we don't evaluate the
|
||||
// possibility of a non-ref het).
|
||||
//double[] genotypePriors = { 0.999, 1e-3, 1e-5 };
|
||||
double[] genotypePriors = { REF_PRIOR, HETVAR_PRIOR, HOMVAR_PRIOR };
|
||||
|
||||
// I'm not sure what the difference between qhat and qstar is in AlleleMetrics, but
|
||||
// to make my life simpler, I evaluate the non-ref balances in genotypeBalances and
|
||||
// report the balances in reportingBalances.
|
||||
double[] genotypeBalances = { qHom, qHet, qHomNonRef };
|
||||
double[] reportingBalances = { 0.0, 0.5, 1.0 };
|
||||
|
||||
// Compute the probability of the distribution of second-best bases (should be random)
|
||||
int[] secondaryBaseCounts = getSecondaryBaseCounts(probs);
|
||||
double[] secondaryBaseProbs = new double[4];
|
||||
for (int baseIndex = 0; baseIndex < 4; baseIndex++) {
|
||||
secondaryBaseProbs[baseIndex] = 0.25;
|
||||
}
|
||||
double secondaryBaseDistProb = MathUtils.multinomialProbability(secondaryBaseCounts, secondaryBaseProbs);
|
||||
|
||||
//System.out.printf("%d %d %d %d ", secondaryBaseCounts[0], secondaryBaseCounts[1], secondaryBaseCounts[2], secondaryBaseCounts[3]);
|
||||
//System.out.println(secondaryBaseDistProb);
|
||||
|
||||
// Determine the probability distribution of non-ref alleles
|
||||
double[] obsWeights = getObservationWeights(probs, refBaseIndex, altBaseIndex);
|
||||
|
||||
// Convolve them with the binomial sampling probability distribution in order to
|
||||
// marginalize the non-ref allele balance.
|
||||
double[] posteriors = new double[3];
|
||||
double qhat = 0.0, qstar = 0.0, lodVsRef = 0.0, pBest = Double.MIN_VALUE;
|
||||
for (int hypothesis = 0; hypothesis < 3; hypothesis++) {
|
||||
posteriors[hypothesis] = 0.0;
|
||||
|
||||
for (int weightIndex = 0; weightIndex < obsWeights.length; weightIndex++) {
|
||||
double binomialWeight = MathUtils.binomialProbability(weightIndex, probs.length, genotypeBalances[hypothesis]);
|
||||
|
||||
// The hom-ref posterior probability distribution may be too weak to
|
||||
// effectively account for noise. Just set it to equal weighting for
|
||||
// the lower quantile of the non-ref balance range.
|
||||
//if (hypothesis == 0 && weightIndex < obsWeights.length/4) {
|
||||
// binomialWeight = 1.0;
|
||||
//}
|
||||
|
||||
posteriors[hypothesis] += obsWeights[weightIndex]*binomialWeight;
|
||||
}
|
||||
|
||||
posteriors[hypothesis] *= genotypePriors[hypothesis];
|
||||
|
||||
double refLikelihood = Double.isInfinite(Math.log10(posteriors[0])) ? -10000.0 : Math.log10(posteriors[0]);
|
||||
|
||||
if (posteriors[hypothesis] > pBest) {
|
||||
qhat = reportingBalances[hypothesis];
|
||||
qstar = reportingBalances[hypothesis];
|
||||
lodVsRef = Math.log10(posteriors[hypothesis]) - refLikelihood;
|
||||
pBest = posteriors[hypothesis];
|
||||
}
|
||||
}
|
||||
|
||||
double pRef = posteriors[0];
|
||||
|
||||
double[] sposteriors = Arrays.copyOf(posteriors, posteriors.length);
|
||||
Arrays.sort(sposteriors);
|
||||
double bestLogPosterior = (Double.isInfinite(Math.log10(sposteriors[2]))) ? -10000.0 : Math.log10(sposteriors[2]);
|
||||
double nextBestLogPosterior = (Double.isInfinite(Math.log10(sposteriors[1]))) ? -10000.0 : Math.log10(sposteriors[1]);
|
||||
double lodVsNextBest = bestLogPosterior - nextBestLogPosterior;
|
||||
|
||||
AlleleFrequencyEstimate freq = new AlleleFrequencyEstimate(context.getLocation(),
|
||||
ref,
|
||||
BaseUtils.baseIndexToSimpleBase(altBaseIndex),
|
||||
2,
|
||||
qhat,
|
||||
qstar,
|
||||
(MathUtils.compareDoubles(qhat, reportingBalances[0]) == 0 ? lodVsNextBest : lodVsRef),
|
||||
lodVsNextBest,
|
||||
pBest,
|
||||
pRef,
|
||||
probs.length,
|
||||
ReadBackedPileup.basePileupAsString(context.getReads(), context.getOffsets()),
|
||||
probs,
|
||||
posteriors,
|
||||
"unknown_sample");
|
||||
|
||||
return freq;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private double[] getObservationWeights(double[][] probs, int refBaseIndex, int altBaseIndex) {
|
||||
return getWeightTableTraces(getWeightTable(probs, refBaseIndex, altBaseIndex, probs.length));
|
||||
}
|
||||
|
||||
private double[][] getWeightTable(double[][] probs, int refBaseIndex, int altBaseIndex, int numReadsToConsider) {
|
||||
if (numReadsToConsider == 1) {
|
||||
double[][] partialProbTable = new double[1][2];
|
||||
partialProbTable[0][0] = probs[0][refBaseIndex];
|
||||
partialProbTable[0][1] = probs[0][altBaseIndex];
|
||||
|
||||
return partialProbTable;
|
||||
}
|
||||
|
||||
double[][] oldPartialProbTable = getWeightTable(probs, refBaseIndex, altBaseIndex, numReadsToConsider - 1);
|
||||
double[] traces = getWeightTableTraces(oldPartialProbTable);
|
||||
|
||||
double[][] newPartialProbTable = new double[numReadsToConsider][2];
|
||||
for (int row = 0, traceElement = traces.length - 1; row < newPartialProbTable.length; row++, traceElement--) {
|
||||
newPartialProbTable[row][0] = traces[traceElement]*probs[numReadsToConsider - 1][refBaseIndex];
|
||||
newPartialProbTable[row][1] = traces[traceElement]*probs[numReadsToConsider - 1][altBaseIndex];
|
||||
}
|
||||
|
||||
return newPartialProbTable;
|
||||
}
|
||||
|
||||
private double[] getWeightTableTraces(double[][] partialProbTable) {
|
||||
double[] traces = new double[partialProbTable.length + 1];
|
||||
|
||||
traces[0] = partialProbTable[partialProbTable.length - 1][0];
|
||||
traces[partialProbTable.length] = partialProbTable[0][1];
|
||||
|
||||
for (int element = 1; element < traces.length - 1; element++) {
|
||||
traces[element] = partialProbTable[partialProbTable.length - element - 1][0] +
|
||||
partialProbTable[partialProbTable.length - element][1];
|
||||
}
|
||||
|
||||
return traces;
|
||||
}
|
||||
|
||||
private double[] getFractionalCounts(double[][] probs) {
|
||||
double[] fractionalCounts = new double[4];
|
||||
|
||||
for (int i = 0; i < probs.length; i++) {
|
||||
for (int j = 0; j < 4; j++) {
|
||||
fractionalCounts[j] += probs[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
return fractionalCounts;
|
||||
}
|
||||
|
||||
private int getMostFrequentNonRefBase(double[] fractionalCounts, int refBaseIndex) {
|
||||
double maxFractionalCounts = -1.0;
|
||||
int bestAltBaseIndex = -1;
|
||||
for (int altBaseIndex = 0; altBaseIndex < 4; altBaseIndex++) {
|
||||
if (altBaseIndex != refBaseIndex) {
|
||||
if (fractionalCounts[altBaseIndex] > maxFractionalCounts) {
|
||||
maxFractionalCounts = fractionalCounts[altBaseIndex];
|
||||
bestAltBaseIndex = altBaseIndex;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return bestAltBaseIndex;
|
||||
}
|
||||
|
||||
private int[] getSecondaryBaseCounts(double[][] probs) {
|
||||
int[] secondaryBaseCounts = new int[4];
|
||||
|
||||
for (int readIndex = 0; readIndex < probs.length; readIndex++) {
|
||||
double bestProb = 0.0;
|
||||
for (int baseIndex = 0; baseIndex < probs[readIndex].length; baseIndex++) {
|
||||
if (probs[readIndex][baseIndex] > bestProb) {
|
||||
bestProb = probs[readIndex][baseIndex];
|
||||
}
|
||||
}
|
||||
|
||||
double secondBestProb = 0.0;
|
||||
int secondBestBaseIndex = 0;
|
||||
for (int baseIndex = 0; baseIndex < 4; baseIndex++) {
|
||||
if (probs[readIndex][baseIndex] > secondBestProb && probs[readIndex][baseIndex] < bestProb) {
|
||||
secondBestProb = probs[readIndex][baseIndex];
|
||||
secondBestBaseIndex = baseIndex;
|
||||
}
|
||||
}
|
||||
|
||||
secondaryBaseCounts[secondBestBaseIndex]++;
|
||||
}
|
||||
|
||||
return secondaryBaseCounts;
|
||||
}
|
||||
|
||||
private AlleleFrequencyEstimate getOneProbAlleleFrequency(char ref, LocusContext context, String rodString, String sample_name) {
|
||||
private AlleleFrequencyEstimate getAlleleFrequency(char ref, LocusContext context, String rodString, String sample_name) {
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
|
||||
String bases = pileup.getBases();
|
||||
|
||||
|
|
@ -333,8 +137,8 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
|
|||
}
|
||||
G.ApplyPrior(ref, this.alt_allele, this.allele_frequency_prior);
|
||||
|
||||
if (fourBaseMode) {
|
||||
G.applyFourBaseDistributionPrior(pileup.getBases(), pileup.getSecondaryBasePileup());
|
||||
if (fourBaseMode && pileup.getBases().length() < 750) {
|
||||
G.applySecondBaseDistributionPrior(pileup.getBases(), pileup.getSecondaryBasePileup());
|
||||
}
|
||||
|
||||
return G.toAlleleFrequencyEstimate(context.getLocation(), ref, bases.length(), bases, G.likelihoods, sample_name);
|
||||
|
|
|
|||
Loading…
Reference in New Issue