From 87ba8b3451455fa0d1fb1f17658e272bca6862db Mon Sep 17 00:00:00 2001 From: kiran Date: Wed, 10 Jun 2009 08:27:06 +0000 Subject: [PATCH] Removed some useless code. Don't apply second-base test if the coverage is too high, since the binomial probs explode and return NaN or Infinite values. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@961 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/SingleSampleGenotyper.java | 234 ++---------------- 1 file changed, 19 insertions(+), 215 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java index dccc7712e..5e49d38e4 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java @@ -23,38 +23,27 @@ import java.util.*; public class SingleSampleGenotyper extends LocusWalker { - @Argument(fullName="calls", shortName="calls", doc="File to write SNP calls to", required=true) public String callsFileName; - @Argument(fullName="metrics", shortName="met", doc="metrics", required=false) public String metricsFileName = "/dev/null"; - @Argument(fullName="metInterval", shortName="mi", doc="metInterval", required=false) public Integer metricsInterval = 50000; - @Argument(fullName="printMetrics", shortName="printMetrics", doc="printMetrics", required=false) public Boolean printMetrics = true; - @Argument(fullName="lodThreshold", shortName="lod", doc="lodThreshold", required=false) public Double lodThreshold = 5.0; - @Argument(fullName="fourBaseMode", shortName="fb", doc="fourBaseMode", required=false) public Boolean fourBaseMode = false; - @Argument(fullName="retest", shortName="re", doc="retest", required=false) public Boolean retest = false; - @Argument(fullName="call_indels", shortName="call_indels", doc="Call Indels", required=false) public Boolean call_indels = false; - @Argument(fullName="qHom", shortName="qHom", doc="qHom", required=false) public Double qHom = 0.04; - @Argument(fullName="qHet", shortName="qHet", doc="qHet", required=false) public Double qHet = 0.49; - @Argument(fullName="qHomNonRef", shortName="qHomNonRef", doc="qHomNonRef", required=false) public Double qHomNonRef = 0.97; - @Argument(fullName="sample_name_regex", shortName="sample_name_regex", required=false, doc="sample_name_regex") public String SAMPLE_NAME_REGEX = null; + @Argument(fullName="calls", shortName="calls", doc="File to write SNP calls to", required=true) public String callsFileName; + @Argument(fullName="metrics", shortName="met", doc="metrics", required=false) public String metricsFileName = "/dev/null"; + @Argument(fullName="metInterval", shortName="mi", doc="metInterval", required=false) public Integer metricsInterval = 50000; + @Argument(fullName="printMetrics", shortName="printMetrics", doc="printMetrics", required=false) public Boolean printMetrics = true; + @Argument(fullName="lodThreshold", shortName="lod", doc="lodThreshold", required=false) public Double lodThreshold = 5.0; + @Argument(fullName="fourBaseMode", shortName="fb", doc="fourBaseMode", required=false) public Boolean fourBaseMode = false; + @Argument(fullName="call_indels", shortName="call_indels", doc="Call Indels", required=false) public Boolean call_indels = false; + @Argument(fullName="refPrior", shortName="refPrior", required=false, doc="Prior likelihood of the reference theory") public double REF_PRIOR = 0.999; + @Argument(fullName="hetVarPrior", shortName="hetVarPrior", required=false, doc="Prior likelihood of a heterozygous variant theory") public double HETVAR_PRIOR = 1e-3; + @Argument(fullName="homVarPrior", shortName="homVarPrior", required=false, doc="Prior likelihood of the homozygous variant theory") public double HOMVAR_PRIOR = 1e-5; + @Argument(fullName="geli", shortName="geli", doc="If true, output will be in Geli/Picard format", required=false) public boolean GeliOutputFormat = false; + @Argument(fullName="sample_name_regex", shortName="sample_name_regex", doc="sample_name_regex", required=false) public String SAMPLE_NAME_REGEX = null; - @Argument(fullName="geli", shortName="geli", required=false, doc="If true, output will be in Geli/Picard format") - public boolean GeliOutputFormat = false; - - // todo: enable argument after fixing all of the hard-codings in GenotypeLikelihoods and other places - - //@Argument(fullName="refPrior", shortName="refPrior", required=false, doc="Prior likelihood of the reference theory") - public double REF_PRIOR = 0.999; - //@Argument(fullName="hetVarPrior", shortName="hetVarPrior", required=false, doc="Prior likelihood of a heterozygous variant theory") - public double HETVAR_PRIOR = 1e-3; - //@Argument(fullName="homVarPrior", shortName="homVarPrior", required=false, doc="Prior likelihood of the homozygous variant theory") - public double HOMVAR_PRIOR = 1e-5; - public AlleleMetrics metrics; - public PrintStream calls_file; - + public PrintStream calls_file; public String sample_name; public boolean filter(RefMetaDataTracker tracker, char ref, LocusContext context) { return true; } + public boolean requiresReads() { return true; } + public void initialize() { try @@ -99,7 +88,7 @@ public class SingleSampleGenotyper extends LocusWalker= 0 && altBaseIndex >= 0) { - // Set the priors for the hom-ref, het, and non-hom ref (we don't evaluate the - // possibility of a non-ref het). - //double[] genotypePriors = { 0.999, 1e-3, 1e-5 }; - double[] genotypePriors = { REF_PRIOR, HETVAR_PRIOR, HOMVAR_PRIOR }; - - // I'm not sure what the difference between qhat and qstar is in AlleleMetrics, but - // to make my life simpler, I evaluate the non-ref balances in genotypeBalances and - // report the balances in reportingBalances. - double[] genotypeBalances = { qHom, qHet, qHomNonRef }; - double[] reportingBalances = { 0.0, 0.5, 1.0 }; - - // Compute the probability of the distribution of second-best bases (should be random) - int[] secondaryBaseCounts = getSecondaryBaseCounts(probs); - double[] secondaryBaseProbs = new double[4]; - for (int baseIndex = 0; baseIndex < 4; baseIndex++) { - secondaryBaseProbs[baseIndex] = 0.25; - } - double secondaryBaseDistProb = MathUtils.multinomialProbability(secondaryBaseCounts, secondaryBaseProbs); - - //System.out.printf("%d %d %d %d ", secondaryBaseCounts[0], secondaryBaseCounts[1], secondaryBaseCounts[2], secondaryBaseCounts[3]); - //System.out.println(secondaryBaseDistProb); - - // Determine the probability distribution of non-ref alleles - double[] obsWeights = getObservationWeights(probs, refBaseIndex, altBaseIndex); - - // Convolve them with the binomial sampling probability distribution in order to - // marginalize the non-ref allele balance. - double[] posteriors = new double[3]; - double qhat = 0.0, qstar = 0.0, lodVsRef = 0.0, pBest = Double.MIN_VALUE; - for (int hypothesis = 0; hypothesis < 3; hypothesis++) { - posteriors[hypothesis] = 0.0; - - for (int weightIndex = 0; weightIndex < obsWeights.length; weightIndex++) { - double binomialWeight = MathUtils.binomialProbability(weightIndex, probs.length, genotypeBalances[hypothesis]); - - // The hom-ref posterior probability distribution may be too weak to - // effectively account for noise. Just set it to equal weighting for - // the lower quantile of the non-ref balance range. - //if (hypothesis == 0 && weightIndex < obsWeights.length/4) { - // binomialWeight = 1.0; - //} - - posteriors[hypothesis] += obsWeights[weightIndex]*binomialWeight; - } - - posteriors[hypothesis] *= genotypePriors[hypothesis]; - - double refLikelihood = Double.isInfinite(Math.log10(posteriors[0])) ? -10000.0 : Math.log10(posteriors[0]); - - if (posteriors[hypothesis] > pBest) { - qhat = reportingBalances[hypothesis]; - qstar = reportingBalances[hypothesis]; - lodVsRef = Math.log10(posteriors[hypothesis]) - refLikelihood; - pBest = posteriors[hypothesis]; - } - } - - double pRef = posteriors[0]; - - double[] sposteriors = Arrays.copyOf(posteriors, posteriors.length); - Arrays.sort(sposteriors); - double bestLogPosterior = (Double.isInfinite(Math.log10(sposteriors[2]))) ? -10000.0 : Math.log10(sposteriors[2]); - double nextBestLogPosterior = (Double.isInfinite(Math.log10(sposteriors[1]))) ? -10000.0 : Math.log10(sposteriors[1]); - double lodVsNextBest = bestLogPosterior - nextBestLogPosterior; - - AlleleFrequencyEstimate freq = new AlleleFrequencyEstimate(context.getLocation(), - ref, - BaseUtils.baseIndexToSimpleBase(altBaseIndex), - 2, - qhat, - qstar, - (MathUtils.compareDoubles(qhat, reportingBalances[0]) == 0 ? lodVsNextBest : lodVsRef), - lodVsNextBest, - pBest, - pRef, - probs.length, - ReadBackedPileup.basePileupAsString(context.getReads(), context.getOffsets()), - probs, - posteriors, - "unknown_sample"); - - return freq; - } - - return null; - } - - private double[] getObservationWeights(double[][] probs, int refBaseIndex, int altBaseIndex) { - return getWeightTableTraces(getWeightTable(probs, refBaseIndex, altBaseIndex, probs.length)); - } - - private double[][] getWeightTable(double[][] probs, int refBaseIndex, int altBaseIndex, int numReadsToConsider) { - if (numReadsToConsider == 1) { - double[][] partialProbTable = new double[1][2]; - partialProbTable[0][0] = probs[0][refBaseIndex]; - partialProbTable[0][1] = probs[0][altBaseIndex]; - - return partialProbTable; - } - - double[][] oldPartialProbTable = getWeightTable(probs, refBaseIndex, altBaseIndex, numReadsToConsider - 1); - double[] traces = getWeightTableTraces(oldPartialProbTable); - - double[][] newPartialProbTable = new double[numReadsToConsider][2]; - for (int row = 0, traceElement = traces.length - 1; row < newPartialProbTable.length; row++, traceElement--) { - newPartialProbTable[row][0] = traces[traceElement]*probs[numReadsToConsider - 1][refBaseIndex]; - newPartialProbTable[row][1] = traces[traceElement]*probs[numReadsToConsider - 1][altBaseIndex]; - } - - return newPartialProbTable; - } - - private double[] getWeightTableTraces(double[][] partialProbTable) { - double[] traces = new double[partialProbTable.length + 1]; - - traces[0] = partialProbTable[partialProbTable.length - 1][0]; - traces[partialProbTable.length] = partialProbTable[0][1]; - - for (int element = 1; element < traces.length - 1; element++) { - traces[element] = partialProbTable[partialProbTable.length - element - 1][0] + - partialProbTable[partialProbTable.length - element][1]; - } - - return traces; - } - - private double[] getFractionalCounts(double[][] probs) { - double[] fractionalCounts = new double[4]; - - for (int i = 0; i < probs.length; i++) { - for (int j = 0; j < 4; j++) { - fractionalCounts[j] += probs[i][j]; - } - } - - return fractionalCounts; - } - - private int getMostFrequentNonRefBase(double[] fractionalCounts, int refBaseIndex) { - double maxFractionalCounts = -1.0; - int bestAltBaseIndex = -1; - for (int altBaseIndex = 0; altBaseIndex < 4; altBaseIndex++) { - if (altBaseIndex != refBaseIndex) { - if (fractionalCounts[altBaseIndex] > maxFractionalCounts) { - maxFractionalCounts = fractionalCounts[altBaseIndex]; - bestAltBaseIndex = altBaseIndex; - } - } - } - - return bestAltBaseIndex; - } - - private int[] getSecondaryBaseCounts(double[][] probs) { - int[] secondaryBaseCounts = new int[4]; - - for (int readIndex = 0; readIndex < probs.length; readIndex++) { - double bestProb = 0.0; - for (int baseIndex = 0; baseIndex < probs[readIndex].length; baseIndex++) { - if (probs[readIndex][baseIndex] > bestProb) { - bestProb = probs[readIndex][baseIndex]; - } - } - - double secondBestProb = 0.0; - int secondBestBaseIndex = 0; - for (int baseIndex = 0; baseIndex < 4; baseIndex++) { - if (probs[readIndex][baseIndex] > secondBestProb && probs[readIndex][baseIndex] < bestProb) { - secondBestProb = probs[readIndex][baseIndex]; - secondBestBaseIndex = baseIndex; - } - } - - secondaryBaseCounts[secondBestBaseIndex]++; - } - - return secondaryBaseCounts; - } - - private AlleleFrequencyEstimate getOneProbAlleleFrequency(char ref, LocusContext context, String rodString, String sample_name) { + private AlleleFrequencyEstimate getAlleleFrequency(char ref, LocusContext context, String rodString, String sample_name) { ReadBackedPileup pileup = new ReadBackedPileup(ref, context); String bases = pileup.getBases(); @@ -333,8 +137,8 @@ public class SingleSampleGenotyper extends LocusWalker