From c7039a9b713703d0252ea073ea482e5416413442 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 2 Jan 2013 15:21:44 -0500 Subject: [PATCH] Pushing in implementation of the Bayesian estimate of Qemp for the BQSR. This isn't hooked up yet with BQSR; it's just a static method used in my testing walker. I'll hook this into BQSR after more testing and the addition of unit tests. Most of the changes in this commit are actually documentation-related. --- .../sting/utils/QualityUtils.java | 5 +- .../recalibration/BaseRecalibration.java | 8 +-- .../utils/recalibration/QualQuantizer.java | 15 ++-- .../sting/utils/recalibration/RecalDatum.java | 71 ++++++++++++++++++- .../recalibration/RecalibrationReport.java | 5 +- 5 files changed, 82 insertions(+), 22 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java index 861f172d9..844fa6b90 100755 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -14,7 +14,8 @@ public class QualityUtils { public final static double ERROR_RATE_OF_MAX_QUAL_SCORE = qualToErrorProbRaw(MAX_QUAL_SCORE); public final static double MIN_REASONABLE_ERROR = 0.0001; - public final static byte MAX_REASONABLE_Q_SCORE = 60; // quals above this value are extremely suspicious + public final static byte MAX_REASONABLE_Q_SCORE = 60; // bams containing quals above this value are extremely suspicious and we should warn the user + public final static byte MAX_GATK_USABLE_Q_SCORE = 40; // quals above this value should be capped down to this value (because they are too high) public final static byte MIN_USABLE_Q_SCORE = 6; public final static int MAPPING_QUALITY_UNAVAILABLE = 255; @@ -73,7 +74,7 @@ public class QualityUtils { } public static double qualToErrorProb(final double qual) { - return Math.pow(10.0, ((double) qual)/-10.0); + return Math.pow(10.0, qual/-10.0); } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index 567514f8c..3e0f36799 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -76,7 +76,7 @@ public class BaseRecalibration { quantizationInfo = recalibrationReport.getQuantizationInfo(); if (quantizationLevels == 0) // quantizationLevels == 0 means no quantization, preserve the quality scores quantizationInfo.noQuantization(); - else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wnats to use what's in the report. + else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wants to use what's in the report. quantizationInfo.quantizeQualityScores(quantizationLevels); this.disableIndelQuals = disableIndelQuals; @@ -233,9 +233,9 @@ public class BaseRecalibration { * Note that this calculation is a constant for each rgKey and errorModel. We need only * compute this value once for all data. * - * @param rgKey - * @param errorModel - * @return + * @param rgKey read group key + * @param errorModel the event type + * @return global delta Q */ private double calculateGlobalDeltaQ(final int rgKey, final EventType errorModel) { double result = 0.0; diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java index a5a3104a0..b5370e268 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java @@ -175,8 +175,7 @@ public class QualQuantizer { } /** - * Human readable name of this interval: e.g., 10-12 - * @return + * @return Human readable name of this interval: e.g., 10-12 */ public String getName() { return qStart + "-" + qEnd; @@ -188,8 +187,7 @@ public class QualQuantizer { } /** - * Returns the error rate (in real space) of this interval, or 0 if there are no obserations - * @return + * @return the error rate (in real space) of this interval, or 0 if there are no observations */ @Ensures("result >= 0.0") public double getErrorRate() { @@ -202,9 +200,7 @@ public class QualQuantizer { } /** - * Returns the QUAL of the error rate of this interval, or the fixed - * qual if this interval was created with a fixed qual. - * @return + * @return the QUAL of the error rate of this interval, or the fixed qual if this interval was created with a fixed qual. */ @Ensures("result >= 0 && result <= QualityUtils.MAX_QUAL_SCORE") public byte getQual() { @@ -254,7 +250,7 @@ public class QualQuantizer { final QualInterval right = this.compareTo(toMerge) < 0 ? toMerge : this; if ( left.qEnd + 1 != right.qStart ) - throw new ReviewedStingException("Attempting to merge non-continguous intervals: left = " + left + " right = " + right); + throw new ReviewedStingException("Attempting to merge non-contiguous intervals: left = " + left + " right = " + right); final long nCombinedObs = left.nObservations + right.nObservations; final long nCombinedErr = left.nErrors + right.nErrors; @@ -343,8 +339,7 @@ public class QualQuantizer { } /** - * Helper function that finds and mergest together the lowest penalty pair - * of intervals + * Helper function that finds and merges together the lowest penalty pair of intervals * @param intervals */ @Requires("! intervals.isEmpty()") diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java index 1ab3b10c4..9b58a5900 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java @@ -28,9 +28,10 @@ package org.broadinstitute.sting.utils.recalibration; import com.google.java.contract.Ensures; import com.google.java.contract.Invariant; import com.google.java.contract.Requires; +import org.apache.commons.math.optimization.fitting.GaussianFunction; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; -import java.util.Random; /** * An individual piece of recalibration data. Each bin counts up the number of observations and the number @@ -149,7 +150,7 @@ public class RecalDatum { //--------------------------------------------------------------------------------------------------------------- /** - * Returns the error rate (in real space) of this interval, or 0 if there are no obserations + * Returns the error rate (in real space) of this interval, or 0 if there are no observations * @return the empirical error rate ~= N errors / N obs */ @Ensures("result >= 0.0") @@ -262,6 +263,9 @@ public class RecalDatum { @Requires("empiricalQuality == UNINITIALIZED") @Ensures("empiricalQuality != UNINITIALIZED") private synchronized void calcEmpiricalQuality() { + + // TODO -- add code for Bayesian estimate of Qemp here + final double empiricalQual = -10 * Math.log10(getEmpiricalErrorRate()); empiricalQuality = Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE); } @@ -276,4 +280,65 @@ public class RecalDatum { private double calcExpectedErrors() { return getNumObservations() * QualityUtils.qualToErrorProb(estimatedQReported); } -} + + static final boolean DEBUG = false; + static final double RESOLUTION_BINS_PER_QUAL = 1.0; + + static public double bayesianEstimateOfEmpiricalQuality(final double nObservations, final double nErrors, final double QReported) { + + final int numBins = (QualityUtils.MAX_REASONABLE_Q_SCORE + 1) * (int)RESOLUTION_BINS_PER_QUAL; + + final double[] log10Posteriors = new double[numBins]; + + for ( int bin = 0; bin < numBins; bin++ ) { + + final double QEmpOfBin = bin / RESOLUTION_BINS_PER_QUAL; + + log10Posteriors[bin] = log10QempPrior(QEmpOfBin, QReported) + log10Likelihood(QEmpOfBin, nObservations, nErrors); + + if ( DEBUG ) + System.out.println(String.format("bin = %d, Qreported = %f, nObservations = %f, nErrors = %f, posteriors = %f", bin, QReported, nObservations, nErrors, log10Posteriors[bin])); + } + + if ( DEBUG ) + System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f", QReported, nObservations, nErrors)); + + final double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10Posteriors); + final int MLEbin = MathUtils.maxElementIndex(normalizedPosteriors); + + final double Qemp = MLEbin / RESOLUTION_BINS_PER_QUAL; + return Qemp; + } + + static final double[] log10QempPriorCache = new double[QualityUtils.MAX_GATK_USABLE_Q_SCORE + 1]; + static { + // f(x) = a + b*exp(-((x - c)^2 / (2*d^2))) + // Note that b is the height of the curve's peak, c is the position of the center of the peak, and d controls the width of the "bell". + final double GF_a = 0.0; + final double GF_b = 0.9; + final double GF_c = 0.0; + final double GF_d = 0.5; + + final GaussianFunction gaussian = new GaussianFunction(GF_a, GF_b, GF_c, GF_d); + for ( int i = 0; i <= QualityUtils.MAX_GATK_USABLE_Q_SCORE; i++ ) + log10QempPriorCache[i] = Math.log10(gaussian.value((double) i)); + } + + static public double log10QempPrior(final double Qempirical, final double Qreported) { + final int difference = Math.min(Math.abs((int) (Qempirical - Qreported)), QualityUtils.MAX_GATK_USABLE_Q_SCORE); + if ( DEBUG ) + System.out.println(String.format("Qemp = %f, log10Priors = %f", Qempirical, log10QempPriorCache[difference])); + return log10QempPriorCache[difference]; + } + + static public double log10Likelihood(final double Qempirical, final double nObservations, final double nErrors) { + // this is just a straight binomial PDF + double log10Prob = MathUtils.log10BinomialProbability((int)nObservations, (int)nErrors, QualityUtils.qualToErrorProbLog10((byte)(int)Qempirical)); + if ( log10Prob == Double.NEGATIVE_INFINITY ) + log10Prob = -Double.MAX_VALUE; + + if ( DEBUG ) + System.out.println(String.format("Qemp = %f, log10Likelihood = %f", Qempirical, log10Prob)); + return log10Prob; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java index 081221da4..6ecac1394 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java @@ -9,8 +9,7 @@ import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; -import java.io.File; -import java.io.PrintStream; +import java.io.*; import java.util.*; /** @@ -199,7 +198,7 @@ public class RecalibrationReport { private RecalDatum getRecalDatum(final GATKReportTable reportTable, final int row, final boolean hasEstimatedQReportedColumn) { final double nObservations = asDouble(reportTable.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME)); final double nErrors = asDouble(reportTable.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME)); - final double empiricalQuality = (Double) reportTable.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME); + final double empiricalQuality = asDouble(reportTable.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME)); // the estimatedQreported column only exists in the ReadGroup table final double estimatedQReported = hasEstimatedQReportedColumn ?