diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java index 861f172d9..844fa6b90 100755 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -14,7 +14,8 @@ public class QualityUtils { public final static double ERROR_RATE_OF_MAX_QUAL_SCORE = qualToErrorProbRaw(MAX_QUAL_SCORE); public final static double MIN_REASONABLE_ERROR = 0.0001; - public final static byte MAX_REASONABLE_Q_SCORE = 60; // quals above this value are extremely suspicious + public final static byte MAX_REASONABLE_Q_SCORE = 60; // bams containing quals above this value are extremely suspicious and we should warn the user + public final static byte MAX_GATK_USABLE_Q_SCORE = 40; // quals above this value should be capped down to this value (because they are too high) public final static byte MIN_USABLE_Q_SCORE = 6; public final static int MAPPING_QUALITY_UNAVAILABLE = 255; @@ -73,7 +74,7 @@ public class QualityUtils { } public static double qualToErrorProb(final double qual) { - return Math.pow(10.0, ((double) qual)/-10.0); + return Math.pow(10.0, qual/-10.0); } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index 567514f8c..3e0f36799 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -76,7 +76,7 @@ public class BaseRecalibration { quantizationInfo = recalibrationReport.getQuantizationInfo(); if (quantizationLevels == 0) // quantizationLevels == 0 means no quantization, preserve the quality scores quantizationInfo.noQuantization(); - else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wnats to use what's in the report. + else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wants to use what's in the report. quantizationInfo.quantizeQualityScores(quantizationLevels); this.disableIndelQuals = disableIndelQuals; @@ -233,9 +233,9 @@ public class BaseRecalibration { * Note that this calculation is a constant for each rgKey and errorModel. We need only * compute this value once for all data. * - * @param rgKey - * @param errorModel - * @return + * @param rgKey read group key + * @param errorModel the event type + * @return global delta Q */ private double calculateGlobalDeltaQ(final int rgKey, final EventType errorModel) { double result = 0.0; diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java index a5a3104a0..b5370e268 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/QualQuantizer.java @@ -175,8 +175,7 @@ public class QualQuantizer { } /** - * Human readable name of this interval: e.g., 10-12 - * @return + * @return Human readable name of this interval: e.g., 10-12 */ public String getName() { return qStart + "-" + qEnd; @@ -188,8 +187,7 @@ public class QualQuantizer { } /** - * Returns the error rate (in real space) of this interval, or 0 if there are no obserations - * @return + * @return the error rate (in real space) of this interval, or 0 if there are no observations */ @Ensures("result >= 0.0") public double getErrorRate() { @@ -202,9 +200,7 @@ public class QualQuantizer { } /** - * Returns the QUAL of the error rate of this interval, or the fixed - * qual if this interval was created with a fixed qual. - * @return + * @return the QUAL of the error rate of this interval, or the fixed qual if this interval was created with a fixed qual. */ @Ensures("result >= 0 && result <= QualityUtils.MAX_QUAL_SCORE") public byte getQual() { @@ -254,7 +250,7 @@ public class QualQuantizer { final QualInterval right = this.compareTo(toMerge) < 0 ? toMerge : this; if ( left.qEnd + 1 != right.qStart ) - throw new ReviewedStingException("Attempting to merge non-continguous intervals: left = " + left + " right = " + right); + throw new ReviewedStingException("Attempting to merge non-contiguous intervals: left = " + left + " right = " + right); final long nCombinedObs = left.nObservations + right.nObservations; final long nCombinedErr = left.nErrors + right.nErrors; @@ -343,8 +339,7 @@ public class QualQuantizer { } /** - * Helper function that finds and mergest together the lowest penalty pair - * of intervals + * Helper function that finds and merges together the lowest penalty pair of intervals * @param intervals */ @Requires("! intervals.isEmpty()") diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java index 1ab3b10c4..9b58a5900 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java @@ -28,9 +28,10 @@ package org.broadinstitute.sting.utils.recalibration; import com.google.java.contract.Ensures; import com.google.java.contract.Invariant; import com.google.java.contract.Requires; +import org.apache.commons.math.optimization.fitting.GaussianFunction; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; -import java.util.Random; /** * An individual piece of recalibration data. Each bin counts up the number of observations and the number @@ -149,7 +150,7 @@ public class RecalDatum { //--------------------------------------------------------------------------------------------------------------- /** - * Returns the error rate (in real space) of this interval, or 0 if there are no obserations + * Returns the error rate (in real space) of this interval, or 0 if there are no observations * @return the empirical error rate ~= N errors / N obs */ @Ensures("result >= 0.0") @@ -262,6 +263,9 @@ public class RecalDatum { @Requires("empiricalQuality == UNINITIALIZED") @Ensures("empiricalQuality != UNINITIALIZED") private synchronized void calcEmpiricalQuality() { + + // TODO -- add code for Bayesian estimate of Qemp here + final double empiricalQual = -10 * Math.log10(getEmpiricalErrorRate()); empiricalQuality = Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE); } @@ -276,4 +280,65 @@ public class RecalDatum { private double calcExpectedErrors() { return getNumObservations() * QualityUtils.qualToErrorProb(estimatedQReported); } -} + + static final boolean DEBUG = false; + static final double RESOLUTION_BINS_PER_QUAL = 1.0; + + static public double bayesianEstimateOfEmpiricalQuality(final double nObservations, final double nErrors, final double QReported) { + + final int numBins = (QualityUtils.MAX_REASONABLE_Q_SCORE + 1) * (int)RESOLUTION_BINS_PER_QUAL; + + final double[] log10Posteriors = new double[numBins]; + + for ( int bin = 0; bin < numBins; bin++ ) { + + final double QEmpOfBin = bin / RESOLUTION_BINS_PER_QUAL; + + log10Posteriors[bin] = log10QempPrior(QEmpOfBin, QReported) + log10Likelihood(QEmpOfBin, nObservations, nErrors); + + if ( DEBUG ) + System.out.println(String.format("bin = %d, Qreported = %f, nObservations = %f, nErrors = %f, posteriors = %f", bin, QReported, nObservations, nErrors, log10Posteriors[bin])); + } + + if ( DEBUG ) + System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f", QReported, nObservations, nErrors)); + + final double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10Posteriors); + final int MLEbin = MathUtils.maxElementIndex(normalizedPosteriors); + + final double Qemp = MLEbin / RESOLUTION_BINS_PER_QUAL; + return Qemp; + } + + static final double[] log10QempPriorCache = new double[QualityUtils.MAX_GATK_USABLE_Q_SCORE + 1]; + static { + // f(x) = a + b*exp(-((x - c)^2 / (2*d^2))) + // Note that b is the height of the curve's peak, c is the position of the center of the peak, and d controls the width of the "bell". + final double GF_a = 0.0; + final double GF_b = 0.9; + final double GF_c = 0.0; + final double GF_d = 0.5; + + final GaussianFunction gaussian = new GaussianFunction(GF_a, GF_b, GF_c, GF_d); + for ( int i = 0; i <= QualityUtils.MAX_GATK_USABLE_Q_SCORE; i++ ) + log10QempPriorCache[i] = Math.log10(gaussian.value((double) i)); + } + + static public double log10QempPrior(final double Qempirical, final double Qreported) { + final int difference = Math.min(Math.abs((int) (Qempirical - Qreported)), QualityUtils.MAX_GATK_USABLE_Q_SCORE); + if ( DEBUG ) + System.out.println(String.format("Qemp = %f, log10Priors = %f", Qempirical, log10QempPriorCache[difference])); + return log10QempPriorCache[difference]; + } + + static public double log10Likelihood(final double Qempirical, final double nObservations, final double nErrors) { + // this is just a straight binomial PDF + double log10Prob = MathUtils.log10BinomialProbability((int)nObservations, (int)nErrors, QualityUtils.qualToErrorProbLog10((byte)(int)Qempirical)); + if ( log10Prob == Double.NEGATIVE_INFINITY ) + log10Prob = -Double.MAX_VALUE; + + if ( DEBUG ) + System.out.println(String.format("Qemp = %f, log10Likelihood = %f", Qempirical, log10Prob)); + return log10Prob; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java index 081221da4..6ecac1394 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java @@ -9,8 +9,7 @@ import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; -import java.io.File; -import java.io.PrintStream; +import java.io.*; import java.util.*; /** @@ -199,7 +198,7 @@ public class RecalibrationReport { private RecalDatum getRecalDatum(final GATKReportTable reportTable, final int row, final boolean hasEstimatedQReportedColumn) { final double nObservations = asDouble(reportTable.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME)); final double nErrors = asDouble(reportTable.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME)); - final double empiricalQuality = (Double) reportTable.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME); + final double empiricalQuality = asDouble(reportTable.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME)); // the estimatedQreported column only exists in the ReadGroup table final double estimatedQReported = hasEstimatedQReportedColumn ?