Pushing in implementation of the Bayesian estimate of Qemp for the BQSR.

This isn't hooked up yet with BQSR; it's just a static method used in my testing walker.  I'll hook this into BQSR after more testing and the addition of unit tests.
Most of the changes in this commit are actually documentation-related.
This commit is contained in:
Eric Banks 2013-01-02 15:21:44 -05:00
parent 03294ae1c8
commit c7039a9b71
5 changed files with 82 additions and 22 deletions

View File

@ -14,7 +14,8 @@ public class QualityUtils {
public final static double ERROR_RATE_OF_MAX_QUAL_SCORE = qualToErrorProbRaw(MAX_QUAL_SCORE);
public final static double MIN_REASONABLE_ERROR = 0.0001;
public final static byte MAX_REASONABLE_Q_SCORE = 60; // quals above this value are extremely suspicious
public final static byte MAX_REASONABLE_Q_SCORE = 60; // bams containing quals above this value are extremely suspicious and we should warn the user
public final static byte MAX_GATK_USABLE_Q_SCORE = 40; // quals above this value should be capped down to this value (because they are too high)
public final static byte MIN_USABLE_Q_SCORE = 6;
public final static int MAPPING_QUALITY_UNAVAILABLE = 255;
@ -73,7 +74,7 @@ public class QualityUtils {
}
public static double qualToErrorProb(final double qual) {
return Math.pow(10.0, ((double) qual)/-10.0);
return Math.pow(10.0, qual/-10.0);
}

View File

@ -76,7 +76,7 @@ public class BaseRecalibration {
quantizationInfo = recalibrationReport.getQuantizationInfo();
if (quantizationLevels == 0) // quantizationLevels == 0 means no quantization, preserve the quality scores
quantizationInfo.noQuantization();
else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wnats to use what's in the report.
else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wants to use what's in the report.
quantizationInfo.quantizeQualityScores(quantizationLevels);
this.disableIndelQuals = disableIndelQuals;
@ -233,9 +233,9 @@ public class BaseRecalibration {
* Note that this calculation is a constant for each rgKey and errorModel. We need only
* compute this value once for all data.
*
* @param rgKey
* @param errorModel
* @return
* @param rgKey read group key
* @param errorModel the event type
* @return global delta Q
*/
private double calculateGlobalDeltaQ(final int rgKey, final EventType errorModel) {
double result = 0.0;

View File

@ -175,8 +175,7 @@ public class QualQuantizer {
}
/**
* Human readable name of this interval: e.g., 10-12
* @return
* @return Human readable name of this interval: e.g., 10-12
*/
public String getName() {
return qStart + "-" + qEnd;
@ -188,8 +187,7 @@ public class QualQuantizer {
}
/**
* Returns the error rate (in real space) of this interval, or 0 if there are no obserations
* @return
* @return the error rate (in real space) of this interval, or 0 if there are no observations
*/
@Ensures("result >= 0.0")
public double getErrorRate() {
@ -202,9 +200,7 @@ public class QualQuantizer {
}
/**
* Returns the QUAL of the error rate of this interval, or the fixed
* qual if this interval was created with a fixed qual.
* @return
* @return the QUAL of the error rate of this interval, or the fixed qual if this interval was created with a fixed qual.
*/
@Ensures("result >= 0 && result <= QualityUtils.MAX_QUAL_SCORE")
public byte getQual() {
@ -254,7 +250,7 @@ public class QualQuantizer {
final QualInterval right = this.compareTo(toMerge) < 0 ? toMerge : this;
if ( left.qEnd + 1 != right.qStart )
throw new ReviewedStingException("Attempting to merge non-continguous intervals: left = " + left + " right = " + right);
throw new ReviewedStingException("Attempting to merge non-contiguous intervals: left = " + left + " right = " + right);
final long nCombinedObs = left.nObservations + right.nObservations;
final long nCombinedErr = left.nErrors + right.nErrors;
@ -343,8 +339,7 @@ public class QualQuantizer {
}
/**
* Helper function that finds and mergest together the lowest penalty pair
* of intervals
* Helper function that finds and merges together the lowest penalty pair of intervals
* @param intervals
*/
@Requires("! intervals.isEmpty()")

View File

@ -28,9 +28,10 @@ package org.broadinstitute.sting.utils.recalibration;
import com.google.java.contract.Ensures;
import com.google.java.contract.Invariant;
import com.google.java.contract.Requires;
import org.apache.commons.math.optimization.fitting.GaussianFunction;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.Random;
/**
* An individual piece of recalibration data. Each bin counts up the number of observations and the number
@ -149,7 +150,7 @@ public class RecalDatum {
//---------------------------------------------------------------------------------------------------------------
/**
* Returns the error rate (in real space) of this interval, or 0 if there are no obserations
* Returns the error rate (in real space) of this interval, or 0 if there are no observations
* @return the empirical error rate ~= N errors / N obs
*/
@Ensures("result >= 0.0")
@ -262,6 +263,9 @@ public class RecalDatum {
@Requires("empiricalQuality == UNINITIALIZED")
@Ensures("empiricalQuality != UNINITIALIZED")
private synchronized void calcEmpiricalQuality() {
// TODO -- add code for Bayesian estimate of Qemp here
final double empiricalQual = -10 * Math.log10(getEmpiricalErrorRate());
empiricalQuality = Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE);
}
@ -276,4 +280,65 @@ public class RecalDatum {
private double calcExpectedErrors() {
return getNumObservations() * QualityUtils.qualToErrorProb(estimatedQReported);
}
}
static final boolean DEBUG = false;
static final double RESOLUTION_BINS_PER_QUAL = 1.0;
static public double bayesianEstimateOfEmpiricalQuality(final double nObservations, final double nErrors, final double QReported) {
final int numBins = (QualityUtils.MAX_REASONABLE_Q_SCORE + 1) * (int)RESOLUTION_BINS_PER_QUAL;
final double[] log10Posteriors = new double[numBins];
for ( int bin = 0; bin < numBins; bin++ ) {
final double QEmpOfBin = bin / RESOLUTION_BINS_PER_QUAL;
log10Posteriors[bin] = log10QempPrior(QEmpOfBin, QReported) + log10Likelihood(QEmpOfBin, nObservations, nErrors);
if ( DEBUG )
System.out.println(String.format("bin = %d, Qreported = %f, nObservations = %f, nErrors = %f, posteriors = %f", bin, QReported, nObservations, nErrors, log10Posteriors[bin]));
}
if ( DEBUG )
System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f", QReported, nObservations, nErrors));
final double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10Posteriors);
final int MLEbin = MathUtils.maxElementIndex(normalizedPosteriors);
final double Qemp = MLEbin / RESOLUTION_BINS_PER_QUAL;
return Qemp;
}
static final double[] log10QempPriorCache = new double[QualityUtils.MAX_GATK_USABLE_Q_SCORE + 1];
static {
// f(x) = a + b*exp(-((x - c)^2 / (2*d^2)))
// Note that b is the height of the curve's peak, c is the position of the center of the peak, and d controls the width of the "bell".
final double GF_a = 0.0;
final double GF_b = 0.9;
final double GF_c = 0.0;
final double GF_d = 0.5;
final GaussianFunction gaussian = new GaussianFunction(GF_a, GF_b, GF_c, GF_d);
for ( int i = 0; i <= QualityUtils.MAX_GATK_USABLE_Q_SCORE; i++ )
log10QempPriorCache[i] = Math.log10(gaussian.value((double) i));
}
static public double log10QempPrior(final double Qempirical, final double Qreported) {
final int difference = Math.min(Math.abs((int) (Qempirical - Qreported)), QualityUtils.MAX_GATK_USABLE_Q_SCORE);
if ( DEBUG )
System.out.println(String.format("Qemp = %f, log10Priors = %f", Qempirical, log10QempPriorCache[difference]));
return log10QempPriorCache[difference];
}
static public double log10Likelihood(final double Qempirical, final double nObservations, final double nErrors) {
// this is just a straight binomial PDF
double log10Prob = MathUtils.log10BinomialProbability((int)nObservations, (int)nErrors, QualityUtils.qualToErrorProbLog10((byte)(int)Qempirical));
if ( log10Prob == Double.NEGATIVE_INFINITY )
log10Prob = -Double.MAX_VALUE;
if ( DEBUG )
System.out.println(String.format("Qemp = %f, log10Likelihood = %f", Qempirical, log10Prob));
return log10Prob;
}
}

View File

@ -9,8 +9,7 @@ import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
import java.io.File;
import java.io.PrintStream;
import java.io.*;
import java.util.*;
/**
@ -199,7 +198,7 @@ public class RecalibrationReport {
private RecalDatum getRecalDatum(final GATKReportTable reportTable, final int row, final boolean hasEstimatedQReportedColumn) {
final double nObservations = asDouble(reportTable.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME));
final double nErrors = asDouble(reportTable.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME));
final double empiricalQuality = (Double) reportTable.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME);
final double empiricalQuality = asDouble(reportTable.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME));
// the estimatedQreported column only exists in the ReadGroup table
final double estimatedQReported = hasEstimatedQReportedColumn ?