Hooking up the Bayesian estimate code for calculating Qemp in BQSR; various fixes after adding unit tests.

2013-01-04 14:43:11 -05:00 · 2013-01-04 14:43:11 -05:00 · dd7f5e2be7
parent c7039a9b71
commit dd7f5e2be7
9 changed files with 199 additions and 95 deletions
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java
@ -53,21 +53,21 @@ public class BQSRIntegrationTest extends WalkerTest {
        String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam";
        String HiSeqInterval = "chr1:10,000,000-10,100,000";
        return new Object[][]{
-                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "2f250fecb930e0dfe0f63fe0fed3960b")},
-                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "26c8d7226139a040557b1d3b1c8792f0")},
-                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "9b43a1839cb6ea03aec1d96f15ca8efb")},
-                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "3159a9d136c45e4a65d46a23dc8fd3b5")},
-                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "bb7262829effbbdbc8d88dd36f480368")},
-                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "fbb002fa2b9197c4b555852dccc11562")},
-                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "7392acb71131a60a527ca32715fc59be")},
-                {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "49d4383896a90795d94138db1410a7df")},
-                {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "427448eff98cf194cc7217c0b1401e79")},
-                {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "50cd1a10b6ecb3d09f90f1e4a66da95d")},
-                {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "1dc71561c9d0fb56f9876cb5043c5376")},
-                {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "13e8f032e76340b114847c90af0a1f8a")},
-                {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "03f58ae4f9d203034e895a3636fc108f")},
-                {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "49d4383896a90795d94138db1410a7df")},
-                {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "2db2ef8c2d63e167663d70340182f49a")},
+                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "6b3f252718f59cf9fd3f7612f73a35bf")},
+                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "863576ac9ff0b0e02f2e84aef15923a7")},
+                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "03e28f48201a35c70d1cf48e9f45364f")},
+                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "6e3c5635d387a1c428a7c9c88ad26488")},
+                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "6507adcb94bacde4cdee9caa9f14f24b")},
+                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "399bbb4bf80764dfc644b2f95d824615")},
+                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "34d70899253c2b3343ca9ae944291c30")},
+                {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "e61fa47bfc08433f0cd55558e2081548")},
+                {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "5c2622c63225b8b04990baf0ae4de07c")},
+                {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "ee7191d83d7d5bb957dc4595883c32f1")},
+                {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "da92f4730356f479c2c2b71497cfac6d")},
+                {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "8075595113b48c0c7ead08ce41bef9fe")},
+                {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "be05834841c5690c66910270521d5c32")},
+                {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "e61fa47bfc08433f0cd55558e2081548")},
+                {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "8ee0b498dbbc95ce76393a0f089fec92")},
        };
    }

@ -104,7 +104,7 @@ public class BQSRIntegrationTest extends WalkerTest {
                        " -sortAllCols" +
                        " --plot_pdf_file /dev/null" +
                        " --intermediate_csv_file %s",
-                Arrays.asList("d1c38a3418979400630e2bca1140689c"));
+                Arrays.asList("dd6e0e1e3f53f8ae0c8f5de21ded6ee9"));
        executeTest("testBQSR-CSVfile", spec);
    }

--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java
@ -79,7 +79,7 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP
     * @return a new RecalDatum object with the observation and the error
     */
    protected RecalDatum createDatumObject(final byte reportedQual, final double isError) {
-        return new RecalDatum(1, isError, reportedQual);
+        return new RecalDatum(1L, isError, reportedQual);
    }

    /**
@ -133,12 +133,12 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP
            if ( ! table.put(createDatumObject(qual, isError), keys) ) {
                // Failed to put a new item because another thread came along and put an item here first.
                // Get the newly-put item and increment it (item is guaranteed to exist at this point)
-                table.get(keys).increment(1.0, isError);
+                table.get(keys).increment(1, isError);
            }
        }
        else {
            // Easy case: already an item here, so increment it
-            existingDatum.increment(1.0, isError);
+            existingDatum.increment(1, isError);
        }
    }
 }
--- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java
+++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java
@ -68,7 +68,7 @@ public class RecalDatum {
    /**
     * number of bases seen in total
     */
-    private double numObservations;
+    private long numObservations;

    /**
     * number of bases seen that didn't match the reference
@ -89,13 +89,13 @@ public class RecalDatum {
    /**
     * Create a new RecalDatum with given observation and mismatch counts, and an reported quality
     *
-     * @param _numObservations
-     * @param _numMismatches
-     * @param reportedQuality
+     * @param _numObservations    observations
+     * @param _numMismatches      mismatches
+     * @param reportedQuality     Qreported
     */
-    public RecalDatum(final double _numObservations, final double _numMismatches, final byte reportedQuality) {
+    public RecalDatum(final long _numObservations, final double _numMismatches, final byte reportedQuality) {
        if ( _numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0");
-        if ( _numMismatches < 0 ) throw new IllegalArgumentException("numMismatches < 0");
+        if ( _numMismatches < 0.0 ) throw new IllegalArgumentException("numMismatches < 0");
        if ( reportedQuality < 0 ) throw new IllegalArgumentException("reportedQuality < 0");

        numObservations = _numObservations;
@ -106,7 +106,7 @@ public class RecalDatum {

    /**
     * Copy copy into this recal datum, overwriting all of this objects data
-     * @param copy
+     * @param copy  RecalDatum to copy
     */
    public RecalDatum(final RecalDatum copy) {
        this.numObservations = copy.getNumObservations();
@ -119,7 +119,7 @@ public class RecalDatum {
     * Add in all of the data from other into this object, updating the reported quality from the expected
     * error rate implied by the two reported qualities
     *
-     * @param other
+     * @param other  RecalDatum to combine
     */
    public synchronized void combine(final RecalDatum other) {
        final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors();
@ -192,7 +192,7 @@ public class RecalDatum {

    @Override
    public String toString() {
-        return String.format("%.2f,%.2f,%.2f", getNumObservations(), getNumMismatches(), getEmpiricalQuality());
+        return String.format("%d,%.2f,%.2f", getNumObservations(), getNumMismatches(), getEmpiricalQuality());
    }

    public String stringForCSV() {
@ -205,11 +205,11 @@ public class RecalDatum {
    //
    //---------------------------------------------------------------------------------------------------------------

-    public final double getNumObservations() {
+    public final long getNumObservations() {
        return numObservations;
    }

-    public final synchronized void setNumObservations(final double numObservations) {
+    public final synchronized void setNumObservations(final long numObservations) {
        if ( numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0");
        this.numObservations = numObservations;
        empiricalQuality = UNINITIALIZED;
@ -227,7 +227,7 @@ public class RecalDatum {
    }

    @Requires({"by >= 0"})
-    public final synchronized void incrementNumObservations(final double by) {
+    public final synchronized void incrementNumObservations(final long by) {
        numObservations += by;
        empiricalQuality = UNINITIALIZED;
    }
@ -240,7 +240,7 @@ public class RecalDatum {

    @Requires({"incObservations >= 0", "incMismatches >= 0"})
    @Ensures({"numObservations == old(numObservations) + incObservations", "numMismatches == old(numMismatches) + incMismatches"})
-    public final synchronized void increment(final double incObservations, final double incMismatches) {
+    public final synchronized void increment(final long incObservations, final double incMismatches) {
        numObservations += incObservations;
        numMismatches += incMismatches;
        empiricalQuality = UNINITIALIZED;
@ -248,7 +248,7 @@ public class RecalDatum {

    @Ensures({"numObservations == old(numObservations) + 1", "numMismatches >= old(numMismatches)"})
    public final synchronized void increment(final boolean isError) {
-        increment(1, isError ? 1 : 0.0);
+        increment(1, isError ? 1.0 : 0.0);
    }

    // -------------------------------------------------------------------------------------
@ -257,19 +257,6 @@ public class RecalDatum {
    //
    // -------------------------------------------------------------------------------------

-    /**
-     * Calculate and cache the empirical quality score from mismatches and observations (expensive operation)
-     */
-    @Requires("empiricalQuality == UNINITIALIZED")
-    @Ensures("empiricalQuality != UNINITIALIZED")
-    private synchronized void calcEmpiricalQuality() {
-
-        // TODO -- add code for Bayesian estimate of Qemp here
-
-        final double empiricalQual = -10 * Math.log10(getEmpiricalErrorRate());
-        empiricalQuality = Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE);
-    }
-
    /**
     * calculate the expected number of errors given the estimated Q reported and the number of observations
     * in this datum.
@ -281,10 +268,29 @@ public class RecalDatum {
        return getNumObservations() * QualityUtils.qualToErrorProb(estimatedQReported);
    }

-    static final boolean DEBUG = false;
-    static final double RESOLUTION_BINS_PER_QUAL = 1.0;
+    /**
+     * Calculate and cache the empirical quality score from mismatches and observations (expensive operation)
+     */
+    @Requires("empiricalQuality == UNINITIALIZED")
+    @Ensures("empiricalQuality != UNINITIALIZED")
+    private synchronized void calcEmpiricalQuality() {

-    static public double bayesianEstimateOfEmpiricalQuality(final double nObservations, final double nErrors, final double QReported) {
+        // smoothing is one error and one non-error observation
+        final long mismatches = (long)(getNumMismatches() + 0.5) + SMOOTHING_CONSTANT;
+        final long observations = getNumObservations() + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT;
+
+        final double empiricalQual = RecalDatum.bayesianEstimateOfEmpiricalQuality(observations, mismatches, getEstimatedQReported());
+
+        // This is the old and busted point estimate approach:
+        //final double empiricalQual = -10 * Math.log10(getEmpiricalErrorRate());
+
+        empiricalQuality = Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE);
+    }
+
+    //static final boolean DEBUG = false;
+    static private final double RESOLUTION_BINS_PER_QUAL = 1.0;
+
+    static public double bayesianEstimateOfEmpiricalQuality(final long nObservations, final long nErrors, final double QReported) {

        final int numBins = (QualityUtils.MAX_REASONABLE_Q_SCORE + 1) * (int)RESOLUTION_BINS_PER_QUAL;

@ -294,14 +300,14 @@ public class RecalDatum {

            final double QEmpOfBin = bin / RESOLUTION_BINS_PER_QUAL;

-            log10Posteriors[bin] = log10QempPrior(QEmpOfBin, QReported) + log10Likelihood(QEmpOfBin, nObservations, nErrors);
+            log10Posteriors[bin] = log10QempPrior(QEmpOfBin, QReported) + log10QempLikelihood(QEmpOfBin, nObservations, nErrors);

-            if ( DEBUG )
-                System.out.println(String.format("bin = %d, Qreported = %f, nObservations = %f, nErrors = %f, posteriors = %f", bin, QReported, nObservations, nErrors, log10Posteriors[bin]));
+            //if ( DEBUG )
+            //    System.out.println(String.format("bin = %d, Qreported = %f, nObservations = %f, nErrors = %f, posteriors = %f", bin, QReported, nObservations, nErrors, log10Posteriors[bin]));
        }

-        if ( DEBUG )
-            System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f", QReported, nObservations, nErrors));
+        //if ( DEBUG )
+        //    System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f", QReported, nObservations, nErrors));

        final double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10Posteriors);
        final int MLEbin = MathUtils.maxElementIndex(normalizedPosteriors);
@ -310,35 +316,47 @@ public class RecalDatum {
        return Qemp;
    }

-    static final double[] log10QempPriorCache = new double[QualityUtils.MAX_GATK_USABLE_Q_SCORE + 1];
+    static private final double[] log10QempPriorCache = new double[QualityUtils.MAX_GATK_USABLE_Q_SCORE + 1];
    static {
        // f(x) = a + b*exp(-((x - c)^2 / (2*d^2)))
        // Note that b is the height of the curve's peak, c is the position of the center of the peak, and d controls the width of the "bell".
        final double GF_a = 0.0;
        final double GF_b = 0.9;
        final double GF_c = 0.0;
-        final double GF_d = 0.5;
+        final double GF_d = 0.5;   // with these parameters, deltas can shift at most ~20 Q points

        final GaussianFunction gaussian = new GaussianFunction(GF_a, GF_b, GF_c, GF_d);
-        for ( int i = 0; i <= QualityUtils.MAX_GATK_USABLE_Q_SCORE; i++ )
-            log10QempPriorCache[i] = Math.log10(gaussian.value((double) i));
+        for ( int i = 0; i <= QualityUtils.MAX_GATK_USABLE_Q_SCORE; i++ ) {
+            double log10Prior = Math.log10(gaussian.value((double) i));
+            if ( Double.isInfinite(log10Prior) )
+                log10Prior = -Double.MAX_VALUE;
+            log10QempPriorCache[i] = log10Prior;
+        }
    }

-    static public double log10QempPrior(final double Qempirical, final double Qreported) {
+    static protected double log10QempPrior(final double Qempirical, final double Qreported) {
        final int difference = Math.min(Math.abs((int) (Qempirical - Qreported)), QualityUtils.MAX_GATK_USABLE_Q_SCORE);
-        if ( DEBUG )
-            System.out.println(String.format("Qemp = %f, log10Priors = %f", Qempirical, log10QempPriorCache[difference]));
+        //if ( DEBUG )
+        //    System.out.println(String.format("Qemp = %f, log10Priors = %f", Qempirical, log10QempPriorCache[difference]));
        return log10QempPriorCache[difference];
    }

-    static public double log10Likelihood(final double Qempirical, final double nObservations, final double nErrors) {
+    static protected double log10QempLikelihood(final double Qempirical, final long nObservations, final long nErrors) {
+        if ( nObservations == 0 )
+            return 0.0;
+
        // this is just a straight binomial PDF
-        double log10Prob = MathUtils.log10BinomialProbability((int)nObservations, (int)nErrors, QualityUtils.qualToErrorProbLog10((byte)(int)Qempirical));
-        if ( log10Prob == Double.NEGATIVE_INFINITY )
+        double log10Prob = MathUtils.log10BinomialProbability(longToInt(nObservations), longToInt(nErrors), QualityUtils.qualToErrorProbLog10((byte)(int)Qempirical));
+        if ( Double.isInfinite(log10Prob) || Double.isNaN(log10Prob) )
            log10Prob = -Double.MAX_VALUE;

-        if ( DEBUG )
-            System.out.println(String.format("Qemp = %f, log10Likelihood = %f", Qempirical, log10Prob));
+        //if ( DEBUG )
+        //    System.out.println(String.format("Qemp = %f, log10Likelihood = %f", Qempirical, log10Prob));
+
        return log10Prob;
    }
+
+    static protected int longToInt(final long l) {
+        return (l > Integer.MAX_VALUE) ? Integer.MAX_VALUE : (int)l;
+    }
 }
--- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java
+++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java
@ -264,7 +264,7 @@ public class RecalDatumNode<T extends RecalDatum> {
            for ( final RecalDatumNode<T> subnode : subnodes ) {
                // use the yates correction to help avoid all zeros => NaN
                counts[i][0] = Math.round(subnode.getRecalDatum().getNumMismatches()) + 1L;
-                counts[i][1] = Math.round(subnode.getRecalDatum().getNumObservations()) + 2L;
+                counts[i][1] = subnode.getRecalDatum().getNumObservations() + 2L;
                i++;
            }

@ -320,7 +320,7 @@ public class RecalDatumNode<T extends RecalDatum> {

        if ( isLeaf() ) {
            // this is leave node
-            return (Math.abs(Math.log10(recalDatum.getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * recalDatum.getNumObservations();
+            return (Math.abs(Math.log10(recalDatum.getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * (double)recalDatum.getNumObservations();
            // TODO -- how we can generalize this calculation?
 //            if ( this.qEnd <= minInterestingQual )
 //                // It's free to merge up quality scores below the smallest interesting one
--- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java
@ -93,7 +93,7 @@ public class RecalUtils {
    private static final Pair<String, String> eventType          = new Pair<String, String>(RecalUtils.EVENT_TYPE_COLUMN_NAME, "%s");
    private static final Pair<String, String> empiricalQuality   = new Pair<String, String>(RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, "%.4f");
    private static final Pair<String, String> estimatedQReported = new Pair<String, String>(RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.4f");
-    private static final Pair<String, String> nObservations      = new Pair<String, String>(RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, "%.2f");
+    private static final Pair<String, String> nObservations      = new Pair<String, String>(RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d");
    private static final Pair<String, String> nErrors            = new Pair<String, String>(RecalUtils.NUMBER_ERRORS_COLUMN_NAME, "%.2f");

    /**
--- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java
+++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java
@ -68,15 +68,6 @@ public class RecalibrationReport {

    }

-    protected RecalibrationReport(final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final GATKReportTable argumentTable, final RecalibrationArgumentCollection RAC) {
-        this.quantizationInfo = quantizationInfo;
-        this.recalibrationTables = recalibrationTables;
-        this.requestedCovariates = requestedCovariates;
-        this.argumentTable = argumentTable;
-        this.RAC = RAC;
-        this.optionalCovariateIndexes = null;
-    }
-
    /**
     * Counts the number of unique read groups in the table
     *
@ -192,11 +183,22 @@ public class RecalibrationReport {
        else if ( o instanceof Long )
            return (Long)o;
        else
-            throw new ReviewedStingException("Object " + o + " is expected to be either a double, long or integer but its not either: " + o.getClass());
+            throw new ReviewedStingException("Object " + o + " is expected to be either a double, long or integer but it's not either: " + o.getClass());
+    }
+
+    private long asLong(final Object o) {
+        if ( o instanceof Long )
+            return (Long)o;
+        else if ( o instanceof Integer )
+            return ((Integer)o).longValue();
+        else if ( o instanceof Double )
+            return ((Double)o).longValue();
+        else
+            throw new ReviewedStingException("Object " + o + " is expected to be a long but it's not: " + o.getClass());
    }

    private RecalDatum getRecalDatum(final GATKReportTable reportTable, final int row, final boolean hasEstimatedQReportedColumn) {
-        final double nObservations = asDouble(reportTable.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME));
+        final long nObservations = asLong(reportTable.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME));
        final double nErrors = asDouble(reportTable.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME));
        final double empiricalQuality = asDouble(reportTable.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME));

--- a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java
@ -30,16 +30,13 @@ package org.broadinstitute.sting.utils.recalibration;


 import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.QualityUtils;
-import org.broadinstitute.sting.utils.Utils;
 import org.testng.Assert;
-import org.testng.annotations.BeforeSuite;
 import org.testng.annotations.DataProvider;
 import org.testng.annotations.Test;

-import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.List;


 public class RecalDatumUnitTest extends BaseTest {
@ -74,7 +71,7 @@ public class RecalDatumUnitTest extends BaseTest {
        }

        public RecalDatum makeRecalDatum() {
-            return new RecalDatum(exTotal, exError, (byte)getReportedQual());
+            return new RecalDatum((long)exTotal, (double)exError, (byte)getReportedQual());
        }

        @Override
@ -83,13 +80,19 @@ public class RecalDatumUnitTest extends BaseTest {
        }
    }

+    private static boolean createdDatumTestProviders = false;
+
    @DataProvider(name = "RecalDatumTestProvider")
    public Object[][] makeRecalDatumTestProvider() {
-        for ( int E : Arrays.asList(1, 10, 100, 1000, 10000) )
-            for ( int N : Arrays.asList(10, 100, 1000, 10000, 100000, 1000000) )
-                for ( int reportedQual : Arrays.asList(10, 20) )
-                    if ( E <= N )
-                        new RecalDatumTestProvider(E, N, reportedQual);
+        if ( !createdDatumTestProviders ) {
+            for ( int E : Arrays.asList(1, 10, 100, 1000, 10000) )
+                for ( int N : Arrays.asList(10, 100, 1000, 10000, 100000, 1000000) )
+                    for ( int reportedQual : Arrays.asList(10, 20) )
+                        if ( E <= N )
+                            new RecalDatumTestProvider(E, N, reportedQual);
+            createdDatumTestProviders = true;
+        }
+
        return RecalDatumTestProvider.getTests(RecalDatumTestProvider.class);
    }

@ -104,7 +107,6 @@ public class RecalDatumUnitTest extends BaseTest {
        Assert.assertEquals(datum.getNumObservations(), cfg.exTotal, 1E-6);
        if ( cfg.getReportedQual() != -1 )
            Assert.assertEquals(datum.getEstimatedQReportedAsByte(), cfg.getReportedQual());
-        BaseTest.assertEqualsDoubleSmart(datum.getEmpiricalQuality(), cfg.getErrorRatePhredScaled());
        BaseTest.assertEqualsDoubleSmart(datum.getEmpiricalErrorRate(), cfg.getErrorRate());

        final double e = datum.getEmpiricalQuality();
@ -175,7 +177,87 @@ public class RecalDatumUnitTest extends BaseTest {

    @Test
    public void testNoObs() {
-        final RecalDatum rd = new RecalDatum(0, 0, (byte)10);
+        final RecalDatum rd = new RecalDatum(0L, 0.0, (byte)10);
        Assert.assertEquals(rd.getEmpiricalErrorRate(), 0.0);
    }
+
+    @Test
+    public void testlog10QempPrior() {
+        for ( int Qemp = 0; Qemp <= QualityUtils.MAX_QUAL_SCORE; Qemp++ ) {
+            for ( int Qrep = 0; Qrep <= QualityUtils.MAX_QUAL_SCORE; Qrep++ ) {
+                final double log10prior = RecalDatum.log10QempPrior(Qemp, Qrep);
+                Assert.assertTrue(log10prior < 0.0);
+                Assert.assertFalse(Double.isInfinite(log10prior));
+                Assert.assertFalse(Double.isNaN(log10prior));
+            }
+        }
+
+        final int Qrep = 20;
+        int maxQemp = -1;
+        double maxQempValue = -Double.MAX_VALUE;
+        for ( int Qemp = 0; Qemp <= QualityUtils.MAX_QUAL_SCORE; Qemp++ ) {
+            final double log10prior = RecalDatum.log10QempPrior(Qemp, Qrep);
+            if ( log10prior > maxQempValue ) {
+                maxQemp = Qemp;
+                maxQempValue = log10prior;
+            }
+        }
+        Assert.assertEquals(maxQemp, Qrep);
+    }
+
+    @Test
+    public void testlog10QempLikelihood() {
+
+        final int Qrep = 20;
+
+        // test no shift
+        Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(0, 0, Qrep), (double)Qrep);
+        Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10, 0, Qrep), (double)Qrep);
+        Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000, 10, Qrep), (double)Qrep);
+
+        // test small shift
+        Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10, 10, Qrep), Qrep - 1.0);
+        Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000, 0, Qrep), Qrep + 1.0);
+
+        // test medium shift
+        Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10000, 0, Qrep), Qrep + 3.0);
+        Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10000, 10, Qrep), Qrep + 3.0);
+
+        // test large shift
+        Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(100000, 10, Qrep), Qrep + 8.0);
+        Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000000, 10, Qrep), Qrep + 16.0);
+    }
+
+    @Test
+    public void testBayesianEstimateOfEmpiricalQuality() {
+
+        final double[] Qemps = new double[] { 0.0, 10.0, 20.0, 30.0 };
+        final int[] observations = new int[] {0, 10, 1000, 1000000};
+        final int[] errors = new int[] {0, 10, 1000, 1000000};
+
+        for ( double Qemp : Qemps ) {
+            for ( int observation : observations ) {
+                for ( int error : errors ) {
+                    if ( error > observation )
+                        continue;
+
+                    final double log10likelihood = RecalDatum.log10QempLikelihood(Qemp, observation, error);
+                    Assert.assertTrue(observation == 0 ? MathUtils.compareDoubles(log10likelihood, 0.0) == 0 : log10likelihood < 0.0);
+                    Assert.assertFalse(Double.isInfinite(log10likelihood));
+                    Assert.assertFalse(Double.isNaN(log10likelihood));
+                }
+            }
+        }
+    }
+
+    @Test
+    public void testLongToInt() {
+        long l = new Long((long)Integer.MAX_VALUE);
+        int i = RecalDatum.longToInt(l);
+        Assert.assertEquals(i, Integer.MAX_VALUE);
+
+        l++;
+        i = RecalDatum.longToInt(l);
+        Assert.assertEquals(i, Integer.MAX_VALUE);
+    }
 }
--- a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalUtilsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalUtilsUnitTest.java
@ -146,7 +146,7 @@ public final class RecalUtilsUnitTest extends BaseTest {
    public NestedIntegerArray<RecalDatum> makeTable(final List<Row> rows) {
        final NestedIntegerArray<RecalDatum> x = new NestedIntegerArray<RecalDatum>(3, 3);
        for ( final Row r : rows )
-            x.put(new RecalDatum(r.no, r.ne, (byte)10), r.rg, r.qual);
+            x.put(new RecalDatum((long)r.no, (double)r.ne, (byte)10), r.rg, r.qual);
        return x;
    }
 }
--- a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java
@ -20,9 +20,11 @@ public class RecalibrationReportUnitTest {
    private static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) {
        final Random random = new Random();
        final int nObservations = random.nextInt(maxObservations);
-        final int nErrors = random.nextInt(maxErrors);
+        int nErrors = random.nextInt(maxErrors);
+        while ( nErrors > nObservations )
+            nErrors = random.nextInt(maxErrors);
        final int qual = random.nextInt(QualityUtils.MAX_QUAL_SCORE);
-        return new RecalDatum(nObservations, nErrors, (byte)qual);
+        return new RecalDatum((long)nObservations, (double)nErrors, (byte)qual);
    }

    @Test(enabled = true)