Fixed precision problem in the Bayesian calculation of Qemp: we need to cap below max integer because the MathUtils code add +1.

Added unit tests for handling large number of observations.
2013-01-07 13:07:36 -05:00 · 2013-01-07 13:07:36 -05:00 · b4e7b3d691
parent 04e3978b04
commit b4e7b3d691
3 changed files with 34 additions and 4 deletions
--- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java
@ -90,6 +90,10 @@ public class QualityUtils {
        return qualToErrorProbLog10Cache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc.
    }

+    static public double qualToErrorProbLog10(final double qual) {
+        return qual/-10.0;
+    }
+
    /**
     * Convert a probability to a quality score.  Note, this is capped at Q40.
     *
--- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java
+++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java
@ -341,6 +341,8 @@ public class RecalDatum {
        return log10QempPriorCache[difference];
    }

+    static private final long MAX_NUMBER_OF_OBSERVATIONS = Integer.MAX_VALUE - 1;
+
    static protected double log10QempLikelihood(final double Qempirical, long nObservations, long nErrors) {
        if ( nObservations == 0 )
            return 0.0;
@ -348,15 +350,15 @@ public class RecalDatum {
        // the binomial code requires ints as input (because it does caching).  This should theoretically be fine because
        // there is plenty of precision in 2^31 observations, but we need to make sure that we don't have overflow
        // before casting down to an int.
-        if ( nObservations > Integer.MAX_VALUE ) {
+        if ( nObservations > MAX_NUMBER_OF_OBSERVATIONS ) {
            // we need to decrease nErrors by the same fraction that we are decreasing nObservations
-            final double fraction = (double)Integer.MAX_VALUE / (double)nObservations;
+            final double fraction = (double)MAX_NUMBER_OF_OBSERVATIONS / (double)nObservations;
            nErrors = Math.round((double)nErrors * fraction);
-            nObservations = Integer.MAX_VALUE;
+            nObservations = MAX_NUMBER_OF_OBSERVATIONS;
        }

        // this is just a straight binomial PDF
-        double log10Prob = MathUtils.log10BinomialProbability((int)nObservations, (int)nErrors, QualityUtils.qualToErrorProbLog10((byte)(int)Qempirical));
+        double log10Prob = MathUtils.log10BinomialProbability((int)nObservations, (int)nErrors, QualityUtils.qualToErrorProbLog10(Qempirical));
        if ( Double.isInfinite(log10Prob) || Double.isNaN(log10Prob) )
            log10Prob = -Double.MAX_VALUE;

--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java
@ -26,6 +26,25 @@ public class BQSRGathererUnitTest extends BaseTest {

    private static File recal_original = new File(privateTestDir + "HiSeq.1mb.1RG.noSG.table");

+    @Test(enabled = true)
+    public void testManyObservations() {
+        File recal = new File(privateTestDir + "bqsr.manyObservations.piece.table");
+
+        final File output = BaseTest.createTempFile("BQSRgathererTest", ".table");
+
+        List<File> recalFiles = new LinkedList<File> ();
+        for ( int i=0; i < 5; i++ )
+            recalFiles.add(recal);
+
+        BQSRGatherer gatherer = new BQSRGatherer();
+        gatherer.gather(recalFiles, output);
+
+        GATKReport originalReport = new GATKReport(new File(privateTestDir + "bqsr.manyObservations.full.table"));
+        GATKReport calculatedReport = new GATKReport(output);
+
+        testReports(originalReport, calculatedReport);
+    }
+
    @Test(enabled = true)
    public void testGatherBQSR() {
        BQSRGatherer gatherer = new BQSRGatherer();
@ -42,6 +61,11 @@ public class BQSRGathererUnitTest extends BaseTest {
        GATKReport originalReport = new GATKReport(recal_original);
        GATKReport calculatedReport = new GATKReport(output);

+        testReports(originalReport, calculatedReport);
+    }
+
+    private void testReports(final GATKReport originalReport, final GATKReport calculatedReport) {
+
        // test the Arguments table
        List<String> columnsToTest = Arrays.asList(RecalUtils.ARGUMENT_COLUMN_NAME, RecalUtils.ARGUMENT_VALUE_COLUMN_NAME);
        GATKReportTable originalTable = originalReport.getTable(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE);