From b4e7b3d691684faf193499a78634456930a92f03 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 7 Jan 2013 13:07:36 -0500 Subject: [PATCH] Fixed precision problem in the Bayesian calculation of Qemp: we need to cap below max integer because the MathUtils code add +1. Added unit tests for handling large number of observations. --- .../sting/utils/QualityUtils.java | 4 ++++ .../sting/utils/recalibration/RecalDatum.java | 10 ++++---- .../walkers/bqsr/BQSRGathererUnitTest.java | 24 +++++++++++++++++++ 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java index 844fa6b90..96fd0250c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -90,6 +90,10 @@ public class QualityUtils { return qualToErrorProbLog10Cache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc. } + static public double qualToErrorProbLog10(final double qual) { + return qual/-10.0; + } + /** * Convert a probability to a quality score. Note, this is capped at Q40. * diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java index eebc86b6b..7bd21ddaa 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java @@ -341,6 +341,8 @@ public class RecalDatum { return log10QempPriorCache[difference]; } + static private final long MAX_NUMBER_OF_OBSERVATIONS = Integer.MAX_VALUE - 1; + static protected double log10QempLikelihood(final double Qempirical, long nObservations, long nErrors) { if ( nObservations == 0 ) return 0.0; @@ -348,15 +350,15 @@ public class RecalDatum { // the binomial code requires ints as input (because it does caching). This should theoretically be fine because // there is plenty of precision in 2^31 observations, but we need to make sure that we don't have overflow // before casting down to an int. - if ( nObservations > Integer.MAX_VALUE ) { + if ( nObservations > MAX_NUMBER_OF_OBSERVATIONS ) { // we need to decrease nErrors by the same fraction that we are decreasing nObservations - final double fraction = (double)Integer.MAX_VALUE / (double)nObservations; + final double fraction = (double)MAX_NUMBER_OF_OBSERVATIONS / (double)nObservations; nErrors = Math.round((double)nErrors * fraction); - nObservations = Integer.MAX_VALUE; + nObservations = MAX_NUMBER_OF_OBSERVATIONS; } // this is just a straight binomial PDF - double log10Prob = MathUtils.log10BinomialProbability((int)nObservations, (int)nErrors, QualityUtils.qualToErrorProbLog10((byte)(int)Qempirical)); + double log10Prob = MathUtils.log10BinomialProbability((int)nObservations, (int)nErrors, QualityUtils.qualToErrorProbLog10(Qempirical)); if ( Double.isInfinite(log10Prob) || Double.isNaN(log10Prob) ) log10Prob = -Double.MAX_VALUE; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java index 44bc7de5f..538c027d9 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java @@ -26,6 +26,25 @@ public class BQSRGathererUnitTest extends BaseTest { private static File recal_original = new File(privateTestDir + "HiSeq.1mb.1RG.noSG.table"); + @Test(enabled = true) + public void testManyObservations() { + File recal = new File(privateTestDir + "bqsr.manyObservations.piece.table"); + + final File output = BaseTest.createTempFile("BQSRgathererTest", ".table"); + + List recalFiles = new LinkedList (); + for ( int i=0; i < 5; i++ ) + recalFiles.add(recal); + + BQSRGatherer gatherer = new BQSRGatherer(); + gatherer.gather(recalFiles, output); + + GATKReport originalReport = new GATKReport(new File(privateTestDir + "bqsr.manyObservations.full.table")); + GATKReport calculatedReport = new GATKReport(output); + + testReports(originalReport, calculatedReport); + } + @Test(enabled = true) public void testGatherBQSR() { BQSRGatherer gatherer = new BQSRGatherer(); @@ -42,6 +61,11 @@ public class BQSRGathererUnitTest extends BaseTest { GATKReport originalReport = new GATKReport(recal_original); GATKReport calculatedReport = new GATKReport(output); + testReports(originalReport, calculatedReport); + } + + private void testReports(final GATKReport originalReport, final GATKReport calculatedReport) { + // test the Arguments table List columnsToTest = Arrays.asList(RecalUtils.ARGUMENT_COLUMN_NAME, RecalUtils.ARGUMENT_VALUE_COLUMN_NAME); GATKReportTable originalTable = originalReport.getTable(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE);