The Empirical quality column in the recalibration report can't be compared in the BQSRGatherer because the value is calculated using the Bayesian estimate with different priors. This value should never be used from a recalibration report anyway except during plotting.

2013-01-30 12:28:14 -05:00 · 2013-01-30 12:28:14 -05:00 · 2967776458
parent 59311aeea2
commit 2967776458
5 changed files with 39 additions and 5 deletions
--- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java
+++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java
@ -188,7 +188,7 @@ public class BaseRecalibration {
        }
    }

-    protected double hierarchicalBayesianQualityEstimate( final double epsilon, final RecalDatum empiricalQualRG, final RecalDatum empiricalQualQS, final List<RecalDatum> empiricalQualCovs ) {
+    protected static double hierarchicalBayesianQualityEstimate( final double epsilon, final RecalDatum empiricalQualRG, final RecalDatum empiricalQualQS, final List<RecalDatum> empiricalQualCovs ) {
        final double globalDeltaQ = ( empiricalQualRG == null ? 0.0 : empiricalQualRG.getEmpiricalQuality(epsilon) - epsilon );
        final double deltaQReported = ( empiricalQualQS == null ? 0.0 : empiricalQualQS.getEmpiricalQuality(globalDeltaQ + epsilon) - (globalDeltaQ + epsilon) );
        double deltaQCovariates = 0.0;
--- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java
+++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java
@ -181,6 +181,7 @@ public class RecalDatum {
        if ( Double.isNaN(estimatedQReported) ) throw new IllegalArgumentException("estimatedQReported is NaN");

        this.estimatedQReported = estimatedQReported;
+        empiricalQuality = UNINITIALIZED;
    }

    public final double getEstimatedQReported() {
--- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java
+++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java
@ -131,7 +131,7 @@ public class RecalibrationReport {
    * Combines two recalibration reports by adding all observations and errors
    *
    * Note: This method DOES NOT recalculate the empirical qualities and quantized qualities. You have to recalculate
-    * them after combining. The reason for not calculating it is because this function is inteded for combining a
+    * them after combining. The reason for not calculating it is because this function is intended for combining a
    * series of recalibration reports, and it only makes sense to calculate the empirical qualities and quantized
    * qualities after all the recalibration reports have been combined. Having the user recalculate when appropriate,
    * makes this method faster
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java
@ -125,19 +125,19 @@ public class BQSRGathererUnitTest extends BaseTest {
        testTablesWithColumns(originalTable, calculatedTable, columnsToTest);

        // test the RecalTable0 table
-        columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME);
+        columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME);
        originalTable = originalReport.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE);
        calculatedTable = calculatedReport.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE);
        testTablesWithColumns(originalTable, calculatedTable, columnsToTest);

        // test the RecalTable1 table
-        columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.QUALITY_SCORE_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME);
+        columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.QUALITY_SCORE_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME);
        originalTable = originalReport.getTable(RecalUtils.QUALITY_SCORE_REPORT_TABLE_TITLE);
        calculatedTable = calculatedReport.getTable(RecalUtils.QUALITY_SCORE_REPORT_TABLE_TITLE);
        testTablesWithColumns(originalTable, calculatedTable, columnsToTest);

        // test the RecalTable2 table
-        columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.QUALITY_SCORE_COLUMN_NAME, RecalUtils.COVARIATE_VALUE_COLUMN_NAME, RecalUtils.COVARIATE_NAME_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME);
+        columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.QUALITY_SCORE_COLUMN_NAME, RecalUtils.COVARIATE_VALUE_COLUMN_NAME, RecalUtils.COVARIATE_NAME_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME);
        originalTable = originalReport.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE);
        calculatedTable = calculatedReport.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE);
        testTablesWithColumns(originalTable, calculatedTable, columnsToTest);
--- a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java
@ -58,6 +58,7 @@ import org.testng.annotations.DataProvider;
 import org.testng.annotations.Test;

 import java.util.Arrays;
+import java.util.Collections;


 public class RecalDatumUnitTest extends BaseTest {
@ -277,4 +278,36 @@ public class RecalDatumUnitTest extends BaseTest {
        Assert.assertFalse(Double.isInfinite(log10likelihood));
        Assert.assertFalse(Double.isNaN(log10likelihood));
    }
+
+    @Test
+    public void basicHierarchicalBayesianQualityEstimateTest() {
+        {
+            double RG_Q = 45.0;
+            RecalDatum RG = new RecalDatum( (long)10000000, (long) (10000000 * 1.0 / (Math.pow(10.0, RG_Q/10.0))), (byte)RG_Q);
+            double Q = 30.0;
+            RecalDatum QS = new RecalDatum( (long)10000000, (long) (10000000 * 1.0 / (Math.pow(10.0, Q/10.0))), (byte)Q);
+            RecalDatum COV = new RecalDatum( (long)150, (long) 1, (byte)45.0); // no data here so Bayesian prior has a huge effect on the empirical quality
+
+            Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 45.0, RG, QS, Collections.singletonList(COV)), Q, 1E-4 );
+            // initial epsilon condition shouldn't matter when there are a lot of observations
+            Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 25.0, RG, QS, Collections.singletonList(COV)), Q, 1E-4 );
+            Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 5.0, RG, QS, Collections.singletonList(COV)), Q, 1E-4 );
+            Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 65.0, RG, QS, Collections.singletonList(COV)), Q, 1E-4 );
+        }
+
+        {
+            double RG_Q = 45.0;
+            RecalDatum RG = new RecalDatum( (long)100, (long) (100 * 1.0 / (Math.pow(10.0, RG_Q/10.0))), (byte)RG_Q);
+            double Q = 30.0;
+            RecalDatum QS = new RecalDatum( (long)100, (long) (100 * 1.0 / (Math.pow(10.0, Q/10.0))), (byte)Q);
+            RecalDatum COV = new RecalDatum( (long)150, (long) 1, (byte)45.0); // no data here so Bayesian prior has a huge effect on the empirical quality
+
+            // initial epsilon condition dominates when there is no data
+            Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 45.0, RG, QS, Collections.singletonList(COV)), 45.0, 1E-4 );
+            Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 25.0, RG, QS, Collections.singletonList(COV)), 25.0, 1E-4 );
+            Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 5.0, RG, QS, Collections.singletonList(COV)), 5.0, 1E-4 );
+            Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 65.0, RG, QS, Collections.singletonList(COV)), 65.0, 1E-4 );
+        }
+
+    }
 }