The Empirical quality column in the recalibration report can't be compared in the BQSRGatherer because the value is calculated using the Bayesian estimate with different priors. This value should never be used from a recalibration report anyway except during plotting.

This commit is contained in:
Ryan Poplin 2013-01-30 12:28:14 -05:00
parent 59311aeea2
commit 2967776458
5 changed files with 39 additions and 5 deletions

View File

@ -188,7 +188,7 @@ public class BaseRecalibration {
}
}
protected double hierarchicalBayesianQualityEstimate( final double epsilon, final RecalDatum empiricalQualRG, final RecalDatum empiricalQualQS, final List<RecalDatum> empiricalQualCovs ) {
protected static double hierarchicalBayesianQualityEstimate( final double epsilon, final RecalDatum empiricalQualRG, final RecalDatum empiricalQualQS, final List<RecalDatum> empiricalQualCovs ) {
final double globalDeltaQ = ( empiricalQualRG == null ? 0.0 : empiricalQualRG.getEmpiricalQuality(epsilon) - epsilon );
final double deltaQReported = ( empiricalQualQS == null ? 0.0 : empiricalQualQS.getEmpiricalQuality(globalDeltaQ + epsilon) - (globalDeltaQ + epsilon) );
double deltaQCovariates = 0.0;

View File

@ -181,6 +181,7 @@ public class RecalDatum {
if ( Double.isNaN(estimatedQReported) ) throw new IllegalArgumentException("estimatedQReported is NaN");
this.estimatedQReported = estimatedQReported;
empiricalQuality = UNINITIALIZED;
}
public final double getEstimatedQReported() {

View File

@ -131,7 +131,7 @@ public class RecalibrationReport {
* Combines two recalibration reports by adding all observations and errors
*
* Note: This method DOES NOT recalculate the empirical qualities and quantized qualities. You have to recalculate
* them after combining. The reason for not calculating it is because this function is inteded for combining a
* them after combining. The reason for not calculating it is because this function is intended for combining a
* series of recalibration reports, and it only makes sense to calculate the empirical qualities and quantized
* qualities after all the recalibration reports have been combined. Having the user recalculate when appropriate,
* makes this method faster

View File

@ -125,19 +125,19 @@ public class BQSRGathererUnitTest extends BaseTest {
testTablesWithColumns(originalTable, calculatedTable, columnsToTest);
// test the RecalTable0 table
columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME);
columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME);
originalTable = originalReport.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE);
calculatedTable = calculatedReport.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE);
testTablesWithColumns(originalTable, calculatedTable, columnsToTest);
// test the RecalTable1 table
columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.QUALITY_SCORE_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME);
columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.QUALITY_SCORE_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME);
originalTable = originalReport.getTable(RecalUtils.QUALITY_SCORE_REPORT_TABLE_TITLE);
calculatedTable = calculatedReport.getTable(RecalUtils.QUALITY_SCORE_REPORT_TABLE_TITLE);
testTablesWithColumns(originalTable, calculatedTable, columnsToTest);
// test the RecalTable2 table
columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.QUALITY_SCORE_COLUMN_NAME, RecalUtils.COVARIATE_VALUE_COLUMN_NAME, RecalUtils.COVARIATE_NAME_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME);
columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.QUALITY_SCORE_COLUMN_NAME, RecalUtils.COVARIATE_VALUE_COLUMN_NAME, RecalUtils.COVARIATE_NAME_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME);
originalTable = originalReport.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE);
calculatedTable = calculatedReport.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE);
testTablesWithColumns(originalTable, calculatedTable, columnsToTest);

View File

@ -58,6 +58,7 @@ import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.Arrays;
import java.util.Collections;
public class RecalDatumUnitTest extends BaseTest {
@ -277,4 +278,36 @@ public class RecalDatumUnitTest extends BaseTest {
Assert.assertFalse(Double.isInfinite(log10likelihood));
Assert.assertFalse(Double.isNaN(log10likelihood));
}
@Test
public void basicHierarchicalBayesianQualityEstimateTest() {
{
double RG_Q = 45.0;
RecalDatum RG = new RecalDatum( (long)10000000, (long) (10000000 * 1.0 / (Math.pow(10.0, RG_Q/10.0))), (byte)RG_Q);
double Q = 30.0;
RecalDatum QS = new RecalDatum( (long)10000000, (long) (10000000 * 1.0 / (Math.pow(10.0, Q/10.0))), (byte)Q);
RecalDatum COV = new RecalDatum( (long)150, (long) 1, (byte)45.0); // no data here so Bayesian prior has a huge effect on the empirical quality
Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 45.0, RG, QS, Collections.singletonList(COV)), Q, 1E-4 );
// initial epsilon condition shouldn't matter when there are a lot of observations
Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 25.0, RG, QS, Collections.singletonList(COV)), Q, 1E-4 );
Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 5.0, RG, QS, Collections.singletonList(COV)), Q, 1E-4 );
Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 65.0, RG, QS, Collections.singletonList(COV)), Q, 1E-4 );
}
{
double RG_Q = 45.0;
RecalDatum RG = new RecalDatum( (long)100, (long) (100 * 1.0 / (Math.pow(10.0, RG_Q/10.0))), (byte)RG_Q);
double Q = 30.0;
RecalDatum QS = new RecalDatum( (long)100, (long) (100 * 1.0 / (Math.pow(10.0, Q/10.0))), (byte)Q);
RecalDatum COV = new RecalDatum( (long)150, (long) 1, (byte)45.0); // no data here so Bayesian prior has a huge effect on the empirical quality
// initial epsilon condition dominates when there is no data
Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 45.0, RG, QS, Collections.singletonList(COV)), 45.0, 1E-4 );
Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 25.0, RG, QS, Collections.singletonList(COV)), 25.0, 1E-4 );
Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 5.0, RG, QS, Collections.singletonList(COV)), 5.0, 1E-4 );
Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 65.0, RG, QS, Collections.singletonList(COV)), 65.0, 1E-4 );
}
}
}