Fixed precision problem in the Bayesian calculation of Qemp: we need to cap below max integer because the MathUtils code add +1.

Added unit tests for handling large number of observations.
This commit is contained in:
Eric Banks 2013-01-07 13:07:36 -05:00
parent 04e3978b04
commit b4e7b3d691
3 changed files with 34 additions and 4 deletions

View File

@ -90,6 +90,10 @@ public class QualityUtils {
return qualToErrorProbLog10Cache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc.
}
static public double qualToErrorProbLog10(final double qual) {
return qual/-10.0;
}
/**
* Convert a probability to a quality score. Note, this is capped at Q40.
*

View File

@ -341,6 +341,8 @@ public class RecalDatum {
return log10QempPriorCache[difference];
}
static private final long MAX_NUMBER_OF_OBSERVATIONS = Integer.MAX_VALUE - 1;
static protected double log10QempLikelihood(final double Qempirical, long nObservations, long nErrors) {
if ( nObservations == 0 )
return 0.0;
@ -348,15 +350,15 @@ public class RecalDatum {
// the binomial code requires ints as input (because it does caching). This should theoretically be fine because
// there is plenty of precision in 2^31 observations, but we need to make sure that we don't have overflow
// before casting down to an int.
if ( nObservations > Integer.MAX_VALUE ) {
if ( nObservations > MAX_NUMBER_OF_OBSERVATIONS ) {
// we need to decrease nErrors by the same fraction that we are decreasing nObservations
final double fraction = (double)Integer.MAX_VALUE / (double)nObservations;
final double fraction = (double)MAX_NUMBER_OF_OBSERVATIONS / (double)nObservations;
nErrors = Math.round((double)nErrors * fraction);
nObservations = Integer.MAX_VALUE;
nObservations = MAX_NUMBER_OF_OBSERVATIONS;
}
// this is just a straight binomial PDF
double log10Prob = MathUtils.log10BinomialProbability((int)nObservations, (int)nErrors, QualityUtils.qualToErrorProbLog10((byte)(int)Qempirical));
double log10Prob = MathUtils.log10BinomialProbability((int)nObservations, (int)nErrors, QualityUtils.qualToErrorProbLog10(Qempirical));
if ( Double.isInfinite(log10Prob) || Double.isNaN(log10Prob) )
log10Prob = -Double.MAX_VALUE;

View File

@ -26,6 +26,25 @@ public class BQSRGathererUnitTest extends BaseTest {
private static File recal_original = new File(privateTestDir + "HiSeq.1mb.1RG.noSG.table");
@Test(enabled = true)
public void testManyObservations() {
File recal = new File(privateTestDir + "bqsr.manyObservations.piece.table");
final File output = BaseTest.createTempFile("BQSRgathererTest", ".table");
List<File> recalFiles = new LinkedList<File> ();
for ( int i=0; i < 5; i++ )
recalFiles.add(recal);
BQSRGatherer gatherer = new BQSRGatherer();
gatherer.gather(recalFiles, output);
GATKReport originalReport = new GATKReport(new File(privateTestDir + "bqsr.manyObservations.full.table"));
GATKReport calculatedReport = new GATKReport(output);
testReports(originalReport, calculatedReport);
}
@Test(enabled = true)
public void testGatherBQSR() {
BQSRGatherer gatherer = new BQSRGatherer();
@ -42,6 +61,11 @@ public class BQSRGathererUnitTest extends BaseTest {
GATKReport originalReport = new GATKReport(recal_original);
GATKReport calculatedReport = new GATKReport(output);
testReports(originalReport, calculatedReport);
}
private void testReports(final GATKReport originalReport, final GATKReport calculatedReport) {
// test the Arguments table
List<String> columnsToTest = Arrays.asList(RecalUtils.ARGUMENT_COLUMN_NAME, RecalUtils.ARGUMENT_VALUE_COLUMN_NAME);
GATKReportTable originalTable = originalReport.getTable(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE);