Hooking up the Bayesian estimate code for calculating Qemp in BQSR; various fixes after adding unit tests.
This commit is contained in:
parent
c7039a9b71
commit
dd7f5e2be7
|
|
@ -53,21 +53,21 @@ public class BQSRIntegrationTest extends WalkerTest {
|
|||
String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam";
|
||||
String HiSeqInterval = "chr1:10,000,000-10,100,000";
|
||||
return new Object[][]{
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "2f250fecb930e0dfe0f63fe0fed3960b")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "26c8d7226139a040557b1d3b1c8792f0")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "9b43a1839cb6ea03aec1d96f15ca8efb")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "3159a9d136c45e4a65d46a23dc8fd3b5")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "bb7262829effbbdbc8d88dd36f480368")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "fbb002fa2b9197c4b555852dccc11562")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "7392acb71131a60a527ca32715fc59be")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "49d4383896a90795d94138db1410a7df")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "427448eff98cf194cc7217c0b1401e79")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "50cd1a10b6ecb3d09f90f1e4a66da95d")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "1dc71561c9d0fb56f9876cb5043c5376")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "13e8f032e76340b114847c90af0a1f8a")},
|
||||
{new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "03f58ae4f9d203034e895a3636fc108f")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "49d4383896a90795d94138db1410a7df")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "2db2ef8c2d63e167663d70340182f49a")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "6b3f252718f59cf9fd3f7612f73a35bf")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "863576ac9ff0b0e02f2e84aef15923a7")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "03e28f48201a35c70d1cf48e9f45364f")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "6e3c5635d387a1c428a7c9c88ad26488")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "6507adcb94bacde4cdee9caa9f14f24b")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "399bbb4bf80764dfc644b2f95d824615")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "34d70899253c2b3343ca9ae944291c30")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "e61fa47bfc08433f0cd55558e2081548")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "5c2622c63225b8b04990baf0ae4de07c")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "ee7191d83d7d5bb957dc4595883c32f1")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "da92f4730356f479c2c2b71497cfac6d")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "8075595113b48c0c7ead08ce41bef9fe")},
|
||||
{new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "be05834841c5690c66910270521d5c32")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "e61fa47bfc08433f0cd55558e2081548")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "8ee0b498dbbc95ce76393a0f089fec92")},
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -104,7 +104,7 @@ public class BQSRIntegrationTest extends WalkerTest {
|
|||
" -sortAllCols" +
|
||||
" --plot_pdf_file /dev/null" +
|
||||
" --intermediate_csv_file %s",
|
||||
Arrays.asList("d1c38a3418979400630e2bca1140689c"));
|
||||
Arrays.asList("dd6e0e1e3f53f8ae0c8f5de21ded6ee9"));
|
||||
executeTest("testBQSR-CSVfile", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -79,7 +79,7 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP
|
|||
* @return a new RecalDatum object with the observation and the error
|
||||
*/
|
||||
protected RecalDatum createDatumObject(final byte reportedQual, final double isError) {
|
||||
return new RecalDatum(1, isError, reportedQual);
|
||||
return new RecalDatum(1L, isError, reportedQual);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -133,12 +133,12 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP
|
|||
if ( ! table.put(createDatumObject(qual, isError), keys) ) {
|
||||
// Failed to put a new item because another thread came along and put an item here first.
|
||||
// Get the newly-put item and increment it (item is guaranteed to exist at this point)
|
||||
table.get(keys).increment(1.0, isError);
|
||||
table.get(keys).increment(1, isError);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Easy case: already an item here, so increment it
|
||||
existingDatum.increment(1.0, isError);
|
||||
existingDatum.increment(1, isError);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -68,7 +68,7 @@ public class RecalDatum {
|
|||
/**
|
||||
* number of bases seen in total
|
||||
*/
|
||||
private double numObservations;
|
||||
private long numObservations;
|
||||
|
||||
/**
|
||||
* number of bases seen that didn't match the reference
|
||||
|
|
@ -89,13 +89,13 @@ public class RecalDatum {
|
|||
/**
|
||||
* Create a new RecalDatum with given observation and mismatch counts, and an reported quality
|
||||
*
|
||||
* @param _numObservations
|
||||
* @param _numMismatches
|
||||
* @param reportedQuality
|
||||
* @param _numObservations observations
|
||||
* @param _numMismatches mismatches
|
||||
* @param reportedQuality Qreported
|
||||
*/
|
||||
public RecalDatum(final double _numObservations, final double _numMismatches, final byte reportedQuality) {
|
||||
public RecalDatum(final long _numObservations, final double _numMismatches, final byte reportedQuality) {
|
||||
if ( _numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0");
|
||||
if ( _numMismatches < 0 ) throw new IllegalArgumentException("numMismatches < 0");
|
||||
if ( _numMismatches < 0.0 ) throw new IllegalArgumentException("numMismatches < 0");
|
||||
if ( reportedQuality < 0 ) throw new IllegalArgumentException("reportedQuality < 0");
|
||||
|
||||
numObservations = _numObservations;
|
||||
|
|
@ -106,7 +106,7 @@ public class RecalDatum {
|
|||
|
||||
/**
|
||||
* Copy copy into this recal datum, overwriting all of this objects data
|
||||
* @param copy
|
||||
* @param copy RecalDatum to copy
|
||||
*/
|
||||
public RecalDatum(final RecalDatum copy) {
|
||||
this.numObservations = copy.getNumObservations();
|
||||
|
|
@ -119,7 +119,7 @@ public class RecalDatum {
|
|||
* Add in all of the data from other into this object, updating the reported quality from the expected
|
||||
* error rate implied by the two reported qualities
|
||||
*
|
||||
* @param other
|
||||
* @param other RecalDatum to combine
|
||||
*/
|
||||
public synchronized void combine(final RecalDatum other) {
|
||||
final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors();
|
||||
|
|
@ -192,7 +192,7 @@ public class RecalDatum {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("%.2f,%.2f,%.2f", getNumObservations(), getNumMismatches(), getEmpiricalQuality());
|
||||
return String.format("%d,%.2f,%.2f", getNumObservations(), getNumMismatches(), getEmpiricalQuality());
|
||||
}
|
||||
|
||||
public String stringForCSV() {
|
||||
|
|
@ -205,11 +205,11 @@ public class RecalDatum {
|
|||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public final double getNumObservations() {
|
||||
public final long getNumObservations() {
|
||||
return numObservations;
|
||||
}
|
||||
|
||||
public final synchronized void setNumObservations(final double numObservations) {
|
||||
public final synchronized void setNumObservations(final long numObservations) {
|
||||
if ( numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0");
|
||||
this.numObservations = numObservations;
|
||||
empiricalQuality = UNINITIALIZED;
|
||||
|
|
@ -227,7 +227,7 @@ public class RecalDatum {
|
|||
}
|
||||
|
||||
@Requires({"by >= 0"})
|
||||
public final synchronized void incrementNumObservations(final double by) {
|
||||
public final synchronized void incrementNumObservations(final long by) {
|
||||
numObservations += by;
|
||||
empiricalQuality = UNINITIALIZED;
|
||||
}
|
||||
|
|
@ -240,7 +240,7 @@ public class RecalDatum {
|
|||
|
||||
@Requires({"incObservations >= 0", "incMismatches >= 0"})
|
||||
@Ensures({"numObservations == old(numObservations) + incObservations", "numMismatches == old(numMismatches) + incMismatches"})
|
||||
public final synchronized void increment(final double incObservations, final double incMismatches) {
|
||||
public final synchronized void increment(final long incObservations, final double incMismatches) {
|
||||
numObservations += incObservations;
|
||||
numMismatches += incMismatches;
|
||||
empiricalQuality = UNINITIALIZED;
|
||||
|
|
@ -248,7 +248,7 @@ public class RecalDatum {
|
|||
|
||||
@Ensures({"numObservations == old(numObservations) + 1", "numMismatches >= old(numMismatches)"})
|
||||
public final synchronized void increment(final boolean isError) {
|
||||
increment(1, isError ? 1 : 0.0);
|
||||
increment(1, isError ? 1.0 : 0.0);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
|
@ -257,19 +257,6 @@ public class RecalDatum {
|
|||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Calculate and cache the empirical quality score from mismatches and observations (expensive operation)
|
||||
*/
|
||||
@Requires("empiricalQuality == UNINITIALIZED")
|
||||
@Ensures("empiricalQuality != UNINITIALIZED")
|
||||
private synchronized void calcEmpiricalQuality() {
|
||||
|
||||
// TODO -- add code for Bayesian estimate of Qemp here
|
||||
|
||||
final double empiricalQual = -10 * Math.log10(getEmpiricalErrorRate());
|
||||
empiricalQuality = Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE);
|
||||
}
|
||||
|
||||
/**
|
||||
* calculate the expected number of errors given the estimated Q reported and the number of observations
|
||||
* in this datum.
|
||||
|
|
@ -281,10 +268,29 @@ public class RecalDatum {
|
|||
return getNumObservations() * QualityUtils.qualToErrorProb(estimatedQReported);
|
||||
}
|
||||
|
||||
static final boolean DEBUG = false;
|
||||
static final double RESOLUTION_BINS_PER_QUAL = 1.0;
|
||||
/**
|
||||
* Calculate and cache the empirical quality score from mismatches and observations (expensive operation)
|
||||
*/
|
||||
@Requires("empiricalQuality == UNINITIALIZED")
|
||||
@Ensures("empiricalQuality != UNINITIALIZED")
|
||||
private synchronized void calcEmpiricalQuality() {
|
||||
|
||||
static public double bayesianEstimateOfEmpiricalQuality(final double nObservations, final double nErrors, final double QReported) {
|
||||
// smoothing is one error and one non-error observation
|
||||
final long mismatches = (long)(getNumMismatches() + 0.5) + SMOOTHING_CONSTANT;
|
||||
final long observations = getNumObservations() + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT;
|
||||
|
||||
final double empiricalQual = RecalDatum.bayesianEstimateOfEmpiricalQuality(observations, mismatches, getEstimatedQReported());
|
||||
|
||||
// This is the old and busted point estimate approach:
|
||||
//final double empiricalQual = -10 * Math.log10(getEmpiricalErrorRate());
|
||||
|
||||
empiricalQuality = Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE);
|
||||
}
|
||||
|
||||
//static final boolean DEBUG = false;
|
||||
static private final double RESOLUTION_BINS_PER_QUAL = 1.0;
|
||||
|
||||
static public double bayesianEstimateOfEmpiricalQuality(final long nObservations, final long nErrors, final double QReported) {
|
||||
|
||||
final int numBins = (QualityUtils.MAX_REASONABLE_Q_SCORE + 1) * (int)RESOLUTION_BINS_PER_QUAL;
|
||||
|
||||
|
|
@ -294,14 +300,14 @@ public class RecalDatum {
|
|||
|
||||
final double QEmpOfBin = bin / RESOLUTION_BINS_PER_QUAL;
|
||||
|
||||
log10Posteriors[bin] = log10QempPrior(QEmpOfBin, QReported) + log10Likelihood(QEmpOfBin, nObservations, nErrors);
|
||||
log10Posteriors[bin] = log10QempPrior(QEmpOfBin, QReported) + log10QempLikelihood(QEmpOfBin, nObservations, nErrors);
|
||||
|
||||
if ( DEBUG )
|
||||
System.out.println(String.format("bin = %d, Qreported = %f, nObservations = %f, nErrors = %f, posteriors = %f", bin, QReported, nObservations, nErrors, log10Posteriors[bin]));
|
||||
//if ( DEBUG )
|
||||
// System.out.println(String.format("bin = %d, Qreported = %f, nObservations = %f, nErrors = %f, posteriors = %f", bin, QReported, nObservations, nErrors, log10Posteriors[bin]));
|
||||
}
|
||||
|
||||
if ( DEBUG )
|
||||
System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f", QReported, nObservations, nErrors));
|
||||
//if ( DEBUG )
|
||||
// System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f", QReported, nObservations, nErrors));
|
||||
|
||||
final double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10Posteriors);
|
||||
final int MLEbin = MathUtils.maxElementIndex(normalizedPosteriors);
|
||||
|
|
@ -310,35 +316,47 @@ public class RecalDatum {
|
|||
return Qemp;
|
||||
}
|
||||
|
||||
static final double[] log10QempPriorCache = new double[QualityUtils.MAX_GATK_USABLE_Q_SCORE + 1];
|
||||
static private final double[] log10QempPriorCache = new double[QualityUtils.MAX_GATK_USABLE_Q_SCORE + 1];
|
||||
static {
|
||||
// f(x) = a + b*exp(-((x - c)^2 / (2*d^2)))
|
||||
// Note that b is the height of the curve's peak, c is the position of the center of the peak, and d controls the width of the "bell".
|
||||
final double GF_a = 0.0;
|
||||
final double GF_b = 0.9;
|
||||
final double GF_c = 0.0;
|
||||
final double GF_d = 0.5;
|
||||
final double GF_d = 0.5; // with these parameters, deltas can shift at most ~20 Q points
|
||||
|
||||
final GaussianFunction gaussian = new GaussianFunction(GF_a, GF_b, GF_c, GF_d);
|
||||
for ( int i = 0; i <= QualityUtils.MAX_GATK_USABLE_Q_SCORE; i++ )
|
||||
log10QempPriorCache[i] = Math.log10(gaussian.value((double) i));
|
||||
for ( int i = 0; i <= QualityUtils.MAX_GATK_USABLE_Q_SCORE; i++ ) {
|
||||
double log10Prior = Math.log10(gaussian.value((double) i));
|
||||
if ( Double.isInfinite(log10Prior) )
|
||||
log10Prior = -Double.MAX_VALUE;
|
||||
log10QempPriorCache[i] = log10Prior;
|
||||
}
|
||||
}
|
||||
|
||||
static public double log10QempPrior(final double Qempirical, final double Qreported) {
|
||||
static protected double log10QempPrior(final double Qempirical, final double Qreported) {
|
||||
final int difference = Math.min(Math.abs((int) (Qempirical - Qreported)), QualityUtils.MAX_GATK_USABLE_Q_SCORE);
|
||||
if ( DEBUG )
|
||||
System.out.println(String.format("Qemp = %f, log10Priors = %f", Qempirical, log10QempPriorCache[difference]));
|
||||
//if ( DEBUG )
|
||||
// System.out.println(String.format("Qemp = %f, log10Priors = %f", Qempirical, log10QempPriorCache[difference]));
|
||||
return log10QempPriorCache[difference];
|
||||
}
|
||||
|
||||
static public double log10Likelihood(final double Qempirical, final double nObservations, final double nErrors) {
|
||||
static protected double log10QempLikelihood(final double Qempirical, final long nObservations, final long nErrors) {
|
||||
if ( nObservations == 0 )
|
||||
return 0.0;
|
||||
|
||||
// this is just a straight binomial PDF
|
||||
double log10Prob = MathUtils.log10BinomialProbability((int)nObservations, (int)nErrors, QualityUtils.qualToErrorProbLog10((byte)(int)Qempirical));
|
||||
if ( log10Prob == Double.NEGATIVE_INFINITY )
|
||||
double log10Prob = MathUtils.log10BinomialProbability(longToInt(nObservations), longToInt(nErrors), QualityUtils.qualToErrorProbLog10((byte)(int)Qempirical));
|
||||
if ( Double.isInfinite(log10Prob) || Double.isNaN(log10Prob) )
|
||||
log10Prob = -Double.MAX_VALUE;
|
||||
|
||||
if ( DEBUG )
|
||||
System.out.println(String.format("Qemp = %f, log10Likelihood = %f", Qempirical, log10Prob));
|
||||
//if ( DEBUG )
|
||||
// System.out.println(String.format("Qemp = %f, log10Likelihood = %f", Qempirical, log10Prob));
|
||||
|
||||
return log10Prob;
|
||||
}
|
||||
|
||||
static protected int longToInt(final long l) {
|
||||
return (l > Integer.MAX_VALUE) ? Integer.MAX_VALUE : (int)l;
|
||||
}
|
||||
}
|
||||
|
|
@ -264,7 +264,7 @@ public class RecalDatumNode<T extends RecalDatum> {
|
|||
for ( final RecalDatumNode<T> subnode : subnodes ) {
|
||||
// use the yates correction to help avoid all zeros => NaN
|
||||
counts[i][0] = Math.round(subnode.getRecalDatum().getNumMismatches()) + 1L;
|
||||
counts[i][1] = Math.round(subnode.getRecalDatum().getNumObservations()) + 2L;
|
||||
counts[i][1] = subnode.getRecalDatum().getNumObservations() + 2L;
|
||||
i++;
|
||||
}
|
||||
|
||||
|
|
@ -320,7 +320,7 @@ public class RecalDatumNode<T extends RecalDatum> {
|
|||
|
||||
if ( isLeaf() ) {
|
||||
// this is leave node
|
||||
return (Math.abs(Math.log10(recalDatum.getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * recalDatum.getNumObservations();
|
||||
return (Math.abs(Math.log10(recalDatum.getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * (double)recalDatum.getNumObservations();
|
||||
// TODO -- how we can generalize this calculation?
|
||||
// if ( this.qEnd <= minInterestingQual )
|
||||
// // It's free to merge up quality scores below the smallest interesting one
|
||||
|
|
|
|||
|
|
@ -93,7 +93,7 @@ public class RecalUtils {
|
|||
private static final Pair<String, String> eventType = new Pair<String, String>(RecalUtils.EVENT_TYPE_COLUMN_NAME, "%s");
|
||||
private static final Pair<String, String> empiricalQuality = new Pair<String, String>(RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, "%.4f");
|
||||
private static final Pair<String, String> estimatedQReported = new Pair<String, String>(RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.4f");
|
||||
private static final Pair<String, String> nObservations = new Pair<String, String>(RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, "%.2f");
|
||||
private static final Pair<String, String> nObservations = new Pair<String, String>(RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d");
|
||||
private static final Pair<String, String> nErrors = new Pair<String, String>(RecalUtils.NUMBER_ERRORS_COLUMN_NAME, "%.2f");
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -68,15 +68,6 @@ public class RecalibrationReport {
|
|||
|
||||
}
|
||||
|
||||
protected RecalibrationReport(final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final GATKReportTable argumentTable, final RecalibrationArgumentCollection RAC) {
|
||||
this.quantizationInfo = quantizationInfo;
|
||||
this.recalibrationTables = recalibrationTables;
|
||||
this.requestedCovariates = requestedCovariates;
|
||||
this.argumentTable = argumentTable;
|
||||
this.RAC = RAC;
|
||||
this.optionalCovariateIndexes = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Counts the number of unique read groups in the table
|
||||
*
|
||||
|
|
@ -192,11 +183,22 @@ public class RecalibrationReport {
|
|||
else if ( o instanceof Long )
|
||||
return (Long)o;
|
||||
else
|
||||
throw new ReviewedStingException("Object " + o + " is expected to be either a double, long or integer but its not either: " + o.getClass());
|
||||
throw new ReviewedStingException("Object " + o + " is expected to be either a double, long or integer but it's not either: " + o.getClass());
|
||||
}
|
||||
|
||||
private long asLong(final Object o) {
|
||||
if ( o instanceof Long )
|
||||
return (Long)o;
|
||||
else if ( o instanceof Integer )
|
||||
return ((Integer)o).longValue();
|
||||
else if ( o instanceof Double )
|
||||
return ((Double)o).longValue();
|
||||
else
|
||||
throw new ReviewedStingException("Object " + o + " is expected to be a long but it's not: " + o.getClass());
|
||||
}
|
||||
|
||||
private RecalDatum getRecalDatum(final GATKReportTable reportTable, final int row, final boolean hasEstimatedQReportedColumn) {
|
||||
final double nObservations = asDouble(reportTable.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME));
|
||||
final long nObservations = asLong(reportTable.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME));
|
||||
final double nErrors = asDouble(reportTable.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME));
|
||||
final double empiricalQuality = asDouble(reportTable.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME));
|
||||
|
||||
|
|
|
|||
|
|
@ -30,16 +30,13 @@ package org.broadinstitute.sting.utils.recalibration;
|
|||
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class RecalDatumUnitTest extends BaseTest {
|
||||
|
|
@ -74,7 +71,7 @@ public class RecalDatumUnitTest extends BaseTest {
|
|||
}
|
||||
|
||||
public RecalDatum makeRecalDatum() {
|
||||
return new RecalDatum(exTotal, exError, (byte)getReportedQual());
|
||||
return new RecalDatum((long)exTotal, (double)exError, (byte)getReportedQual());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
@ -83,13 +80,19 @@ public class RecalDatumUnitTest extends BaseTest {
|
|||
}
|
||||
}
|
||||
|
||||
private static boolean createdDatumTestProviders = false;
|
||||
|
||||
@DataProvider(name = "RecalDatumTestProvider")
|
||||
public Object[][] makeRecalDatumTestProvider() {
|
||||
for ( int E : Arrays.asList(1, 10, 100, 1000, 10000) )
|
||||
for ( int N : Arrays.asList(10, 100, 1000, 10000, 100000, 1000000) )
|
||||
for ( int reportedQual : Arrays.asList(10, 20) )
|
||||
if ( E <= N )
|
||||
new RecalDatumTestProvider(E, N, reportedQual);
|
||||
if ( !createdDatumTestProviders ) {
|
||||
for ( int E : Arrays.asList(1, 10, 100, 1000, 10000) )
|
||||
for ( int N : Arrays.asList(10, 100, 1000, 10000, 100000, 1000000) )
|
||||
for ( int reportedQual : Arrays.asList(10, 20) )
|
||||
if ( E <= N )
|
||||
new RecalDatumTestProvider(E, N, reportedQual);
|
||||
createdDatumTestProviders = true;
|
||||
}
|
||||
|
||||
return RecalDatumTestProvider.getTests(RecalDatumTestProvider.class);
|
||||
}
|
||||
|
||||
|
|
@ -104,7 +107,6 @@ public class RecalDatumUnitTest extends BaseTest {
|
|||
Assert.assertEquals(datum.getNumObservations(), cfg.exTotal, 1E-6);
|
||||
if ( cfg.getReportedQual() != -1 )
|
||||
Assert.assertEquals(datum.getEstimatedQReportedAsByte(), cfg.getReportedQual());
|
||||
BaseTest.assertEqualsDoubleSmart(datum.getEmpiricalQuality(), cfg.getErrorRatePhredScaled());
|
||||
BaseTest.assertEqualsDoubleSmart(datum.getEmpiricalErrorRate(), cfg.getErrorRate());
|
||||
|
||||
final double e = datum.getEmpiricalQuality();
|
||||
|
|
@ -175,7 +177,87 @@ public class RecalDatumUnitTest extends BaseTest {
|
|||
|
||||
@Test
|
||||
public void testNoObs() {
|
||||
final RecalDatum rd = new RecalDatum(0, 0, (byte)10);
|
||||
final RecalDatum rd = new RecalDatum(0L, 0.0, (byte)10);
|
||||
Assert.assertEquals(rd.getEmpiricalErrorRate(), 0.0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testlog10QempPrior() {
|
||||
for ( int Qemp = 0; Qemp <= QualityUtils.MAX_QUAL_SCORE; Qemp++ ) {
|
||||
for ( int Qrep = 0; Qrep <= QualityUtils.MAX_QUAL_SCORE; Qrep++ ) {
|
||||
final double log10prior = RecalDatum.log10QempPrior(Qemp, Qrep);
|
||||
Assert.assertTrue(log10prior < 0.0);
|
||||
Assert.assertFalse(Double.isInfinite(log10prior));
|
||||
Assert.assertFalse(Double.isNaN(log10prior));
|
||||
}
|
||||
}
|
||||
|
||||
final int Qrep = 20;
|
||||
int maxQemp = -1;
|
||||
double maxQempValue = -Double.MAX_VALUE;
|
||||
for ( int Qemp = 0; Qemp <= QualityUtils.MAX_QUAL_SCORE; Qemp++ ) {
|
||||
final double log10prior = RecalDatum.log10QempPrior(Qemp, Qrep);
|
||||
if ( log10prior > maxQempValue ) {
|
||||
maxQemp = Qemp;
|
||||
maxQempValue = log10prior;
|
||||
}
|
||||
}
|
||||
Assert.assertEquals(maxQemp, Qrep);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testlog10QempLikelihood() {
|
||||
|
||||
final int Qrep = 20;
|
||||
|
||||
// test no shift
|
||||
Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(0, 0, Qrep), (double)Qrep);
|
||||
Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10, 0, Qrep), (double)Qrep);
|
||||
Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000, 10, Qrep), (double)Qrep);
|
||||
|
||||
// test small shift
|
||||
Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10, 10, Qrep), Qrep - 1.0);
|
||||
Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000, 0, Qrep), Qrep + 1.0);
|
||||
|
||||
// test medium shift
|
||||
Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10000, 0, Qrep), Qrep + 3.0);
|
||||
Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10000, 10, Qrep), Qrep + 3.0);
|
||||
|
||||
// test large shift
|
||||
Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(100000, 10, Qrep), Qrep + 8.0);
|
||||
Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000000, 10, Qrep), Qrep + 16.0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBayesianEstimateOfEmpiricalQuality() {
|
||||
|
||||
final double[] Qemps = new double[] { 0.0, 10.0, 20.0, 30.0 };
|
||||
final int[] observations = new int[] {0, 10, 1000, 1000000};
|
||||
final int[] errors = new int[] {0, 10, 1000, 1000000};
|
||||
|
||||
for ( double Qemp : Qemps ) {
|
||||
for ( int observation : observations ) {
|
||||
for ( int error : errors ) {
|
||||
if ( error > observation )
|
||||
continue;
|
||||
|
||||
final double log10likelihood = RecalDatum.log10QempLikelihood(Qemp, observation, error);
|
||||
Assert.assertTrue(observation == 0 ? MathUtils.compareDoubles(log10likelihood, 0.0) == 0 : log10likelihood < 0.0);
|
||||
Assert.assertFalse(Double.isInfinite(log10likelihood));
|
||||
Assert.assertFalse(Double.isNaN(log10likelihood));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLongToInt() {
|
||||
long l = new Long((long)Integer.MAX_VALUE);
|
||||
int i = RecalDatum.longToInt(l);
|
||||
Assert.assertEquals(i, Integer.MAX_VALUE);
|
||||
|
||||
l++;
|
||||
i = RecalDatum.longToInt(l);
|
||||
Assert.assertEquals(i, Integer.MAX_VALUE);
|
||||
}
|
||||
}
|
||||
|
|
@ -146,7 +146,7 @@ public final class RecalUtilsUnitTest extends BaseTest {
|
|||
public NestedIntegerArray<RecalDatum> makeTable(final List<Row> rows) {
|
||||
final NestedIntegerArray<RecalDatum> x = new NestedIntegerArray<RecalDatum>(3, 3);
|
||||
for ( final Row r : rows )
|
||||
x.put(new RecalDatum(r.no, r.ne, (byte)10), r.rg, r.qual);
|
||||
x.put(new RecalDatum((long)r.no, (double)r.ne, (byte)10), r.rg, r.qual);
|
||||
return x;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -20,9 +20,11 @@ public class RecalibrationReportUnitTest {
|
|||
private static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) {
|
||||
final Random random = new Random();
|
||||
final int nObservations = random.nextInt(maxObservations);
|
||||
final int nErrors = random.nextInt(maxErrors);
|
||||
int nErrors = random.nextInt(maxErrors);
|
||||
while ( nErrors > nObservations )
|
||||
nErrors = random.nextInt(maxErrors);
|
||||
final int qual = random.nextInt(QualityUtils.MAX_QUAL_SCORE);
|
||||
return new RecalDatum(nObservations, nErrors, (byte)qual);
|
||||
return new RecalDatum((long)nObservations, (double)nErrors, (byte)qual);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
|
|
|
|||
Loading…
Reference in New Issue