From dd7f5e2be749b9373cb9ed7dd30b3c98ed34e9b0 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 4 Jan 2013 14:43:11 -0500 Subject: [PATCH] Hooking up the Bayesian estimate code for calculating Qemp in BQSR; various fixes after adding unit tests. --- .../walkers/bqsr/BQSRIntegrationTest.java | 32 ++--- .../bqsr/StandardRecalibrationEngine.java | 6 +- .../sting/utils/recalibration/RecalDatum.java | 112 ++++++++++-------- .../utils/recalibration/RecalDatumNode.java | 4 +- .../sting/utils/recalibration/RecalUtils.java | 2 +- .../recalibration/RecalibrationReport.java | 24 ++-- .../recalibration/RecalDatumUnitTest.java | 106 +++++++++++++++-- .../recalibration/RecalUtilsUnitTest.java | 2 +- .../RecalibrationReportUnitTest.java | 6 +- 9 files changed, 199 insertions(+), 95 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java index 177a989fb..9b6d6f913 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java @@ -53,21 +53,21 @@ public class BQSRIntegrationTest extends WalkerTest { String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam"; String HiSeqInterval = "chr1:10,000,000-10,100,000"; return new Object[][]{ - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "2f250fecb930e0dfe0f63fe0fed3960b")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "26c8d7226139a040557b1d3b1c8792f0")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "9b43a1839cb6ea03aec1d96f15ca8efb")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "3159a9d136c45e4a65d46a23dc8fd3b5")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "bb7262829effbbdbc8d88dd36f480368")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "fbb002fa2b9197c4b555852dccc11562")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "7392acb71131a60a527ca32715fc59be")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "49d4383896a90795d94138db1410a7df")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "427448eff98cf194cc7217c0b1401e79")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "50cd1a10b6ecb3d09f90f1e4a66da95d")}, - {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "1dc71561c9d0fb56f9876cb5043c5376")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "13e8f032e76340b114847c90af0a1f8a")}, - {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "03f58ae4f9d203034e895a3636fc108f")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "49d4383896a90795d94138db1410a7df")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "2db2ef8c2d63e167663d70340182f49a")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "6b3f252718f59cf9fd3f7612f73a35bf")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "863576ac9ff0b0e02f2e84aef15923a7")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "03e28f48201a35c70d1cf48e9f45364f")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "6e3c5635d387a1c428a7c9c88ad26488")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "6507adcb94bacde4cdee9caa9f14f24b")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "399bbb4bf80764dfc644b2f95d824615")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "34d70899253c2b3343ca9ae944291c30")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "e61fa47bfc08433f0cd55558e2081548")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "5c2622c63225b8b04990baf0ae4de07c")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "ee7191d83d7d5bb957dc4595883c32f1")}, + {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "da92f4730356f479c2c2b71497cfac6d")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "8075595113b48c0c7ead08ce41bef9fe")}, + {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "be05834841c5690c66910270521d5c32")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "e61fa47bfc08433f0cd55558e2081548")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "8ee0b498dbbc95ce76393a0f089fec92")}, }; } @@ -104,7 +104,7 @@ public class BQSRIntegrationTest extends WalkerTest { " -sortAllCols" + " --plot_pdf_file /dev/null" + " --intermediate_csv_file %s", - Arrays.asList("d1c38a3418979400630e2bca1140689c")); + Arrays.asList("dd6e0e1e3f53f8ae0c8f5de21ded6ee9")); executeTest("testBQSR-CSVfile", spec); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java index a6ab98e8b..4ad8ccdf3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java @@ -79,7 +79,7 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP * @return a new RecalDatum object with the observation and the error */ protected RecalDatum createDatumObject(final byte reportedQual, final double isError) { - return new RecalDatum(1, isError, reportedQual); + return new RecalDatum(1L, isError, reportedQual); } /** @@ -133,12 +133,12 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP if ( ! table.put(createDatumObject(qual, isError), keys) ) { // Failed to put a new item because another thread came along and put an item here first. // Get the newly-put item and increment it (item is guaranteed to exist at this point) - table.get(keys).increment(1.0, isError); + table.get(keys).increment(1, isError); } } else { // Easy case: already an item here, so increment it - existingDatum.increment(1.0, isError); + existingDatum.increment(1, isError); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java index 9b58a5900..4eb069542 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java @@ -68,7 +68,7 @@ public class RecalDatum { /** * number of bases seen in total */ - private double numObservations; + private long numObservations; /** * number of bases seen that didn't match the reference @@ -89,13 +89,13 @@ public class RecalDatum { /** * Create a new RecalDatum with given observation and mismatch counts, and an reported quality * - * @param _numObservations - * @param _numMismatches - * @param reportedQuality + * @param _numObservations observations + * @param _numMismatches mismatches + * @param reportedQuality Qreported */ - public RecalDatum(final double _numObservations, final double _numMismatches, final byte reportedQuality) { + public RecalDatum(final long _numObservations, final double _numMismatches, final byte reportedQuality) { if ( _numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0"); - if ( _numMismatches < 0 ) throw new IllegalArgumentException("numMismatches < 0"); + if ( _numMismatches < 0.0 ) throw new IllegalArgumentException("numMismatches < 0"); if ( reportedQuality < 0 ) throw new IllegalArgumentException("reportedQuality < 0"); numObservations = _numObservations; @@ -106,7 +106,7 @@ public class RecalDatum { /** * Copy copy into this recal datum, overwriting all of this objects data - * @param copy + * @param copy RecalDatum to copy */ public RecalDatum(final RecalDatum copy) { this.numObservations = copy.getNumObservations(); @@ -119,7 +119,7 @@ public class RecalDatum { * Add in all of the data from other into this object, updating the reported quality from the expected * error rate implied by the two reported qualities * - * @param other + * @param other RecalDatum to combine */ public synchronized void combine(final RecalDatum other) { final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors(); @@ -192,7 +192,7 @@ public class RecalDatum { @Override public String toString() { - return String.format("%.2f,%.2f,%.2f", getNumObservations(), getNumMismatches(), getEmpiricalQuality()); + return String.format("%d,%.2f,%.2f", getNumObservations(), getNumMismatches(), getEmpiricalQuality()); } public String stringForCSV() { @@ -205,11 +205,11 @@ public class RecalDatum { // //--------------------------------------------------------------------------------------------------------------- - public final double getNumObservations() { + public final long getNumObservations() { return numObservations; } - public final synchronized void setNumObservations(final double numObservations) { + public final synchronized void setNumObservations(final long numObservations) { if ( numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0"); this.numObservations = numObservations; empiricalQuality = UNINITIALIZED; @@ -227,7 +227,7 @@ public class RecalDatum { } @Requires({"by >= 0"}) - public final synchronized void incrementNumObservations(final double by) { + public final synchronized void incrementNumObservations(final long by) { numObservations += by; empiricalQuality = UNINITIALIZED; } @@ -240,7 +240,7 @@ public class RecalDatum { @Requires({"incObservations >= 0", "incMismatches >= 0"}) @Ensures({"numObservations == old(numObservations) + incObservations", "numMismatches == old(numMismatches) + incMismatches"}) - public final synchronized void increment(final double incObservations, final double incMismatches) { + public final synchronized void increment(final long incObservations, final double incMismatches) { numObservations += incObservations; numMismatches += incMismatches; empiricalQuality = UNINITIALIZED; @@ -248,7 +248,7 @@ public class RecalDatum { @Ensures({"numObservations == old(numObservations) + 1", "numMismatches >= old(numMismatches)"}) public final synchronized void increment(final boolean isError) { - increment(1, isError ? 1 : 0.0); + increment(1, isError ? 1.0 : 0.0); } // ------------------------------------------------------------------------------------- @@ -257,19 +257,6 @@ public class RecalDatum { // // ------------------------------------------------------------------------------------- - /** - * Calculate and cache the empirical quality score from mismatches and observations (expensive operation) - */ - @Requires("empiricalQuality == UNINITIALIZED") - @Ensures("empiricalQuality != UNINITIALIZED") - private synchronized void calcEmpiricalQuality() { - - // TODO -- add code for Bayesian estimate of Qemp here - - final double empiricalQual = -10 * Math.log10(getEmpiricalErrorRate()); - empiricalQuality = Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE); - } - /** * calculate the expected number of errors given the estimated Q reported and the number of observations * in this datum. @@ -281,10 +268,29 @@ public class RecalDatum { return getNumObservations() * QualityUtils.qualToErrorProb(estimatedQReported); } - static final boolean DEBUG = false; - static final double RESOLUTION_BINS_PER_QUAL = 1.0; + /** + * Calculate and cache the empirical quality score from mismatches and observations (expensive operation) + */ + @Requires("empiricalQuality == UNINITIALIZED") + @Ensures("empiricalQuality != UNINITIALIZED") + private synchronized void calcEmpiricalQuality() { - static public double bayesianEstimateOfEmpiricalQuality(final double nObservations, final double nErrors, final double QReported) { + // smoothing is one error and one non-error observation + final long mismatches = (long)(getNumMismatches() + 0.5) + SMOOTHING_CONSTANT; + final long observations = getNumObservations() + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT; + + final double empiricalQual = RecalDatum.bayesianEstimateOfEmpiricalQuality(observations, mismatches, getEstimatedQReported()); + + // This is the old and busted point estimate approach: + //final double empiricalQual = -10 * Math.log10(getEmpiricalErrorRate()); + + empiricalQuality = Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE); + } + + //static final boolean DEBUG = false; + static private final double RESOLUTION_BINS_PER_QUAL = 1.0; + + static public double bayesianEstimateOfEmpiricalQuality(final long nObservations, final long nErrors, final double QReported) { final int numBins = (QualityUtils.MAX_REASONABLE_Q_SCORE + 1) * (int)RESOLUTION_BINS_PER_QUAL; @@ -294,14 +300,14 @@ public class RecalDatum { final double QEmpOfBin = bin / RESOLUTION_BINS_PER_QUAL; - log10Posteriors[bin] = log10QempPrior(QEmpOfBin, QReported) + log10Likelihood(QEmpOfBin, nObservations, nErrors); + log10Posteriors[bin] = log10QempPrior(QEmpOfBin, QReported) + log10QempLikelihood(QEmpOfBin, nObservations, nErrors); - if ( DEBUG ) - System.out.println(String.format("bin = %d, Qreported = %f, nObservations = %f, nErrors = %f, posteriors = %f", bin, QReported, nObservations, nErrors, log10Posteriors[bin])); + //if ( DEBUG ) + // System.out.println(String.format("bin = %d, Qreported = %f, nObservations = %f, nErrors = %f, posteriors = %f", bin, QReported, nObservations, nErrors, log10Posteriors[bin])); } - if ( DEBUG ) - System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f", QReported, nObservations, nErrors)); + //if ( DEBUG ) + // System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f", QReported, nObservations, nErrors)); final double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10Posteriors); final int MLEbin = MathUtils.maxElementIndex(normalizedPosteriors); @@ -310,35 +316,47 @@ public class RecalDatum { return Qemp; } - static final double[] log10QempPriorCache = new double[QualityUtils.MAX_GATK_USABLE_Q_SCORE + 1]; + static private final double[] log10QempPriorCache = new double[QualityUtils.MAX_GATK_USABLE_Q_SCORE + 1]; static { // f(x) = a + b*exp(-((x - c)^2 / (2*d^2))) // Note that b is the height of the curve's peak, c is the position of the center of the peak, and d controls the width of the "bell". final double GF_a = 0.0; final double GF_b = 0.9; final double GF_c = 0.0; - final double GF_d = 0.5; + final double GF_d = 0.5; // with these parameters, deltas can shift at most ~20 Q points final GaussianFunction gaussian = new GaussianFunction(GF_a, GF_b, GF_c, GF_d); - for ( int i = 0; i <= QualityUtils.MAX_GATK_USABLE_Q_SCORE; i++ ) - log10QempPriorCache[i] = Math.log10(gaussian.value((double) i)); + for ( int i = 0; i <= QualityUtils.MAX_GATK_USABLE_Q_SCORE; i++ ) { + double log10Prior = Math.log10(gaussian.value((double) i)); + if ( Double.isInfinite(log10Prior) ) + log10Prior = -Double.MAX_VALUE; + log10QempPriorCache[i] = log10Prior; + } } - static public double log10QempPrior(final double Qempirical, final double Qreported) { + static protected double log10QempPrior(final double Qempirical, final double Qreported) { final int difference = Math.min(Math.abs((int) (Qempirical - Qreported)), QualityUtils.MAX_GATK_USABLE_Q_SCORE); - if ( DEBUG ) - System.out.println(String.format("Qemp = %f, log10Priors = %f", Qempirical, log10QempPriorCache[difference])); + //if ( DEBUG ) + // System.out.println(String.format("Qemp = %f, log10Priors = %f", Qempirical, log10QempPriorCache[difference])); return log10QempPriorCache[difference]; } - static public double log10Likelihood(final double Qempirical, final double nObservations, final double nErrors) { + static protected double log10QempLikelihood(final double Qempirical, final long nObservations, final long nErrors) { + if ( nObservations == 0 ) + return 0.0; + // this is just a straight binomial PDF - double log10Prob = MathUtils.log10BinomialProbability((int)nObservations, (int)nErrors, QualityUtils.qualToErrorProbLog10((byte)(int)Qempirical)); - if ( log10Prob == Double.NEGATIVE_INFINITY ) + double log10Prob = MathUtils.log10BinomialProbability(longToInt(nObservations), longToInt(nErrors), QualityUtils.qualToErrorProbLog10((byte)(int)Qempirical)); + if ( Double.isInfinite(log10Prob) || Double.isNaN(log10Prob) ) log10Prob = -Double.MAX_VALUE; - if ( DEBUG ) - System.out.println(String.format("Qemp = %f, log10Likelihood = %f", Qempirical, log10Prob)); + //if ( DEBUG ) + // System.out.println(String.format("Qemp = %f, log10Likelihood = %f", Qempirical, log10Prob)); + return log10Prob; } + + static protected int longToInt(final long l) { + return (l > Integer.MAX_VALUE) ? Integer.MAX_VALUE : (int)l; + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java index 6c94c3c42..b8f89ad66 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java @@ -264,7 +264,7 @@ public class RecalDatumNode { for ( final RecalDatumNode subnode : subnodes ) { // use the yates correction to help avoid all zeros => NaN counts[i][0] = Math.round(subnode.getRecalDatum().getNumMismatches()) + 1L; - counts[i][1] = Math.round(subnode.getRecalDatum().getNumObservations()) + 2L; + counts[i][1] = subnode.getRecalDatum().getNumObservations() + 2L; i++; } @@ -320,7 +320,7 @@ public class RecalDatumNode { if ( isLeaf() ) { // this is leave node - return (Math.abs(Math.log10(recalDatum.getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * recalDatum.getNumObservations(); + return (Math.abs(Math.log10(recalDatum.getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * (double)recalDatum.getNumObservations(); // TODO -- how we can generalize this calculation? // if ( this.qEnd <= minInterestingQual ) // // It's free to merge up quality scores below the smallest interesting one diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java index d4e781fdd..9a9a5dfc6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java @@ -93,7 +93,7 @@ public class RecalUtils { private static final Pair eventType = new Pair(RecalUtils.EVENT_TYPE_COLUMN_NAME, "%s"); private static final Pair empiricalQuality = new Pair(RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, "%.4f"); private static final Pair estimatedQReported = new Pair(RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.4f"); - private static final Pair nObservations = new Pair(RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, "%.2f"); + private static final Pair nObservations = new Pair(RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d"); private static final Pair nErrors = new Pair(RecalUtils.NUMBER_ERRORS_COLUMN_NAME, "%.2f"); /** diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java index 6ecac1394..ac451221e 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java @@ -68,15 +68,6 @@ public class RecalibrationReport { } - protected RecalibrationReport(final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final GATKReportTable argumentTable, final RecalibrationArgumentCollection RAC) { - this.quantizationInfo = quantizationInfo; - this.recalibrationTables = recalibrationTables; - this.requestedCovariates = requestedCovariates; - this.argumentTable = argumentTable; - this.RAC = RAC; - this.optionalCovariateIndexes = null; - } - /** * Counts the number of unique read groups in the table * @@ -192,11 +183,22 @@ public class RecalibrationReport { else if ( o instanceof Long ) return (Long)o; else - throw new ReviewedStingException("Object " + o + " is expected to be either a double, long or integer but its not either: " + o.getClass()); + throw new ReviewedStingException("Object " + o + " is expected to be either a double, long or integer but it's not either: " + o.getClass()); + } + + private long asLong(final Object o) { + if ( o instanceof Long ) + return (Long)o; + else if ( o instanceof Integer ) + return ((Integer)o).longValue(); + else if ( o instanceof Double ) + return ((Double)o).longValue(); + else + throw new ReviewedStingException("Object " + o + " is expected to be a long but it's not: " + o.getClass()); } private RecalDatum getRecalDatum(final GATKReportTable reportTable, final int row, final boolean hasEstimatedQReportedColumn) { - final double nObservations = asDouble(reportTable.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME)); + final long nObservations = asLong(reportTable.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME)); final double nErrors = asDouble(reportTable.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME)); final double empiricalQuality = asDouble(reportTable.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME)); diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java index 0f0c74362..2305fe566 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java @@ -30,16 +30,13 @@ package org.broadinstitute.sting.utils.recalibration; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.Utils; import org.testng.Assert; -import org.testng.annotations.BeforeSuite; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.ArrayList; import java.util.Arrays; -import java.util.List; public class RecalDatumUnitTest extends BaseTest { @@ -74,7 +71,7 @@ public class RecalDatumUnitTest extends BaseTest { } public RecalDatum makeRecalDatum() { - return new RecalDatum(exTotal, exError, (byte)getReportedQual()); + return new RecalDatum((long)exTotal, (double)exError, (byte)getReportedQual()); } @Override @@ -83,13 +80,19 @@ public class RecalDatumUnitTest extends BaseTest { } } + private static boolean createdDatumTestProviders = false; + @DataProvider(name = "RecalDatumTestProvider") public Object[][] makeRecalDatumTestProvider() { - for ( int E : Arrays.asList(1, 10, 100, 1000, 10000) ) - for ( int N : Arrays.asList(10, 100, 1000, 10000, 100000, 1000000) ) - for ( int reportedQual : Arrays.asList(10, 20) ) - if ( E <= N ) - new RecalDatumTestProvider(E, N, reportedQual); + if ( !createdDatumTestProviders ) { + for ( int E : Arrays.asList(1, 10, 100, 1000, 10000) ) + for ( int N : Arrays.asList(10, 100, 1000, 10000, 100000, 1000000) ) + for ( int reportedQual : Arrays.asList(10, 20) ) + if ( E <= N ) + new RecalDatumTestProvider(E, N, reportedQual); + createdDatumTestProviders = true; + } + return RecalDatumTestProvider.getTests(RecalDatumTestProvider.class); } @@ -104,7 +107,6 @@ public class RecalDatumUnitTest extends BaseTest { Assert.assertEquals(datum.getNumObservations(), cfg.exTotal, 1E-6); if ( cfg.getReportedQual() != -1 ) Assert.assertEquals(datum.getEstimatedQReportedAsByte(), cfg.getReportedQual()); - BaseTest.assertEqualsDoubleSmart(datum.getEmpiricalQuality(), cfg.getErrorRatePhredScaled()); BaseTest.assertEqualsDoubleSmart(datum.getEmpiricalErrorRate(), cfg.getErrorRate()); final double e = datum.getEmpiricalQuality(); @@ -175,7 +177,87 @@ public class RecalDatumUnitTest extends BaseTest { @Test public void testNoObs() { - final RecalDatum rd = new RecalDatum(0, 0, (byte)10); + final RecalDatum rd = new RecalDatum(0L, 0.0, (byte)10); Assert.assertEquals(rd.getEmpiricalErrorRate(), 0.0); } + + @Test + public void testlog10QempPrior() { + for ( int Qemp = 0; Qemp <= QualityUtils.MAX_QUAL_SCORE; Qemp++ ) { + for ( int Qrep = 0; Qrep <= QualityUtils.MAX_QUAL_SCORE; Qrep++ ) { + final double log10prior = RecalDatum.log10QempPrior(Qemp, Qrep); + Assert.assertTrue(log10prior < 0.0); + Assert.assertFalse(Double.isInfinite(log10prior)); + Assert.assertFalse(Double.isNaN(log10prior)); + } + } + + final int Qrep = 20; + int maxQemp = -1; + double maxQempValue = -Double.MAX_VALUE; + for ( int Qemp = 0; Qemp <= QualityUtils.MAX_QUAL_SCORE; Qemp++ ) { + final double log10prior = RecalDatum.log10QempPrior(Qemp, Qrep); + if ( log10prior > maxQempValue ) { + maxQemp = Qemp; + maxQempValue = log10prior; + } + } + Assert.assertEquals(maxQemp, Qrep); + } + + @Test + public void testlog10QempLikelihood() { + + final int Qrep = 20; + + // test no shift + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(0, 0, Qrep), (double)Qrep); + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10, 0, Qrep), (double)Qrep); + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000, 10, Qrep), (double)Qrep); + + // test small shift + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10, 10, Qrep), Qrep - 1.0); + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000, 0, Qrep), Qrep + 1.0); + + // test medium shift + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10000, 0, Qrep), Qrep + 3.0); + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10000, 10, Qrep), Qrep + 3.0); + + // test large shift + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(100000, 10, Qrep), Qrep + 8.0); + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000000, 10, Qrep), Qrep + 16.0); + } + + @Test + public void testBayesianEstimateOfEmpiricalQuality() { + + final double[] Qemps = new double[] { 0.0, 10.0, 20.0, 30.0 }; + final int[] observations = new int[] {0, 10, 1000, 1000000}; + final int[] errors = new int[] {0, 10, 1000, 1000000}; + + for ( double Qemp : Qemps ) { + for ( int observation : observations ) { + for ( int error : errors ) { + if ( error > observation ) + continue; + + final double log10likelihood = RecalDatum.log10QempLikelihood(Qemp, observation, error); + Assert.assertTrue(observation == 0 ? MathUtils.compareDoubles(log10likelihood, 0.0) == 0 : log10likelihood < 0.0); + Assert.assertFalse(Double.isInfinite(log10likelihood)); + Assert.assertFalse(Double.isNaN(log10likelihood)); + } + } + } + } + + @Test + public void testLongToInt() { + long l = new Long((long)Integer.MAX_VALUE); + int i = RecalDatum.longToInt(l); + Assert.assertEquals(i, Integer.MAX_VALUE); + + l++; + i = RecalDatum.longToInt(l); + Assert.assertEquals(i, Integer.MAX_VALUE); + } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalUtilsUnitTest.java index 500a41e74..6c3b18a92 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalUtilsUnitTest.java @@ -146,7 +146,7 @@ public final class RecalUtilsUnitTest extends BaseTest { public NestedIntegerArray makeTable(final List rows) { final NestedIntegerArray x = new NestedIntegerArray(3, 3); for ( final Row r : rows ) - x.put(new RecalDatum(r.no, r.ne, (byte)10), r.rg, r.qual); + x.put(new RecalDatum((long)r.no, (double)r.ne, (byte)10), r.rg, r.qual); return x; } } diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java index 9707ed078..43c9245d7 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java @@ -20,9 +20,11 @@ public class RecalibrationReportUnitTest { private static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) { final Random random = new Random(); final int nObservations = random.nextInt(maxObservations); - final int nErrors = random.nextInt(maxErrors); + int nErrors = random.nextInt(maxErrors); + while ( nErrors > nObservations ) + nErrors = random.nextInt(maxErrors); final int qual = random.nextInt(QualityUtils.MAX_QUAL_SCORE); - return new RecalDatum(nObservations, nErrors, (byte)qual); + return new RecalDatum((long)nObservations, (double)nErrors, (byte)qual); } @Test(enabled = true)