From fbee4c11f1bca8e530e892ae02b29dfd7e978367 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 2 Jan 2013 15:20:25 -0500 Subject: [PATCH 1/8] Unit tests for ProgressMeterData --- .../ProgressMeterDataUnitTest.java | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDataUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDataUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDataUnitTest.java new file mode 100644 index 000000000..d6ea3b227 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDataUnitTest.java @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.progressmeter; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.AutoFormattingTime; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * UnitTests for the ProgressMeterData + * + * User: depristo + * Date: 8/24/12 + * Time: 11:25 AM + * To change this template use File | Settings | File Templates. + */ +public class ProgressMeterDataUnitTest extends BaseTest { + @Test + public void testBasic() { + Assert.assertEquals(new ProgressMeterData(1.0, 2, 3).getElapsedSeconds(), 1.0); + Assert.assertEquals(new ProgressMeterData(1.0, 2, 3).getUnitsProcessed(), 2); + Assert.assertEquals(new ProgressMeterData(1.0, 2, 3).getBpProcessed(), 3); + } + + @Test + public void testFraction() { + final double TOL = 1e-3; + Assert.assertEquals(new ProgressMeterData(1.0, 1, 1).calculateFractionGenomeTargetCompleted(10), 0.1, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, 1, 2).calculateFractionGenomeTargetCompleted(10), 0.2, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, 1, 1).calculateFractionGenomeTargetCompleted(100), 0.01, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, 1, 2).calculateFractionGenomeTargetCompleted(100), 0.02, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, 1, 1).calculateFractionGenomeTargetCompleted(0), 1.0, TOL); + } + + @Test + public void testSecondsPerBP() { + final double TOL = 1e-3; + final long M = 1000000; + Assert.assertEquals(new ProgressMeterData(1.0, 1, M).secondsPerMillionBP(), 1.0, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, 1, M/10).secondsPerMillionBP(), 10.0, TOL); + Assert.assertEquals(new ProgressMeterData(2.0, 1, M).secondsPerMillionBP(), 2.0, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, 1, 0).secondsPerMillionBP(), 1e6, TOL); + } + + @Test + public void testSecondsPerElement() { + final double TOL = 1e-3; + final long M = 1000000; + Assert.assertEquals(new ProgressMeterData(1.0, M, 1).secondsPerMillionElements(), 1.0, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, M/10, 1).secondsPerMillionElements(), 10.0, TOL); + Assert.assertEquals(new ProgressMeterData(2.00, M, 1).secondsPerMillionElements(), 2.0, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, 0, 1).secondsPerMillionElements(), 1e6, TOL); + } +} From 1ba8d47a81c94561feabef7740e2638b670a88ae Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 2 Jan 2013 16:12:08 -0500 Subject: [PATCH 2/8] Unit tests for ProgressMeterDaemon --- .../utils/progressmeter/ProgressMeter.java | 13 ++- .../progressmeter/ProgressMeterDaemon.java | 31 +++++- .../ProgressMeterDaemonUnitTest.java | 102 ++++++++++++++++++ 3 files changed, 142 insertions(+), 4 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java index 161335957..c9d849227 100755 --- a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java +++ b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java @@ -160,6 +160,13 @@ public class ProgressMeter { public ProgressMeter(final File performanceLogFile, final String processingUnitName, final GenomeLocSortedSet processingIntervals) { + this(performanceLogFile, processingUnitName, processingIntervals, ProgressMeterDaemon.DEFAULT_POLL_FREQUENCY_MILLISECONDS); + } + + protected ProgressMeter(final File performanceLogFile, + final String processingUnitName, + final GenomeLocSortedSet processingIntervals, + final long pollingFrequency) { if ( processingUnitName == null ) throw new IllegalArgumentException("processingUnitName cannot be null"); if ( processingIntervals == null ) throw new IllegalArgumentException("Target intervals cannot be null"); @@ -184,10 +191,14 @@ public class ProgressMeter { targetSizeInBP = processingIntervals.coveredSize(); // start up the timer - progressMeterDaemon = new ProgressMeterDaemon(this); + progressMeterDaemon = new ProgressMeterDaemon(this, pollingFrequency); start(); } + public ProgressMeterDaemon getProgressMeterDaemon() { + return progressMeterDaemon; + } + /** * Start up the progress meter, printing initialization message and starting up the * daemon thread for periodic printing. diff --git a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java index 16887400a..7edd8e724 100644 --- a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java +++ b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java @@ -8,10 +8,20 @@ package org.broadinstitute.sting.utils.progressmeter; * Time: 9:16 PM */ public final class ProgressMeterDaemon extends Thread { + public final static long DEFAULT_POLL_FREQUENCY_MILLISECONDS = 10 * 1000; + /** * How frequently should we poll and print progress? */ - private final static long POLL_FREQUENCY_MILLISECONDS = 10 * 1000; + private final long pollFrequencyMilliseconds; + + /** + * How long are we waiting between print progress calls are issued? + * @return the time in milliseconds between progress meter calls + */ + private long getPollFrequencyMilliseconds() { + return pollFrequencyMilliseconds; + } /** * Are we to continue periodically printing status, or should we shut down? @@ -27,13 +37,20 @@ public final class ProgressMeterDaemon extends Thread { * Create a new ProgressMeterDaemon printing progress for meter * @param meter the progress meter to print progress of */ - public ProgressMeterDaemon(final ProgressMeter meter) { + public ProgressMeterDaemon(final ProgressMeter meter, final long pollFrequencyMilliseconds) { if ( meter == null ) throw new IllegalArgumentException("meter cannot be null"); + if ( pollFrequencyMilliseconds <= 0 ) throw new IllegalArgumentException("pollFrequencyMilliseconds must be greater than 0 but got " + pollFrequencyMilliseconds); + this.meter = meter; + this.pollFrequencyMilliseconds = pollFrequencyMilliseconds; setDaemon(true); setName("ProgressMeterDaemon"); } + public ProgressMeterDaemon(final ProgressMeter meter) { + this(meter, DEFAULT_POLL_FREQUENCY_MILLISECONDS); + } + /** * Tells this daemon thread to shutdown at the next opportunity, as the progress * metering is complete. @@ -42,6 +59,14 @@ public final class ProgressMeterDaemon extends Thread { this.done = true; } + /** + * Is this daemon thread done? + * @return true if done, false otherwise + */ + public boolean isDone() { + return done; + } + /** * Start up the ProgressMeterDaemon, polling every tens of seconds to print, if * necessary, the provided progress meter. Never exits until the JVM is complete, @@ -51,7 +76,7 @@ public final class ProgressMeterDaemon extends Thread { while (! done) { meter.printProgress(false); try { - Thread.sleep(POLL_FREQUENCY_MILLISECONDS); + Thread.sleep(getPollFrequencyMilliseconds()); } catch (InterruptedException e) { throw new RuntimeException(e); } diff --git a/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java new file mode 100644 index 000000000..420db683e --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.progressmeter; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + +/** + * UnitTests for the ProgressMeterDaemon + * + * User: depristo + * Date: 8/24/12 + * Time: 11:25 AM + * To change this template use File | Settings | File Templates. + */ +public class ProgressMeterDaemonUnitTest extends BaseTest { + private GenomeLocParser genomeLocParser; + + @BeforeClass + public void init() throws FileNotFoundException { + genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(b37KGReference))); + } + + // capture and count calls to progress + private class TestingProgressMeter extends ProgressMeter { + final List progressCalls = new LinkedList(); + + private TestingProgressMeter(final long poll) { + super(null, "test", new GenomeLocSortedSet(genomeLocParser), poll); + } + + @Override + protected synchronized void printProgress(boolean mustPrint) { + progressCalls.add(System.currentTimeMillis()); + } + } + + @DataProvider(name = "PollingData") + public Object[][] makePollingData() { + List tests = new ArrayList(); + for ( final int ticks : Arrays.asList(1, 5, 10) ) { + for ( final int poll : Arrays.asList(10, 100) ) { + tests.add(new Object[]{poll, ticks}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "PollingData", invocationCount = 10, successPercentage = 90) + public void testMe(final long poll, final int ticks) throws InterruptedException { + final TestingProgressMeter meter = new TestingProgressMeter(poll); + final ProgressMeterDaemon daemon = meter.getProgressMeterDaemon(); + Assert.assertTrue(daemon.isDaemon()); + + Assert.assertFalse(daemon.isDone()); + Thread.sleep(ticks * poll); + Assert.assertFalse(daemon.isDone()); + + daemon.done(); + Assert.assertTrue(daemon.isDone()); + + Assert.assertEquals(meter.progressCalls.size(), ticks, + "Expected " + ticks + " progress calls from daemon thread, but only got " + meter.progressCalls.size() + " with exact calls " + meter.progressCalls); + } +} From 7df47418d839182cabd90839cea94da400d1d8eb Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 3 Jan 2013 14:43:36 -0500 Subject: [PATCH 3/8] BQSR optimization: make RecalibrationTables thread-local, and merge results in onTraversalDone -- With the newer, faster BQSR, scaling was limited by the NestedIntegerArray. The solution to this is to make the entire table thread-local, so that each nct thread has its own data and doesn't have any collisions. -- Removed the previous partial solution of having a thread-local quality score table -- Added a new argument -lowMemory --- .../bqsr/AdvancedRecalibrationEngine.java | 42 +------- .../gatk/walkers/bqsr/BaseRecalibrator.java | 31 +++--- .../walkers/bqsr/RecalibrationEngine.java | 11 ++- .../bqsr/StandardRecalibrationEngine.java | 99 ++++++++++++++++--- .../recalibration/RecalibrationTables.java | 16 +-- 5 files changed, 130 insertions(+), 69 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java index 255f1fd05..3871101eb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java @@ -25,35 +25,21 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; * OTHER DEALINGS IN THE SOFTWARE. */ -import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.recalibration.ReadCovariates; import org.broadinstitute.sting.utils.recalibration.RecalDatum; +import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import java.util.LinkedList; -import java.util.List; - public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine implements ProtectedPackageSource { - private final static Logger logger = Logger.getLogger(AdvancedRecalibrationEngine.class); - - final List> allThreadLocalQualityScoreTables = new LinkedList>(); - private ThreadLocal> threadLocalQualityScoreTables = new ThreadLocal>() { - @Override - protected synchronized NestedIntegerArray initialValue() { - final NestedIntegerArray table = recalibrationTables.makeQualityScoreTable(); - allThreadLocalQualityScoreTables.add(table); - return table; - } - }; - @Override public void updateDataForRead( final ReadRecalibrationInfo recalInfo ) { final GATKSAMRecord read = recalInfo.getRead(); final ReadCovariates readCovariates = recalInfo.getCovariatesValues(); - final NestedIntegerArray qualityScoreTable = getThreadLocalQualityScoreTable(); + final RecalibrationTables tables = getRecalibrationTables(); + final NestedIntegerArray qualityScoreTable = tables.getQualityScoreTable(); for( int offset = 0; offset < read.getReadBases().length; offset++ ) { if( ! recalInfo.skip(offset) ) { @@ -70,30 +56,10 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp if (keys[i] < 0) continue; - incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); + incrementDatumOrPutIfNecessary(tables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); } } } } } - - /** - * Get a NestedIntegerArray for a QualityScore table specific to this thread - * @return a non-null NestedIntegerArray ready to be used to collect calibration info for the quality score covariate - */ - private NestedIntegerArray getThreadLocalQualityScoreTable() { - return threadLocalQualityScoreTables.get(); - } - - @Override - public void finalizeData() { - // merge in all of the thread local tables - logger.info("Merging " + allThreadLocalQualityScoreTables.size() + " thread-local quality score tables"); - for ( final NestedIntegerArray localTable : allThreadLocalQualityScoreTables ) { - recalibrationTables.combineQualityScoreTable(localTable); - } - allThreadLocalQualityScoreTables.clear(); // cleanup after ourselves - - super.finalizeData(); - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 7692c58e2..ffcfd6233 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -120,9 +120,16 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche @Argument(fullName = "bqsrBAQGapOpenPenalty", shortName="bqsrBAQGOP", doc="BQSR BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false) public double BAQGOP = BAQ.DEFAULT_GOP; - private QuantizationInfo quantizationInfo; // an object that keeps track of the information necessary for quality score quantization + /** + * When you have nct > 1, BQSR uses nct times more memory to compute its recalibration tables, for efficiency + * purposes. If you have many covariates, and therefore are using a lot of memory, you can use this flag + * to safely access only one table. There may be some CPU cost, but as long as the table is really big + * there should be relatively little CPU costs. + */ + @Argument(fullName = "lowMemoryMode", shortName="lowMemoryMode", doc="Reduce memory usage in multi-threaded code at the expense of threading efficiency", required = false) + public boolean lowMemoryMode = false; - private RecalibrationTables recalibrationTables; + private QuantizationInfo quantizationInfo; // an object that keeps track of the information necessary for quality score quantization private Covariate[] requestedCovariates; // list to hold the all the covariate objects that were requested (required + standard + experimental) @@ -130,8 +137,6 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche private int minimumQToUse; - protected static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\ - private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation."; private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector @@ -143,7 +148,6 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche * Based on the covariates' estimates for initial capacity allocate the data hashmap */ public void initialize() { - baq = new BAQ(BAQGOP); // setup the BAQ object with the provided gap open penalty // check for unsupported access @@ -188,10 +192,11 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche int numReadGroups = 0; for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() ) numReadGroups += header.getReadGroups().size(); - recalibrationTables = new RecalibrationTables(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG); recalibrationEngine = initializeRecalibrationEngine(); - recalibrationEngine.initialize(requestedCovariates, recalibrationTables); + recalibrationEngine.initialize(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG); + if ( lowMemoryMode ) + recalibrationEngine.enableLowMemoryMode(); minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN; referenceReader = getToolkit().getReferenceDataSource().getReference(); @@ -501,14 +506,18 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche logger.info("Processed: " + result + " reads"); } + private RecalibrationTables getRecalibrationTable() { + return recalibrationEngine.getFinalRecalibrationTables(); + } + private void generatePlots() { File recalFile = getToolkit().getArguments().BQSR_RECAL_FILE; if (recalFile != null) { RecalibrationReport report = new RecalibrationReport(recalFile); - RecalUtils.generateRecalibrationPlot(RAC, report.getRecalibrationTables(), recalibrationTables, requestedCovariates); + RecalUtils.generateRecalibrationPlot(RAC, report.getRecalibrationTables(), getRecalibrationTable(), requestedCovariates); } else - RecalUtils.generateRecalibrationPlot(RAC, recalibrationTables, requestedCovariates); + RecalUtils.generateRecalibrationPlot(RAC, getRecalibrationTable(), requestedCovariates); } /** @@ -517,10 +526,10 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche * generate a quantization map (recalibrated_qual -> quantized_qual) */ private void quantizeQualityScores() { - quantizationInfo = new QuantizationInfo(recalibrationTables, RAC.QUANTIZING_LEVELS); + quantizationInfo = new QuantizationInfo(getRecalibrationTable(), RAC.QUANTIZING_LEVELS); } private void generateReport() { - RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, recalibrationTables, requestedCovariates, RAC.SORT_BY_ALL_COLUMNS); + RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, getRecalibrationTable(), requestedCovariates, RAC.SORT_BY_ALL_COLUMNS); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java index 5c002b7e5..6c3189be5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java @@ -5,6 +5,8 @@ import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.io.PrintStream; + /* * Copyright (c) 2009 The Broad Institute * @@ -40,9 +42,10 @@ public interface RecalibrationEngine { * The engine should collect match and mismatch data into the recalibrationTables data. * * @param covariates an array of the covariates we'll be using in this engine, order matters - * @param recalibrationTables the destination recalibrationTables where stats should be collected + * @param numReadGroups the number of read groups we should use for the recalibration tables + * @param maybeLogStream an optional print stream for logging calls to the nestedhashmap in the recalibration tables */ - public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables); + public void initialize(final Covariate[] covariates, final int numReadGroups, final PrintStream maybeLogStream); /** * Update the recalibration statistics using the information in recalInfo @@ -57,4 +60,8 @@ public interface RecalibrationEngine { * Called once after all calls to updateDataForRead have been issued. */ public void finalizeData(); + + public void enableLowMemoryMode(); + + public RecalibrationTables getFinalRecalibrationTables(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java index a6ab98e8b..0cd042eeb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java @@ -25,26 +25,64 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; * OTHER DEALINGS IN THE SOFTWARE. */ +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.classloader.PublicPackageSource; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; -import org.broadinstitute.sting.utils.recalibration.EventType; -import org.broadinstitute.sting.utils.recalibration.ReadCovariates; -import org.broadinstitute.sting.utils.recalibration.RecalDatum; -import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; +import org.broadinstitute.sting.utils.recalibration.*; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.io.PrintStream; +import java.util.LinkedList; +import java.util.List; + public class StandardRecalibrationEngine implements RecalibrationEngine, PublicPackageSource { + private static final Logger logger = Logger.getLogger(StandardRecalibrationEngine.class); protected Covariate[] covariates; - protected RecalibrationTables recalibrationTables; + private int numReadGroups; + private PrintStream maybeLogStream; + private boolean lowMemoryMode = false; + + private boolean finalized = false; + private RecalibrationTables mergedRecalibrationTables = null; + + private final List recalibrationTablesList = new LinkedList(); + + private final ThreadLocal threadLocalTables = new ThreadLocal() { + private synchronized RecalibrationTables makeAndCaptureTable() { + logger.info("Creating RecalibrationTable for " + Thread.currentThread()); + final RecalibrationTables newTable = new RecalibrationTables(covariates, numReadGroups, maybeLogStream); + recalibrationTablesList.add(newTable); + return newTable; + } + + @Override + protected synchronized RecalibrationTables initialValue() { + if ( lowMemoryMode ) { + return recalibrationTablesList.isEmpty() ? makeAndCaptureTable() : recalibrationTablesList.get(0); + } else { + return makeAndCaptureTable(); + } + } + }; + + protected RecalibrationTables getRecalibrationTables() { + return threadLocalTables.get(); + } + + public void enableLowMemoryMode() { + this.lowMemoryMode = true; + } @Override - public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables) { + public void initialize(final Covariate[] covariates, final int numReadGroups, final PrintStream maybeLogStream) { if ( covariates == null ) throw new IllegalArgumentException("Covariates cannot be null"); - if ( recalibrationTables == null ) throw new IllegalArgumentException("recalibrationTables cannot be null"); + if ( numReadGroups < 1 ) throw new IllegalArgumentException("numReadGroups must be >= 1 but got " + numReadGroups); this.covariates = covariates.clone(); - this.recalibrationTables = recalibrationTables; + this.numReadGroups = numReadGroups; + this.maybeLogStream = maybeLogStream; } @Override @@ -59,13 +97,13 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP final double isError = recalInfo.getErrorFraction(eventType, offset); final int[] keys = readCovariates.getKeySet(offset, eventType); - incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventType.index); + incrementDatumOrPutIfNecessary(getRecalibrationTables().getQualityScoreTable(), qual, isError, keys[0], keys[1], eventType.index); for (int i = 2; i < covariates.length; i++) { if (keys[i] < 0) continue; - incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventType.index); + incrementDatumOrPutIfNecessary(getRecalibrationTables().getTable(i), qual, isError, keys[0], keys[1], keys[i], eventType.index); } } } @@ -90,8 +128,13 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP */ @Override public void finalizeData() { - final NestedIntegerArray byReadGroupTable = recalibrationTables.getReadGroupTable(); - final NestedIntegerArray byQualTable = recalibrationTables.getQualityScoreTable(); + if ( finalized ) throw new IllegalStateException("FinalizeData() has already been called"); + + // merge all of the thread-local tables + mergedRecalibrationTables = mergeThreadLocalRecalibrationTables(); + + final NestedIntegerArray byReadGroupTable = mergedRecalibrationTables.getReadGroupTable(); + final NestedIntegerArray byQualTable = mergedRecalibrationTables.getQualityScoreTable(); // iterate over all values in the qual table for ( NestedIntegerArray.Leaf leaf : byQualTable.getAllLeaves() ) { @@ -108,6 +151,38 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP rgDatum.combine(qualDatum); } } + + finalized = true; + } + + /** + * Merge all of the thread local recalibration tables into a single one. + * + * Reuses one of the recalibration tables to hold the merged table, so this function can only be + * called once in the engine. + * + * @return the merged recalibration table + */ + @Requires("! finalized") + private RecalibrationTables mergeThreadLocalRecalibrationTables() { + if ( recalibrationTablesList.isEmpty() ) throw new IllegalStateException("recalibration tables list is empty"); + + RecalibrationTables merged = null; + for ( final RecalibrationTables table : recalibrationTablesList ) { + if ( merged == null ) + // fast path -- if there's only only one table, so just make it the merged one + merged = table; + else { + merged.combine(table); + } + } + + return merged; + } + + public RecalibrationTables getFinalRecalibrationTables() { + if ( ! finalized ) throw new IllegalStateException("Cannot get final recalibration tables until finalizeData() has been called"); + return mergedRecalibrationTables; } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java index 3f968d7f6..a6b1e13b9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java @@ -123,12 +123,16 @@ public final class RecalibrationTables { } /** - * Merge in the quality score table information from qualityScoreTable into this - * recalibration table's quality score table. - * - * @param qualityScoreTable the quality score table we want to merge in + * Merge all of the tables from toMerge into into this set of tables */ - public void combineQualityScoreTable(final NestedIntegerArray qualityScoreTable) { - RecalUtils.combineTables(getQualityScoreTable(), qualityScoreTable); + public void combine(final RecalibrationTables toMerge) { + if ( numTables() != toMerge.numTables() ) + throw new IllegalArgumentException("Attempting to merge RecalibrationTables with different sizes"); + + for ( int i = 0; i < numTables(); i++ ) { + final NestedIntegerArray myTable = this.getTable(i); + final NestedIntegerArray otherTable = toMerge.getTable(i); + RecalUtils.combineTables(myTable, otherTable); + } } } From bbdf9ee91b895d353260f2b5219d1b3d6c3c3ce4 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 3 Jan 2013 16:47:45 -0500 Subject: [PATCH 4/8] BQSR cleanup: merge Advanced and Standard recalibration engine into just the RecalibrationEngine -- As we are no longer maintaining a public/protected system we need only have one RecalibrationEngine. -- Misc. code cleanup and docs along the way --- .../bqsr/AdvancedRecalibrationEngine.java | 65 ----- .../gatk/walkers/bqsr/BaseRecalibrator.java | 51 ++-- .../walkers/bqsr/RecalibrationEngine.java | 254 +++++++++++++++--- .../bqsr/StandardRecalibrationEngine.java | 219 --------------- 4 files changed, 249 insertions(+), 340 deletions(-) delete mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java deleted file mode 100644 index 3871101eb..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java +++ /dev/null @@ -1,65 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource; -import org.broadinstitute.sting.utils.collections.NestedIntegerArray; -import org.broadinstitute.sting.utils.recalibration.EventType; -import org.broadinstitute.sting.utils.recalibration.ReadCovariates; -import org.broadinstitute.sting.utils.recalibration.RecalDatum; -import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine implements ProtectedPackageSource { - @Override - public void updateDataForRead( final ReadRecalibrationInfo recalInfo ) { - final GATKSAMRecord read = recalInfo.getRead(); - final ReadCovariates readCovariates = recalInfo.getCovariatesValues(); - final RecalibrationTables tables = getRecalibrationTables(); - final NestedIntegerArray qualityScoreTable = tables.getQualityScoreTable(); - - for( int offset = 0; offset < read.getReadBases().length; offset++ ) { - if( ! recalInfo.skip(offset) ) { - - for (final EventType eventType : EventType.values()) { - final int[] keys = readCovariates.getKeySet(offset, eventType); - final int eventIndex = eventType.index; - final byte qual = recalInfo.getQual(eventType, offset); - final double isError = recalInfo.getErrorFraction(eventType, offset); - - incrementDatumOrPutIfNecessary(qualityScoreTable, qual, isError, keys[0], keys[1], eventIndex); - - for (int i = 2; i < covariates.length; i++) { - if (keys[i] < 0) - continue; - - incrementDatumOrPutIfNecessary(tables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); - } - } - } - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index ffcfd6233..2c774d94d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -113,12 +113,11 @@ import java.util.List; @ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class}) @PartitionBy(PartitionType.READ) public class BaseRecalibrator extends ReadWalker implements NanoSchedulable { + /** + * all the command line arguments for BQSR and it's covariates + */ @ArgumentCollection - private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // all the command line arguments for BQSR and it's covariates - - @Advanced - @Argument(fullName = "bqsrBAQGapOpenPenalty", shortName="bqsrBAQGOP", doc="BQSR BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false) - public double BAQGOP = BAQ.DEFAULT_GOP; + private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); /** * When you have nct > 1, BQSR uses nct times more memory to compute its recalibration tables, for efficiency @@ -129,9 +128,19 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche @Argument(fullName = "lowMemoryMode", shortName="lowMemoryMode", doc="Reduce memory usage in multi-threaded code at the expense of threading efficiency", required = false) public boolean lowMemoryMode = false; - private QuantizationInfo quantizationInfo; // an object that keeps track of the information necessary for quality score quantization + @Advanced + @Argument(fullName = "bqsrBAQGapOpenPenalty", shortName="bqsrBAQGOP", doc="BQSR BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false) + public double BAQGOP = BAQ.DEFAULT_GOP; - private Covariate[] requestedCovariates; // list to hold the all the covariate objects that were requested (required + standard + experimental) + /** + * an object that keeps track of the information necessary for quality score quantization + */ + private QuantizationInfo quantizationInfo; + + /** + * list to hold the all the covariate objects that were requested (required + standard + experimental) + */ + private Covariate[] requestedCovariates; private RecalibrationEngine recalibrationEngine; @@ -189,30 +198,20 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche throw new UserException.CouldNotCreateOutputFile(RAC.RECAL_TABLE_FILE, e); } - int numReadGroups = 0; - for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() ) - numReadGroups += header.getReadGroups().size(); - - recalibrationEngine = initializeRecalibrationEngine(); - recalibrationEngine.initialize(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG); - if ( lowMemoryMode ) - recalibrationEngine.enableLowMemoryMode(); - + initializeRecalibrationEngine(); minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN; referenceReader = getToolkit().getReferenceDataSource().getReference(); } - private RecalibrationEngine initializeRecalibrationEngine() { + /** + * Initialize the recalibration engine + */ + private void initializeRecalibrationEngine() { + int numReadGroups = 0; + for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() ) + numReadGroups += header.getReadGroups().size(); - final Class recalibrationEngineClass = GATKLiteUtils.getProtectedClassIfAvailable(RecalibrationEngine.class); - try { - final Constructor constructor = recalibrationEngineClass.getDeclaredConstructor((Class[])null); - constructor.setAccessible(true); - return (RecalibrationEngine)constructor.newInstance(); - } - catch (Exception e) { - throw new ReviewedStingException("Unable to create RecalibrationEngine class instance " + recalibrationEngineClass.getSimpleName()); - } + recalibrationEngine = new RecalibrationEngine(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG, lowMemoryMode); } private boolean isLowQualityBase( final GATKSAMRecord read, final int offset ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java index 6c3189be5..c6d5cddb9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java @@ -1,37 +1,90 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.bqsr; import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.collections.NestedIntegerArray; +import org.broadinstitute.sting.utils.recalibration.EventType; +import org.broadinstitute.sting.utils.recalibration.ReadCovariates; +import org.broadinstitute.sting.utils.recalibration.RecalDatum; import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.PrintStream; +import java.util.LinkedList; +import java.util.List; + +public class RecalibrationEngine { + final protected Covariate[] covariates; + final private int numReadGroups; + final private PrintStream maybeLogStream; + final private boolean lowMemoryMode; + + /** + * Has finalizeData() been called? + */ + private boolean finalized = false; + + /** + * The final (merged, etc) recalibration tables, suitable for downstream analysis. + */ + private RecalibrationTables finalRecalibrationTables = null; + + private final List recalibrationTablesList = new LinkedList(); + + private final ThreadLocal threadLocalTables = new ThreadLocal() { + private synchronized RecalibrationTables makeAndCaptureTable() { + final RecalibrationTables newTable = new RecalibrationTables(covariates, numReadGroups, maybeLogStream); + recalibrationTablesList.add(newTable); + return newTable; + } + + @Override + protected synchronized RecalibrationTables initialValue() { + if ( lowMemoryMode ) { + return recalibrationTablesList.isEmpty() ? makeAndCaptureTable() : recalibrationTablesList.get(0); + } else { + return makeAndCaptureTable(); + } + } + }; + + /** + * Get a recalibration table suitable for updating the underlying RecalDatums + * + * May return a thread-local version, or a single version, depending on the initialization + * arguments of this instance. + * + * @return + */ + protected RecalibrationTables getUpdatableRecalibrationTables() { + return threadLocalTables.get(); + } -/* -* Copyright (c) 2009 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE. -*/ -public interface RecalibrationEngine { /** * Initialize the recalibration engine * @@ -45,23 +98,164 @@ public interface RecalibrationEngine { * @param numReadGroups the number of read groups we should use for the recalibration tables * @param maybeLogStream an optional print stream for logging calls to the nestedhashmap in the recalibration tables */ - public void initialize(final Covariate[] covariates, final int numReadGroups, final PrintStream maybeLogStream); + public RecalibrationEngine(final Covariate[] covariates, final int numReadGroups, final PrintStream maybeLogStream, final boolean enableLowMemoryMode) { + if ( covariates == null ) throw new IllegalArgumentException("Covariates cannot be null"); + if ( numReadGroups < 1 ) throw new IllegalArgumentException("numReadGroups must be >= 1 but got " + numReadGroups); + + this.covariates = covariates.clone(); + this.numReadGroups = numReadGroups; + this.maybeLogStream = maybeLogStream; + this.lowMemoryMode = enableLowMemoryMode; + } /** * Update the recalibration statistics using the information in recalInfo * @param recalInfo data structure holding information about the recalibration values for a single read */ @Requires("recalInfo != null") - public void updateDataForRead(final ReadRecalibrationInfo recalInfo); + public void updateDataForRead( final ReadRecalibrationInfo recalInfo ) { + final GATKSAMRecord read = recalInfo.getRead(); + final ReadCovariates readCovariates = recalInfo.getCovariatesValues(); + final RecalibrationTables tables = getUpdatableRecalibrationTables(); + final NestedIntegerArray qualityScoreTable = tables.getQualityScoreTable(); + + for( int offset = 0; offset < read.getReadBases().length; offset++ ) { + if( ! recalInfo.skip(offset) ) { + + for (final EventType eventType : EventType.values()) { + final int[] keys = readCovariates.getKeySet(offset, eventType); + final int eventIndex = eventType.index; + final byte qual = recalInfo.getQual(eventType, offset); + final double isError = recalInfo.getErrorFraction(eventType, offset); + + incrementDatumOrPutIfNecessary(qualityScoreTable, qual, isError, keys[0], keys[1], eventIndex); + + for (int i = 2; i < covariates.length; i++) { + if (keys[i] < 0) + continue; + + incrementDatumOrPutIfNecessary(tables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); + } + } + } + } + } + + /** + * creates a datum object with one observation and one or zero error + * + * @param reportedQual the quality score reported by the instrument for this base + * @param isError whether or not the observation is an error + * @return a new RecalDatum object with the observation and the error + */ + protected RecalDatum createDatumObject(final byte reportedQual, final double isError) { + return new RecalDatum(1, isError, reportedQual); + } /** * Finalize, if appropriate, all derived data in recalibrationTables. * * Called once after all calls to updateDataForRead have been issued. + * + * Assumes that all of the principal tables (by quality score) have been completely updated, + * and walks over this data to create summary data tables like by read group table. */ - public void finalizeData(); + public void finalizeData() { + if ( finalized ) throw new IllegalStateException("FinalizeData() has already been called"); - public void enableLowMemoryMode(); + // merge all of the thread-local tables + finalRecalibrationTables = mergeThreadLocalRecalibrationTables(); - public RecalibrationTables getFinalRecalibrationTables(); + final NestedIntegerArray byReadGroupTable = finalRecalibrationTables.getReadGroupTable(); + final NestedIntegerArray byQualTable = finalRecalibrationTables.getQualityScoreTable(); + + // iterate over all values in the qual table + for ( NestedIntegerArray.Leaf leaf : byQualTable.getAllLeaves() ) { + final int rgKey = leaf.keys[0]; + final int eventIndex = leaf.keys[2]; + final RecalDatum rgDatum = byReadGroupTable.get(rgKey, eventIndex); + final RecalDatum qualDatum = leaf.value; + + if ( rgDatum == null ) { + // create a copy of qualDatum, and initialize byReadGroup table with it + byReadGroupTable.put(new RecalDatum(qualDatum), rgKey, eventIndex); + } else { + // combine the qual datum with the existing datum in the byReadGroup table + rgDatum.combine(qualDatum); + } + } + + finalized = true; + } + + /** + * Merge all of the thread local recalibration tables into a single one. + * + * Reuses one of the recalibration tables to hold the merged table, so this function can only be + * called once in the engine. + * + * @return the merged recalibration table + */ + @Requires("! finalized") + private RecalibrationTables mergeThreadLocalRecalibrationTables() { + if ( recalibrationTablesList.isEmpty() ) throw new IllegalStateException("recalibration tables list is empty"); + + RecalibrationTables merged = null; + for ( final RecalibrationTables table : recalibrationTablesList ) { + if ( merged == null ) + // fast path -- if there's only only one table, so just make it the merged one + merged = table; + else { + merged.combine(table); + } + } + + return merged; + } + + /** + * Get the final recalibration tables, after finalizeData() has been called + * + * This returns the finalized recalibration table collected by this engine. + * + * It is an error to call this function before finalizeData has been called + * + * @return the finalized recalibration table collected by this engine + */ + public RecalibrationTables getFinalRecalibrationTables() { + if ( ! finalized ) throw new IllegalStateException("Cannot get final recalibration tables until finalizeData() has been called"); + return finalRecalibrationTables; + } + + /** + * Increments the RecalDatum at the specified position in the specified table, or put a new item there + * if there isn't already one. + * + * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() + * to return false if another thread inserts a new item at our position in the middle of our put operation. + * + * @param table the table that holds/will hold our item + * @param qual qual for this event + * @param isError error value for this event + * @param keys location in table of our item + */ + protected void incrementDatumOrPutIfNecessary( final NestedIntegerArray table, + final byte qual, + final double isError, + final int... keys ) { + final RecalDatum existingDatum = table.get(keys); + + if ( existingDatum == null ) { + // No existing item, try to put a new one + if ( ! table.put(createDatumObject(qual, isError), keys) ) { + // Failed to put a new item because another thread came along and put an item here first. + // Get the newly-put item and increment it (item is guaranteed to exist at this point) + table.get(keys).increment(1.0, isError); + } + } + else { + // Easy case: already an item here, so increment it + existingDatum.increment(1.0, isError); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java deleted file mode 100644 index 0cd042eeb..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java +++ /dev/null @@ -1,219 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.classloader.PublicPackageSource; -import org.broadinstitute.sting.utils.collections.NestedIntegerArray; -import org.broadinstitute.sting.utils.recalibration.*; -import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.io.PrintStream; -import java.util.LinkedList; -import java.util.List; - -public class StandardRecalibrationEngine implements RecalibrationEngine, PublicPackageSource { - private static final Logger logger = Logger.getLogger(StandardRecalibrationEngine.class); - protected Covariate[] covariates; - private int numReadGroups; - private PrintStream maybeLogStream; - private boolean lowMemoryMode = false; - - private boolean finalized = false; - private RecalibrationTables mergedRecalibrationTables = null; - - private final List recalibrationTablesList = new LinkedList(); - - private final ThreadLocal threadLocalTables = new ThreadLocal() { - private synchronized RecalibrationTables makeAndCaptureTable() { - logger.info("Creating RecalibrationTable for " + Thread.currentThread()); - final RecalibrationTables newTable = new RecalibrationTables(covariates, numReadGroups, maybeLogStream); - recalibrationTablesList.add(newTable); - return newTable; - } - - @Override - protected synchronized RecalibrationTables initialValue() { - if ( lowMemoryMode ) { - return recalibrationTablesList.isEmpty() ? makeAndCaptureTable() : recalibrationTablesList.get(0); - } else { - return makeAndCaptureTable(); - } - } - }; - - protected RecalibrationTables getRecalibrationTables() { - return threadLocalTables.get(); - } - - public void enableLowMemoryMode() { - this.lowMemoryMode = true; - } - - @Override - public void initialize(final Covariate[] covariates, final int numReadGroups, final PrintStream maybeLogStream) { - if ( covariates == null ) throw new IllegalArgumentException("Covariates cannot be null"); - if ( numReadGroups < 1 ) throw new IllegalArgumentException("numReadGroups must be >= 1 but got " + numReadGroups); - - this.covariates = covariates.clone(); - this.numReadGroups = numReadGroups; - this.maybeLogStream = maybeLogStream; - } - - @Override - public void updateDataForRead( final ReadRecalibrationInfo recalInfo ) { - final GATKSAMRecord read = recalInfo.getRead(); - final EventType eventType = EventType.BASE_SUBSTITUTION; - final ReadCovariates readCovariates = recalInfo.getCovariatesValues(); - - for( int offset = 0; offset < read.getReadBases().length; offset++ ) { - if( ! recalInfo.skip(offset) ) { - final byte qual = recalInfo.getQual(eventType, offset); - final double isError = recalInfo.getErrorFraction(eventType, offset); - final int[] keys = readCovariates.getKeySet(offset, eventType); - - incrementDatumOrPutIfNecessary(getRecalibrationTables().getQualityScoreTable(), qual, isError, keys[0], keys[1], eventType.index); - - for (int i = 2; i < covariates.length; i++) { - if (keys[i] < 0) - continue; - - incrementDatumOrPutIfNecessary(getRecalibrationTables().getTable(i), qual, isError, keys[0], keys[1], keys[i], eventType.index); - } - } - } - } - - /** - * creates a datum object with one observation and one or zero error - * - * @param reportedQual the quality score reported by the instrument for this base - * @param isError whether or not the observation is an error - * @return a new RecalDatum object with the observation and the error - */ - protected RecalDatum createDatumObject(final byte reportedQual, final double isError) { - return new RecalDatum(1, isError, reportedQual); - } - - /** - * Create derived recalibration data tables - * - * Assumes that all of the principal tables (by quality score) have been completely updated, - * and walks over this data to create summary data tables like by read group table. - */ - @Override - public void finalizeData() { - if ( finalized ) throw new IllegalStateException("FinalizeData() has already been called"); - - // merge all of the thread-local tables - mergedRecalibrationTables = mergeThreadLocalRecalibrationTables(); - - final NestedIntegerArray byReadGroupTable = mergedRecalibrationTables.getReadGroupTable(); - final NestedIntegerArray byQualTable = mergedRecalibrationTables.getQualityScoreTable(); - - // iterate over all values in the qual table - for ( NestedIntegerArray.Leaf leaf : byQualTable.getAllLeaves() ) { - final int rgKey = leaf.keys[0]; - final int eventIndex = leaf.keys[2]; - final RecalDatum rgDatum = byReadGroupTable.get(rgKey, eventIndex); - final RecalDatum qualDatum = leaf.value; - - if ( rgDatum == null ) { - // create a copy of qualDatum, and initialize byReadGroup table with it - byReadGroupTable.put(new RecalDatum(qualDatum), rgKey, eventIndex); - } else { - // combine the qual datum with the existing datum in the byReadGroup table - rgDatum.combine(qualDatum); - } - } - - finalized = true; - } - - /** - * Merge all of the thread local recalibration tables into a single one. - * - * Reuses one of the recalibration tables to hold the merged table, so this function can only be - * called once in the engine. - * - * @return the merged recalibration table - */ - @Requires("! finalized") - private RecalibrationTables mergeThreadLocalRecalibrationTables() { - if ( recalibrationTablesList.isEmpty() ) throw new IllegalStateException("recalibration tables list is empty"); - - RecalibrationTables merged = null; - for ( final RecalibrationTables table : recalibrationTablesList ) { - if ( merged == null ) - // fast path -- if there's only only one table, so just make it the merged one - merged = table; - else { - merged.combine(table); - } - } - - return merged; - } - - public RecalibrationTables getFinalRecalibrationTables() { - if ( ! finalized ) throw new IllegalStateException("Cannot get final recalibration tables until finalizeData() has been called"); - return mergedRecalibrationTables; - } - - /** - * Increments the RecalDatum at the specified position in the specified table, or put a new item there - * if there isn't already one. - * - * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() - * to return false if another thread inserts a new item at our position in the middle of our put operation. - * - * @param table the table that holds/will hold our item - * @param qual qual for this event - * @param isError error value for this event - * @param keys location in table of our item - */ - protected void incrementDatumOrPutIfNecessary( final NestedIntegerArray table, - final byte qual, - final double isError, - final int... keys ) { - final RecalDatum existingDatum = table.get(keys); - - if ( existingDatum == null ) { - // No existing item, try to put a new one - if ( ! table.put(createDatumObject(qual, isError), keys) ) { - // Failed to put a new item because another thread came along and put an item here first. - // Get the newly-put item and increment it (item is guaranteed to exist at this point) - table.get(keys).increment(1.0, isError); - } - } - else { - // Easy case: already an item here, so increment it - existingDatum.increment(1.0, isError); - } - } -} From a5901cdd2037833568fa13e38f74952e261cc80a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 4 Jan 2013 10:54:02 -0500 Subject: [PATCH 6/8] Bugfix for printProgress in TraverseReadsNano -- Must provide a single bp position (1:10) not the range of the read (1:1-50). ProgressMeter now checks at runtime for this problem as well. --- .../sting/gatk/traversals/TraverseReadsNano.java | 3 ++- .../sting/utils/progressmeter/ProgressMeter.java | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index ee71d82bb..aa33def62 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -65,7 +65,8 @@ public class TraverseReadsNano extends TraversalEngine, @Override public void progress(MapData lastProcessedMap) { if ( lastProcessedMap.refContext != null ) - printProgress(lastProcessedMap.refContext.getLocus()); + // note, need to use getStopLocation so we don't give an interval to ProgressMeterDaemon + printProgress(lastProcessedMap.refContext.getLocus().getStopLocation()); } }); } diff --git a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java index c9d849227..b36326722 100755 --- a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java +++ b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java @@ -234,11 +234,13 @@ public class ProgressMeter { * the progress itself. A separate printing daemon periodically polls the meter to print out * progress * - * @param loc Current location, can be null if you are at the end of the processing unit + * @param loc Current location, can be null if you are at the end of the processing unit. Must + * have size == 1 (cannot be multiple bases in size). * @param nTotalRecordsProcessed the total number of records we've processed */ public synchronized void notifyOfProgress(final GenomeLoc loc, final long nTotalRecordsProcessed) { if ( nTotalRecordsProcessed < 0 ) throw new IllegalArgumentException("nTotalRecordsProcessed must be >= 0"); + if ( loc.size() != 1 ) throw new IllegalArgumentException("GenomeLoc must have size == 1 but got " + loc); // weird comparison to ensure that loc == null (in unmapped reads) is keep before maxGenomeLoc == null (on startup) this.maxGenomeLoc = loc == null ? loc : (maxGenomeLoc == null ? loc : loc.max(maxGenomeLoc)); From 810e2da1d47b973f93be2102de2205310ea98a37 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 4 Jan 2013 11:38:58 -0500 Subject: [PATCH 7/8] Cleanup and unit tests for EventType and ReadRecalibrationInfo in BQSR -- Added unit tests for EventType and ReadRecalibrationInfo -- Simplified interface of EventType. Previously this enum carried an index with it, but this is redundant with the enum.ordinal function. Now just using that function instead. --- .../walkers/bqsr/ReadRecalibrationInfo.java | 7 +- .../walkers/bqsr/RecalibrationEngine.java | 2 +- .../recalibration/BaseRecalibration.java | 10 +- .../sting/utils/recalibration/EventType.java | 42 ++++--- .../utils/recalibration/ReadCovariates.java | 24 ++-- .../recalibration/RecalibrationReport.java | 6 +- .../bqsr/ReadRecalibrationInfoUnitTest.java | 110 ++++++++++++++++++ .../recalibration/EventTypeUnitTest.java | 61 ++++++++++ .../RecalibrationReportUnitTest.java | 8 +- 9 files changed, 216 insertions(+), 54 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/recalibration/EventTypeUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java index 121e3449b..b884b89db 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java @@ -58,7 +58,8 @@ public final class ReadRecalibrationInfo { if ( covariates == null ) throw new IllegalArgumentException("covariates cannot be null"); if ( skips == null ) throw new IllegalArgumentException("skips cannot be null"); if ( snpErrors == null ) throw new IllegalArgumentException("snpErrors cannot be null"); - // future: may allow insertionErrors && deletionErrors to be null, so don't enforce + if ( insertionErrors == null ) throw new IllegalArgumentException("insertionErrors cannot be null"); + if ( deletionErrors == null ) throw new IllegalArgumentException("deletionErrors cannot be null"); this.read = read; this.baseQuals = read.getBaseQualities(); @@ -73,8 +74,8 @@ public final class ReadRecalibrationInfo { if ( skips.length != length ) throw new IllegalArgumentException("skips.length " + snpErrors.length + " != length " + length); if ( snpErrors.length != length ) throw new IllegalArgumentException("snpErrors.length " + snpErrors.length + " != length " + length); - if ( insertionErrors != null && insertionErrors.length != length ) throw new IllegalArgumentException("insertionErrors.length " + snpErrors.length + " != length " + length); - if ( deletionErrors != null && deletionErrors.length != length ) throw new IllegalArgumentException("deletionErrors.length " + snpErrors.length + " != length " + length); + if ( insertionErrors.length != length ) throw new IllegalArgumentException("insertionErrors.length " + snpErrors.length + " != length " + length); + if ( deletionErrors.length != length ) throw new IllegalArgumentException("deletionErrors.length " + snpErrors.length + " != length " + length); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java index c6d5cddb9..910519031 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java @@ -124,7 +124,7 @@ public class RecalibrationEngine { for (final EventType eventType : EventType.values()) { final int[] keys = readCovariates.getKeySet(offset, eventType); - final int eventIndex = eventType.index; + final int eventIndex = eventType.ordinal(); final byte qual = recalInfo.getQual(eventType, offset); final double isError = recalInfo.getErrorFraction(eventType, offset); diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index 3e0f36799..43c9bd2b5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -198,7 +198,7 @@ public class BaseRecalibration { } private double getGlobalDeltaQ(final int rgKey, final EventType errorModel) { - final Double cached = globalDeltaQs.get(rgKey, errorModel.index); + final Double cached = globalDeltaQs.get(rgKey, errorModel.ordinal()); if ( TEST_CACHING ) { final double calcd = calculateGlobalDeltaQ(rgKey, errorModel); @@ -210,7 +210,7 @@ public class BaseRecalibration { } private double getDeltaQReported(final int rgKey, final int qualKey, final EventType errorModel, final double globalDeltaQ) { - final Double cached = deltaQReporteds.get(rgKey, qualKey, errorModel.index); + final Double cached = deltaQReporteds.get(rgKey, qualKey, errorModel.ordinal()); if ( TEST_CACHING ) { final double calcd = calculateDeltaQReported(rgKey, qualKey, errorModel, globalDeltaQ, (byte)qualKey); @@ -240,7 +240,7 @@ public class BaseRecalibration { private double calculateGlobalDeltaQ(final int rgKey, final EventType errorModel) { double result = 0.0; - final RecalDatum empiricalQualRG = recalibrationTables.getReadGroupTable().get(rgKey, errorModel.index); + final RecalDatum empiricalQualRG = recalibrationTables.getReadGroupTable().get(rgKey, errorModel.ordinal()); if (empiricalQualRG != null) { final double globalDeltaQEmpirical = empiricalQualRG.getEmpiricalQuality(); @@ -254,7 +254,7 @@ public class BaseRecalibration { private double calculateDeltaQReported(final int rgKey, final int qualKey, final EventType errorModel, final double globalDeltaQ, final byte qualFromRead) { double result = 0.0; - final RecalDatum empiricalQualQS = recalibrationTables.getQualityScoreTable().get(rgKey, qualKey, errorModel.index); + final RecalDatum empiricalQualQS = recalibrationTables.getQualityScoreTable().get(rgKey, qualKey, errorModel.ordinal()); if (empiricalQualQS != null) { final double deltaQReportedEmpirical = empiricalQualQS.getEmpiricalQuality(); result = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; @@ -287,7 +287,7 @@ public class BaseRecalibration { final double globalDeltaQ, final double deltaQReported, final byte qualFromRead) { - final RecalDatum empiricalQualCO = table.get(rgKey, qualKey, tableKey, errorModel.index); + final RecalDatum empiricalQualCO = table.get(rgKey, qualKey, tableKey, errorModel.ordinal()); if (empiricalQualCO != null) { final double deltaQCovariateEmpirical = empiricalQualCO.getEmpiricalQuality(); return deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/EventType.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/EventType.java index 1c84518eb..63f873892 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/EventType.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/EventType.java @@ -1,41 +1,39 @@ package org.broadinstitute.sting.utils.recalibration; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - public enum EventType { - BASE_SUBSTITUTION(0, "M", "Base Substitution"), - BASE_INSERTION(1, "I", "Base Insertion"), - BASE_DELETION(2, "D", "Base Deletion"); + BASE_SUBSTITUTION("M", "Base Substitution"), + BASE_INSERTION("I", "Base Insertion"), + BASE_DELETION("D", "Base Deletion"); - public final int index; private final String representation; private final String longRepresentation; - private EventType(int index, String representation, String longRepresentation) { - this.index = index; + private EventType(String representation, String longRepresentation) { this.representation = representation; this.longRepresentation = longRepresentation; } + /** + * Get the EventType corresponding to its ordinal index + * @param index an ordinal index + * @return the event type corresponding to ordinal index + */ public static EventType eventFrom(int index) { - switch (index) { - case 0: - return BASE_SUBSTITUTION; - case 1: - return BASE_INSERTION; - case 2: - return BASE_DELETION; - default: - throw new ReviewedStingException(String.format("Event %d does not exist.", index)); - } + return EventType.values()[index]; } - - public static EventType eventFrom(String event) { + + /** + * Get the EventType with short string representation + * @throws IllegalArgumentException if representation doesn't correspond to one of EventType + * @param representation short string representation of the event + * @return an EventType + */ + public static EventType eventFrom(String representation) { for (EventType eventType : EventType.values()) - if (eventType.representation.equals(event)) + if (eventType.representation.equals(representation)) return eventType; - throw new ReviewedStingException(String.format("Event %s does not exist.", event)); + throw new IllegalArgumentException(String.format("Event %s does not exist.", representation)); } @Override diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java index 4ddcb2b92..405b2d143 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java @@ -68,9 +68,9 @@ public class ReadCovariates { * @param readOffset the read offset, must be >= 0 and <= the read length used to create this ReadCovariates */ public void addCovariate(final int mismatch, final int insertion, final int deletion, final int readOffset) { - keys[EventType.BASE_SUBSTITUTION.index][readOffset][currentCovariateIndex] = mismatch; - keys[EventType.BASE_INSERTION.index][readOffset][currentCovariateIndex] = insertion; - keys[EventType.BASE_DELETION.index][readOffset][currentCovariateIndex] = deletion; + keys[EventType.BASE_SUBSTITUTION.ordinal()][readOffset][currentCovariateIndex] = mismatch; + keys[EventType.BASE_INSERTION.ordinal()][readOffset][currentCovariateIndex] = insertion; + keys[EventType.BASE_DELETION.ordinal()][readOffset][currentCovariateIndex] = deletion; } /** @@ -81,11 +81,11 @@ public class ReadCovariates { * @return */ public int[] getKeySet(final int readPosition, final EventType errorModel) { - return keys[errorModel.index][readPosition]; + return keys[errorModel.ordinal()][readPosition]; } public int[][] getKeySet(final EventType errorModel) { - return keys[errorModel.index]; + return keys[errorModel.ordinal()]; } // ---------------------------------------------------------------------- @@ -94,17 +94,9 @@ public class ReadCovariates { // // ---------------------------------------------------------------------- - protected int[][] getMismatchesKeySet() { - return keys[EventType.BASE_SUBSTITUTION.index]; - } - - protected int[][] getInsertionsKeySet() { - return keys[EventType.BASE_INSERTION.index]; - } - - protected int[][] getDeletionsKeySet() { - return keys[EventType.BASE_DELETION.index]; - } + protected int[][] getMismatchesKeySet() { return getKeySet(EventType.BASE_SUBSTITUTION); } + protected int[][] getInsertionsKeySet() { return getKeySet(EventType.BASE_INSERTION); } + protected int[][] getDeletionsKeySet() { return getKeySet(EventType.BASE_DELETION); } protected int[] getMismatchesKeySet(final int readPosition) { return getKeySet(readPosition, EventType.BASE_SUBSTITUTION); diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java index 6ecac1394..ff0890ff0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java @@ -142,7 +142,7 @@ public class RecalibrationReport { tempCOVarray[2] = requestedCovariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + covIndex].keyFromValue(covValue); final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); - tempCOVarray[3] = event.index; + tempCOVarray[3] = event.ordinal(); recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + covIndex).put(getRecalDatum(reportTable, i, false), tempCOVarray); } @@ -161,7 +161,7 @@ public class RecalibrationReport { final Object qual = reportTable.get(i, RecalUtils.QUALITY_SCORE_COLUMN_NAME); tempQUALarray[1] = requestedCovariates[1].keyFromValue(qual); final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); - tempQUALarray[2] = event.index; + tempQUALarray[2] = event.ordinal(); qualTable.put(getRecalDatum(reportTable, i, false), tempQUALarray); } @@ -178,7 +178,7 @@ public class RecalibrationReport { final Object rg = reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME); tempRGarray[0] = requestedCovariates[0].keyFromValue(rg); final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); - tempRGarray[1] = event.index; + tempRGarray[1] = event.ordinal(); rgTable.put(getRecalDatum(reportTable, i, true), tempRGarray); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java new file mode 100644 index 000000000..08a8f2dc1 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import net.sf.samtools.SAMUtils; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.recalibration.EventType; +import org.broadinstitute.sting.utils.recalibration.ReadCovariates; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.EnumMap; +import java.util.List; + +public final class ReadRecalibrationInfoUnitTest extends BaseTest { + @DataProvider(name = "InfoProvider") + public Object[][] createCombineTablesProvider() { + List tests = new ArrayList(); + + for ( final int readLength: Arrays.asList(10, 100, 1000) ) { + for ( final boolean includeIndelErrors : Arrays.asList(true, false) ) { + tests.add(new Object[]{readLength, includeIndelErrors}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "InfoProvider") + public void testReadInfo(final int readLength, final boolean includeIndelErrors) { + final ReadCovariates covariates = new ReadCovariates(readLength, 2); + + final byte[] bases = new byte[readLength]; + final byte[] baseQuals = new byte[readLength]; + final byte[] insertionQuals = new byte[readLength]; + final byte[] deletionQuals = new byte[readLength]; + final boolean[] skips = new boolean[readLength]; + final double[] snpErrors = new double[readLength]; + final double[] insertionErrors = new double[readLength]; + final double[] deletionsErrors = new double[readLength]; + for ( int i = 0; i < readLength; i++ ) { + bases[i] = 'A'; + baseQuals[i] = (byte)(i % SAMUtils.MAX_PHRED_SCORE); + insertionQuals[i] = (byte)((i+1) % SAMUtils.MAX_PHRED_SCORE); + deletionQuals[i] = (byte)((i+2) % SAMUtils.MAX_PHRED_SCORE); + skips[i] = i % 2 == 0; + snpErrors[i] = 1.0 / (i+1); + insertionErrors[i] = 0.5 / (i+1); + deletionsErrors[i] = 0.3 / (i+1); + } + + final EnumMap errors = new EnumMap(EventType.class); + errors.put(EventType.BASE_SUBSTITUTION, snpErrors); + errors.put(EventType.BASE_INSERTION, insertionErrors); + errors.put(EventType.BASE_DELETION, deletionsErrors); + + final EnumMap quals = new EnumMap(EventType.class); + quals.put(EventType.BASE_SUBSTITUTION, baseQuals); + quals.put(EventType.BASE_INSERTION, insertionQuals); + quals.put(EventType.BASE_DELETION, deletionQuals); + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, baseQuals, readLength + "M"); + if ( includeIndelErrors ) { + read.setBaseQualities(insertionQuals, EventType.BASE_INSERTION); + read.setBaseQualities(deletionQuals, EventType.BASE_DELETION); + } + + final ReadRecalibrationInfo info = new ReadRecalibrationInfo(read, covariates, skips, snpErrors, insertionErrors, deletionsErrors); + + Assert.assertEquals(info.getCovariatesValues(), covariates); + Assert.assertEquals(info.getRead(), read); + + for ( int i = 0; i < readLength; i++ ) { + Assert.assertEquals(info.skip(i), skips[i]); + for ( final EventType et : EventType.values() ) { + Assert.assertEquals(info.getErrorFraction(et, i), errors.get(et)[i]); + final byte expectedQual = et == EventType.BASE_SUBSTITUTION || includeIndelErrors ? quals.get(et)[i]: GATKSAMRecord.DEFAULT_INSERTION_DELETION_QUAL; + Assert.assertEquals(info.getQual(et, i), expectedQual); + } + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/EventTypeUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/EventTypeUnitTest.java new file mode 100644 index 000000000..53645e224 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/EventTypeUnitTest.java @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.*; + +public final class EventTypeUnitTest extends BaseTest { + @Test + public void testEventTypes() { + for ( final EventType et : EventType.values() ) { + Assert.assertNotNull(et.toString()); + Assert.assertNotNull(et.prettyPrint()); + Assert.assertFalse("".equals(et.toString())); + Assert.assertFalse("".equals(et.prettyPrint())); + Assert.assertEquals(EventType.eventFrom(et.ordinal()), et); + Assert.assertEquals(EventType.eventFrom(et.toString()), et); + } + } + + @Test + public void testEventTypesEnumItself() { + final Set shortReps = new HashSet(); + for ( final EventType et : EventType.values() ) { + Assert.assertFalse(shortReps.contains(et.toString()), "Short representative for EventType has duplicates for " + et); + shortReps.add(et.toString()); + } + Assert.assertEquals(shortReps.size(), EventType.values().length, "Short representatives for EventType aren't unique"); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testBadString() { + EventType.eventFrom("asdfhalsdjfalkjsdf"); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java index 9707ed078..aa0419fed 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java @@ -20,7 +20,7 @@ public class RecalibrationReportUnitTest { private static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) { final Random random = new Random(); final int nObservations = random.nextInt(maxObservations); - final int nErrors = random.nextInt(maxErrors); + final int nErrors = Math.min(random.nextInt(maxErrors), nObservations); final int qual = random.nextInt(QualityUtils.MAX_QUAL_SCORE); return new RecalDatum(nObservations, nErrors, (byte)qual); } @@ -90,14 +90,14 @@ public class RecalibrationReportUnitTest { final int[] covariates = rc.getKeySet(offset, errorMode); final int randomMax = errorMode == EventType.BASE_SUBSTITUTION ? 10000 : 100000; - rgTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], errorMode.index); - qualTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], errorMode.index); + rgTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], errorMode.ordinal()); + qualTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], errorMode.ordinal()); nKeys += 2; for (int j = 0; j < optionalCovariates.size(); j++) { final NestedIntegerArray covTable = recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + j); final int covValue = covariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + j]; if ( covValue >= 0 ) { - covTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], covValue, errorMode.index); + covTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], covValue, errorMode.ordinal()); nKeys++; } } From fe06912a87c2a55f3f22b5760855afc9d4987afe Mon Sep 17 00:00:00 2001 From: Tad Jordan Date: Fri, 4 Jan 2013 11:52:04 -0500 Subject: [PATCH 8/8] Removed sorting by row from walkers --- .../sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java | 2 +- .../gatk/walkers/varianteval/VariantEvalReportWriter.java | 2 +- .../sting/utils/recalibration/QuantizationInfo.java | 4 ++-- .../broadinstitute/sting/gatk/report/GATKReportUnitTest.java | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java index 5972322f8..98e581e21 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java @@ -124,7 +124,7 @@ public class ErrorRatePerCycle extends LocusWalker { public void initialize() { report = new GATKReport(); - report.addTable(reportName, reportDescription, 6, GATKReportTable.TableSortingWay.SORT_BY_ROW); + report.addTable(reportName, reportDescription, 6); table = report.getTable(reportName); table.addColumn("readgroup"); table.addColumn("cycle"); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java index 6dad128fe..91efd1ffd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java @@ -162,7 +162,7 @@ public class VariantEvalReportWriter { // create the table final String tableName = ve.getSimpleName(); final String tableDesc = ve.getClass().getAnnotation(Analysis.class).description(); - report.addTable(tableName, tableDesc, 1 + stratifiers.size() + (scanner.hasMoltenField() ? 2 : datamap.size()), GATKReportTable.TableSortingWay.SORT_BY_ROW); + report.addTable(tableName, tableDesc, 1 + stratifiers.size() + (scanner.hasMoltenField() ? 2 : datamap.size())); // grab the table, and add the columns we need to it final GATKReportTable table = report.getTable(tableName); diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java index fc942499c..5cf16dc9f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java @@ -67,9 +67,9 @@ public class QuantizationInfo { return quantizationLevels; } - public GATKReportTable generateReportTable(boolean sortBycols) { + public GATKReportTable generateReportTable(boolean sortByCols) { GATKReportTable quantizedTable; - if(sortBycols) { + if(sortByCols) { quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3, GATKReportTable.TableSortingWay.SORT_BY_COLUMN); } else { quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3); diff --git a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java index 40d8d8ff9..0637e9a25 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java @@ -105,7 +105,7 @@ public class GATKReportUnitTest extends BaseTest { private boolean isSorted(GATKReportTable table) { boolean result = true; - File testingSortingTableFile = new File("myFile.txt"); + File testingSortingTableFile = new File("testSortingFile.txt"); try { // Connect print stream to the output stream