From a481d006f04a9e7d69a7faa85050577c20053d8d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 12 Dec 2012 16:14:09 -0500 Subject: [PATCH] Optimizations for applying BQSR table with PrintReads -- Cleaned up code in updateDataForRead so that constant values where not computed in inner loops -- BaseRecalibrator doesn't create it's own fasta index reader, it just piggy backs on the GATK one -- ReadCovariates now uses a thread local cache for it's int[][][] keys member variable. This stops us from recreating an expensive array over and over. In order to make this really work had to update recordValues in ContextCovariate so it writes 0s over base values its skipping because of low quality base clipping. Previously the values in the ReadCovariates keys were 0 because they were never modified by ContextCovariates. Now these values are actually zero'd out explicitly by the covariates. --- .../bqsr/AdvancedRecalibrationEngine.java | 9 +- .../gatk/walkers/bqsr/BaseRecalibrator.java | 11 +-- .../broadinstitute/sting/utils/LRUCache.java | 20 ++++ .../recalibration/BaseRecalibration.java | 8 -- .../utils/recalibration/ReadCovariates.java | 93 ++++++++++++++----- .../covariates/ContextCovariate.java | 16 +++- 6 files changed, 107 insertions(+), 50 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/LRUCache.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java index ffd681056..ac280b70e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java @@ -45,13 +45,12 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp @Override public void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { + final ReadCovariates readCovariates = covariateKeySetFrom(read); + byte[] tempQualArray = threadLocalTempQualArray.get(); + double[] tempFractionalErrorArray = threadLocalTempFractionalErrorArray.get(); + for( int offset = 0; offset < read.getReadBases().length; offset++ ) { if( !skip[offset] ) { - final ReadCovariates readCovariates = covariateKeySetFrom(read); - - byte[] tempQualArray = threadLocalTempQualArray.get(); - double[] tempFractionalErrorArray = threadLocalTempFractionalErrorArray.get(); - tempQualArray[EventType.BASE_SUBSTITUTION.index] = read.getBaseQualities()[offset]; tempFractionalErrorArray[EventType.BASE_SUBSTITUTION.index] = snpErrors[offset]; tempQualArray[EventType.BASE_INSERTION.index] = read.getBaseInsertionQualities()[offset]; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 2410aefce..f037f861f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -45,7 +45,6 @@ import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.recalibration.*; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; @@ -53,7 +52,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import java.io.File; -import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintStream; import java.lang.reflect.Constructor; @@ -194,14 +192,7 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche recalibrationEngine.initialize(requestedCovariates, recalibrationTables); minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN; - - try { - // fasta reference reader for use with BAQ calculation - referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); - } catch( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); - } - + referenceReader = getToolkit().getReferenceDataSource().getReference(); } private RecalibrationEngine initializeRecalibrationEngine() { diff --git a/public/java/src/org/broadinstitute/sting/utils/LRUCache.java b/public/java/src/org/broadinstitute/sting/utils/LRUCache.java new file mode 100644 index 000000000..a3514c95f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/LRUCache.java @@ -0,0 +1,20 @@ +package org.broadinstitute.sting.utils; + +import java.util.LinkedHashMap; +import java.util.Map; + +/** + * An LRU cache implemented as an extension to LinkedHashMap + */ +public class LRUCache extends LinkedHashMap { + private int capacity; // Maximum number of items in the cache. + + public LRUCache(int capacity) { + super(capacity+1, 1.0f, true); // Pass 'true' for accessOrder. + this.capacity = capacity; + } + + protected boolean removeEldestEntry(final Map.Entry entry) { + return (size() > this.capacity); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index 5d4020a07..7d63996c3 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -54,14 +54,6 @@ public class BaseRecalibration { private final int preserveQLessThan; private final boolean emitOriginalQuals; - // TODO -- was this supposed to be used somewhere? -// private static final NestedHashMap[] qualityScoreByFullCovariateKey = new NestedHashMap[EventType.values().length]; // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values. -// static { -// for (int i = 0; i < EventType.values().length; i++) -// qualityScoreByFullCovariateKey[i] = new NestedHashMap(); -// } - - /** * Constructor using a GATK Report file * diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java index 2b682f84b..ae2b0ad28 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.recalibration; -import java.util.Arrays; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.LRUCache; /** * The object temporarily held by a read that describes all of it's covariates. @@ -11,12 +12,47 @@ import java.util.Arrays; * @since 2/8/12 */ public class ReadCovariates { + private final static Logger logger = Logger.getLogger(ReadCovariates.class); + + /** + * How big should we let the LRU cache grow + */ + private static final int LRU_CACHE_SIZE = 500; + + /** + * Use an LRU cache to keep cache of keys (int[][][]) arrays for each read length we've seen. + * The cache allows us to avoid the expense of recreating these arrays for every read. The LRU + * keeps the total number of cached arrays to less than LRU_CACHE_SIZE. + * + * This is a thread local variable, so the total memory required may grow to N_THREADS x LRU_CACHE_SIZE + */ + private final static ThreadLocal> keysCache = new ThreadLocal>() { + @Override protected LRUCache initialValue() { + return new LRUCache(LRU_CACHE_SIZE); + } + }; + + /** + * Our keys, indexed by event type x read length x covariate + */ private final int[][][] keys; + /** + * The index of the current covariate, used by addCovariate + */ private int currentCovariateIndex = 0; public ReadCovariates(final int readLength, final int numberOfCovariates) { - keys = new int[EventType.values().length][readLength][numberOfCovariates]; + final LRUCache cache = keysCache.get(); + final int[][][] cachedKeys = cache.get(readLength); + if ( cachedKeys == null ) { + // There's no cached value for read length so we need to create a new int[][][] array + logger.info("Keys cache miss for length " + readLength + " cache size " + cache.size()); + keys = new int[EventType.values().length][readLength][numberOfCovariates]; + cache.put(readLength, keys); + } else { + keys = cachedKeys; + } } public void setCovariateIndex(final int index) { @@ -24,22 +60,26 @@ public class ReadCovariates { } /** - * Necessary due to bug in BaseRecalibration recalibrateRead function. It is clearly seeing space it's not supposed to - * @return + * Update the keys for mismatch, insertion, and deletion for the current covariate at read offset + * + * @param mismatch the mismatch key value + * @param insertion the insertion key value + * @param deletion the deletion key value + * @param readOffset the read offset, must be >= 0 and <= the read length used to create this ReadCovariates */ - public ReadCovariates clear() { - for ( int i = 0; i < keys.length; i++ ) - for ( int j = 0; j < keys[i].length; j++) - Arrays.fill(keys[i][j], 0); - return this; - } - public void addCovariate(final int mismatch, final int insertion, final int deletion, final int readOffset) { keys[EventType.BASE_SUBSTITUTION.index][readOffset][currentCovariateIndex] = mismatch; keys[EventType.BASE_INSERTION.index][readOffset][currentCovariateIndex] = insertion; keys[EventType.BASE_DELETION.index][readOffset][currentCovariateIndex] = deletion; } + /** + * Get the keys for all covariates at read position for error model + * + * @param readPosition + * @param errorModel + * @return + */ public int[] getKeySet(final int readPosition, final EventType errorModel) { return keys[errorModel.index][readPosition]; } @@ -48,21 +88,12 @@ public class ReadCovariates { return keys[errorModel.index]; } - public int[] getMismatchesKeySet(final int readPosition) { - return keys[EventType.BASE_SUBSTITUTION.index][readPosition]; - } + // ---------------------------------------------------------------------- + // + // routines for testing + // + // ---------------------------------------------------------------------- - public int[] getInsertionsKeySet(final int readPosition) { - return keys[EventType.BASE_INSERTION.index][readPosition]; - } - - public int[] getDeletionsKeySet(final int readPosition) { - return keys[EventType.BASE_DELETION.index][readPosition]; - } - - /** - * Testing routines - */ protected int[][] getMismatchesKeySet() { return keys[EventType.BASE_SUBSTITUTION.index]; } @@ -74,4 +105,16 @@ public class ReadCovariates { protected int[][] getDeletionsKeySet() { return keys[EventType.BASE_DELETION.index]; } + + protected int[] getMismatchesKeySet(final int readPosition) { + return getKeySet(readPosition, EventType.BASE_SUBSTITUTION); + } + + protected int[] getInsertionsKeySet(final int readPosition) { + return getKeySet(readPosition, EventType.BASE_INSERTION); + } + + protected int[] getDeletionsKeySet(final int readPosition) { + return getKeySet(readPosition, EventType.BASE_DELETION); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java index 5e470b35f..b586a1607 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java @@ -26,13 +26,13 @@ package org.broadinstitute.sting.utils.recalibration.covariates; import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.recalibration.ReadCovariates; import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.clipping.ClippingRepresentation; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.recalibration.ReadCovariates; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.ArrayList; @@ -99,9 +99,21 @@ public class ContextCovariate implements StandardCovariate { final ArrayList indelKeys = contextWith(bases, indelsContextSize, indelsKeyMask); final int readLength = bases.length; + + // this is necessary to ensure that we don't keep historical data in the ReadCovariates values + // since the context covariate may not span the entire set of values in read covariates + // due to the clipping of the low quality bases + if ( readLength != originalBases.length ) { + // don't both zeroing out if we are going to overwrite the whole array + for ( int i = 0; i < originalBases.length; i++ ) + // this base has been clipped off, so zero out the covariate values here + values.addCovariate(0, 0, 0, i); + } + for (int i = 0; i < readLength; i++) { + final int readOffset = (negativeStrand ? readLength - i - 1 : i); final int indelKey = indelKeys.get(i); - values.addCovariate(mismatchKeys.get(i), indelKey, indelKey, (negativeStrand ? readLength - i - 1 : i)); + values.addCovariate(mismatchKeys.get(i), indelKey, indelKey, readOffset); } // put the original bases back in