diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java index ffd681056..ac280b70e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java @@ -45,13 +45,12 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp @Override public void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { + final ReadCovariates readCovariates = covariateKeySetFrom(read); + byte[] tempQualArray = threadLocalTempQualArray.get(); + double[] tempFractionalErrorArray = threadLocalTempFractionalErrorArray.get(); + for( int offset = 0; offset < read.getReadBases().length; offset++ ) { if( !skip[offset] ) { - final ReadCovariates readCovariates = covariateKeySetFrom(read); - - byte[] tempQualArray = threadLocalTempQualArray.get(); - double[] tempFractionalErrorArray = threadLocalTempFractionalErrorArray.get(); - tempQualArray[EventType.BASE_SUBSTITUTION.index] = read.getBaseQualities()[offset]; tempFractionalErrorArray[EventType.BASE_SUBSTITUTION.index] = snpErrors[offset]; tempQualArray[EventType.BASE_INSERTION.index] = read.getBaseInsertionQualities()[offset]; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 2410aefce..f037f861f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -45,7 +45,6 @@ import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.recalibration.*; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; @@ -53,7 +52,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import java.io.File; -import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintStream; import java.lang.reflect.Constructor; @@ -194,14 +192,7 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche recalibrationEngine.initialize(requestedCovariates, recalibrationTables); minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN; - - try { - // fasta reference reader for use with BAQ calculation - referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); - } catch( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); - } - + referenceReader = getToolkit().getReferenceDataSource().getReference(); } private RecalibrationEngine initializeRecalibrationEngine() { diff --git a/public/java/src/org/broadinstitute/sting/utils/LRUCache.java b/public/java/src/org/broadinstitute/sting/utils/LRUCache.java new file mode 100644 index 000000000..a3514c95f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/LRUCache.java @@ -0,0 +1,20 @@ +package org.broadinstitute.sting.utils; + +import java.util.LinkedHashMap; +import java.util.Map; + +/** + * An LRU cache implemented as an extension to LinkedHashMap + */ +public class LRUCache extends LinkedHashMap { + private int capacity; // Maximum number of items in the cache. + + public LRUCache(int capacity) { + super(capacity+1, 1.0f, true); // Pass 'true' for accessOrder. + this.capacity = capacity; + } + + protected boolean removeEldestEntry(final Map.Entry entry) { + return (size() > this.capacity); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index 5d4020a07..7d63996c3 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -54,14 +54,6 @@ public class BaseRecalibration { private final int preserveQLessThan; private final boolean emitOriginalQuals; - // TODO -- was this supposed to be used somewhere? -// private static final NestedHashMap[] qualityScoreByFullCovariateKey = new NestedHashMap[EventType.values().length]; // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values. -// static { -// for (int i = 0; i < EventType.values().length; i++) -// qualityScoreByFullCovariateKey[i] = new NestedHashMap(); -// } - - /** * Constructor using a GATK Report file * diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java index 2b682f84b..ae2b0ad28 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.recalibration; -import java.util.Arrays; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.LRUCache; /** * The object temporarily held by a read that describes all of it's covariates. @@ -11,12 +12,47 @@ import java.util.Arrays; * @since 2/8/12 */ public class ReadCovariates { + private final static Logger logger = Logger.getLogger(ReadCovariates.class); + + /** + * How big should we let the LRU cache grow + */ + private static final int LRU_CACHE_SIZE = 500; + + /** + * Use an LRU cache to keep cache of keys (int[][][]) arrays for each read length we've seen. + * The cache allows us to avoid the expense of recreating these arrays for every read. The LRU + * keeps the total number of cached arrays to less than LRU_CACHE_SIZE. + * + * This is a thread local variable, so the total memory required may grow to N_THREADS x LRU_CACHE_SIZE + */ + private final static ThreadLocal> keysCache = new ThreadLocal>() { + @Override protected LRUCache initialValue() { + return new LRUCache(LRU_CACHE_SIZE); + } + }; + + /** + * Our keys, indexed by event type x read length x covariate + */ private final int[][][] keys; + /** + * The index of the current covariate, used by addCovariate + */ private int currentCovariateIndex = 0; public ReadCovariates(final int readLength, final int numberOfCovariates) { - keys = new int[EventType.values().length][readLength][numberOfCovariates]; + final LRUCache cache = keysCache.get(); + final int[][][] cachedKeys = cache.get(readLength); + if ( cachedKeys == null ) { + // There's no cached value for read length so we need to create a new int[][][] array + logger.info("Keys cache miss for length " + readLength + " cache size " + cache.size()); + keys = new int[EventType.values().length][readLength][numberOfCovariates]; + cache.put(readLength, keys); + } else { + keys = cachedKeys; + } } public void setCovariateIndex(final int index) { @@ -24,22 +60,26 @@ public class ReadCovariates { } /** - * Necessary due to bug in BaseRecalibration recalibrateRead function. It is clearly seeing space it's not supposed to - * @return + * Update the keys for mismatch, insertion, and deletion for the current covariate at read offset + * + * @param mismatch the mismatch key value + * @param insertion the insertion key value + * @param deletion the deletion key value + * @param readOffset the read offset, must be >= 0 and <= the read length used to create this ReadCovariates */ - public ReadCovariates clear() { - for ( int i = 0; i < keys.length; i++ ) - for ( int j = 0; j < keys[i].length; j++) - Arrays.fill(keys[i][j], 0); - return this; - } - public void addCovariate(final int mismatch, final int insertion, final int deletion, final int readOffset) { keys[EventType.BASE_SUBSTITUTION.index][readOffset][currentCovariateIndex] = mismatch; keys[EventType.BASE_INSERTION.index][readOffset][currentCovariateIndex] = insertion; keys[EventType.BASE_DELETION.index][readOffset][currentCovariateIndex] = deletion; } + /** + * Get the keys for all covariates at read position for error model + * + * @param readPosition + * @param errorModel + * @return + */ public int[] getKeySet(final int readPosition, final EventType errorModel) { return keys[errorModel.index][readPosition]; } @@ -48,21 +88,12 @@ public class ReadCovariates { return keys[errorModel.index]; } - public int[] getMismatchesKeySet(final int readPosition) { - return keys[EventType.BASE_SUBSTITUTION.index][readPosition]; - } + // ---------------------------------------------------------------------- + // + // routines for testing + // + // ---------------------------------------------------------------------- - public int[] getInsertionsKeySet(final int readPosition) { - return keys[EventType.BASE_INSERTION.index][readPosition]; - } - - public int[] getDeletionsKeySet(final int readPosition) { - return keys[EventType.BASE_DELETION.index][readPosition]; - } - - /** - * Testing routines - */ protected int[][] getMismatchesKeySet() { return keys[EventType.BASE_SUBSTITUTION.index]; } @@ -74,4 +105,16 @@ public class ReadCovariates { protected int[][] getDeletionsKeySet() { return keys[EventType.BASE_DELETION.index]; } + + protected int[] getMismatchesKeySet(final int readPosition) { + return getKeySet(readPosition, EventType.BASE_SUBSTITUTION); + } + + protected int[] getInsertionsKeySet(final int readPosition) { + return getKeySet(readPosition, EventType.BASE_INSERTION); + } + + protected int[] getDeletionsKeySet(final int readPosition) { + return getKeySet(readPosition, EventType.BASE_DELETION); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java index 5e470b35f..b586a1607 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java @@ -26,13 +26,13 @@ package org.broadinstitute.sting.utils.recalibration.covariates; import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.recalibration.ReadCovariates; import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.clipping.ClippingRepresentation; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.recalibration.ReadCovariates; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.ArrayList; @@ -99,9 +99,21 @@ public class ContextCovariate implements StandardCovariate { final ArrayList indelKeys = contextWith(bases, indelsContextSize, indelsKeyMask); final int readLength = bases.length; + + // this is necessary to ensure that we don't keep historical data in the ReadCovariates values + // since the context covariate may not span the entire set of values in read covariates + // due to the clipping of the low quality bases + if ( readLength != originalBases.length ) { + // don't both zeroing out if we are going to overwrite the whole array + for ( int i = 0; i < originalBases.length; i++ ) + // this base has been clipped off, so zero out the covariate values here + values.addCovariate(0, 0, 0, i); + } + for (int i = 0; i < readLength; i++) { + final int readOffset = (negativeStrand ? readLength - i - 1 : i); final int indelKey = indelKeys.get(i); - values.addCovariate(mismatchKeys.get(i), indelKey, indelKey, (negativeStrand ? readLength - i - 1 : i)); + values.addCovariate(mismatchKeys.get(i), indelKey, indelKey, readOffset); } // put the original bases back in