Optimizations for applying BQSR table with PrintReads
-- Cleaned up code in updateDataForRead so that constant values where not computed in inner loops -- BaseRecalibrator doesn't create it's own fasta index reader, it just piggy backs on the GATK one -- ReadCovariates <init> now uses a thread local cache for it's int[][][] keys member variable. This stops us from recreating an expensive array over and over. In order to make this really work had to update recordValues in ContextCovariate so it writes 0s over base values its skipping because of low quality base clipping. Previously the values in the ReadCovariates keys were 0 because they were never modified by ContextCovariates. Now these values are actually zero'd out explicitly by the covariates.
This commit is contained in:
parent
5ec25797b3
commit
a481d006f0
|
|
@ -45,13 +45,12 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp
|
|||
|
||||
@Override
|
||||
public void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) {
|
||||
final ReadCovariates readCovariates = covariateKeySetFrom(read);
|
||||
byte[] tempQualArray = threadLocalTempQualArray.get();
|
||||
double[] tempFractionalErrorArray = threadLocalTempFractionalErrorArray.get();
|
||||
|
||||
for( int offset = 0; offset < read.getReadBases().length; offset++ ) {
|
||||
if( !skip[offset] ) {
|
||||
final ReadCovariates readCovariates = covariateKeySetFrom(read);
|
||||
|
||||
byte[] tempQualArray = threadLocalTempQualArray.get();
|
||||
double[] tempFractionalErrorArray = threadLocalTempFractionalErrorArray.get();
|
||||
|
||||
tempQualArray[EventType.BASE_SUBSTITUTION.index] = read.getBaseQualities()[offset];
|
||||
tempFractionalErrorArray[EventType.BASE_SUBSTITUTION.index] = snpErrors[offset];
|
||||
tempQualArray[EventType.BASE_INSERTION.index] = read.getBaseInsertionQualities()[offset];
|
||||
|
|
|
|||
|
|
@ -45,7 +45,6 @@ import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
|||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.recalibration.*;
|
||||
import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
|
||||
|
|
@ -53,7 +52,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
|||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.lang.reflect.Constructor;
|
||||
|
|
@ -194,14 +192,7 @@ public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSche
|
|||
recalibrationEngine.initialize(requestedCovariates, recalibrationTables);
|
||||
|
||||
minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN;
|
||||
|
||||
try {
|
||||
// fasta reference reader for use with BAQ calculation
|
||||
referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
|
||||
} catch( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e);
|
||||
}
|
||||
|
||||
referenceReader = getToolkit().getReferenceDataSource().getReference();
|
||||
}
|
||||
|
||||
private RecalibrationEngine initializeRecalibrationEngine() {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,20 @@
|
|||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* An LRU cache implemented as an extension to LinkedHashMap
|
||||
*/
|
||||
public class LRUCache<K,V> extends LinkedHashMap<K,V> {
|
||||
private int capacity; // Maximum number of items in the cache.
|
||||
|
||||
public LRUCache(int capacity) {
|
||||
super(capacity+1, 1.0f, true); // Pass 'true' for accessOrder.
|
||||
this.capacity = capacity;
|
||||
}
|
||||
|
||||
protected boolean removeEldestEntry(final Map.Entry entry) {
|
||||
return (size() > this.capacity);
|
||||
}
|
||||
}
|
||||
|
|
@ -54,14 +54,6 @@ public class BaseRecalibration {
|
|||
private final int preserveQLessThan;
|
||||
private final boolean emitOriginalQuals;
|
||||
|
||||
// TODO -- was this supposed to be used somewhere?
|
||||
// private static final NestedHashMap[] qualityScoreByFullCovariateKey = new NestedHashMap[EventType.values().length]; // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values.
|
||||
// static {
|
||||
// for (int i = 0; i < EventType.values().length; i++)
|
||||
// qualityScoreByFullCovariateKey[i] = new NestedHashMap();
|
||||
// }
|
||||
|
||||
|
||||
/**
|
||||
* Constructor using a GATK Report file
|
||||
*
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
package org.broadinstitute.sting.utils.recalibration;
|
||||
|
||||
import java.util.Arrays;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.LRUCache;
|
||||
|
||||
/**
|
||||
* The object temporarily held by a read that describes all of it's covariates.
|
||||
|
|
@ -11,12 +12,47 @@ import java.util.Arrays;
|
|||
* @since 2/8/12
|
||||
*/
|
||||
public class ReadCovariates {
|
||||
private final static Logger logger = Logger.getLogger(ReadCovariates.class);
|
||||
|
||||
/**
|
||||
* How big should we let the LRU cache grow
|
||||
*/
|
||||
private static final int LRU_CACHE_SIZE = 500;
|
||||
|
||||
/**
|
||||
* Use an LRU cache to keep cache of keys (int[][][]) arrays for each read length we've seen.
|
||||
* The cache allows us to avoid the expense of recreating these arrays for every read. The LRU
|
||||
* keeps the total number of cached arrays to less than LRU_CACHE_SIZE.
|
||||
*
|
||||
* This is a thread local variable, so the total memory required may grow to N_THREADS x LRU_CACHE_SIZE
|
||||
*/
|
||||
private final static ThreadLocal<LRUCache<Integer, int[][][]>> keysCache = new ThreadLocal<LRUCache<Integer, int[][][]>>() {
|
||||
@Override protected LRUCache<Integer, int[][][]> initialValue() {
|
||||
return new LRUCache<Integer, int[][][]>(LRU_CACHE_SIZE);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Our keys, indexed by event type x read length x covariate
|
||||
*/
|
||||
private final int[][][] keys;
|
||||
|
||||
/**
|
||||
* The index of the current covariate, used by addCovariate
|
||||
*/
|
||||
private int currentCovariateIndex = 0;
|
||||
|
||||
public ReadCovariates(final int readLength, final int numberOfCovariates) {
|
||||
keys = new int[EventType.values().length][readLength][numberOfCovariates];
|
||||
final LRUCache<Integer, int[][][]> cache = keysCache.get();
|
||||
final int[][][] cachedKeys = cache.get(readLength);
|
||||
if ( cachedKeys == null ) {
|
||||
// There's no cached value for read length so we need to create a new int[][][] array
|
||||
logger.info("Keys cache miss for length " + readLength + " cache size " + cache.size());
|
||||
keys = new int[EventType.values().length][readLength][numberOfCovariates];
|
||||
cache.put(readLength, keys);
|
||||
} else {
|
||||
keys = cachedKeys;
|
||||
}
|
||||
}
|
||||
|
||||
public void setCovariateIndex(final int index) {
|
||||
|
|
@ -24,22 +60,26 @@ public class ReadCovariates {
|
|||
}
|
||||
|
||||
/**
|
||||
* Necessary due to bug in BaseRecalibration recalibrateRead function. It is clearly seeing space it's not supposed to
|
||||
* @return
|
||||
* Update the keys for mismatch, insertion, and deletion for the current covariate at read offset
|
||||
*
|
||||
* @param mismatch the mismatch key value
|
||||
* @param insertion the insertion key value
|
||||
* @param deletion the deletion key value
|
||||
* @param readOffset the read offset, must be >= 0 and <= the read length used to create this ReadCovariates
|
||||
*/
|
||||
public ReadCovariates clear() {
|
||||
for ( int i = 0; i < keys.length; i++ )
|
||||
for ( int j = 0; j < keys[i].length; j++)
|
||||
Arrays.fill(keys[i][j], 0);
|
||||
return this;
|
||||
}
|
||||
|
||||
public void addCovariate(final int mismatch, final int insertion, final int deletion, final int readOffset) {
|
||||
keys[EventType.BASE_SUBSTITUTION.index][readOffset][currentCovariateIndex] = mismatch;
|
||||
keys[EventType.BASE_INSERTION.index][readOffset][currentCovariateIndex] = insertion;
|
||||
keys[EventType.BASE_DELETION.index][readOffset][currentCovariateIndex] = deletion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the keys for all covariates at read position for error model
|
||||
*
|
||||
* @param readPosition
|
||||
* @param errorModel
|
||||
* @return
|
||||
*/
|
||||
public int[] getKeySet(final int readPosition, final EventType errorModel) {
|
||||
return keys[errorModel.index][readPosition];
|
||||
}
|
||||
|
|
@ -48,21 +88,12 @@ public class ReadCovariates {
|
|||
return keys[errorModel.index];
|
||||
}
|
||||
|
||||
public int[] getMismatchesKeySet(final int readPosition) {
|
||||
return keys[EventType.BASE_SUBSTITUTION.index][readPosition];
|
||||
}
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// routines for testing
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
public int[] getInsertionsKeySet(final int readPosition) {
|
||||
return keys[EventType.BASE_INSERTION.index][readPosition];
|
||||
}
|
||||
|
||||
public int[] getDeletionsKeySet(final int readPosition) {
|
||||
return keys[EventType.BASE_DELETION.index][readPosition];
|
||||
}
|
||||
|
||||
/**
|
||||
* Testing routines
|
||||
*/
|
||||
protected int[][] getMismatchesKeySet() {
|
||||
return keys[EventType.BASE_SUBSTITUTION.index];
|
||||
}
|
||||
|
|
@ -74,4 +105,16 @@ public class ReadCovariates {
|
|||
protected int[][] getDeletionsKeySet() {
|
||||
return keys[EventType.BASE_DELETION.index];
|
||||
}
|
||||
|
||||
protected int[] getMismatchesKeySet(final int readPosition) {
|
||||
return getKeySet(readPosition, EventType.BASE_SUBSTITUTION);
|
||||
}
|
||||
|
||||
protected int[] getInsertionsKeySet(final int readPosition) {
|
||||
return getKeySet(readPosition, EventType.BASE_INSERTION);
|
||||
}
|
||||
|
||||
protected int[] getDeletionsKeySet(final int readPosition) {
|
||||
return getKeySet(readPosition, EventType.BASE_DELETION);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,13 +26,13 @@
|
|||
package org.broadinstitute.sting.utils.recalibration.covariates;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.recalibration.ReadCovariates;
|
||||
import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.clipping.ClippingRepresentation;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.recalibration.ReadCovariates;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
|
@ -99,9 +99,21 @@ public class ContextCovariate implements StandardCovariate {
|
|||
final ArrayList<Integer> indelKeys = contextWith(bases, indelsContextSize, indelsKeyMask);
|
||||
|
||||
final int readLength = bases.length;
|
||||
|
||||
// this is necessary to ensure that we don't keep historical data in the ReadCovariates values
|
||||
// since the context covariate may not span the entire set of values in read covariates
|
||||
// due to the clipping of the low quality bases
|
||||
if ( readLength != originalBases.length ) {
|
||||
// don't both zeroing out if we are going to overwrite the whole array
|
||||
for ( int i = 0; i < originalBases.length; i++ )
|
||||
// this base has been clipped off, so zero out the covariate values here
|
||||
values.addCovariate(0, 0, 0, i);
|
||||
}
|
||||
|
||||
for (int i = 0; i < readLength; i++) {
|
||||
final int readOffset = (negativeStrand ? readLength - i - 1 : i);
|
||||
final int indelKey = indelKeys.get(i);
|
||||
values.addCovariate(mismatchKeys.get(i), indelKey, indelKey, (negativeStrand ? readLength - i - 1 : i));
|
||||
values.addCovariate(mismatchKeys.get(i), indelKey, indelKey, readOffset);
|
||||
}
|
||||
|
||||
// put the original bases back in
|
||||
|
|
|
|||
Loading…
Reference in New Issue