Optimization of BQSR
-- Created a ReadRecalibrationInfo class that holds all of the information (read, base quality vectors, error vectors) for a read for the call to updateDataForRead in RecalibrationEngine. This object has a restrictive interface to just get information about specific qual and error values at offset and for event type. This restrict allows us to avoid creating an vector of byte 45 for each read to represent BI and BD values not in the reads. Shaves 5% of the runtime off the entire code. -- Cleaned up code and added lots more docs -- With this commit we no longer have much in the way of low-hanging fruit left in the optimization of BQSR. 95% of the runtime is spent in BAQing the read, and updating the RecalData in the NestedIntegerArrays.
This commit is contained in:
parent
f6d5499582
commit
0f0188ddb1
|
|
@ -28,41 +28,22 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
|
|||
import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource;
|
||||
import org.broadinstitute.sting.utils.recalibration.EventType;
|
||||
import org.broadinstitute.sting.utils.recalibration.ReadCovariates;
|
||||
import org.broadinstitute.sting.utils.recalibration.RecalibrationTables;
|
||||
import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.threading.ThreadLocalArray;
|
||||
|
||||
public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine implements ProtectedPackageSource {
|
||||
|
||||
// optimization: only allocate temp arrays once per thread
|
||||
private final ThreadLocal<byte[]> threadLocalTempQualArray = new ThreadLocalArray<byte[]>(EventType.values().length, byte.class);
|
||||
private final ThreadLocal<double[]> threadLocalTempFractionalErrorArray = new ThreadLocalArray<double[]>(EventType.values().length, double.class);
|
||||
|
||||
public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables) {
|
||||
super.initialize(covariates, recalibrationTables);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) {
|
||||
final ReadCovariates readCovariates = covariateKeySetFrom(read);
|
||||
byte[] tempQualArray = threadLocalTempQualArray.get();
|
||||
double[] tempFractionalErrorArray = threadLocalTempFractionalErrorArray.get();
|
||||
public void updateDataForRead( final ReadRecalibrationInfo recalInfo ) {
|
||||
final GATKSAMRecord read = recalInfo.getRead();
|
||||
final ReadCovariates readCovariates = recalInfo.getCovariatesValues();
|
||||
|
||||
for( int offset = 0; offset < read.getReadBases().length; offset++ ) {
|
||||
if( !skip[offset] ) {
|
||||
tempQualArray[EventType.BASE_SUBSTITUTION.index] = read.getBaseQualities()[offset];
|
||||
tempFractionalErrorArray[EventType.BASE_SUBSTITUTION.index] = snpErrors[offset];
|
||||
tempQualArray[EventType.BASE_INSERTION.index] = read.getBaseInsertionQualities()[offset];
|
||||
tempFractionalErrorArray[EventType.BASE_INSERTION.index] = insertionErrors[offset];
|
||||
tempQualArray[EventType.BASE_DELETION.index] = read.getBaseDeletionQualities()[offset];
|
||||
tempFractionalErrorArray[EventType.BASE_DELETION.index] = deletionErrors[offset];
|
||||
if( ! recalInfo.skip(offset) ) {
|
||||
|
||||
for (final EventType eventType : EventType.values()) {
|
||||
final int[] keys = readCovariates.getKeySet(offset, eventType);
|
||||
final int eventIndex = eventType.index;
|
||||
final byte qual = tempQualArray[eventIndex];
|
||||
final double isError = tempFractionalErrorArray[eventIndex];
|
||||
final byte qual = recalInfo.getQual(eventType, offset);
|
||||
final double isError = recalInfo.getErrorFraction(eventType, offset);
|
||||
|
||||
incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventIndex);
|
||||
|
||||
|
|
|
|||
|
|
@ -225,19 +225,22 @@ public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSche
|
|||
if (!RecalUtils.isColorSpaceConsistent(RAC.SOLID_NOCALL_STRATEGY, read)) { // parse the solid color space and check for color no-calls
|
||||
return 0L; // skip this read completely
|
||||
}
|
||||
read.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalUtils.computeCovariates(read, requestedCovariates));
|
||||
|
||||
final boolean[] skip = calculateSkipArray(read, metaDataTracker); // skip known sites of variation as well as low quality and non-regular bases
|
||||
final int[] isSNP = calculateIsSNP(read, ref, originalRead);
|
||||
final int[] isInsertion = calculateIsIndel(read, EventType.BASE_INSERTION);
|
||||
final int[] isDeletion = calculateIsIndel(read, EventType.BASE_DELETION);
|
||||
final byte[] baqArray = calculateBAQArray(read);
|
||||
|
||||
if( baqArray != null ) { // some reads just can't be BAQ'ed
|
||||
final ReadCovariates covariates = RecalUtils.computeCovariates(read, requestedCovariates);
|
||||
final boolean[] skip = calculateSkipArray(read, metaDataTracker); // skip known sites of variation as well as low quality and non-regular bases
|
||||
final int[] isSNP = calculateIsSNP(read, ref, originalRead);
|
||||
final int[] isInsertion = calculateIsIndel(read, EventType.BASE_INSERTION);
|
||||
final int[] isDeletion = calculateIsIndel(read, EventType.BASE_DELETION);
|
||||
final double[] snpErrors = calculateFractionalErrorArray(isSNP, baqArray);
|
||||
final double[] insertionErrors = calculateFractionalErrorArray(isInsertion, baqArray);
|
||||
final double[] deletionErrors = calculateFractionalErrorArray(isDeletion, baqArray);
|
||||
recalibrationEngine.updateDataForRead(read, skip, snpErrors, insertionErrors, deletionErrors);
|
||||
|
||||
// aggregrate all of the info into our info object, and update the data
|
||||
final ReadRecalibrationInfo info = new ReadRecalibrationInfo(read, covariates, skip, snpErrors, insertionErrors, deletionErrors);
|
||||
recalibrationEngine.updateDataForRead(info);
|
||||
return 1L;
|
||||
} else {
|
||||
return 0L;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,157 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.recalibration.EventType;
|
||||
import org.broadinstitute.sting.utils.recalibration.ReadCovariates;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: 12/18/12
|
||||
* Time: 3:50 PM
|
||||
*
|
||||
* TODO -- merge in ReadCovariates?
|
||||
*/
|
||||
public final class ReadRecalibrationInfo {
|
||||
private final GATKSAMRecord read;
|
||||
private final int length;
|
||||
private final ReadCovariates covariates;
|
||||
private final boolean[] skips;
|
||||
private final byte[] baseQuals, insertionQuals, deletionQuals;
|
||||
private final double[] snpErrors, insertionErrors, deletionErrors;
|
||||
|
||||
public ReadRecalibrationInfo(final GATKSAMRecord read,
|
||||
final ReadCovariates covariates,
|
||||
final boolean[] skips,
|
||||
final double[] snpErrors,
|
||||
final double[] insertionErrors,
|
||||
final double[] deletionErrors) {
|
||||
if ( read == null ) throw new IllegalArgumentException("read cannot be null");
|
||||
if ( covariates == null ) throw new IllegalArgumentException("covariates cannot be null");
|
||||
if ( skips == null ) throw new IllegalArgumentException("skips cannot be null");
|
||||
if ( snpErrors == null ) throw new IllegalArgumentException("snpErrors cannot be null");
|
||||
// future: may allow insertionErrors && deletionErrors to be null, so don't enforce
|
||||
|
||||
this.read = read;
|
||||
this.baseQuals = read.getBaseQualities();
|
||||
this.length = baseQuals.length;
|
||||
this.covariates = covariates;
|
||||
this.skips = skips;
|
||||
this.insertionQuals = read.getExistingBaseInsertionQualities();
|
||||
this.deletionQuals = read.getExistingBaseDeletionQualities();
|
||||
this.snpErrors = snpErrors;
|
||||
this.insertionErrors = insertionErrors;
|
||||
this.deletionErrors = deletionErrors;
|
||||
|
||||
if ( skips.length != length ) throw new IllegalArgumentException("skips.length " + snpErrors.length + " != length " + length);
|
||||
if ( snpErrors.length != length ) throw new IllegalArgumentException("snpErrors.length " + snpErrors.length + " != length " + length);
|
||||
if ( insertionErrors != null && insertionErrors.length != length ) throw new IllegalArgumentException("insertionErrors.length " + snpErrors.length + " != length " + length);
|
||||
if ( deletionErrors != null && deletionErrors.length != length ) throw new IllegalArgumentException("deletionErrors.length " + snpErrors.length + " != length " + length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the qual score for event type at offset
|
||||
*
|
||||
* @param eventType the type of event we want the qual for
|
||||
* @param offset the offset into this read for the qual
|
||||
* @return a valid quality score for event at offset
|
||||
*/
|
||||
@Requires("validOffset(offset)")
|
||||
@Ensures("result >= 0 && result <= QualityUtils.MAX_QUAL_SCORE")
|
||||
public byte getQual(final EventType eventType, final int offset) {
|
||||
switch ( eventType ) {
|
||||
case BASE_SUBSTITUTION: return baseQuals[offset];
|
||||
// note optimization here -- if we don't have ins/del quals we just return the default byte directly
|
||||
case BASE_INSERTION: return insertionQuals == null ? GATKSAMRecord.DEFAULT_INSERTION_DELETION_QUAL : insertionQuals[offset];
|
||||
case BASE_DELETION: return deletionQuals == null ? GATKSAMRecord.DEFAULT_INSERTION_DELETION_QUAL : deletionQuals[offset];
|
||||
default: throw new IllegalStateException("Unknown event type " + eventType);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the error fraction for event type at offset
|
||||
*
|
||||
* The error fraction is a value between 0 and 1 that indicates how much certainty we have
|
||||
* in the error occurring at offset. A value of 1 means that the error definitely occurs at this
|
||||
* site, a value of 0.0 means it definitely doesn't happen here. 0.5 means that half the weight
|
||||
* of the error belongs here
|
||||
*
|
||||
* @param eventType the type of event we want the qual for
|
||||
* @param offset the offset into this read for the qual
|
||||
* @return a fractional weight for an error at this offset
|
||||
*/
|
||||
@Requires("validOffset(offset)")
|
||||
@Ensures("result >= 0.0 && result <= 1.0")
|
||||
public double getErrorFraction(final EventType eventType, final int offset) {
|
||||
switch ( eventType ) {
|
||||
case BASE_SUBSTITUTION: return snpErrors[offset];
|
||||
case BASE_INSERTION: return insertionErrors[offset];
|
||||
case BASE_DELETION: return deletionErrors[offset];
|
||||
default: throw new IllegalStateException("Unknown event type " + eventType);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the read involved in this recalibration info
|
||||
* @return a non-null GATKSAMRecord
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public GATKSAMRecord getRead() {
|
||||
return read;
|
||||
}
|
||||
|
||||
/**
|
||||
* Should offset in this read be skipped (because it's covered by a known variation site?)
|
||||
* @param offset a valid offset into this info
|
||||
* @return true if offset should be skipped, false otherwise
|
||||
*/
|
||||
@Requires("validOffset(offset)")
|
||||
public boolean skip(final int offset) {
|
||||
return skips[offset];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the ReadCovariates object carrying the mapping from offsets -> covariate key sets
|
||||
* @return a non-null ReadCovariates object
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public ReadCovariates getCovariatesValues() {
|
||||
return covariates;
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensures an offset is valid. Used in contracts
|
||||
* @param offset a proposed offset
|
||||
* @return true if offset is valid w.r.t. the data in this object, false otherwise
|
||||
*/
|
||||
private boolean validOffset(final int offset) {
|
||||
return offset >= 0 && offset < baseQuals.length;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,5 +1,6 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.recalibration.RecalibrationTables;
|
||||
import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
|
@ -29,10 +30,31 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
|||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
public interface RecalibrationEngine {
|
||||
|
||||
/**
|
||||
* Initialize the recalibration engine
|
||||
*
|
||||
* Called once before any calls to updateDataForRead are made. The engine should prepare itself
|
||||
* to handle any number of updateDataForRead calls containing ReadRecalibrationInfo containing
|
||||
* keys for each of the covariates provided.
|
||||
*
|
||||
* The engine should collect match and mismatch data into the recalibrationTables data.
|
||||
*
|
||||
* @param covariates an array of the covariates we'll be using in this engine, order matters
|
||||
* @param recalibrationTables the destination recalibrationTables where stats should be collected
|
||||
*/
|
||||
public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables);
|
||||
|
||||
public void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors);
|
||||
/**
|
||||
* Update the recalibration statistics using the information in recalInfo
|
||||
* @param recalInfo data structure holding information about the recalibration values for a single read
|
||||
*/
|
||||
@Requires("recalInfo != null")
|
||||
public void updateDataForRead(final ReadRecalibrationInfo recalInfo);
|
||||
|
||||
/**
|
||||
* Finalize, if appropriate, all derived data in recalibrationTables.
|
||||
*
|
||||
* Called once after all calls to updateDataForRead have been issued.
|
||||
*/
|
||||
public void finalizeData();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -35,34 +35,37 @@ import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
|
|||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
public class StandardRecalibrationEngine implements RecalibrationEngine, PublicPackageSource {
|
||||
|
||||
protected Covariate[] covariates;
|
||||
protected RecalibrationTables recalibrationTables;
|
||||
|
||||
@Override
|
||||
public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables) {
|
||||
if ( covariates == null ) throw new IllegalArgumentException("Covariates cannot be null");
|
||||
if ( recalibrationTables == null ) throw new IllegalArgumentException("recalibrationTables cannot be null");
|
||||
|
||||
this.covariates = covariates.clone();
|
||||
this.recalibrationTables = recalibrationTables;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateDataForRead( final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) {
|
||||
public void updateDataForRead( final ReadRecalibrationInfo recalInfo ) {
|
||||
final GATKSAMRecord read = recalInfo.getRead();
|
||||
final EventType eventType = EventType.BASE_SUBSTITUTION;
|
||||
final ReadCovariates readCovariates = recalInfo.getCovariatesValues();
|
||||
|
||||
for( int offset = 0; offset < read.getReadBases().length; offset++ ) {
|
||||
if( !skip[offset] ) {
|
||||
final ReadCovariates readCovariates = covariateKeySetFrom(read);
|
||||
if( ! recalInfo.skip(offset) ) {
|
||||
final byte qual = recalInfo.getQual(eventType, offset);
|
||||
final double isError = recalInfo.getErrorFraction(eventType, offset);
|
||||
final int[] keys = readCovariates.getKeySet(offset, eventType);
|
||||
|
||||
final byte qual = read.getBaseQualities()[offset];
|
||||
final double isError = snpErrors[offset];
|
||||
|
||||
final int[] keys = readCovariates.getKeySet(offset, EventType.BASE_SUBSTITUTION);
|
||||
final int eventIndex = EventType.BASE_SUBSTITUTION.index;
|
||||
|
||||
incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventIndex);
|
||||
incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventType.index);
|
||||
|
||||
for (int i = 2; i < covariates.length; i++) {
|
||||
if (keys[i] < 0)
|
||||
continue;
|
||||
|
||||
incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex);
|
||||
incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventType.index);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -79,16 +82,6 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP
|
|||
return new RecalDatum(1, isError, reportedQual);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the covariate key set from a read
|
||||
*
|
||||
* @param read the read
|
||||
* @return the covariate keysets for this read
|
||||
*/
|
||||
protected ReadCovariates covariateKeySetFrom(GATKSAMRecord read) {
|
||||
return (ReadCovariates) read.getTemporaryAttribute(BaseRecalibrator.COVARS_ATTRIBUTE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create derived recalibration data tables
|
||||
*
|
||||
|
|
@ -129,7 +122,10 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP
|
|||
* @param isError error value for this event
|
||||
* @param keys location in table of our item
|
||||
*/
|
||||
protected void incrementDatumOrPutIfNecessary( final NestedIntegerArray<RecalDatum> table, final byte qual, final double isError, final int... keys ) {
|
||||
protected void incrementDatumOrPutIfNecessary( final NestedIntegerArray<RecalDatum> table,
|
||||
final byte qual,
|
||||
final double isError,
|
||||
final int... keys ) {
|
||||
final RecalDatum existingDatum = table.get(keys);
|
||||
|
||||
if ( existingDatum == null ) {
|
||||
|
|
|
|||
|
|
@ -56,6 +56,12 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
public static final String BQSR_BASE_INSERTION_QUALITIES = "BI"; // base qualities for insertions
|
||||
public static final String BQSR_BASE_DELETION_QUALITIES = "BD"; // base qualities for deletions
|
||||
|
||||
/**
|
||||
* The default quality score for an insertion or deletion, if
|
||||
* none are provided for this read.
|
||||
*/
|
||||
public static final byte DEFAULT_INSERTION_DELETION_QUAL = (byte)45;
|
||||
|
||||
// the SAMRecord data we're caching
|
||||
private String mReadString = null;
|
||||
private GATKSAMReadGroupRecord mReadGroup = null;
|
||||
|
|
@ -239,7 +245,7 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
byte [] quals = getExistingBaseInsertionQualities();
|
||||
if( quals == null ) {
|
||||
quals = new byte[getBaseQualities().length];
|
||||
Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will
|
||||
Arrays.fill(quals, DEFAULT_INSERTION_DELETION_QUAL); // Some day in the future when base insertion and base deletion quals exist the samtools API will
|
||||
// be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45
|
||||
}
|
||||
return quals;
|
||||
|
|
@ -255,7 +261,7 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
byte[] quals = getExistingBaseDeletionQualities();
|
||||
if( quals == null ) {
|
||||
quals = new byte[getBaseQualities().length];
|
||||
Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will
|
||||
Arrays.fill(quals, DEFAULT_INSERTION_DELETION_QUAL); // Some day in the future when base insertion and base deletion quals exist the samtools API will
|
||||
// be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45
|
||||
}
|
||||
return quals;
|
||||
|
|
|
|||
Loading…
Reference in New Issue