BQSR optimization: only compute BAQ when there's at least one error to delocalize

-- Saves something like 2/3 of the compute cost of BQSR
This commit is contained in:
Mark DePristo 2012-12-19 15:39:58 -05:00
parent eedc01ff22
commit f849910c4e
2 changed files with 49 additions and 8 deletions

View File

@ -135,6 +135,7 @@ public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSche
private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector
private IndexedFastaSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation
private final static byte NO_BAQ_UNCERTAINTY = (byte)'@';
/**
* Parse the -cov arguments and create a list of covariates to be used here
@ -226,19 +227,23 @@ public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSche
return 0L; // skip this read completely
}
final byte[] baqArray = calculateBAQArray(read);
final int[] isSNP = calculateIsSNP(read, ref, originalRead);
final int[] isInsertion = calculateIsIndel(read, EventType.BASE_INSERTION);
final int[] isDeletion = calculateIsIndel(read, EventType.BASE_DELETION);
final int nErrors = nEvents(isSNP, isInsertion, isDeletion);
// note for efficiency regions we don't compute the BAQ array unless we actually have
// some error to marginalize over. For ILMN data ~85% of reads have no error
final byte[] baqArray = nErrors == 0 ? flatBAQArray(read) : calculateBAQArray(read);
if( baqArray != null ) { // some reads just can't be BAQ'ed
final ReadCovariates covariates = RecalUtils.computeCovariates(read, requestedCovariates);
final boolean[] skip = calculateSkipArray(read, metaDataTracker); // skip known sites of variation as well as low quality and non-regular bases
final int[] isSNP = calculateIsSNP(read, ref, originalRead);
final int[] isInsertion = calculateIsIndel(read, EventType.BASE_INSERTION);
final int[] isDeletion = calculateIsIndel(read, EventType.BASE_DELETION);
final double[] snpErrors = calculateFractionalErrorArray(isSNP, baqArray);
final double[] insertionErrors = calculateFractionalErrorArray(isInsertion, baqArray);
final double[] deletionErrors = calculateFractionalErrorArray(isDeletion, baqArray);
// aggregrate all of the info into our info object, and update the data
// aggregate all of the info into our info object, and update the data
final ReadRecalibrationInfo info = new ReadRecalibrationInfo(read, covariates, skip, snpErrors, insertionErrors, deletionErrors);
recalibrationEngine.updateDataForRead(info);
return 1L;
@ -247,6 +252,22 @@ public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSche
}
}
/**
* Compute the number of mutational events across all hasEvent vectors
*
* Simply the sum of entries in hasEvents
*
* @param hasEvents a vector a vectors of 0 (no event) and 1 (has event)
* @return the total number of events across all hasEvent arrays
*/
private int nEvents(final int[]... hasEvents) {
int n = 0;
for ( final int[] hasEvent : hasEvents ) {
n += MathUtils.sum(hasEvent);
}
return n;
}
protected boolean[] calculateSkipArray( final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker ) {
final byte[] bases = read.getReadBases();
final boolean[] skip = new boolean[bases.length];
@ -374,7 +395,6 @@ public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSche
throw new ReviewedStingException("Array length mismatch detected. Malformed read?");
}
final byte NO_BAQ_UNCERTAINTY = (byte)'@';
final int BLOCK_START_UNSET = -1;
final double[] fractionalErrors = new double[baqArray.length];
@ -418,8 +438,24 @@ public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSche
}
}
/**
* Create a BAQ style array that indicates no alignment uncertainty
* @param read the read for which we want a BAQ array
* @return a BAQ-style non-null byte[] counting NO_BAQ_UNCERTAINTY values
* // TODO -- could be optimized avoiding this function entirely by using this inline if the calculation code above
*/
private byte[] flatBAQArray(final GATKSAMRecord read) {
final byte[] baq = new byte[read.getReadLength()];
Arrays.fill(baq, NO_BAQ_UNCERTAINTY);
return baq;
}
/**
* Compute an actual BAQ array for read, based on its quals and the reference sequence
* @param read the read to BAQ
* @return a non-null BAQ tag array for read
*/
private byte[] calculateBAQArray( final GATKSAMRecord read ) {
// todo -- it would be good to directly use the BAQ qualities rather than encoding and decoding the result and using the special @ value
baq.baqRead(read, referenceReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.ADD_TAG);
return BAQ.getBAQTag(read);
}

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.recalibration.EventType;
import org.broadinstitute.sting.utils.recalibration.ReadCovariates;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -84,7 +85,7 @@ public final class ReadRecalibrationInfo {
* @return a valid quality score for event at offset
*/
@Requires("validOffset(offset)")
@Ensures("result >= 0 && result <= QualityUtils.MAX_QUAL_SCORE")
@Ensures("validQual(result)")
public byte getQual(final EventType eventType, final int offset) {
switch ( eventType ) {
case BASE_SUBSTITUTION: return baseQuals[offset];
@ -154,4 +155,8 @@ public final class ReadRecalibrationInfo {
private boolean validOffset(final int offset) {
return offset >= 0 && offset < baseQuals.length;
}
private boolean validQual(final byte result) {
return result >= 0 && result <= QualityUtils.MAX_QUAL_SCORE;
}
}