From f849910c4ed85af12a5a1ac4bce2c9ea3e31c078 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 19 Dec 2012 15:39:58 -0500 Subject: [PATCH] BQSR optimization: only compute BAQ when there's at least one error to delocalize -- Saves something like 2/3 of the compute cost of BQSR --- .../gatk/walkers/bqsr/BaseRecalibrator.java | 50 ++++++++++++++++--- .../walkers/bqsr/ReadRecalibrationInfo.java | 7 ++- 2 files changed, 49 insertions(+), 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 8a71872c1..dcaaafefd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -135,6 +135,7 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector private IndexedFastaSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation + private final static byte NO_BAQ_UNCERTAINTY = (byte)'@'; /** * Parse the -cov arguments and create a list of covariates to be used here @@ -226,19 +227,23 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche return 0L; // skip this read completely } - final byte[] baqArray = calculateBAQArray(read); + final int[] isSNP = calculateIsSNP(read, ref, originalRead); + final int[] isInsertion = calculateIsIndel(read, EventType.BASE_INSERTION); + final int[] isDeletion = calculateIsIndel(read, EventType.BASE_DELETION); + final int nErrors = nEvents(isSNP, isInsertion, isDeletion); + + // note for efficiency regions we don't compute the BAQ array unless we actually have + // some error to marginalize over. For ILMN data ~85% of reads have no error + final byte[] baqArray = nErrors == 0 ? flatBAQArray(read) : calculateBAQArray(read); if( baqArray != null ) { // some reads just can't be BAQ'ed final ReadCovariates covariates = RecalUtils.computeCovariates(read, requestedCovariates); final boolean[] skip = calculateSkipArray(read, metaDataTracker); // skip known sites of variation as well as low quality and non-regular bases - final int[] isSNP = calculateIsSNP(read, ref, originalRead); - final int[] isInsertion = calculateIsIndel(read, EventType.BASE_INSERTION); - final int[] isDeletion = calculateIsIndel(read, EventType.BASE_DELETION); final double[] snpErrors = calculateFractionalErrorArray(isSNP, baqArray); final double[] insertionErrors = calculateFractionalErrorArray(isInsertion, baqArray); final double[] deletionErrors = calculateFractionalErrorArray(isDeletion, baqArray); - // aggregrate all of the info into our info object, and update the data + // aggregate all of the info into our info object, and update the data final ReadRecalibrationInfo info = new ReadRecalibrationInfo(read, covariates, skip, snpErrors, insertionErrors, deletionErrors); recalibrationEngine.updateDataForRead(info); return 1L; @@ -247,6 +252,22 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche } } + /** + * Compute the number of mutational events across all hasEvent vectors + * + * Simply the sum of entries in hasEvents + * + * @param hasEvents a vector a vectors of 0 (no event) and 1 (has event) + * @return the total number of events across all hasEvent arrays + */ + private int nEvents(final int[]... hasEvents) { + int n = 0; + for ( final int[] hasEvent : hasEvents ) { + n += MathUtils.sum(hasEvent); + } + return n; + } + protected boolean[] calculateSkipArray( final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker ) { final byte[] bases = read.getReadBases(); final boolean[] skip = new boolean[bases.length]; @@ -374,7 +395,6 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche throw new ReviewedStingException("Array length mismatch detected. Malformed read?"); } - final byte NO_BAQ_UNCERTAINTY = (byte)'@'; final int BLOCK_START_UNSET = -1; final double[] fractionalErrors = new double[baqArray.length]; @@ -418,8 +438,24 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche } } + /** + * Create a BAQ style array that indicates no alignment uncertainty + * @param read the read for which we want a BAQ array + * @return a BAQ-style non-null byte[] counting NO_BAQ_UNCERTAINTY values + * // TODO -- could be optimized avoiding this function entirely by using this inline if the calculation code above + */ + private byte[] flatBAQArray(final GATKSAMRecord read) { + final byte[] baq = new byte[read.getReadLength()]; + Arrays.fill(baq, NO_BAQ_UNCERTAINTY); + return baq; + } + + /** + * Compute an actual BAQ array for read, based on its quals and the reference sequence + * @param read the read to BAQ + * @return a non-null BAQ tag array for read + */ private byte[] calculateBAQArray( final GATKSAMRecord read ) { - // todo -- it would be good to directly use the BAQ qualities rather than encoding and decoding the result and using the special @ value baq.baqRead(read, referenceReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.ADD_TAG); return BAQ.getBAQTag(read); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java index f8d3c4281..121e3449b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.recalibration.ReadCovariates; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -84,7 +85,7 @@ public final class ReadRecalibrationInfo { * @return a valid quality score for event at offset */ @Requires("validOffset(offset)") - @Ensures("result >= 0 && result <= QualityUtils.MAX_QUAL_SCORE") + @Ensures("validQual(result)") public byte getQual(final EventType eventType, final int offset) { switch ( eventType ) { case BASE_SUBSTITUTION: return baseQuals[offset]; @@ -154,4 +155,8 @@ public final class ReadRecalibrationInfo { private boolean validOffset(final int offset) { return offset >= 0 && offset < baseQuals.length; } + + private boolean validQual(final byte result) { + return result >= 0 && result <= QualityUtils.MAX_QUAL_SCORE; + } }