From 0fb9179f7652d787c5a627bf4b3a1633b639aa96 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 7 Jun 2012 19:41:03 -0400 Subject: [PATCH 03/11] BQSR optimization: don't clone the original quals for each read, we can just overwrite the original array --- .../utils/recalibration/BaseRecalibration.java | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index d85fb03cd..5b5f99b3f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -88,19 +88,18 @@ public class BaseRecalibration { public void recalibrateRead(final GATKSAMRecord read) { final ReadCovariates readCovariates = RecalDataManager.computeCovariates(read, requestedCovariates); // compute all covariates for the read for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings - final byte[] originalQuals = read.getBaseQualities(errorModel); - final byte[] recalQuals = originalQuals.clone(); + final byte[] quals = read.getBaseQualities(errorModel); for (int offset = 0; offset < read.getReadLength(); offset++) { // recalibrate all bases in the read - byte qualityScore = originalQuals[offset]; + final byte originalQualityScore = quals[offset]; - if (qualityScore >= QualityUtils.MIN_USABLE_Q_SCORE) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality) + if (originalQualityScore >= QualityUtils.MIN_USABLE_Q_SCORE) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality) final BitSet[] keySet = readCovariates.getKeySet(offset, errorModel); // get the keyset for this base using the error model - qualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base + final byte recalibratedQualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base + quals[offset] = recalibratedQualityScore; } - recalQuals[offset] = qualityScore; } - read.setBaseQualities(recalQuals, errorModel); + read.setBaseQualities(quals, errorModel); } } From 31c3a6be480376d6c71af75995879f887739fe80 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 7 Jun 2012 20:04:10 -0400 Subject: [PATCH 04/11] BQSR optimization: getRequiredCovariates() and getOptionalCovariates() were creating a new List every time they were being called, and unfortunately getRequiredCovariates().size() is used as the stop condition in for-loops throughout the code. Just maintaining the original list of covariates results in a 15% reduction in runtime for BQSR. --- .../gatk/walkers/bqsr/BQSRKeyManager.java | 80 ++++++++++--------- 1 file changed, 42 insertions(+), 38 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java index 1cb02f1c1..bba87f2ad 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java @@ -25,8 +25,11 @@ import java.util.*; * @since 3/6/12 */ public class BQSRKeyManager { - private final List requiredCovariates; - private final List optionalCovariates; + + private final List requiredCovariates; + private final List optionalCovariates; + private final List requiredCovariatesInfo; + private final List optionalCovariatesInfo; private final Map covariateNameToIDMap; private int nRequiredBits; // Number of bits used to represent the required covariates @@ -44,15 +47,17 @@ public class BQSRKeyManager { * @param optionalCovariates the ordered list of optional covariates */ public BQSRKeyManager(List requiredCovariates, List optionalCovariates) { - this.requiredCovariates = new ArrayList(requiredCovariates.size()); // initialize the required covariates list - this.optionalCovariates = new ArrayList(optionalCovariates.size()); // initialize the optional covariates list (size may be 0, it's okay) - this.covariateNameToIDMap = new HashMap(optionalCovariates.size()*2); // the map from covariate name to covariate id (when reading GATK Reports, we get the IDs as names of covariates) + this.requiredCovariates = new ArrayList(requiredCovariates); + this.optionalCovariates = new ArrayList(optionalCovariates); + requiredCovariatesInfo = new ArrayList(requiredCovariates.size()); // initialize the required covariates list + optionalCovariatesInfo = new ArrayList(optionalCovariates.size()); // initialize the optional covariates list (size may be 0, it's okay) + covariateNameToIDMap = new HashMap(optionalCovariates.size()*2); // the map from covariate name to covariate id (when reading GATK Reports, we get the IDs as names of covariates) nRequiredBits = 0; for (Covariate required : requiredCovariates) { // create a list of required covariates with the extra information for key management int nBits = required.numberOfBits(); // number of bits used by this covariate BitSet mask = genericMask(nRequiredBits, nBits); // create a mask for this covariate - this.requiredCovariates.add(new RequiredCovariateInfo(nRequiredBits, mask, required)); // Create an object for this required covariate + requiredCovariatesInfo.add(new RequiredCovariateInfo(nRequiredBits, mask, required)); // Create an object for this required covariate nRequiredBits += nBits; } @@ -62,9 +67,9 @@ public class BQSRKeyManager { int nBits = optional.numberOfBits(); // number of bits used by this covariate nOptionalBits = Math.max(nOptionalBits, nBits); // optional covariates are represented by the number of bits needed by biggest covariate BitSet optionalID = BitSetUtils.bitSetFrom(id); // calculate the optional covariate ID for this covariate - this.optionalCovariates.add(new OptionalCovariateInfo(optionalID, optional)); // optional covariates have standardized mask and number of bits, so no need to store in the RequiredCovariateInfo object + optionalCovariatesInfo.add(new OptionalCovariateInfo(optionalID, optional)); // optional covariates have standardized mask and number of bits, so no need to store in the RequiredCovariateInfo object String covariateName = optional.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport - this.covariateNameToIDMap.put(covariateName, id); + covariateNameToIDMap.put(covariateName, id); id++; } @@ -100,10 +105,10 @@ public class BQSRKeyManager { int covariateIndex = 0; BitSet requiredKey = new BitSet(nRequiredBits); // This will be a bitset holding all the required keys, to replicate later on - for (RequiredCovariateInfo infoRequired : requiredCovariates) + for (RequiredCovariateInfo infoRequired : requiredCovariatesInfo) addBitSetToKeyAtLocation(requiredKey, allKeys[covariateIndex++], infoRequired.bitsBefore); // Add all the required covariates to the key set - for (OptionalCovariateInfo infoOptional : optionalCovariates) { + for (OptionalCovariateInfo infoOptional : optionalCovariatesInfo) { BitSet covariateKey = allKeys[covariateIndex++]; // get the bitset from all keys if (covariateKey == null) continue; // do not add nulls to the final set of keys. @@ -116,7 +121,7 @@ public class BQSRKeyManager { allBitSets.add(optionalKey); // add this key to the list of keys } - if (optionalCovariates.size() == 0) { // special case when we have no optional covariates, add the event type to the required key (our only key) + if (optionalCovariatesInfo.size() == 0) { // special case when we have no optional covariates, add the event type to the required key (our only key) addBitSetToKeyAtLocation(requiredKey, eventBitSet, eventTypeBitIndex); // Add the event type allBitSets.add(requiredKey); // add this key to the list of keys } @@ -140,16 +145,16 @@ public class BQSRKeyManager { BitSet bitSetKey = new BitSet(totalNumberOfBits); int requiredCovariate = 0; - for (RequiredCovariateInfo infoRequired : requiredCovariates) { + for (RequiredCovariateInfo infoRequired : requiredCovariatesInfo) { BitSet covariateBitSet = infoRequired.covariate.bitSetFromKey(key[requiredCovariate++]); // create a bitset from the object key provided using the required covariate's interface addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, infoRequired.bitsBefore); // add it to the bitset key } - if (optionalCovariates.size() > 0) { - int optionalCovariate = requiredCovariates.size(); // the optional covariate index in the key array + if (optionalCovariatesInfo.size() > 0) { + int optionalCovariate = requiredCovariatesInfo.size(); // the optional covariate index in the key array int covariateIDIndex = optionalCovariate + 1; // the optional covariate ID index is right after the optional covariate's int covariateID = parseCovariateID(key[covariateIDIndex]); // when reading the GATK Report the ID may come in a String instead of an index - OptionalCovariateInfo infoOptional = optionalCovariates.get(covariateID); // so we can get the optional covariate information + OptionalCovariateInfo infoOptional = optionalCovariatesInfo.get(covariateID); // so we can get the optional covariate information BitSet covariateBitSet = infoOptional.covariate.bitSetFromKey(key[optionalCovariate]); // convert the optional covariate key into a bitset using the covariate's interface addBitSetToKeyAtLocation(bitSetKey, covariateBitSet, nRequiredBits); // add the optional covariate right after the required covariates @@ -185,16 +190,16 @@ public class BQSRKeyManager { */ public List keySetFrom(BitSet key) { List objectKeys = new ArrayList(); - for (RequiredCovariateInfo info : requiredCovariates) { + for (RequiredCovariateInfo info : requiredCovariatesInfo) { BitSet covariateBitSet = extractBitSetFromKey(key, info.mask, info.bitsBefore); // get the covariate's bitset objectKeys.add(info.covariate.keyFromBitSet(covariateBitSet)); // convert the bitset to object using covariate's interface } - if (optionalCovariates.size() > 0) { + if (optionalCovariatesInfo.size() > 0) { BitSet covBitSet = extractBitSetFromKey(key, optionalCovariateMask, nRequiredBits); // mask out the covariate bit set BitSet idbs = extractBitSetFromKey(key, optionalCovariateIDMask, nRequiredBits + nOptionalBits); // mask out the covariate order (to identify which covariate this is) short id = BitSetUtils.shortFrom(idbs); // covert the id bitset into a short - Covariate covariate = optionalCovariates.get(id).covariate; // get the corresponding optional covariate object + Covariate covariate = optionalCovariatesInfo.get(id).covariate; // get the corresponding optional covariate object objectKeys.add(covariate.keyFromBitSet(covBitSet)); // add the optional covariate to the key set objectKeys.add(covariate.getClass().getSimpleName().split("Covariate")[0]); // add the covariate name using the id } @@ -203,18 +208,17 @@ public class BQSRKeyManager { return objectKeys; } + /** + * Translates a masked bitset into a bitset starting at 0 + * + * @return a list of the optional covariates + */ public List getRequiredCovariates() { - ArrayList list = new ArrayList(requiredCovariates.size()); - for (RequiredCovariateInfo info : requiredCovariates) - list.add(info.covariate); - return list; + return requiredCovariates; } public List getOptionalCovariates() { - ArrayList list = new ArrayList(optionalCovariates.size()); - for (OptionalCovariateInfo info : optionalCovariates) - list.add(info.covariate); - return list; + return optionalCovariates; } /** @@ -287,24 +291,24 @@ public class BQSRKeyManager { if (this == other) return true; - if (requiredCovariates.size() != other.requiredCovariates.size() || optionalCovariates.size() != other.optionalCovariates.size()) + if (requiredCovariatesInfo.size() != other.requiredCovariatesInfo.size() || + optionalCovariatesInfo.size() != other.optionalCovariatesInfo.size()) return false; - Iterator otherRequiredIterator = other.requiredCovariates.iterator(); - for (RequiredCovariateInfo thisInfo: requiredCovariates) { - RequiredCovariateInfo otherInfo = otherRequiredIterator.next(); - - String thisName = thisInfo.covariate.getClass().getSimpleName(); - String otherName = otherInfo.covariate.getClass().getSimpleName(); + for (int i = 0; i < requiredCovariates.size(); i++) { + Covariate myRequiredCovariate = requiredCovariates.get(i); + Covariate otherRequiredCovariate = other.requiredCovariates.get(i); + String thisName = myRequiredCovariate.getClass().getSimpleName(); + String otherName = otherRequiredCovariate.getClass().getSimpleName(); if (!thisName.equals(otherName)) return false; } - Iterator otherOptionalIterator = other.optionalCovariates.iterator(); - for (OptionalCovariateInfo thisInfo : optionalCovariates) { - OptionalCovariateInfo otherInfo = otherOptionalIterator.next(); - String thisName = thisInfo.covariate.getClass().getSimpleName(); - String otherName = otherInfo.covariate.getClass().getSimpleName(); + for (int i = 0; i < optionalCovariates.size(); i++) { + Covariate myOptionalCovariate = optionalCovariates.get(i); + Covariate otherOptionalCovariate = other.optionalCovariates.get(i); + String thisName = myOptionalCovariate.getClass().getSimpleName(); + String otherName = otherOptionalCovariate.getClass().getSimpleName(); if (!thisName.equals(otherName)) return false; } From 2bd48a73519809b4ea79f47f29d1208cb37ba4b1 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 7 Jun 2012 23:12:56 -0400 Subject: [PATCH 05/11] Bad comments made it into the previous commit --- .../sting/gatk/walkers/bqsr/BQSRKeyManager.java | 5 ----- 1 file changed, 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java index bba87f2ad..cdd186e9f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java @@ -208,11 +208,6 @@ public class BQSRKeyManager { return objectKeys; } - /** - * Translates a masked bitset into a bitset starting at 0 - * - * @return a list of the optional covariates - */ public List getRequiredCovariates() { return requiredCovariates; } From d463ab2cbf4d08302c002246d340dd6017d10f76 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 8 Jun 2012 10:42:42 -0400 Subject: [PATCH 06/11] BQSR optimization: String manipulation is extremely expensive in Java (accounts for 8% of BQSR runtime). Instead use byte[] and StringBuilder when possible. --- .../gatk/walkers/bqsr/ContextCovariate.java | 4 +- .../broadinstitute/sting/utils/BaseUtils.java | 11 +++++ .../sting/utils/BitSetUtils.java | 47 ++++++++----------- 3 files changed, 32 insertions(+), 30 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java index e2d2a3d1f..e2663359b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java @@ -126,8 +126,8 @@ public class ContextCovariate implements StandardCovariate { private BitSet contextWith(byte[] bases, int offset, int contextSize) { BitSet result = null; if (offset - contextSize + 1 >= 0) { - String context = new String(Arrays.copyOfRange(bases, offset - contextSize + 1, offset + 1)); - if (!context.contains("N")) + final byte[] context = Arrays.copyOfRange(bases, offset - contextSize + 1, offset + 1); + if (!BaseUtils.containsBase(context, BaseUtils.N)) result = BitSetUtils.bitSetFrom(context); } return result; diff --git a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 61812629c..3871ca987 100644 --- a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -101,6 +101,17 @@ public class BaseUtils { return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2); } + /** + * @return true iff the bases array contains at least one instance of base + */ + static public boolean containsBase(final byte[] bases, final byte base) { + for ( final byte b : bases ) { + if ( b == base ) + return true; + } + return false; + } + /** * Converts a IUPAC nucleotide code to a pair of bases * diff --git a/public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java b/public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java index 6d3493211..98c901bcd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java @@ -130,32 +130,32 @@ public class BitSetUtils { if (number < 0) throw new ReviewedStingException("dna conversion cannot handle negative numbers. Possible overflow?"); - int length = contextLengthFor(number); // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls) - number -= combinationsFor(length - 1); // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation + final int length = contextLengthFor(number); // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls) + number -= combinationsFor(length - 1); // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation - String dna = ""; + StringBuilder dna = new StringBuilder(); while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical) byte base = (byte) (number % 4); switch (base) { case 0: - dna = "A" + dna; + dna.append('A'); break; case 1: - dna = "C" + dna; + dna.append('C'); break; case 2: - dna = "G" + dna; + dna.append('G'); break; case 3: - dna = "T" + dna; + dna.append('T'); break; } number /= 4; } for (int j = dna.length(); j < length; j++) - dna = "A" + dna; // add leading A's as necessary (due to the "quasi" canonical status, see description above) + dna.append('A'); // add leading A's as necessary (due to the "quasi" canonical status, see description above) - return dna; + return dna.reverse().toString(); // make sure to reverse the string since we should have been pre-pending all along } /** @@ -178,27 +178,18 @@ public class BitSetUtils { * @return the bitset representing the dna sequence */ public static BitSet bitSetFrom(String dna) { - if (dna.length() > MAX_DNA_CONTEXT) - throw new ReviewedStingException(String.format("DNA Length cannot be bigger than %d. dna: %s (%d)", MAX_DNA_CONTEXT, dna, dna.length())); + return bitSetFrom(dna.getBytes()); + } - long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set - long preContext = combinationsFor(dna.length() - 1); // the sum of all combinations that preceded the length of the dna string - for (int i = 0; i < dna.length(); i++) { + public static BitSet bitSetFrom(final byte[] dna) { + if (dna.length > MAX_DNA_CONTEXT) + throw new ReviewedStingException(String.format("DNA Length cannot be bigger than %d. dna: %s (%d)", MAX_DNA_CONTEXT, dna, dna.length)); + + final long preContext = combinationsFor(dna.length - 1); // the sum of all combinations that preceded the length of the dna string + long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set + for (final byte base : dna) { baseTen *= 4; - switch (dna.charAt(i)) { - case 'A': - baseTen += 0; - break; - case 'C': - baseTen += 1; - break; - case 'G': - baseTen += 2; - break; - case 'T': - baseTen += 3; - break; - } + baseTen += BaseUtils.simpleBaseToBaseIndex(base); } return bitSetFrom(baseTen + preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length. } From 0a37e199987dcb2d89b9f548f7165022e3602568 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Fri, 8 Jun 2012 11:51:28 -0400 Subject: [PATCH 07/11] Bug fix in VQSR so that the VCF index will be created for the recalFile. --- .../walkers/variantrecalibration/VariantRecalibrator.java | 5 +---- .../VariantRecalibrationWalkersIntegrationTest.java | 2 ++ 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 813f20e57..3666cfd12 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -139,7 +139,6 @@ public class VariantRecalibrator extends RodWalker Date: Fri, 8 Jun 2012 12:07:58 -0400 Subject: [PATCH 08/11] Minor optimizations --- .../broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java | 2 +- public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java index cdd186e9f..c1449c5b3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java @@ -98,7 +98,7 @@ public class BQSRKeyManager { * @return one key in bitset representation per covariate */ public List bitSetsFromAllKeys(BitSet[] allKeys, EventType eventType) { - List allBitSets = new LinkedList(); // Generate one key per optional covariate + List allBitSets = new ArrayList(); // Generate one key per optional covariate BitSet eventBitSet = BitSetUtils.bitSetFrom(eventType.index); // create a bitset with the event type int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // Location in the bit set to add the event type bits diff --git a/public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java b/public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java index 98c901bcd..41889e2f5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java @@ -82,7 +82,7 @@ public class BitSetUtils { * @return a bitset representation of the integer */ public static BitSet bitSetFrom(long number, int nBits) { - BitSet bitSet = new BitSet(); + BitSet bitSet = new BitSet(nBits); boolean isNegative = number < 0; int bitIndex = 0; while (number != 0) { From 92280b4068e80bf17b6d4a615e6604a6ad6e8be1 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 8 Jun 2012 13:54:37 -0400 Subject: [PATCH 09/11] BQSR optimization: cache the BitSetUtils.bitSetFrom() calls since they are called over and over again with the same values. Another 10% reduction in runtime. --- .../gatk/walkers/bqsr/BQSRKeyManager.java | 21 ++++++++++++++----- .../sting/utils/BitSetUtils.java | 11 +++++++++- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java index c1449c5b3..728d1a8d4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java @@ -66,7 +66,7 @@ public class BQSRKeyManager { for (Covariate optional : optionalCovariates) { int nBits = optional.numberOfBits(); // number of bits used by this covariate nOptionalBits = Math.max(nOptionalBits, nBits); // optional covariates are represented by the number of bits needed by biggest covariate - BitSet optionalID = BitSetUtils.bitSetFrom(id); // calculate the optional covariate ID for this covariate + BitSet optionalID = bitSetFromId(id); // calculate the optional covariate ID for this covariate optionalCovariatesInfo.add(new OptionalCovariateInfo(optionalID, optional)); // optional covariates have standardized mask and number of bits, so no need to store in the RequiredCovariateInfo object String covariateName = optional.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport covariateNameToIDMap.put(covariateName, id); @@ -100,7 +100,7 @@ public class BQSRKeyManager { public List bitSetsFromAllKeys(BitSet[] allKeys, EventType eventType) { List allBitSets = new ArrayList(); // Generate one key per optional covariate - BitSet eventBitSet = BitSetUtils.bitSetFrom(eventType.index); // create a bitset with the event type + BitSet eventBitSet = bitSetFromEvent(eventType); // create a bitset with the event type int eventTypeBitIndex = nRequiredBits + nOptionalBits + nOptionalIDBits; // Location in the bit set to add the event type bits int covariateIndex = 0; @@ -257,9 +257,20 @@ public class BQSRKeyManager { eventKey.set(i - firstBitIndex); return EventType.eventFrom(BitSetUtils.shortFrom(eventKey)); } - - private BitSet bitSetFromEvent(EventType eventType) { - return BitSetUtils.bitSetFrom(eventType.index); + + // cache the BitSet representing an event since it's otherwise created a massive amount of times + private static final Map eventTypeCache = new HashMap(EventType.values().length); + static { + for (final EventType eventType : EventType.values()) + eventTypeCache.put(eventType, BitSetUtils.bitSetFrom(eventType.index)); + } + + private BitSet bitSetFromEvent(final EventType eventType) { + return eventTypeCache.get(eventType); + } + + private BitSet bitSetFromId(final short id) { + return BitSetUtils.bitSetFrom(id); } private int bitsInEventType() { diff --git a/public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java b/public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java index 41889e2f5..aa7cd4b37 100644 --- a/public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java @@ -5,6 +5,8 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.io.ByteArrayOutputStream; import java.io.ObjectOutputStream; import java.util.BitSet; +import java.util.HashMap; +import java.util.Map; /** * Utilities for bitset conversion @@ -71,8 +73,15 @@ public class BitSetUtils { * @return a bitset representation of the short */ public static BitSet bitSetFrom(short number) { - return bitSetFrom(number, NBITS_SHORT_REPRESENTATION); + BitSet result = shortCache.get(number); + if (result == null) { + result = bitSetFrom(number, NBITS_SHORT_REPRESENTATION); + shortCache.put(number, result); + } + return result; } + // use a static cache for shorts (but not for longs, because there could be a lot of entries) + private static final Map shortCache = new HashMap(2 * Short.MAX_VALUE); /** * Creates a BitSet representation of an arbitrary integer (number of bits capped at 64 -- long precision) From 4aad7e23efed0dd113b7f407811dbe6bd0ee03df Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 14 May 2012 13:39:42 -0400 Subject: [PATCH 11/11] New ReduceReads v2 with unclipped variant regions and soft-clipped bases * Re-wrote the sliding window approach to allow the variant region not to clip the reads that overlap it. * Updated consensus to include only reads that were not passed on by the variant region, header counts are updated on the fly to avoid recompute * Added soft clipped bases to ReduceReads analysis by unclipping high quality soft-clips then re-clipping after reduce reads * Updated all integration tests --- .../sting/utils/clipping/ReadClipper.java | 74 ++++++++++++++++--- .../sting/utils/sam/GATKSAMRecord.java | 4 +- ...eadUnclippedStartWithNoTiesComparator.java | 48 ++++++++++++ 3 files changed, 114 insertions(+), 12 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/sam/ReadUnclippedStartWithNoTiesComparator.java diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java index 9e7ee9dac..d934bb972 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils.clipping; import com.google.java.contract.Requires; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.gatk.walkers.bqsr.EventType; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -135,10 +136,9 @@ public class ReadClipper { ops.clear(); if ( clippedRead.isEmpty() ) return GATKSAMRecord.emptyRead(clippedRead); -// return new GATKSAMRecord( clippedRead.getHeader() ); return clippedRead; } catch (CloneNotSupportedException e) { - throw new RuntimeException(e); // this should never happen + throw new RuntimeException(e); // this should never happen } } } @@ -188,7 +188,6 @@ public class ReadClipper { private GATKSAMRecord hardClipByReadCoordinates(int start, int stop) { if (read.isEmpty() || (start == 0 && stop == read.getReadLength() - 1)) return GATKSAMRecord.emptyRead(read); -// return new GATKSAMRecord(read.getHeader()); this.addOp(new ClippingOp(start, stop)); return clipRead(ClippingRepresentation.HARDCLIP_BASES); @@ -213,14 +212,12 @@ public class ReadClipper { private GATKSAMRecord hardClipBothEndsByReferenceCoordinates(int left, int right) { if (read.isEmpty() || left == right) return GATKSAMRecord.emptyRead(read); -// return new GATKSAMRecord(read.getHeader()); GATKSAMRecord leftTailRead = hardClipByReferenceCoordinates(right, -1); // after clipping one tail, it is possible that the consequent hard clipping of adjacent deletions // make the left cut index no longer part of the read. In that case, clip the read entirely. if (left > leftTailRead.getAlignmentEnd()) return GATKSAMRecord.emptyRead(read); -// return new GATKSAMRecord(read.getHeader()); ReadClipper clipper = new ReadClipper(leftTailRead); return clipper.hardClipByReferenceCoordinatesLeftTail(left); @@ -402,17 +399,77 @@ public class ReadClipper { /** * Turns soft clipped bases into matches - * * @return a new read with every soft clip turned into a match */ private GATKSAMRecord revertSoftClippedBases() { - this.addOp(new ClippingOp(0, 0)); // UNSOFTCLIP_BASES doesn't need coordinates + if (read.isEmpty()) + return read; + + this.addOp(new ClippingOp(0, 0)); return this.clipRead(ClippingRepresentation.REVERT_SOFTCLIPPED_BASES); } + + /** + * Reverts ALL soft-clipped bases + * + * @param read the read + * @return the read with all soft-clipped bases turned into matches + */ public static GATKSAMRecord revertSoftClippedBases(GATKSAMRecord read) { return (new ReadClipper(read)).revertSoftClippedBases(); } + /** + * Reverts only soft clipped bases with quality score greater than or equal to minQual + * + * Note: Will write a temporary field with the number of soft clips that were undone on each side (left: 'SL', right: 'SR') + * + * @param read the read + * @param minQual the mininum base quality score to revert the base (inclusive) + * @return the read with high quality soft clips reverted + */ + public static GATKSAMRecord revertSoftClippedBases(GATKSAMRecord read, byte minQual) { + int nLeadingSoftClips = read.getAlignmentStart() - read.getSoftStart(); + if (read.isEmpty() || nLeadingSoftClips > read.getReadLength()) + return GATKSAMRecord.emptyRead(read); + + byte [] quals = read.getBaseQualities(EventType.BASE_SUBSTITUTION); + int left = -1; + + if (nLeadingSoftClips > 0) { + for (int i = nLeadingSoftClips - 1; i >= 0; i--) { + if (quals[i] >= minQual) + left = i; + else + break; + } + } + + int right = -1; + int nTailingSoftClips = read.getSoftEnd() - read.getAlignmentEnd(); + if (nTailingSoftClips > 0) { + for (int i = read.getReadLength() - nTailingSoftClips; i < read.getReadLength() ; i++) { + if (quals[i] >= minQual) + right = i; + else + break; + } + } + + GATKSAMRecord clippedRead = read; + if (right >= 0) { + if (right + 1 < clippedRead.getReadLength()) + clippedRead = hardClipByReadCoordinates(clippedRead, right+1, clippedRead.getReadLength()-1); // first we hard clip the low quality soft clips on the left tail + clippedRead.setTemporaryAttribute("SR", nTailingSoftClips - (read.getReadLength() - right - 1)); // keep track of how may bases to 're-softclip' after processing + } + if (left >= 0) { + if (left - 1 > 0) + clippedRead = hardClipByReadCoordinates(clippedRead, 0, left-1); // then we hard clip the low quality soft clips on the right tail + clippedRead.setTemporaryAttribute("SL", nLeadingSoftClips - left); // keep track of how may bases to 're-softclip' after processing + } + return revertSoftClippedBases(clippedRead); // now that we have only good bases in the soft clips, we can revert them all + } + /** * Generic functionality to hard clip a read, used internally by hardClipByReferenceCoordinatesLeftTail * and hardClipByReferenceCoordinatesRightTail. Should not be used directly. @@ -446,9 +503,6 @@ public class ReadClipper { stop = read.getReadLength() - 1; } -// if ((start == 0 && stop == read.getReadLength() - 1)) -// return GATKSAMRecord.emptyRead(read); - if (start < 0 || stop > read.getReadLength() - 1) throw new ReviewedStingException("Trying to clip before the start or after the end of a read"); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 7d3477a7b..c6d14a8bc 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -428,8 +428,8 @@ public class GATKSAMRecord extends BAMRecord { * Use this method if you want to create a new empty GATKSAMRecord based on * another GATKSAMRecord * - * @param read - * @return + * @param read a read to copy the header from + * @return a read with no bases but safe for the GATK */ public static GATKSAMRecord emptyRead(GATKSAMRecord read) { GATKSAMRecord emptyRead = new GATKSAMRecord(read.getHeader(), diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUnclippedStartWithNoTiesComparator.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUnclippedStartWithNoTiesComparator.java new file mode 100644 index 000000000..f961857e1 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUnclippedStartWithNoTiesComparator.java @@ -0,0 +1,48 @@ +package org.broadinstitute.sting.utils.sam; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.SAMRecord; + +import java.util.Comparator; + +public class ReadUnclippedStartWithNoTiesComparator implements Comparator { + @Requires("c1 >= 0 && c2 >= 0") + @Ensures("result == 0 || result == 1 || result == -1") + private int compareContigs(int c1, int c2) { + if (c1 == c2) + return 0; + else if (c1 > c2) + return 1; + return -1; + } + + @Requires("r1 != null && r2 != null") + @Ensures("result == 0 || result == 1 || result == -1") + public int compare(SAMRecord r1, SAMRecord r2) { + int result; + + if (r1 == r2) + result = 0; + + else if (r1.getReadUnmappedFlag()) + result = 1; + else if (r2.getReadUnmappedFlag()) + result = -1; + else { + final int cmpContig = compareContigs(r1.getReferenceIndex(), r2.getReferenceIndex()); + + if (cmpContig != 0) + result = cmpContig; + + else { + if (r1.getUnclippedStart() < r2.getUnclippedStart()) + result = -1; + else + result = 1; + } + } + + return result; + } +} \ No newline at end of file