BQSR optimization: String manipulation is extremely expensive in Java (accounts for 8% of BQSR runtime). Instead use byte[] and StringBuilder when possible.

2012-06-08 10:42:42 -04:00 · 2012-06-08 10:42:42 -04:00 · d463ab2cbf
parent 2bd48a7351
commit d463ab2cbf
3 changed files with 32 additions and 30 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java
@ -126,8 +126,8 @@ public class ContextCovariate implements StandardCovariate {
    private BitSet contextWith(byte[] bases, int offset, int contextSize) {
        BitSet result = null;
        if (offset - contextSize + 1 >= 0) {
-            String context = new String(Arrays.copyOfRange(bases, offset - contextSize + 1, offset + 1));
-            if (!context.contains("N"))
+            final byte[] context = Arrays.copyOfRange(bases, offset - contextSize + 1, offset + 1);
+            if (!BaseUtils.containsBase(context, BaseUtils.N))
                result = BitSetUtils.bitSetFrom(context);
        }
        return result;
--- a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java
@ -101,6 +101,17 @@ public class BaseUtils {
        return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2);
    }

+    /**
+     * @return true iff the bases array contains at least one instance of base
+     */
+    static public boolean containsBase(final byte[] bases, final byte base) {
+        for ( final byte b : bases ) {
+            if ( b == base )
+                return true;
+        }
+        return false;
+    }
+
    /**
     * Converts a IUPAC nucleotide code to a pair of bases
     *
--- a/public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/BitSetUtils.java
@ -130,32 +130,32 @@ public class BitSetUtils {
        if (number < 0)
            throw new ReviewedStingException("dna conversion cannot handle negative numbers. Possible overflow?");

-        int length = contextLengthFor(number);  // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls)
-        number -= combinationsFor(length - 1);  // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation
+        final int length = contextLengthFor(number);  // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls)
+        number -= combinationsFor(length - 1);        // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation

-        String dna = "";
+        StringBuilder dna = new StringBuilder();
        while (number > 0) {                    // perform a simple base_10 to base_4 conversion (quasi-canonical)
            byte base = (byte) (number % 4);
            switch (base) {
                case 0:
-                    dna = "A" + dna;
+                    dna.append('A');
                    break;
                case 1:
-                    dna = "C" + dna;
+                    dna.append('C');
                    break;
                case 2:
-                    dna = "G" + dna;
+                    dna.append('G');
                    break;
                case 3:
-                    dna = "T" + dna;
+                    dna.append('T');
                    break;
            }
            number /= 4;
        }
        for (int j = dna.length(); j < length; j++)
-            dna = "A" + dna;                    // add leading A's as necessary (due to the "quasi" canonical status, see description above)
+            dna.append('A');                          // add leading A's as necessary (due to the "quasi" canonical status, see description above)

-        return dna;
+        return dna.reverse().toString();              // make sure to reverse the string since we should have been pre-pending all along
    }

    /**
@ -178,27 +178,18 @@ public class BitSetUtils {
     * @return the bitset representing the dna sequence
     */
    public static BitSet bitSetFrom(String dna) {
-        if (dna.length() > MAX_DNA_CONTEXT)
-            throw new ReviewedStingException(String.format("DNA Length cannot be bigger than %d. dna: %s (%d)", MAX_DNA_CONTEXT, dna, dna.length()));
+        return bitSetFrom(dna.getBytes());
+    }

-        long baseTen = 0;                                       // the number in base_10 that we are going to use to generate the bit set
-        long preContext = combinationsFor(dna.length() - 1);    // the sum of all combinations that preceded the length of the dna string
-        for (int i = 0; i < dna.length(); i++) {
+    public static BitSet bitSetFrom(final byte[] dna) {
+        if (dna.length > MAX_DNA_CONTEXT)
+            throw new ReviewedStingException(String.format("DNA Length cannot be bigger than %d. dna: %s (%d)", MAX_DNA_CONTEXT, dna, dna.length));
+
+        final long preContext = combinationsFor(dna.length - 1);      // the sum of all combinations that preceded the length of the dna string
+        long baseTen = 0;                                             // the number in base_10 that we are going to use to generate the bit set
+        for (final byte base : dna) {
            baseTen *= 4;
-            switch (dna.charAt(i)) {
-                case 'A':
-                    baseTen += 0;
-                    break;
-                case 'C':
-                    baseTen += 1;
-                    break;
-                case 'G':
-                    baseTen += 2;
-                    break;
-                case 'T':
-                    baseTen += 3;
-                    break;
-            }
+            baseTen += BaseUtils.simpleBaseToBaseIndex(base);
        }
        return bitSetFrom(baseTen + preContext);                // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length.
    }