BQSR optimization: String manipulation is extremely expensive in Java (accounts for 8% of BQSR runtime). Instead use byte[] and StringBuilder when possible.

This commit is contained in:
Eric Banks 2012-06-08 10:42:42 -04:00
parent 2bd48a7351
commit d463ab2cbf
3 changed files with 32 additions and 30 deletions

View File

@ -126,8 +126,8 @@ public class ContextCovariate implements StandardCovariate {
private BitSet contextWith(byte[] bases, int offset, int contextSize) { private BitSet contextWith(byte[] bases, int offset, int contextSize) {
BitSet result = null; BitSet result = null;
if (offset - contextSize + 1 >= 0) { if (offset - contextSize + 1 >= 0) {
String context = new String(Arrays.copyOfRange(bases, offset - contextSize + 1, offset + 1)); final byte[] context = Arrays.copyOfRange(bases, offset - contextSize + 1, offset + 1);
if (!context.contains("N")) if (!BaseUtils.containsBase(context, BaseUtils.N))
result = BitSetUtils.bitSetFrom(context); result = BitSetUtils.bitSetFrom(context);
} }
return result; return result;

View File

@ -101,6 +101,17 @@ public class BaseUtils {
return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2); return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2);
} }
/**
* @return true iff the bases array contains at least one instance of base
*/
static public boolean containsBase(final byte[] bases, final byte base) {
for ( final byte b : bases ) {
if ( b == base )
return true;
}
return false;
}
/** /**
* Converts a IUPAC nucleotide code to a pair of bases * Converts a IUPAC nucleotide code to a pair of bases
* *

View File

@ -130,32 +130,32 @@ public class BitSetUtils {
if (number < 0) if (number < 0)
throw new ReviewedStingException("dna conversion cannot handle negative numbers. Possible overflow?"); throw new ReviewedStingException("dna conversion cannot handle negative numbers. Possible overflow?");
int length = contextLengthFor(number); // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls) final int length = contextLengthFor(number); // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls)
number -= combinationsFor(length - 1); // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation number -= combinationsFor(length - 1); // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation
String dna = ""; StringBuilder dna = new StringBuilder();
while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical) while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical)
byte base = (byte) (number % 4); byte base = (byte) (number % 4);
switch (base) { switch (base) {
case 0: case 0:
dna = "A" + dna; dna.append('A');
break; break;
case 1: case 1:
dna = "C" + dna; dna.append('C');
break; break;
case 2: case 2:
dna = "G" + dna; dna.append('G');
break; break;
case 3: case 3:
dna = "T" + dna; dna.append('T');
break; break;
} }
number /= 4; number /= 4;
} }
for (int j = dna.length(); j < length; j++) for (int j = dna.length(); j < length; j++)
dna = "A" + dna; // add leading A's as necessary (due to the "quasi" canonical status, see description above) dna.append('A'); // add leading A's as necessary (due to the "quasi" canonical status, see description above)
return dna; return dna.reverse().toString(); // make sure to reverse the string since we should have been pre-pending all along
} }
/** /**
@ -178,27 +178,18 @@ public class BitSetUtils {
* @return the bitset representing the dna sequence * @return the bitset representing the dna sequence
*/ */
public static BitSet bitSetFrom(String dna) { public static BitSet bitSetFrom(String dna) {
if (dna.length() > MAX_DNA_CONTEXT) return bitSetFrom(dna.getBytes());
throw new ReviewedStingException(String.format("DNA Length cannot be bigger than %d. dna: %s (%d)", MAX_DNA_CONTEXT, dna, dna.length()));
long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set
long preContext = combinationsFor(dna.length() - 1); // the sum of all combinations that preceded the length of the dna string
for (int i = 0; i < dna.length(); i++) {
baseTen *= 4;
switch (dna.charAt(i)) {
case 'A':
baseTen += 0;
break;
case 'C':
baseTen += 1;
break;
case 'G':
baseTen += 2;
break;
case 'T':
baseTen += 3;
break;
} }
public static BitSet bitSetFrom(final byte[] dna) {
if (dna.length > MAX_DNA_CONTEXT)
throw new ReviewedStingException(String.format("DNA Length cannot be bigger than %d. dna: %s (%d)", MAX_DNA_CONTEXT, dna, dna.length));
final long preContext = combinationsFor(dna.length - 1); // the sum of all combinations that preceded the length of the dna string
long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set
for (final byte base : dna) {
baseTen *= 4;
baseTen += BaseUtils.simpleBaseToBaseIndex(base);
} }
return bitSetFrom(baseTen + preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length. return bitSetFrom(baseTen + preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length.
} }