BQSR optimization: String manipulation is extremely expensive in Java (accounts for 8% of BQSR runtime). Instead use byte[] and StringBuilder when possible.

This commit is contained in:
Eric Banks 2012-06-08 10:42:42 -04:00
parent 2bd48a7351
commit d463ab2cbf
3 changed files with 32 additions and 30 deletions

View File

@ -126,8 +126,8 @@ public class ContextCovariate implements StandardCovariate {
private BitSet contextWith(byte[] bases, int offset, int contextSize) {
BitSet result = null;
if (offset - contextSize + 1 >= 0) {
String context = new String(Arrays.copyOfRange(bases, offset - contextSize + 1, offset + 1));
if (!context.contains("N"))
final byte[] context = Arrays.copyOfRange(bases, offset - contextSize + 1, offset + 1);
if (!BaseUtils.containsBase(context, BaseUtils.N))
result = BitSetUtils.bitSetFrom(context);
}
return result;

View File

@ -101,6 +101,17 @@ public class BaseUtils {
return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2);
}
/**
* @return true iff the bases array contains at least one instance of base
*/
static public boolean containsBase(final byte[] bases, final byte base) {
for ( final byte b : bases ) {
if ( b == base )
return true;
}
return false;
}
/**
* Converts a IUPAC nucleotide code to a pair of bases
*

View File

@ -130,32 +130,32 @@ public class BitSetUtils {
if (number < 0)
throw new ReviewedStingException("dna conversion cannot handle negative numbers. Possible overflow?");
int length = contextLengthFor(number); // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls)
number -= combinationsFor(length - 1); // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation
final int length = contextLengthFor(number); // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls)
number -= combinationsFor(length - 1); // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation
String dna = "";
StringBuilder dna = new StringBuilder();
while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical)
byte base = (byte) (number % 4);
switch (base) {
case 0:
dna = "A" + dna;
dna.append('A');
break;
case 1:
dna = "C" + dna;
dna.append('C');
break;
case 2:
dna = "G" + dna;
dna.append('G');
break;
case 3:
dna = "T" + dna;
dna.append('T');
break;
}
number /= 4;
}
for (int j = dna.length(); j < length; j++)
dna = "A" + dna; // add leading A's as necessary (due to the "quasi" canonical status, see description above)
dna.append('A'); // add leading A's as necessary (due to the "quasi" canonical status, see description above)
return dna;
return dna.reverse().toString(); // make sure to reverse the string since we should have been pre-pending all along
}
/**
@ -178,27 +178,18 @@ public class BitSetUtils {
* @return the bitset representing the dna sequence
*/
public static BitSet bitSetFrom(String dna) {
if (dna.length() > MAX_DNA_CONTEXT)
throw new ReviewedStingException(String.format("DNA Length cannot be bigger than %d. dna: %s (%d)", MAX_DNA_CONTEXT, dna, dna.length()));
return bitSetFrom(dna.getBytes());
}
long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set
long preContext = combinationsFor(dna.length() - 1); // the sum of all combinations that preceded the length of the dna string
for (int i = 0; i < dna.length(); i++) {
public static BitSet bitSetFrom(final byte[] dna) {
if (dna.length > MAX_DNA_CONTEXT)
throw new ReviewedStingException(String.format("DNA Length cannot be bigger than %d. dna: %s (%d)", MAX_DNA_CONTEXT, dna, dna.length));
final long preContext = combinationsFor(dna.length - 1); // the sum of all combinations that preceded the length of the dna string
long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set
for (final byte base : dna) {
baseTen *= 4;
switch (dna.charAt(i)) {
case 'A':
baseTen += 0;
break;
case 'C':
baseTen += 1;
break;
case 'G':
baseTen += 2;
break;
case 'T':
baseTen += 3;
break;
}
baseTen += BaseUtils.simpleBaseToBaseIndex(base);
}
return bitSetFrom(baseTen + preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length.
}