BQSR optimization: String manipulation is extremely expensive in Java (accounts for 8% of BQSR runtime). Instead use byte[] and StringBuilder when possible.
This commit is contained in:
parent
2bd48a7351
commit
d463ab2cbf
|
|
@ -126,8 +126,8 @@ public class ContextCovariate implements StandardCovariate {
|
|||
private BitSet contextWith(byte[] bases, int offset, int contextSize) {
|
||||
BitSet result = null;
|
||||
if (offset - contextSize + 1 >= 0) {
|
||||
String context = new String(Arrays.copyOfRange(bases, offset - contextSize + 1, offset + 1));
|
||||
if (!context.contains("N"))
|
||||
final byte[] context = Arrays.copyOfRange(bases, offset - contextSize + 1, offset + 1);
|
||||
if (!BaseUtils.containsBase(context, BaseUtils.N))
|
||||
result = BitSetUtils.bitSetFrom(context);
|
||||
}
|
||||
return result;
|
||||
|
|
|
|||
|
|
@ -101,6 +101,17 @@ public class BaseUtils {
|
|||
return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true iff the bases array contains at least one instance of base
|
||||
*/
|
||||
static public boolean containsBase(final byte[] bases, final byte base) {
|
||||
for ( final byte b : bases ) {
|
||||
if ( b == base )
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a IUPAC nucleotide code to a pair of bases
|
||||
*
|
||||
|
|
|
|||
|
|
@ -130,32 +130,32 @@ public class BitSetUtils {
|
|||
if (number < 0)
|
||||
throw new ReviewedStingException("dna conversion cannot handle negative numbers. Possible overflow?");
|
||||
|
||||
int length = contextLengthFor(number); // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls)
|
||||
number -= combinationsFor(length - 1); // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation
|
||||
final int length = contextLengthFor(number); // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls)
|
||||
number -= combinationsFor(length - 1); // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation
|
||||
|
||||
String dna = "";
|
||||
StringBuilder dna = new StringBuilder();
|
||||
while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical)
|
||||
byte base = (byte) (number % 4);
|
||||
switch (base) {
|
||||
case 0:
|
||||
dna = "A" + dna;
|
||||
dna.append('A');
|
||||
break;
|
||||
case 1:
|
||||
dna = "C" + dna;
|
||||
dna.append('C');
|
||||
break;
|
||||
case 2:
|
||||
dna = "G" + dna;
|
||||
dna.append('G');
|
||||
break;
|
||||
case 3:
|
||||
dna = "T" + dna;
|
||||
dna.append('T');
|
||||
break;
|
||||
}
|
||||
number /= 4;
|
||||
}
|
||||
for (int j = dna.length(); j < length; j++)
|
||||
dna = "A" + dna; // add leading A's as necessary (due to the "quasi" canonical status, see description above)
|
||||
dna.append('A'); // add leading A's as necessary (due to the "quasi" canonical status, see description above)
|
||||
|
||||
return dna;
|
||||
return dna.reverse().toString(); // make sure to reverse the string since we should have been pre-pending all along
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -178,27 +178,18 @@ public class BitSetUtils {
|
|||
* @return the bitset representing the dna sequence
|
||||
*/
|
||||
public static BitSet bitSetFrom(String dna) {
|
||||
if (dna.length() > MAX_DNA_CONTEXT)
|
||||
throw new ReviewedStingException(String.format("DNA Length cannot be bigger than %d. dna: %s (%d)", MAX_DNA_CONTEXT, dna, dna.length()));
|
||||
return bitSetFrom(dna.getBytes());
|
||||
}
|
||||
|
||||
long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set
|
||||
long preContext = combinationsFor(dna.length() - 1); // the sum of all combinations that preceded the length of the dna string
|
||||
for (int i = 0; i < dna.length(); i++) {
|
||||
public static BitSet bitSetFrom(final byte[] dna) {
|
||||
if (dna.length > MAX_DNA_CONTEXT)
|
||||
throw new ReviewedStingException(String.format("DNA Length cannot be bigger than %d. dna: %s (%d)", MAX_DNA_CONTEXT, dna, dna.length));
|
||||
|
||||
final long preContext = combinationsFor(dna.length - 1); // the sum of all combinations that preceded the length of the dna string
|
||||
long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set
|
||||
for (final byte base : dna) {
|
||||
baseTen *= 4;
|
||||
switch (dna.charAt(i)) {
|
||||
case 'A':
|
||||
baseTen += 0;
|
||||
break;
|
||||
case 'C':
|
||||
baseTen += 1;
|
||||
break;
|
||||
case 'G':
|
||||
baseTen += 2;
|
||||
break;
|
||||
case 'T':
|
||||
baseTen += 3;
|
||||
break;
|
||||
}
|
||||
baseTen += BaseUtils.simpleBaseToBaseIndex(base);
|
||||
}
|
||||
return bitSetFrom(baseTen + preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length.
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue