BQSR optimization: String manipulation is extremely expensive in Java (accounts for 8% of BQSR runtime). Instead use byte[] and StringBuilder when possible.
This commit is contained in:
parent
2bd48a7351
commit
d463ab2cbf
|
|
@ -126,8 +126,8 @@ public class ContextCovariate implements StandardCovariate {
|
||||||
private BitSet contextWith(byte[] bases, int offset, int contextSize) {
|
private BitSet contextWith(byte[] bases, int offset, int contextSize) {
|
||||||
BitSet result = null;
|
BitSet result = null;
|
||||||
if (offset - contextSize + 1 >= 0) {
|
if (offset - contextSize + 1 >= 0) {
|
||||||
String context = new String(Arrays.copyOfRange(bases, offset - contextSize + 1, offset + 1));
|
final byte[] context = Arrays.copyOfRange(bases, offset - contextSize + 1, offset + 1);
|
||||||
if (!context.contains("N"))
|
if (!BaseUtils.containsBase(context, BaseUtils.N))
|
||||||
result = BitSetUtils.bitSetFrom(context);
|
result = BitSetUtils.bitSetFrom(context);
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
|
|
|
||||||
|
|
@ -101,6 +101,17 @@ public class BaseUtils {
|
||||||
return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2);
|
return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return true iff the bases array contains at least one instance of base
|
||||||
|
*/
|
||||||
|
static public boolean containsBase(final byte[] bases, final byte base) {
|
||||||
|
for ( final byte b : bases ) {
|
||||||
|
if ( b == base )
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Converts a IUPAC nucleotide code to a pair of bases
|
* Converts a IUPAC nucleotide code to a pair of bases
|
||||||
*
|
*
|
||||||
|
|
|
||||||
|
|
@ -130,32 +130,32 @@ public class BitSetUtils {
|
||||||
if (number < 0)
|
if (number < 0)
|
||||||
throw new ReviewedStingException("dna conversion cannot handle negative numbers. Possible overflow?");
|
throw new ReviewedStingException("dna conversion cannot handle negative numbers. Possible overflow?");
|
||||||
|
|
||||||
int length = contextLengthFor(number); // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls)
|
final int length = contextLengthFor(number); // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls)
|
||||||
number -= combinationsFor(length - 1); // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation
|
number -= combinationsFor(length - 1); // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation
|
||||||
|
|
||||||
String dna = "";
|
StringBuilder dna = new StringBuilder();
|
||||||
while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical)
|
while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical)
|
||||||
byte base = (byte) (number % 4);
|
byte base = (byte) (number % 4);
|
||||||
switch (base) {
|
switch (base) {
|
||||||
case 0:
|
case 0:
|
||||||
dna = "A" + dna;
|
dna.append('A');
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
dna = "C" + dna;
|
dna.append('C');
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
dna = "G" + dna;
|
dna.append('G');
|
||||||
break;
|
break;
|
||||||
case 3:
|
case 3:
|
||||||
dna = "T" + dna;
|
dna.append('T');
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
number /= 4;
|
number /= 4;
|
||||||
}
|
}
|
||||||
for (int j = dna.length(); j < length; j++)
|
for (int j = dna.length(); j < length; j++)
|
||||||
dna = "A" + dna; // add leading A's as necessary (due to the "quasi" canonical status, see description above)
|
dna.append('A'); // add leading A's as necessary (due to the "quasi" canonical status, see description above)
|
||||||
|
|
||||||
return dna;
|
return dna.reverse().toString(); // make sure to reverse the string since we should have been pre-pending all along
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -178,27 +178,18 @@ public class BitSetUtils {
|
||||||
* @return the bitset representing the dna sequence
|
* @return the bitset representing the dna sequence
|
||||||
*/
|
*/
|
||||||
public static BitSet bitSetFrom(String dna) {
|
public static BitSet bitSetFrom(String dna) {
|
||||||
if (dna.length() > MAX_DNA_CONTEXT)
|
return bitSetFrom(dna.getBytes());
|
||||||
throw new ReviewedStingException(String.format("DNA Length cannot be bigger than %d. dna: %s (%d)", MAX_DNA_CONTEXT, dna, dna.length()));
|
|
||||||
|
|
||||||
long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set
|
|
||||||
long preContext = combinationsFor(dna.length() - 1); // the sum of all combinations that preceded the length of the dna string
|
|
||||||
for (int i = 0; i < dna.length(); i++) {
|
|
||||||
baseTen *= 4;
|
|
||||||
switch (dna.charAt(i)) {
|
|
||||||
case 'A':
|
|
||||||
baseTen += 0;
|
|
||||||
break;
|
|
||||||
case 'C':
|
|
||||||
baseTen += 1;
|
|
||||||
break;
|
|
||||||
case 'G':
|
|
||||||
baseTen += 2;
|
|
||||||
break;
|
|
||||||
case 'T':
|
|
||||||
baseTen += 3;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static BitSet bitSetFrom(final byte[] dna) {
|
||||||
|
if (dna.length > MAX_DNA_CONTEXT)
|
||||||
|
throw new ReviewedStingException(String.format("DNA Length cannot be bigger than %d. dna: %s (%d)", MAX_DNA_CONTEXT, dna, dna.length));
|
||||||
|
|
||||||
|
final long preContext = combinationsFor(dna.length - 1); // the sum of all combinations that preceded the length of the dna string
|
||||||
|
long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set
|
||||||
|
for (final byte base : dna) {
|
||||||
|
baseTen *= 4;
|
||||||
|
baseTen += BaseUtils.simpleBaseToBaseIndex(base);
|
||||||
}
|
}
|
||||||
return bitSetFrom(baseTen + preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length.
|
return bitSetFrom(baseTen + preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length.
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue