285 lines
12 KiB
Java
285 lines
12 KiB
Java
package org.broadinstitute.sting.utils;
|
|
|
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|
|
|
import java.io.ByteArrayOutputStream;
|
|
import java.io.ObjectOutputStream;
|
|
import java.util.BitSet;
|
|
|
|
/**
|
|
* Utilities for bitset conversion
|
|
*
|
|
* @author Mauricio Carneiro
|
|
* @since 3/5/12
|
|
*/
|
|
public class BitSetUtils {
|
|
|
|
static final private int MAX_DNA_CONTEXT = 31; // the maximum context size (number of bases) permitted in the "long bitset" implementation of the DNA <=> BitSet conversion.
|
|
static final private byte NBITS_LONG_REPRESENTATION = 64; // the number of bits used in the long version to represent the bit set (necessary for the two's complement representation of negative numbers)
|
|
static final private byte NBITS_SHORT_REPRESENTATION = 16; // the number of bits used in the short version to represent the bit set (necessary for the two's complement representation of negative numbers)
|
|
static final long[] combinationsPerLength = new long[MAX_DNA_CONTEXT + 1]; // keeps the memoized table with the number of combinations for each given DNA context length
|
|
|
|
/**
|
|
* Creates an long out of a bitset
|
|
*
|
|
* @param bitSet the bitset
|
|
* @return a long from the bitset representation
|
|
*/
|
|
public static long longFrom(final BitSet bitSet) {
|
|
return longFrom(bitSet, NBITS_LONG_REPRESENTATION);
|
|
}
|
|
|
|
/**
|
|
* Creates a short integer from a bitset
|
|
*
|
|
* @param bitSet the bitset
|
|
* @return a short from the bitset representation
|
|
*/
|
|
public static short shortFrom(final BitSet bitSet) {
|
|
return (short) longFrom(bitSet, NBITS_SHORT_REPRESENTATION);
|
|
}
|
|
|
|
/**
|
|
* Cretes an integer with any number of bits (up to 64 -- long precision) from a bitset
|
|
*
|
|
* @param bitSet the bitset
|
|
* @param nBits the number of bits to be used for this representation
|
|
* @return an integer with nBits from the bitset representation
|
|
*/
|
|
public static long longFrom(final BitSet bitSet, final int nBits) {
|
|
long number = 0;
|
|
for (int bitIndex = bitSet.nextSetBit(0); bitIndex >= 0 && bitIndex <= nBits; bitIndex = bitSet.nextSetBit(bitIndex + 1))
|
|
number |= 1L << bitIndex;
|
|
|
|
return number;
|
|
}
|
|
|
|
/**
|
|
* Creates a BitSet representation of a given long
|
|
*
|
|
* @param number the number to turn into a bitset
|
|
* @return a bitset representation of the long
|
|
*/
|
|
public static BitSet bitSetFrom(long number) {
|
|
return bitSetFrom(number, NBITS_LONG_REPRESENTATION);
|
|
}
|
|
|
|
/**
|
|
* Creates a BitSet representation of a given short
|
|
*
|
|
* @param number the number to turn into a bitset
|
|
* @return a bitset representation of the short
|
|
*/
|
|
public static BitSet bitSetFrom(short number) {
|
|
return bitSetFrom(number, NBITS_SHORT_REPRESENTATION);
|
|
}
|
|
|
|
/**
|
|
* Creates a BitSet representation of an arbitrary integer (number of bits capped at 64 -- long precision)
|
|
*
|
|
* @param number the number to turn into a bitset
|
|
* @param nBits the number of bits to use as precision for this conversion
|
|
* @return a bitset representation of the integer
|
|
*/
|
|
public static BitSet bitSetFrom(long number, int nBits) {
|
|
BitSet bitSet = new BitSet();
|
|
boolean isNegative = number < 0;
|
|
int bitIndex = 0;
|
|
while (number != 0) {
|
|
if (number % 2 != 0)
|
|
bitSet.set(bitIndex);
|
|
bitIndex++;
|
|
number /= 2;
|
|
}
|
|
if (isNegative) {
|
|
boolean foundFirstSetBit = false;
|
|
for (int i = bitSet.nextSetBit(0); i < nBits && i >= 0; i++) {
|
|
boolean bit = bitSet.get(i);
|
|
if (!foundFirstSetBit && bit)
|
|
foundFirstSetBit = true; // maintain all bits until the first 1 is found (inclusive)
|
|
else if (foundFirstSetBit)
|
|
bitSet.flip(i); // flip every other bit up to NBITS_REPRESENTATION
|
|
}
|
|
}
|
|
return bitSet;
|
|
}
|
|
|
|
/**
|
|
* Converts a BitSet into the dna string representation.
|
|
*
|
|
* Warning: This conversion is limited to long precision, therefore the dna sequence cannot
|
|
* be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create
|
|
* a bitSetFrom(BigNumber) method.
|
|
*
|
|
* We calculate the length of the resulting DNA sequence by looking at the sum(4^i) that exceeds the
|
|
* base_10 representation of the sequence. This is important for us to know how to bring the number
|
|
* to a quasi-canonical base_4 representation, and to fill in leading A's (since A's are represented
|
|
* as 0's and leading 0's are omitted).
|
|
*
|
|
* quasi-canonical because A is represented by a 0, therefore,
|
|
* instead of : 0, 1, 2, 3, 10, 11, 12, ...
|
|
* we have : 0, 1, 2, 3, 00, 01, 02, ...
|
|
*
|
|
* but we can correctly decode it because we know the final length.
|
|
*
|
|
* @param bitSet the bitset representation of the dna sequence
|
|
* @return the dna sequence represented by the bitset
|
|
*/
|
|
public static String dnaFrom(final BitSet bitSet) {
|
|
long number = longFrom(bitSet); // the base_10 representation of the bit set
|
|
if (number < 0)
|
|
throw new ReviewedStingException("dna conversion cannot handle negative numbers. Possible overflow?");
|
|
|
|
int length = contextLengthFor(number); // the length of the context (the number of combinations is memoized, so costs zero to separate this into two method calls)
|
|
number -= combinationsFor(length - 1); // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation
|
|
|
|
String dna = "";
|
|
while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical)
|
|
byte base = (byte) (number % 4);
|
|
switch (base) {
|
|
case 0:
|
|
dna = "A" + dna;
|
|
break;
|
|
case 1:
|
|
dna = "C" + dna;
|
|
break;
|
|
case 2:
|
|
dna = "G" + dna;
|
|
break;
|
|
case 3:
|
|
dna = "T" + dna;
|
|
break;
|
|
}
|
|
number /= 4;
|
|
}
|
|
for (int j = dna.length(); j < length; j++)
|
|
dna = "A" + dna; // add leading A's as necessary (due to the "quasi" canonical status, see description above)
|
|
|
|
return dna;
|
|
}
|
|
|
|
/**
|
|
* Creates a BitSet representation of a given dna string.
|
|
*
|
|
* Warning: This conversion is limited to long precision, therefore the dna sequence cannot
|
|
* be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create
|
|
* a bitSetFrom(BigNumber) method.
|
|
*
|
|
* The bit representation of a dna string is the simple:
|
|
* 0 A 4 AA 8 CA
|
|
* 1 C 5 AC ...
|
|
* 2 G 6 AG 1343 TTGGT
|
|
* 3 T 7 AT 1364 TTTTT
|
|
*
|
|
* To convert from dna to number, we convert the dna string to base10 and add all combinations that
|
|
* preceded the string (with smaller lengths).
|
|
*
|
|
* @param dna the dna sequence
|
|
* @return the bitset representing the dna sequence
|
|
*/
|
|
public static BitSet bitSetFrom(String dna) {
|
|
if (dna.length() > MAX_DNA_CONTEXT)
|
|
throw new ReviewedStingException(String.format("DNA Length cannot be bigger than %d. dna: %s (%d)", MAX_DNA_CONTEXT, dna, dna.length()));
|
|
|
|
long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set
|
|
long preContext = combinationsFor(dna.length() - 1); // the sum of all combinations that preceded the length of the dna string
|
|
for (int i = 0; i < dna.length(); i++) {
|
|
baseTen *= 4;
|
|
switch (dna.charAt(i)) {
|
|
case 'A':
|
|
baseTen += 0;
|
|
break;
|
|
case 'C':
|
|
baseTen += 1;
|
|
break;
|
|
case 'G':
|
|
baseTen += 2;
|
|
break;
|
|
case 'T':
|
|
baseTen += 3;
|
|
break;
|
|
}
|
|
}
|
|
return bitSetFrom(baseTen + preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length.
|
|
}
|
|
|
|
/**
|
|
* Calculates the number of bits necessary to represent a given number of elements
|
|
*
|
|
* @param numberOfElements the number of elements to represent (must be positive)
|
|
* @return the number of bits necessary to represent this many elements
|
|
*/
|
|
public static int numberOfBitsToRepresent(long numberOfElements) {
|
|
if (numberOfElements < 0)
|
|
throw new ReviewedStingException("Number of elements must be positive: " + numberOfElements);
|
|
|
|
if (numberOfElements == 1L)
|
|
return 1; // special case
|
|
|
|
int n = 0;
|
|
numberOfElements--;
|
|
while (numberOfElements > 0) {
|
|
numberOfElements = numberOfElements >> 1;
|
|
n++;
|
|
}
|
|
return n;
|
|
}
|
|
|
|
/**
|
|
* Calculates the length of the DNA context for a given base 10 number
|
|
*
|
|
* It is important to know the length given the base 10 number to calculate the number of combinations
|
|
* and to disambiguate the "quasi-canonical" state.
|
|
*
|
|
* This method also calculates the number of combinations as a by-product, but since it memoizes the
|
|
* results, a subsequent call to combinationsFor(length) is O(1).
|
|
*
|
|
* @param number the base 10 representation of the bitset
|
|
* @return the length of the DNA context represented by this number
|
|
*/
|
|
private static int contextLengthFor(long number) {
|
|
int length = 1; // the calculated length of the DNA sequence given the base_10 representation of its BitSet.
|
|
long combinations = combinationsFor(length); // the next context (we advance it so we know which one was preceding it).
|
|
while (combinations <= number) { // find the length of the dna string (length)
|
|
length++;
|
|
combinations = combinationsFor(length); // calculate the next context
|
|
}
|
|
return length;
|
|
}
|
|
|
|
/**
|
|
* The sum of all combinations of a context of a given length from length = 0 to length.
|
|
*
|
|
* Memoized implementation of sum(4^i) , where i=[0,length]
|
|
*
|
|
* @param length the length of the DNA context
|
|
* @return the sum of all combinations leading up to this context length.
|
|
*/
|
|
private static long combinationsFor(int length) {
|
|
if (length > MAX_DNA_CONTEXT)
|
|
throw new ReviewedStingException(String.format("Context cannot be longer than %d bases but requested %d.", MAX_DNA_CONTEXT, length));
|
|
|
|
// only calculate the number of combinations if the table hasn't already cached the value
|
|
if (length > 0 && combinationsPerLength[length] == 0) {
|
|
long combinations = 0L;
|
|
for (int i = 1; i <= length; i++)
|
|
combinations += (1L << 2 * i); // add all combinations with 4^i ( 4^i is the same as 2^(2*i) )
|
|
combinationsPerLength[length] = combinations;
|
|
}
|
|
return combinationsPerLength[length];
|
|
}
|
|
|
|
|
|
public static byte[] sizeOf(Object obj) throws java.io.IOException
|
|
{
|
|
ByteArrayOutputStream byteObject = new ByteArrayOutputStream();
|
|
ObjectOutputStream objectOutputStream = new ObjectOutputStream(byteObject);
|
|
objectOutputStream.writeObject(obj);
|
|
objectOutputStream.flush();
|
|
objectOutputStream.close();
|
|
byteObject.close();
|
|
|
|
return byteObject.toByteArray();
|
|
}
|
|
}
|