Start to build BWT abstractions, so we can present a reasonable facsimile of the BWT to the user no matter how it's represented on disk.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1589 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
c3f77acd5e
commit
0f3049652a
|
|
@ -1,5 +1,7 @@
|
|||
package org.broadinstitute.sting.bwa;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Represents the Burrows-Wheeler Transform of a reference sequence.
|
||||
*
|
||||
|
|
@ -7,18 +9,120 @@ package org.broadinstitute.sting.bwa;
|
|||
* @version 0.1
|
||||
*/
|
||||
public class BWT {
|
||||
/**
|
||||
* Write an occurrence table after every SEQUENCE_BLOCK_SIZE bases.
|
||||
* For this implementation to behave correctly, SEQUENCE_BLOCK_SIZE % 8 == 0
|
||||
*/
|
||||
public static final int SEQUENCE_BLOCK_SIZE = 128;
|
||||
|
||||
public final int inverseSA0;
|
||||
public final int[] occurrences;
|
||||
public final byte[] sequence;
|
||||
public final int[] count;
|
||||
public final SequenceBlock[] sequenceBlocks;
|
||||
|
||||
/**
|
||||
* Creates a new BWT with the given inverse SA, counts, and sequence (in ASCII).
|
||||
* @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence.
|
||||
* @param count Cumulative count of bases, in A,C,G,T order.
|
||||
* @param sequenceBlocks The full BWT sequence, sans the '$'.
|
||||
*/
|
||||
public BWT( int inverseSA0, int[] count, SequenceBlock[] sequenceBlocks ) {
|
||||
this.inverseSA0 = inverseSA0;
|
||||
this.count = count;
|
||||
this.sequenceBlocks = sequenceBlocks;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new BWT with the given inverse SA, occurrences, and sequence (in ASCII).
|
||||
* @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence.
|
||||
* @param occurrences Cumulative number of occurrences of A,C,G,T, in order.
|
||||
* @param count Cumulative count of bases, in A,C,G,T order.
|
||||
* @param sequence The full BWT sequence, sans the '$'.
|
||||
*/
|
||||
public BWT( int inverseSA0, int[] occurrences, byte[] sequence ) {
|
||||
this.inverseSA0 = inverseSA0;
|
||||
public BWT( int inverseSA0, int[] count, byte[] sequence ) {
|
||||
this(inverseSA0,count,generateSequenceBlocks(sequence));
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract the full sequence from the list of block.
|
||||
* @return The full BWT string as a byte array.
|
||||
*/
|
||||
public byte[] getSequence() {
|
||||
byte[] sequence = new byte[count[count.length-1]];
|
||||
for( SequenceBlock block: sequenceBlocks )
|
||||
System.arraycopy(block.sequence,0,sequence,block.sequenceStart,block.sequenceLength);
|
||||
return sequence;
|
||||
}
|
||||
|
||||
/**
|
||||
* The number of bases in the BWT as a whole.
|
||||
* @return Number of bases.
|
||||
*/
|
||||
public int length() {
|
||||
return count[count.length-1];
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a set of sequence blocks from one long sequence.
|
||||
* @param sequence Sequence from which to derive blocks.
|
||||
* @return Array of sequence blocks containing data from the sequence.
|
||||
*/
|
||||
private static SequenceBlock[] generateSequenceBlocks( byte[] sequence ) {
|
||||
int[] occurrences = {0,0,0,0};
|
||||
|
||||
int numSequenceBlocks = PackUtils.numberOfPartitions(sequence.length,SEQUENCE_BLOCK_SIZE);
|
||||
SequenceBlock[] sequenceBlocks = new SequenceBlock[numSequenceBlocks];
|
||||
|
||||
for( int block = 0; block < numSequenceBlocks; block++ ) {
|
||||
int blockStart = block*SEQUENCE_BLOCK_SIZE;
|
||||
int blockLength = Math.min(SEQUENCE_BLOCK_SIZE, sequence.length-blockStart);
|
||||
byte[] subsequence = new byte[blockLength];
|
||||
|
||||
System.arraycopy(sequence,blockStart,subsequence,0,blockLength);
|
||||
|
||||
sequenceBlocks[block] = new SequenceBlock(blockStart,blockLength,Arrays.copyOf(occurrences,occurrences.length),subsequence);
|
||||
|
||||
for( byte base: subsequence )
|
||||
occurrences[PackUtils.packBase(base)]++;
|
||||
}
|
||||
|
||||
return sequenceBlocks;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Models a block of bases within the BWT.
|
||||
*/
|
||||
class SequenceBlock {
|
||||
/**
|
||||
* Start position of this sequence within the BWT.
|
||||
*/
|
||||
public final int sequenceStart;
|
||||
|
||||
/**
|
||||
* Length of this sequence within the BWT.
|
||||
*/
|
||||
public final int sequenceLength;
|
||||
|
||||
|
||||
/**
|
||||
* Occurrences of each letter up to this sequence block.
|
||||
*/
|
||||
public final int[] occurrences;
|
||||
|
||||
/**
|
||||
* Sequence for this segment.
|
||||
*/
|
||||
public final byte[] sequence;
|
||||
|
||||
/**
|
||||
* Create a new block within this BWT.
|
||||
* @param sequenceStart Starting position of this sequence within the BWT.
|
||||
* @param sequenceLength Length of this sequence.
|
||||
* @param occurrences How many of each base has been seen before this sequence began.
|
||||
* @param sequence The actual sequence from the BWT.
|
||||
*/
|
||||
SequenceBlock( int sequenceStart, int sequenceLength, int[] occurrences, byte[] sequence ) {
|
||||
this.sequenceStart = sequenceStart;
|
||||
this.sequenceLength = sequenceLength;
|
||||
this.occurrences = occurrences;
|
||||
this.sequence = sequence;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,20 +39,35 @@ public class BWTReader {
|
|||
BasePackedInputStream basePackedInputStream = new BasePackedInputStream<Integer>(Integer.class, inputStream, ByteOrder.LITTLE_ENDIAN);
|
||||
|
||||
int inverseSA0;
|
||||
int[] occurrences;
|
||||
byte[] bwt;
|
||||
int[] count;
|
||||
SequenceBlock[] sequenceBlocks;
|
||||
|
||||
try {
|
||||
inverseSA0 = intPackedInputStream.read();
|
||||
occurrences = new int[PackUtils.ALPHABET_SIZE];
|
||||
intPackedInputStream.read(occurrences);
|
||||
bwt = basePackedInputStream.read(occurrences[PackUtils.ALPHABET_SIZE-1]);
|
||||
count = new int[PackUtils.ALPHABET_SIZE];
|
||||
intPackedInputStream.read(count);
|
||||
|
||||
int bwtSize = count[PackUtils.ALPHABET_SIZE-1];
|
||||
sequenceBlocks = new SequenceBlock[PackUtils.numberOfPartitions(bwtSize,BWT.SEQUENCE_BLOCK_SIZE)];
|
||||
|
||||
for( int block = 0; block < sequenceBlocks.length; block++ ) {
|
||||
int sequenceStart = block*BWT.SEQUENCE_BLOCK_SIZE;
|
||||
int sequenceLength = Math.min(BWT.SEQUENCE_BLOCK_SIZE,bwtSize-sequenceStart);
|
||||
|
||||
int[] occurrences = new int[PackUtils.ALPHABET_SIZE];
|
||||
byte[] bwt = new byte[sequenceLength];
|
||||
|
||||
intPackedInputStream.read(occurrences);
|
||||
basePackedInputStream.read(bwt);
|
||||
|
||||
sequenceBlocks[block] = new SequenceBlock(sequenceStart,sequenceLength,occurrences,bwt);
|
||||
}
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new StingException("Unable to read BWT from input stream.", ex);
|
||||
}
|
||||
|
||||
return new BWT(inverseSA0, occurrences, bwt);
|
||||
return new BWT(inverseSA0, count, sequenceBlocks);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ public class BWTWriter {
|
|||
/**
|
||||
* Input stream from which to read BWT data.
|
||||
*/
|
||||
private OutputStream outputStream;
|
||||
private final OutputStream outputStream;
|
||||
|
||||
/**
|
||||
* Create a new BWT reader.
|
||||
|
|
@ -40,8 +40,22 @@ public class BWTWriter {
|
|||
|
||||
try {
|
||||
intPackedOutputStream.write(bwt.inverseSA0);
|
||||
intPackedOutputStream.write(bwt.occurrences);
|
||||
basePackedOutputStream.write(bwt.sequence);
|
||||
intPackedOutputStream.write(bwt.count);
|
||||
|
||||
for( SequenceBlock block: bwt.sequenceBlocks ) {
|
||||
intPackedOutputStream.write(block.occurrences);
|
||||
basePackedOutputStream.write(block.sequence);
|
||||
}
|
||||
|
||||
// The last block is the last set of counts in the structure.
|
||||
int[] finalCount = new int[bwt.count.length];
|
||||
for( int i = 0; i < finalCount.length; i++ ) {
|
||||
if(i != 0)
|
||||
finalCount[i] = bwt.count[i]-bwt.count[i-1];
|
||||
else
|
||||
finalCount[i] = bwt.count[i];
|
||||
}
|
||||
intPackedOutputStream.write(finalCount);
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new StingException("Unable to read BWT from input stream.", ex);
|
||||
|
|
|
|||
|
|
@ -43,14 +43,23 @@ public class BasePackedInputStream<T> {
|
|||
|
||||
/**
|
||||
* Read the entire contents of the input stream.
|
||||
* @param length number of bases to read from the stream.
|
||||
* @param bwt array into which bases should be read.
|
||||
* @return a byte array of the given length.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public byte[] read( int length ) throws IOException {
|
||||
byte[] bwt = new byte[length];
|
||||
int packedWord = 0;
|
||||
public void read(byte[] bwt) throws IOException {
|
||||
read(bwt,0,bwt.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the next <code>length</code> bases into the bwt array, starting at the given offset.
|
||||
* @param bwt array holding the given data.
|
||||
* @param offset target position in the bases array into which bytes should be written.
|
||||
* @param length number of bases to read from the stream.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void read( byte[] bwt, int offset, int length ) throws IOException {
|
||||
int packedWord = 0;
|
||||
final int basesPerEntry = PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BASE;
|
||||
for( int i = 0; i < length; i++ ) {
|
||||
if( i % basesPerEntry == 0 ) {
|
||||
|
|
@ -60,10 +69,7 @@ public class BasePackedInputStream<T> {
|
|||
}
|
||||
|
||||
int position = basesPerEntry - i % basesPerEntry - 1;
|
||||
bwt[i] = PackUtils.unpackBase((byte)((packedWord >> position*PackUtils.BITS_PER_BASE) & 0x3));
|
||||
bwt[i+offset] = PackUtils.unpackBase((byte)((packedWord >> position*PackUtils.BITS_PER_BASE) & 0x3));
|
||||
}
|
||||
|
||||
return bwt;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -65,11 +65,22 @@ public class BasePackedOutputStream<T> {
|
|||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void write( byte[] bases ) throws IOException {
|
||||
write(bases,0,bases.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes a subset of the array of bases to the output stream.
|
||||
* @param bases List of bases to write.
|
||||
* @param offset site at which to start writing.
|
||||
* @param length number of bases to write.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void write( byte[] bases, int offset, int length ) throws IOException {
|
||||
int packedBases = 0;
|
||||
int positionInPack = 0;
|
||||
|
||||
for(byte base: bases) {
|
||||
packedBases = packBase(base, packedBases, positionInPack);
|
||||
for( int base = offset; base < offset+length; base++ ) {
|
||||
packedBases = packBase(bases[base], packedBases, positionInPack);
|
||||
|
||||
// Increment the packed counter. If all possible bases have been squeezed into this byte, write it out.
|
||||
positionInPack = ++positionInPack % basesPerType;
|
||||
|
|
|
|||
|
|
@ -52,6 +52,13 @@ public class CreateBWTFromReference {
|
|||
return StringUtil.bytesToString(sequence.getBases());
|
||||
}
|
||||
|
||||
private String loadReverseReference( File inputFile ) {
|
||||
ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile);
|
||||
ReferenceSequence sequence = reference.nextSequence();
|
||||
PackUtils.reverse(sequence.getBases());
|
||||
return StringUtil.bytesToString(sequence.getBases());
|
||||
}
|
||||
|
||||
private int[] countOccurrences( String sequence ) {
|
||||
int occurrences[] = new int[ALPHABET_SIZE];
|
||||
for( char base: sequence.toCharArray() )
|
||||
|
|
@ -118,8 +125,8 @@ public class CreateBWTFromReference {
|
|||
}
|
||||
|
||||
public static void main( String argv[] ) throws IOException {
|
||||
if( argv.length != 3 ) {
|
||||
System.out.println("USAGE: CreateBWTFromReference <input>.fasta <output bwt> <output sa>");
|
||||
if( argv.length != 5 ) {
|
||||
System.out.println("USAGE: CreateBWTFromReference <input>.fasta <output bwt> <output rbwt> <output sa> <output rsa>");
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -129,12 +136,19 @@ public class CreateBWTFromReference {
|
|||
String bwtFileName = argv[1];
|
||||
File bwtFile = new File(bwtFileName);
|
||||
|
||||
String saFileName = argv[2];
|
||||
String rbwtFileName = argv[2];
|
||||
File rbwtFile = new File(rbwtFileName);
|
||||
|
||||
String saFileName = argv[3];
|
||||
File saFile = new File(saFileName);
|
||||
|
||||
String rsaFileName = argv[4];
|
||||
File rsaFile = new File(rsaFileName);
|
||||
|
||||
CreateBWTFromReference creator = new CreateBWTFromReference();
|
||||
|
||||
String sequence = creator.loadReference(inputFile);
|
||||
String reverseSequence = creator.loadReverseReference(inputFile);
|
||||
|
||||
// Count the occurences of each given base.
|
||||
int[] occurrences = creator.countOccurrences(sequence);
|
||||
|
|
@ -142,17 +156,23 @@ public class CreateBWTFromReference {
|
|||
|
||||
// Generate the suffix array and print diagnostics.
|
||||
int[] suffixArrayData = creator.createSuffixArray(sequence);
|
||||
int[] reverseSuffixArrayData = creator.createSuffixArray(reverseSequence);
|
||||
|
||||
// Invert the suffix array and print diagnostics.
|
||||
int[] inverseSuffixArray = creator.invertSuffixArray(suffixArrayData);
|
||||
int[] reverseInverseSuffixArray = creator.invertSuffixArray(reverseSuffixArrayData);
|
||||
|
||||
SuffixArray suffixArray = new SuffixArray( inverseSuffixArray[0], occurrences, suffixArrayData );
|
||||
SuffixArray reverseSuffixArray = new SuffixArray( reverseInverseSuffixArray[0], occurrences, reverseSuffixArrayData );
|
||||
|
||||
/*
|
||||
for( int i = 0; i < 8; i++ )
|
||||
System.out.printf("suffixArray[%d] = %d (%s...)%n", i, suffixArray.sequence[i], sequence.substring(suffixArray.sequence[i],Math.min(suffixArray.sequence[i]+100,sequence.length())));
|
||||
for( int i = 0; i < 8; i++ )
|
||||
System.out.printf("inverseSuffixArray[%d] = %d (%s...)%n", i, inverseSuffixArray[i], sequence.substring(i,Math.min(i+100,sequence.length())));
|
||||
*/
|
||||
|
||||
/*
|
||||
// Create the data structure for the compressed suffix array and print diagnostics.
|
||||
int[] compressedSuffixArray = creator.createCompressedSuffixArray(suffixArray.sequence,inverseSuffixArray);
|
||||
int reconstructedInverseSA = compressedSuffixArray[0];
|
||||
|
|
@ -166,30 +186,40 @@ public class CreateBWTFromReference {
|
|||
for( int i = 0; i < 8; i++ ) {
|
||||
System.out.printf("inverseCompressedSuffixArray[%d] = %d%n", i, inverseCompressedSuffixArray[i]);
|
||||
}
|
||||
*/
|
||||
|
||||
// Create the BWT.
|
||||
BWT bwt = new BWT( inverseSuffixArray[0], occurrences, creator.createBWT(sequence, suffixArray.sequence) );
|
||||
BWT reverseBWT = new BWT( reverseInverseSuffixArray[0], occurrences, creator.createBWT(reverseSequence, reverseSuffixArray.sequence));
|
||||
|
||||
String bwtAsString = new String(bwt.sequence);
|
||||
System.out.printf("BWT: %s...%n", bwtAsString.substring(0,80));
|
||||
byte[] bwtSequence = bwt.getSequence();
|
||||
System.out.printf("BWT: %s... (length = %d)%n", new String(bwtSequence,0,80),bwt.length());
|
||||
|
||||
BWTWriter bwtWriter = new BWTWriter(bwtFile);
|
||||
bwtWriter.write(bwt);
|
||||
bwtWriter.close();
|
||||
|
||||
BWTWriter reverseBWTWriter = new BWTWriter(rbwtFile);
|
||||
reverseBWTWriter.write(reverseBWT);
|
||||
reverseBWTWriter.close();
|
||||
|
||||
SuffixArrayWriter saWriter = new SuffixArrayWriter(saFile);
|
||||
saWriter.write(suffixArray);
|
||||
saWriter.close();
|
||||
|
||||
SuffixArrayWriter reverseSAWriter = new SuffixArrayWriter(rsaFile);
|
||||
reverseSAWriter.write(reverseSuffixArray);
|
||||
reverseSAWriter.close();
|
||||
|
||||
File existingBWTFile = new File(inputFileName+".bwt");
|
||||
BWTReader existingBWTReader = new BWTReader(existingBWTFile);
|
||||
BWT existingBWT = existingBWTReader.read();
|
||||
|
||||
String existingBwtAsString = new String(existingBWT.sequence);
|
||||
System.out.printf("Existing BWT: %s...%n",existingBwtAsString.substring(0,80));
|
||||
byte[] existingBWTSequence = existingBWT.getSequence();
|
||||
System.out.printf("Existing BWT: %s... (length = %d)%n",new String(existingBWTSequence,0,80),existingBWT.length());
|
||||
|
||||
for( int i = 0; i < bwt.sequence.length; i++ ) {
|
||||
if( bwt.sequence[i] != existingBWT.sequence[i] )
|
||||
for( int i = 0; i < bwt.length(); i++ ) {
|
||||
if( bwtSequence[i] != existingBWTSequence[i] )
|
||||
throw new StingException("BWT mismatch at " + i);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -41,8 +41,8 @@ import java.nio.ByteOrder;
|
|||
|
||||
public class CreatePACFromReference {
|
||||
public static void main( String argv[] ) throws IOException {
|
||||
if( argv.length != 2 ) {
|
||||
System.out.println("USAGE: CreatePACFromReference <input>.fasta <output>");
|
||||
if( argv.length != 3 ) {
|
||||
System.out.println("USAGE: CreatePACFromReference <input>.fasta <output pac> <output rpac>");
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -53,13 +53,22 @@ public class CreatePACFromReference {
|
|||
ReferenceSequence sequence = reference.nextSequence();
|
||||
|
||||
// Target file for output
|
||||
File outputFile = new File(argv[1]);
|
||||
writeSequence( new File(argv[1]), sequence.getBases() );
|
||||
|
||||
// Reverse the bases in the reference
|
||||
PackUtils.reverse(sequence.getBases());
|
||||
|
||||
// Target file for output
|
||||
writeSequence( new File(argv[2]), sequence.getBases() );
|
||||
}
|
||||
|
||||
private static void writeSequence( File outputFile, byte[] bases ) throws IOException {
|
||||
OutputStream outputStream = new FileOutputStream(outputFile);
|
||||
|
||||
BasePackedOutputStream<Byte> basePackedOutputStream = new BasePackedOutputStream<Byte>(Byte.class, outputStream, ByteOrder.BIG_ENDIAN);
|
||||
basePackedOutputStream.write(sequence.getBases());
|
||||
basePackedOutputStream.write(bases);
|
||||
|
||||
outputStream.write(sequence.getBases().length%PackUtils.ALPHABET_SIZE);
|
||||
outputStream.write(bases.length%PackUtils.ALPHABET_SIZE);
|
||||
|
||||
outputStream.close();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ public class PackUtils {
|
|||
* @param type Type to test.
|
||||
* @return Number of bits that the given type can hold.
|
||||
*/
|
||||
public static final int bitsInType( Class<?> type ) {
|
||||
public static int bitsInType( Class<?> type ) {
|
||||
try {
|
||||
long typeSize = type.getField("MAX_VALUE").getLong(null) - type.getField("MIN_VALUE").getLong(null)+1;
|
||||
long intTypeSize = (long)Integer.MAX_VALUE - (long)Integer.MIN_VALUE + 1;
|
||||
|
|
@ -65,6 +65,11 @@ public class PackUtils {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a two-bit representation of a base into an ASCII representation of a base.
|
||||
* @param pack Byte from 0-3 indicating which base is represented.
|
||||
* @return An ASCII value representing the packed base.
|
||||
*/
|
||||
public static byte unpackBase(byte pack) {
|
||||
switch( pack ) {
|
||||
case 0:
|
||||
|
|
@ -79,4 +84,28 @@ public class PackUtils {
|
|||
throw new StingException("Unknown pack type: " + pack);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reverses an unpacked sequence of bases.
|
||||
* @param bases bases to reverse.
|
||||
*/
|
||||
public static void reverse( byte[] bases ) {
|
||||
for( int i = 0, j = bases.length-1; i < j; i++, j-- ) {
|
||||
byte temp = bases[j];
|
||||
bases[j] = bases[i];
|
||||
bases[i] = temp;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a structure of size <code>size</code> that should be split
|
||||
* into <code>partitionSize</code> partitions, how many partitions should
|
||||
* be created? Size of last partition will be <= partitionSize.
|
||||
* @param size Total size of the data structure.
|
||||
* @param partitionSize Size of an individual partition.
|
||||
* @return Number of partitions that would be created.
|
||||
*/
|
||||
public static int numberOfPartitions( int size, int partitionSize ) {
|
||||
return (size + partitionSize - 1)/partitionSize;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue