Start to build BWT abstractions, so we can present a reasonable facsimile of the BWT to the user no matter how it's represented on disk.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1589 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-09-11 18:23:15 +00:00
parent c3f77acd5e
commit 0f3049652a
8 changed files with 257 additions and 39 deletions

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.bwa;
import java.util.Arrays;
/**
* Represents the Burrows-Wheeler Transform of a reference sequence.
*
@ -7,18 +9,120 @@ package org.broadinstitute.sting.bwa;
* @version 0.1
*/
public class BWT {
/**
* Write an occurrence table after every SEQUENCE_BLOCK_SIZE bases.
* For this implementation to behave correctly, SEQUENCE_BLOCK_SIZE % 8 == 0
*/
public static final int SEQUENCE_BLOCK_SIZE = 128;
public final int inverseSA0;
public final int[] occurrences;
public final byte[] sequence;
public final int[] count;
public final SequenceBlock[] sequenceBlocks;
/**
* Creates a new BWT with the given inverse SA, counts, and sequence (in ASCII).
* @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence.
* @param count Cumulative count of bases, in A,C,G,T order.
* @param sequenceBlocks The full BWT sequence, sans the '$'.
*/
public BWT( int inverseSA0, int[] count, SequenceBlock[] sequenceBlocks ) {
this.inverseSA0 = inverseSA0;
this.count = count;
this.sequenceBlocks = sequenceBlocks;
}
/**
* Creates a new BWT with the given inverse SA, occurrences, and sequence (in ASCII).
* @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence.
* @param occurrences Cumulative number of occurrences of A,C,G,T, in order.
* @param count Cumulative count of bases, in A,C,G,T order.
* @param sequence The full BWT sequence, sans the '$'.
*/
public BWT( int inverseSA0, int[] occurrences, byte[] sequence ) {
this.inverseSA0 = inverseSA0;
public BWT( int inverseSA0, int[] count, byte[] sequence ) {
this(inverseSA0,count,generateSequenceBlocks(sequence));
}
/**
* Extract the full sequence from the list of block.
* @return The full BWT string as a byte array.
*/
public byte[] getSequence() {
byte[] sequence = new byte[count[count.length-1]];
for( SequenceBlock block: sequenceBlocks )
System.arraycopy(block.sequence,0,sequence,block.sequenceStart,block.sequenceLength);
return sequence;
}
/**
* The number of bases in the BWT as a whole.
* @return Number of bases.
*/
public int length() {
return count[count.length-1];
}
/**
* Create a set of sequence blocks from one long sequence.
* @param sequence Sequence from which to derive blocks.
* @return Array of sequence blocks containing data from the sequence.
*/
private static SequenceBlock[] generateSequenceBlocks( byte[] sequence ) {
int[] occurrences = {0,0,0,0};
int numSequenceBlocks = PackUtils.numberOfPartitions(sequence.length,SEQUENCE_BLOCK_SIZE);
SequenceBlock[] sequenceBlocks = new SequenceBlock[numSequenceBlocks];
for( int block = 0; block < numSequenceBlocks; block++ ) {
int blockStart = block*SEQUENCE_BLOCK_SIZE;
int blockLength = Math.min(SEQUENCE_BLOCK_SIZE, sequence.length-blockStart);
byte[] subsequence = new byte[blockLength];
System.arraycopy(sequence,blockStart,subsequence,0,blockLength);
sequenceBlocks[block] = new SequenceBlock(blockStart,blockLength,Arrays.copyOf(occurrences,occurrences.length),subsequence);
for( byte base: subsequence )
occurrences[PackUtils.packBase(base)]++;
}
return sequenceBlocks;
}
}
/**
* Models a block of bases within the BWT.
*/
class SequenceBlock {
/**
* Start position of this sequence within the BWT.
*/
public final int sequenceStart;
/**
* Length of this sequence within the BWT.
*/
public final int sequenceLength;
/**
* Occurrences of each letter up to this sequence block.
*/
public final int[] occurrences;
/**
* Sequence for this segment.
*/
public final byte[] sequence;
/**
* Create a new block within this BWT.
* @param sequenceStart Starting position of this sequence within the BWT.
* @param sequenceLength Length of this sequence.
* @param occurrences How many of each base has been seen before this sequence began.
* @param sequence The actual sequence from the BWT.
*/
SequenceBlock( int sequenceStart, int sequenceLength, int[] occurrences, byte[] sequence ) {
this.sequenceStart = sequenceStart;
this.sequenceLength = sequenceLength;
this.occurrences = occurrences;
this.sequence = sequence;
}

View File

@ -39,20 +39,35 @@ public class BWTReader {
BasePackedInputStream basePackedInputStream = new BasePackedInputStream<Integer>(Integer.class, inputStream, ByteOrder.LITTLE_ENDIAN);
int inverseSA0;
int[] occurrences;
byte[] bwt;
int[] count;
SequenceBlock[] sequenceBlocks;
try {
inverseSA0 = intPackedInputStream.read();
occurrences = new int[PackUtils.ALPHABET_SIZE];
intPackedInputStream.read(occurrences);
bwt = basePackedInputStream.read(occurrences[PackUtils.ALPHABET_SIZE-1]);
count = new int[PackUtils.ALPHABET_SIZE];
intPackedInputStream.read(count);
int bwtSize = count[PackUtils.ALPHABET_SIZE-1];
sequenceBlocks = new SequenceBlock[PackUtils.numberOfPartitions(bwtSize,BWT.SEQUENCE_BLOCK_SIZE)];
for( int block = 0; block < sequenceBlocks.length; block++ ) {
int sequenceStart = block*BWT.SEQUENCE_BLOCK_SIZE;
int sequenceLength = Math.min(BWT.SEQUENCE_BLOCK_SIZE,bwtSize-sequenceStart);
int[] occurrences = new int[PackUtils.ALPHABET_SIZE];
byte[] bwt = new byte[sequenceLength];
intPackedInputStream.read(occurrences);
basePackedInputStream.read(bwt);
sequenceBlocks[block] = new SequenceBlock(sequenceStart,sequenceLength,occurrences,bwt);
}
}
catch( IOException ex ) {
throw new StingException("Unable to read BWT from input stream.", ex);
}
return new BWT(inverseSA0, occurrences, bwt);
return new BWT(inverseSA0, count, sequenceBlocks);
}
/**

View File

@ -15,7 +15,7 @@ public class BWTWriter {
/**
* Input stream from which to read BWT data.
*/
private OutputStream outputStream;
private final OutputStream outputStream;
/**
* Create a new BWT reader.
@ -40,8 +40,22 @@ public class BWTWriter {
try {
intPackedOutputStream.write(bwt.inverseSA0);
intPackedOutputStream.write(bwt.occurrences);
basePackedOutputStream.write(bwt.sequence);
intPackedOutputStream.write(bwt.count);
for( SequenceBlock block: bwt.sequenceBlocks ) {
intPackedOutputStream.write(block.occurrences);
basePackedOutputStream.write(block.sequence);
}
// The last block is the last set of counts in the structure.
int[] finalCount = new int[bwt.count.length];
for( int i = 0; i < finalCount.length; i++ ) {
if(i != 0)
finalCount[i] = bwt.count[i]-bwt.count[i-1];
else
finalCount[i] = bwt.count[i];
}
intPackedOutputStream.write(finalCount);
}
catch( IOException ex ) {
throw new StingException("Unable to read BWT from input stream.", ex);

View File

@ -43,14 +43,23 @@ public class BasePackedInputStream<T> {
/**
* Read the entire contents of the input stream.
* @param length number of bases to read from the stream.
* @param bwt array into which bases should be read.
* @return a byte array of the given length.
* @throws IOException if an I/O error occurs.
*/
public byte[] read( int length ) throws IOException {
byte[] bwt = new byte[length];
int packedWord = 0;
public void read(byte[] bwt) throws IOException {
read(bwt,0,bwt.length);
}
/**
* Read the next <code>length</code> bases into the bwt array, starting at the given offset.
* @param bwt array holding the given data.
* @param offset target position in the bases array into which bytes should be written.
* @param length number of bases to read from the stream.
* @throws IOException if an I/O error occurs.
*/
public void read( byte[] bwt, int offset, int length ) throws IOException {
int packedWord = 0;
final int basesPerEntry = PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BASE;
for( int i = 0; i < length; i++ ) {
if( i % basesPerEntry == 0 ) {
@ -60,10 +69,7 @@ public class BasePackedInputStream<T> {
}
int position = basesPerEntry - i % basesPerEntry - 1;
bwt[i] = PackUtils.unpackBase((byte)((packedWord >> position*PackUtils.BITS_PER_BASE) & 0x3));
bwt[i+offset] = PackUtils.unpackBase((byte)((packedWord >> position*PackUtils.BITS_PER_BASE) & 0x3));
}
return bwt;
}
}

View File

@ -65,11 +65,22 @@ public class BasePackedOutputStream<T> {
* @throws IOException if an I/O error occurs.
*/
public void write( byte[] bases ) throws IOException {
write(bases,0,bases.length);
}
/**
* Writes a subset of the array of bases to the output stream.
* @param bases List of bases to write.
* @param offset site at which to start writing.
* @param length number of bases to write.
* @throws IOException if an I/O error occurs.
*/
public void write( byte[] bases, int offset, int length ) throws IOException {
int packedBases = 0;
int positionInPack = 0;
for(byte base: bases) {
packedBases = packBase(base, packedBases, positionInPack);
for( int base = offset; base < offset+length; base++ ) {
packedBases = packBase(bases[base], packedBases, positionInPack);
// Increment the packed counter. If all possible bases have been squeezed into this byte, write it out.
positionInPack = ++positionInPack % basesPerType;

View File

@ -52,6 +52,13 @@ public class CreateBWTFromReference {
return StringUtil.bytesToString(sequence.getBases());
}
private String loadReverseReference( File inputFile ) {
ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile);
ReferenceSequence sequence = reference.nextSequence();
PackUtils.reverse(sequence.getBases());
return StringUtil.bytesToString(sequence.getBases());
}
private int[] countOccurrences( String sequence ) {
int occurrences[] = new int[ALPHABET_SIZE];
for( char base: sequence.toCharArray() )
@ -118,8 +125,8 @@ public class CreateBWTFromReference {
}
public static void main( String argv[] ) throws IOException {
if( argv.length != 3 ) {
System.out.println("USAGE: CreateBWTFromReference <input>.fasta <output bwt> <output sa>");
if( argv.length != 5 ) {
System.out.println("USAGE: CreateBWTFromReference <input>.fasta <output bwt> <output rbwt> <output sa> <output rsa>");
return;
}
@ -129,12 +136,19 @@ public class CreateBWTFromReference {
String bwtFileName = argv[1];
File bwtFile = new File(bwtFileName);
String saFileName = argv[2];
String rbwtFileName = argv[2];
File rbwtFile = new File(rbwtFileName);
String saFileName = argv[3];
File saFile = new File(saFileName);
String rsaFileName = argv[4];
File rsaFile = new File(rsaFileName);
CreateBWTFromReference creator = new CreateBWTFromReference();
String sequence = creator.loadReference(inputFile);
String reverseSequence = creator.loadReverseReference(inputFile);
// Count the occurences of each given base.
int[] occurrences = creator.countOccurrences(sequence);
@ -142,17 +156,23 @@ public class CreateBWTFromReference {
// Generate the suffix array and print diagnostics.
int[] suffixArrayData = creator.createSuffixArray(sequence);
int[] reverseSuffixArrayData = creator.createSuffixArray(reverseSequence);
// Invert the suffix array and print diagnostics.
int[] inverseSuffixArray = creator.invertSuffixArray(suffixArrayData);
int[] reverseInverseSuffixArray = creator.invertSuffixArray(reverseSuffixArrayData);
SuffixArray suffixArray = new SuffixArray( inverseSuffixArray[0], occurrences, suffixArrayData );
SuffixArray reverseSuffixArray = new SuffixArray( reverseInverseSuffixArray[0], occurrences, reverseSuffixArrayData );
/*
for( int i = 0; i < 8; i++ )
System.out.printf("suffixArray[%d] = %d (%s...)%n", i, suffixArray.sequence[i], sequence.substring(suffixArray.sequence[i],Math.min(suffixArray.sequence[i]+100,sequence.length())));
for( int i = 0; i < 8; i++ )
System.out.printf("inverseSuffixArray[%d] = %d (%s...)%n", i, inverseSuffixArray[i], sequence.substring(i,Math.min(i+100,sequence.length())));
*/
/*
// Create the data structure for the compressed suffix array and print diagnostics.
int[] compressedSuffixArray = creator.createCompressedSuffixArray(suffixArray.sequence,inverseSuffixArray);
int reconstructedInverseSA = compressedSuffixArray[0];
@ -166,30 +186,40 @@ public class CreateBWTFromReference {
for( int i = 0; i < 8; i++ ) {
System.out.printf("inverseCompressedSuffixArray[%d] = %d%n", i, inverseCompressedSuffixArray[i]);
}
*/
// Create the BWT.
BWT bwt = new BWT( inverseSuffixArray[0], occurrences, creator.createBWT(sequence, suffixArray.sequence) );
BWT reverseBWT = new BWT( reverseInverseSuffixArray[0], occurrences, creator.createBWT(reverseSequence, reverseSuffixArray.sequence));
String bwtAsString = new String(bwt.sequence);
System.out.printf("BWT: %s...%n", bwtAsString.substring(0,80));
byte[] bwtSequence = bwt.getSequence();
System.out.printf("BWT: %s... (length = %d)%n", new String(bwtSequence,0,80),bwt.length());
BWTWriter bwtWriter = new BWTWriter(bwtFile);
bwtWriter.write(bwt);
bwtWriter.close();
BWTWriter reverseBWTWriter = new BWTWriter(rbwtFile);
reverseBWTWriter.write(reverseBWT);
reverseBWTWriter.close();
SuffixArrayWriter saWriter = new SuffixArrayWriter(saFile);
saWriter.write(suffixArray);
saWriter.close();
SuffixArrayWriter reverseSAWriter = new SuffixArrayWriter(rsaFile);
reverseSAWriter.write(reverseSuffixArray);
reverseSAWriter.close();
File existingBWTFile = new File(inputFileName+".bwt");
BWTReader existingBWTReader = new BWTReader(existingBWTFile);
BWT existingBWT = existingBWTReader.read();
String existingBwtAsString = new String(existingBWT.sequence);
System.out.printf("Existing BWT: %s...%n",existingBwtAsString.substring(0,80));
byte[] existingBWTSequence = existingBWT.getSequence();
System.out.printf("Existing BWT: %s... (length = %d)%n",new String(existingBWTSequence,0,80),existingBWT.length());
for( int i = 0; i < bwt.sequence.length; i++ ) {
if( bwt.sequence[i] != existingBWT.sequence[i] )
for( int i = 0; i < bwt.length(); i++ ) {
if( bwtSequence[i] != existingBWTSequence[i] )
throw new StingException("BWT mismatch at " + i);
}

View File

@ -41,8 +41,8 @@ import java.nio.ByteOrder;
public class CreatePACFromReference {
public static void main( String argv[] ) throws IOException {
if( argv.length != 2 ) {
System.out.println("USAGE: CreatePACFromReference <input>.fasta <output>");
if( argv.length != 3 ) {
System.out.println("USAGE: CreatePACFromReference <input>.fasta <output pac> <output rpac>");
return;
}
@ -53,13 +53,22 @@ public class CreatePACFromReference {
ReferenceSequence sequence = reference.nextSequence();
// Target file for output
File outputFile = new File(argv[1]);
writeSequence( new File(argv[1]), sequence.getBases() );
// Reverse the bases in the reference
PackUtils.reverse(sequence.getBases());
// Target file for output
writeSequence( new File(argv[2]), sequence.getBases() );
}
private static void writeSequence( File outputFile, byte[] bases ) throws IOException {
OutputStream outputStream = new FileOutputStream(outputFile);
BasePackedOutputStream<Byte> basePackedOutputStream = new BasePackedOutputStream<Byte>(Byte.class, outputStream, ByteOrder.BIG_ENDIAN);
basePackedOutputStream.write(sequence.getBases());
basePackedOutputStream.write(bases);
outputStream.write(sequence.getBases().length%PackUtils.ALPHABET_SIZE);
outputStream.write(bases.length%PackUtils.ALPHABET_SIZE);
outputStream.close();
}

View File

@ -29,7 +29,7 @@ public class PackUtils {
* @param type Type to test.
* @return Number of bits that the given type can hold.
*/
public static final int bitsInType( Class<?> type ) {
public static int bitsInType( Class<?> type ) {
try {
long typeSize = type.getField("MAX_VALUE").getLong(null) - type.getField("MIN_VALUE").getLong(null)+1;
long intTypeSize = (long)Integer.MAX_VALUE - (long)Integer.MIN_VALUE + 1;
@ -65,6 +65,11 @@ public class PackUtils {
}
}
/**
* Converts a two-bit representation of a base into an ASCII representation of a base.
* @param pack Byte from 0-3 indicating which base is represented.
* @return An ASCII value representing the packed base.
*/
public static byte unpackBase(byte pack) {
switch( pack ) {
case 0:
@ -79,4 +84,28 @@ public class PackUtils {
throw new StingException("Unknown pack type: " + pack);
}
}
/**
* Reverses an unpacked sequence of bases.
* @param bases bases to reverse.
*/
public static void reverse( byte[] bases ) {
for( int i = 0, j = bases.length-1; i < j; i++, j-- ) {
byte temp = bases[j];
bases[j] = bases[i];
bases[i] = temp;
}
}
/**
* Given a structure of size <code>size</code> that should be split
* into <code>partitionSize</code> partitions, how many partitions should
* be created? Size of last partition will be <= partitionSize.
* @param size Total size of the data structure.
* @param partitionSize Size of an individual partition.
* @return Number of partitions that would be created.
*/
public static int numberOfPartitions( int size, int partitionSize ) {
return (size + partitionSize - 1)/partitionSize;
}
}