Higher-level readers and writers for BWTs and suffix arrays.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1573 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-09-09 22:45:32 +00:00
parent 0093482c62
commit 6de54dcd2a
10 changed files with 383 additions and 61 deletions

View File

@ -0,0 +1,25 @@
package org.broadinstitute.sting.bwa;
/**
* Represents the Burrows-Wheeler Transform of a reference sequence.
*
* @author mhanna
* @version 0.1
*/
public class BWT {
public final int inverseSA0;
public final int[] occurrences;
public final byte[] sequence;
/**
* Creates a new BWT with the given inverse SA, occurrences, and sequence (in ASCII).
* @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence.
* @param occurrences Cumulative number of occurrences of A,C,G,T, in order.
* @param sequence The full BWT sequence, sans the '$'.
*/
public BWT( int inverseSA0, int[] occurrences, byte[] sequence ) {
this.inverseSA0 = inverseSA0;
this.occurrences = occurrences;
this.sequence = sequence;
}
}

View File

@ -0,0 +1,69 @@
package org.broadinstitute.sting.bwa;
import org.broadinstitute.sting.utils.StingException;
import java.io.*;
import java.nio.ByteOrder;
/**
* Reads a BWT from a given file.
*
* @author mhanna
* @version 0.1
*/
public class BWTReader {
/**
* Input stream from which to read BWT data.
*/
private InputStream inputStream;
/**
* Create a new BWT reader.
* @param inputFile File in which the BWT is stored.
*/
public BWTReader( File inputFile ) {
try {
this.inputStream = new BufferedInputStream(new FileInputStream(inputFile));
}
catch( FileNotFoundException ex ) {
throw new StingException("Unable to open input file", ex);
}
}
/**
* Read a BWT from the input stream.
* @return The BWT stored in the input stream.
*/
public BWT read() {
IntPackedInputStream intPackedInputStream = new IntPackedInputStream(inputStream, ByteOrder.LITTLE_ENDIAN);
BasePackedInputStream basePackedInputStream = new BasePackedInputStream<Integer>(Integer.class, inputStream, ByteOrder.LITTLE_ENDIAN);
int inverseSA0;
int[] occurrences;
byte[] bwt;
try {
inverseSA0 = intPackedInputStream.read();
occurrences = new int[PackUtils.ALPHABET_SIZE];
intPackedInputStream.read(occurrences);
bwt = basePackedInputStream.read(occurrences[PackUtils.ALPHABET_SIZE-1]);
}
catch( IOException ex ) {
throw new StingException("Unable to read BWT from input stream.", ex);
}
return new BWT(inverseSA0, occurrences, bwt);
}
/**
* Close the input stream.
*/
public void close() {
try {
inputStream.close();
}
catch( IOException ex ) {
throw new StingException("Unable to close input file", ex);
}
}
}

View File

@ -0,0 +1,62 @@
package org.broadinstitute.sting.bwa;
import org.broadinstitute.sting.utils.StingException;
import java.io.*;
import java.nio.ByteOrder;
/**
* Writes an in-memory BWT to an outputstream.
*
* @author mhanna
* @version 0.1
*/
public class BWTWriter {
/**
* Input stream from which to read BWT data.
*/
private OutputStream outputStream;
/**
* Create a new BWT reader.
* @param inputFile File in which the BWT is stored.
*/
public BWTWriter( File inputFile ) {
try {
this.outputStream = new BufferedOutputStream(new FileOutputStream(inputFile));
}
catch( FileNotFoundException ex ) {
throw new StingException("Unable to open output file", ex);
}
}
/**
* Write a BWT to the output stream.
* @param bwt Transform to be written to the output stream.
*/
public void write( BWT bwt ) {
IntPackedOutputStream intPackedOutputStream = new IntPackedOutputStream(outputStream, ByteOrder.LITTLE_ENDIAN);
BasePackedOutputStream basePackedOutputStream = new BasePackedOutputStream<Integer>(Integer.class, outputStream, ByteOrder.LITTLE_ENDIAN);
try {
intPackedOutputStream.write(bwt.inverseSA0);
intPackedOutputStream.write(bwt.occurrences);
basePackedOutputStream.write(bwt.sequence);
}
catch( IOException ex ) {
throw new StingException("Unable to read BWT from input stream.", ex);
}
}
/**
* Close the input stream.
*/
public void close() {
try {
outputStream.close();
}
catch( IOException ex ) {
throw new StingException("Unable to close input file", ex);
}
}
}

View File

@ -32,7 +32,7 @@ public class BasePackedInputStream<T> {
this(type,new BufferedInputStream(new FileInputStream(inputFile)),byteOrder);
}
public BasePackedInputStream( Class<T> type, InputStream inputStream, ByteOrder byteOrder ) throws FileNotFoundException {
public BasePackedInputStream( Class<T> type, InputStream inputStream, ByteOrder byteOrder ) {
if( type != Integer.class )
throw new StingException("Only bases packed into 32-bit words are currently supported by this input stream. Type specified: " + type.getName());
@ -44,6 +44,7 @@ public class BasePackedInputStream<T> {
/**
* Read the entire contents of the input stream.
* @param length number of bases to read from the stream.
* @return a byte array of the given length.
* @throws IOException if an I/O error occurs.
*/
public byte[] read( int length ) throws IOException {

View File

@ -33,8 +33,8 @@ import net.sf.samtools.util.StringUtil;
import java.io.*;
import java.util.TreeSet;
import java.util.Comparator;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import org.broadinstitute.sting.utils.StingException;
/**
* Create a suffix array data structure.
@ -136,18 +136,25 @@ public class CreateBWTFromReference {
String sequence = creator.loadReference(inputFile);
// Count the occurences of each given base.
int[] occurrences = creator.countOccurrences(sequence);
System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences[0],occurrences[1],occurrences[2],occurrences[3]);
// Generate the suffix array and print diagnostics.
int[] suffixArray = creator.createSuffixArray(sequence);
for( int i = 0; i < 8; i++ )
System.out.printf("suffixArray[%d] = %d (%s...)%n", i, suffixArray[i], sequence.substring(suffixArray[i],Math.min(suffixArray[i]+100,sequence.length())));
int[] suffixArrayData = creator.createSuffixArray(sequence);
// Invert the suffix array and print diagnostics.
int[] inverseSuffixArray = creator.invertSuffixArray(suffixArray);
int[] inverseSuffixArray = creator.invertSuffixArray(suffixArrayData);
SuffixArray suffixArray = new SuffixArray( inverseSuffixArray[0], occurrences, suffixArrayData );
for( int i = 0; i < 8; i++ )
System.out.printf("suffixArray[%d] = %d (%s...)%n", i, suffixArray.sequence[i], sequence.substring(suffixArray.sequence[i],Math.min(suffixArray.sequence[i]+100,sequence.length())));
for( int i = 0; i < 8; i++ )
System.out.printf("inverseSuffixArray[%d] = %d (%s...)%n", i, inverseSuffixArray[i], sequence.substring(i,Math.min(i+100,sequence.length())));
// Create the data structure for the compressed suffix array and print diagnostics.
int[] compressedSuffixArray = creator.createCompressedSuffixArray(suffixArray,inverseSuffixArray);
int[] compressedSuffixArray = creator.createCompressedSuffixArray(suffixArray.sequence,inverseSuffixArray);
int reconstructedInverseSA = compressedSuffixArray[0];
for( int i = 0; i < 8; i++ ) {
System.out.printf("compressedSuffixArray[%d] = %d (SA-1[%d] = %d)%n", i, compressedSuffixArray[i], i, reconstructedInverseSA);
@ -160,64 +167,39 @@ public class CreateBWTFromReference {
System.out.printf("inverseCompressedSuffixArray[%d] = %d%n", i, inverseCompressedSuffixArray[i]);
}
// Count the occurences of each given base.
int[] occurrences = creator.countOccurrences(sequence);
System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences[0],occurrences[1],occurrences[2],occurrences[3]);
// Create the BWT.
byte[] bwt = creator.createBWT(sequence, suffixArray);
BWT bwt = new BWT( inverseSuffixArray[0], occurrences, creator.createBWT(sequence, suffixArray.sequence) );
String bwtAsString = new String(bwt);
String bwtAsString = new String(bwt.sequence);
System.out.printf("BWT: %s...%n", bwtAsString.substring(0,80));
OutputStream bwtOutputStream = new BufferedOutputStream(new FileOutputStream(bwtFile));
BWTWriter bwtWriter = new BWTWriter(bwtFile);
bwtWriter.write(bwt);
bwtWriter.close();
ByteBuffer buffer = ByteBuffer.allocate(4).order(ByteOrder.LITTLE_ENDIAN);
buffer.putInt(inverseSuffixArray[0]);
bwtOutputStream.write(buffer.array());
bwtOutputStream.flush();
SuffixArrayWriter saWriter = new SuffixArrayWriter(saFile);
saWriter.write(suffixArray);
saWriter.close();
IntPackedOutputStream occurrenceWriter = new IntPackedOutputStream(bwtOutputStream,ByteOrder.LITTLE_ENDIAN);
occurrenceWriter.write(occurrences);
occurrenceWriter.flush();
File existingBWTFile = new File(inputFileName+".bwt");
BWTReader existingBWTReader = new BWTReader(existingBWTFile);
BWT existingBWT = existingBWTReader.read();
BasePackedOutputStream<Integer> sequenceOutputStream = new BasePackedOutputStream<Integer>(Integer.class,bwtOutputStream,ByteOrder.LITTLE_ENDIAN);
sequenceOutputStream.write(bwt);
sequenceOutputStream.close();
OutputStream saOutputStream = new BufferedOutputStream(new FileOutputStream(saFile));
IntPackedOutputStream saIntWriter = new IntPackedOutputStream(saOutputStream,ByteOrder.LITTLE_ENDIAN);
// SA file format is 'primary' (= SA-1[0]?), occurrence array, interval, sequence length, SA[]
saIntWriter.write(inverseSuffixArray[0]);
saIntWriter.write(occurrences);
saIntWriter.write(1);
saIntWriter.write(suffixArray.length-1);
saIntWriter.write(suffixArray, 1, suffixArray.length-1);
saIntWriter.close();
File existingBwtFile = new File(inputFileName+".bwt");
InputStream existingBwtStream = new BufferedInputStream(new FileInputStream(existingBwtFile));
IntPackedInputStream existingIntReader = new IntPackedInputStream(existingBwtStream,ByteOrder.LITTLE_ENDIAN);
int existingFirstInverseSA = existingIntReader.read();
int[] existingOccurrences = new int[4];
existingIntReader.read(existingOccurrences);
BasePackedInputStream inputStream = new BasePackedInputStream<Integer>(Integer.class,existingBwtStream,ByteOrder.LITTLE_ENDIAN);
byte[] existingBwt = inputStream.read(existingOccurrences[3]);
String existingBwtAsString = new String(existingBwt);
String existingBwtAsString = new String(existingBWT.sequence);
System.out.printf("Existing BWT: %s...%n",existingBwtAsString.substring(0,80));
for( int i = 0; i < bwt.length; i++ ) {
if( bwt[i] != existingBwt[i] ) {
System.out.printf("First bwt mismatch: %d%n",i);
break;
}
for( int i = 0; i < bwt.sequence.length; i++ ) {
if( bwt.sequence[i] != existingBWT.sequence[i] )
throw new StingException("BWT mismatch at " + i);
}
File existingSAFile = new File(inputFileName+".sa");
SuffixArrayReader existingSuffixArrayReader = new SuffixArrayReader(existingSAFile);
SuffixArray existingSuffixArray = existingSuffixArrayReader.read();
for( int i = 0; i < suffixArray.sequence.length; i++ ) {
if( suffixArray.sequence[i] != existingSuffixArray.sequence[i] )
throw new StingException("Suffix array mismatch at " + i);
}
}

View File

@ -35,9 +35,8 @@ public class IntPackedInputStream {
* Read ints from the given InputStream.
* @param inputStream Input stream from which to read ints.
* @param byteOrder Endianness to use when writing a list of integers.
* @throws IOException if an I/O error occurs.
*/
public IntPackedInputStream(InputStream inputStream, ByteOrder byteOrder) throws IOException {
public IntPackedInputStream(InputStream inputStream, ByteOrder byteOrder) {
this.targetInputStream = inputStream;
this.buffer = ByteBuffer.allocate(PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BYTE).order(byteOrder);
}
@ -59,7 +58,18 @@ public class IntPackedInputStream {
* @throws IOException if an I/O error occurs.
*/
public void read( int[] data ) throws IOException {
for(int i = 0; i < data.length; i++) {
read( data, 0, data.length );
}
/**
* Read the data from the input stream, starting at the given offset.
* @param data placeholder for input data.
* @param offset place in the array to start reading in data.
* @param length number of ints to read in.
* @throws IOException if an I/O error occurs.
*/
public void read( int[] data, int offset, int length ) throws IOException {
for(int i = offset; i < offset+length; i++) {
targetInputStream.read(buffer.array());
data[i] = buffer.getInt();
buffer.rewind();

View File

@ -62,7 +62,7 @@ public class IntPackedOutputStream {
* @param byteOrder Endianness to use when writing a list of integers.
* @throws IOException if an I/O error occurs.
*/
public IntPackedOutputStream(OutputStream outputStream, ByteOrder byteOrder) throws IOException {
public IntPackedOutputStream(OutputStream outputStream, ByteOrder byteOrder) {
this.targetOutputStream = outputStream;
buffer = ByteBuffer.allocate(PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BYTE).order(byteOrder);
}

View File

@ -0,0 +1,34 @@
package org.broadinstitute.sting.bwa;
/**
* An in-memory representation of a suffix array.
*
* @author mhanna
* @version 0.1
*/
public class SuffixArray {
public final int inverseSA0;
public final int[] occurrences;
public final int[] sequence;
/**
* Creates a new sequence array with the given inverse SA, occurrences, and values.
* @param inverseSA0 Inverse SA entry for the first element.
* @param occurrences Cumulative number of occurrences of A,C,G,T, in order.
* @param sequence The full suffix array.
*/
public SuffixArray(int inverseSA0, int[] occurrences, int[] sequence) {
this.inverseSA0 = inverseSA0;
this.occurrences = occurrences;
this.sequence = sequence;
}
/**
* Retrieves the length of the sequence array.
* @return Length of the suffix array.
*/
public int length() {
return sequence.length;
}
}

View File

@ -0,0 +1,73 @@
package org.broadinstitute.sting.bwa;
import org.broadinstitute.sting.utils.StingException;
import java.io.*;
import java.nio.ByteOrder;
/**
* A reader for suffix arrays in permanent storage.
*
* @author mhanna
* @version 0.1
*/
public class SuffixArrayReader {
/**
* Input stream from which to read suffix array data.
*/
private InputStream inputStream;
/**
* Create a new suffix array reader.
* @param inputFile File in which the suffix array is stored.
*/
public SuffixArrayReader( File inputFile ) {
try {
this.inputStream = new BufferedInputStream(new FileInputStream(inputFile));
}
catch( FileNotFoundException ex ) {
throw new StingException("Unable to open input file", ex);
}
}
/**
* Read a suffix array from the input stream.
* @return The suffix array stored in the input stream.
*/
public SuffixArray read() {
IntPackedInputStream intPackedInputStream = new IntPackedInputStream(inputStream, ByteOrder.LITTLE_ENDIAN);
int inverseSA0;
int[] occurrences;
int[] suffixArray;
try {
inverseSA0 = intPackedInputStream.read();
occurrences = new int[PackUtils.ALPHABET_SIZE];
intPackedInputStream.read(occurrences);
// Throw away the suffix array size in bytes and use the occurrences table directly.
intPackedInputStream.read();
int suffixArraySize = occurrences[PackUtils.ALPHABET_SIZE-1]+1;
suffixArray = new int[suffixArraySize];
intPackedInputStream.read(suffixArray);
}
catch( IOException ex ) {
throw new StingException("Unable to read BWT from input stream.", ex);
}
return new SuffixArray(inverseSA0, occurrences, suffixArray);
}
/**
* Close the input stream.
*/
public void close() {
try {
inputStream.close();
}
catch( IOException ex ) {
throw new StingException("Unable to close input file", ex);
}
}
}

View File

@ -0,0 +1,66 @@
package org.broadinstitute.sting.bwa;
import org.broadinstitute.sting.utils.StingException;
import java.io.*;
import java.nio.ByteOrder;
/**
* Javadoc goes here.
*
* @author mhanna
* @version 0.1
*/
public class SuffixArrayWriter {
/**
* Input stream from which to read suffix array data.
*/
private OutputStream outputStream;
/**
* Create a new suffix array reader.
* @param inputFile File in which the suffix array is stored.
*/
public SuffixArrayWriter( File inputFile ) {
try {
this.outputStream = new BufferedOutputStream(new FileOutputStream(inputFile));
}
catch( FileNotFoundException ex ) {
throw new StingException("Unable to open input file", ex);
}
}
/**
* Write a suffix array to the output stream.
* @param suffixArray suffix array to write.
*/
public void write(SuffixArray suffixArray) {
IntPackedOutputStream intPackedOutputStream = new IntPackedOutputStream(outputStream, ByteOrder.LITTLE_ENDIAN);
try {
intPackedOutputStream.write(suffixArray.inverseSA0);
intPackedOutputStream.write(suffixArray.occurrences);
// How frequently the suffix array entry is placed.
intPackedOutputStream.write(1);
// Length of the suffix array.
intPackedOutputStream.write(suffixArray.length()-1);
intPackedOutputStream.write(suffixArray.sequence, 1, suffixArray.length()-1);
}
catch( IOException ex ) {
throw new StingException("Unable to read BWT from input stream.", ex);
}
}
/**
* Close the input stream.
*/
public void close() {
try {
outputStream.close();
}
catch( IOException ex ) {
throw new StingException("Unable to close input file", ex);
}
}
}