Higher-level readers and writers for BWTs and suffix arrays.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1573 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
0093482c62
commit
6de54dcd2a
|
|
@ -0,0 +1,25 @@
|
|||
package org.broadinstitute.sting.bwa;
|
||||
|
||||
/**
|
||||
* Represents the Burrows-Wheeler Transform of a reference sequence.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BWT {
|
||||
public final int inverseSA0;
|
||||
public final int[] occurrences;
|
||||
public final byte[] sequence;
|
||||
|
||||
/**
|
||||
* Creates a new BWT with the given inverse SA, occurrences, and sequence (in ASCII).
|
||||
* @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence.
|
||||
* @param occurrences Cumulative number of occurrences of A,C,G,T, in order.
|
||||
* @param sequence The full BWT sequence, sans the '$'.
|
||||
*/
|
||||
public BWT( int inverseSA0, int[] occurrences, byte[] sequence ) {
|
||||
this.inverseSA0 = inverseSA0;
|
||||
this.occurrences = occurrences;
|
||||
this.sequence = sequence;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,69 @@
|
|||
package org.broadinstitute.sting.bwa;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.ByteOrder;
|
||||
|
||||
/**
|
||||
* Reads a BWT from a given file.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BWTReader {
|
||||
/**
|
||||
* Input stream from which to read BWT data.
|
||||
*/
|
||||
private InputStream inputStream;
|
||||
|
||||
/**
|
||||
* Create a new BWT reader.
|
||||
* @param inputFile File in which the BWT is stored.
|
||||
*/
|
||||
public BWTReader( File inputFile ) {
|
||||
try {
|
||||
this.inputStream = new BufferedInputStream(new FileInputStream(inputFile));
|
||||
}
|
||||
catch( FileNotFoundException ex ) {
|
||||
throw new StingException("Unable to open input file", ex);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a BWT from the input stream.
|
||||
* @return The BWT stored in the input stream.
|
||||
*/
|
||||
public BWT read() {
|
||||
IntPackedInputStream intPackedInputStream = new IntPackedInputStream(inputStream, ByteOrder.LITTLE_ENDIAN);
|
||||
BasePackedInputStream basePackedInputStream = new BasePackedInputStream<Integer>(Integer.class, inputStream, ByteOrder.LITTLE_ENDIAN);
|
||||
|
||||
int inverseSA0;
|
||||
int[] occurrences;
|
||||
byte[] bwt;
|
||||
|
||||
try {
|
||||
inverseSA0 = intPackedInputStream.read();
|
||||
occurrences = new int[PackUtils.ALPHABET_SIZE];
|
||||
intPackedInputStream.read(occurrences);
|
||||
bwt = basePackedInputStream.read(occurrences[PackUtils.ALPHABET_SIZE-1]);
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new StingException("Unable to read BWT from input stream.", ex);
|
||||
}
|
||||
|
||||
return new BWT(inverseSA0, occurrences, bwt);
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the input stream.
|
||||
*/
|
||||
public void close() {
|
||||
try {
|
||||
inputStream.close();
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new StingException("Unable to close input file", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
package org.broadinstitute.sting.bwa;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.ByteOrder;
|
||||
|
||||
/**
|
||||
* Writes an in-memory BWT to an outputstream.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BWTWriter {
|
||||
/**
|
||||
* Input stream from which to read BWT data.
|
||||
*/
|
||||
private OutputStream outputStream;
|
||||
|
||||
/**
|
||||
* Create a new BWT reader.
|
||||
* @param inputFile File in which the BWT is stored.
|
||||
*/
|
||||
public BWTWriter( File inputFile ) {
|
||||
try {
|
||||
this.outputStream = new BufferedOutputStream(new FileOutputStream(inputFile));
|
||||
}
|
||||
catch( FileNotFoundException ex ) {
|
||||
throw new StingException("Unable to open output file", ex);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a BWT to the output stream.
|
||||
* @param bwt Transform to be written to the output stream.
|
||||
*/
|
||||
public void write( BWT bwt ) {
|
||||
IntPackedOutputStream intPackedOutputStream = new IntPackedOutputStream(outputStream, ByteOrder.LITTLE_ENDIAN);
|
||||
BasePackedOutputStream basePackedOutputStream = new BasePackedOutputStream<Integer>(Integer.class, outputStream, ByteOrder.LITTLE_ENDIAN);
|
||||
|
||||
try {
|
||||
intPackedOutputStream.write(bwt.inverseSA0);
|
||||
intPackedOutputStream.write(bwt.occurrences);
|
||||
basePackedOutputStream.write(bwt.sequence);
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new StingException("Unable to read BWT from input stream.", ex);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the input stream.
|
||||
*/
|
||||
public void close() {
|
||||
try {
|
||||
outputStream.close();
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new StingException("Unable to close input file", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -32,7 +32,7 @@ public class BasePackedInputStream<T> {
|
|||
this(type,new BufferedInputStream(new FileInputStream(inputFile)),byteOrder);
|
||||
}
|
||||
|
||||
public BasePackedInputStream( Class<T> type, InputStream inputStream, ByteOrder byteOrder ) throws FileNotFoundException {
|
||||
public BasePackedInputStream( Class<T> type, InputStream inputStream, ByteOrder byteOrder ) {
|
||||
if( type != Integer.class )
|
||||
throw new StingException("Only bases packed into 32-bit words are currently supported by this input stream. Type specified: " + type.getName());
|
||||
|
||||
|
|
@ -44,6 +44,7 @@ public class BasePackedInputStream<T> {
|
|||
/**
|
||||
* Read the entire contents of the input stream.
|
||||
* @param length number of bases to read from the stream.
|
||||
* @return a byte array of the given length.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public byte[] read( int length ) throws IOException {
|
||||
|
|
|
|||
|
|
@ -33,8 +33,8 @@ import net.sf.samtools.util.StringUtil;
|
|||
import java.io.*;
|
||||
import java.util.TreeSet;
|
||||
import java.util.Comparator;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
/**
|
||||
* Create a suffix array data structure.
|
||||
|
|
@ -136,18 +136,25 @@ public class CreateBWTFromReference {
|
|||
|
||||
String sequence = creator.loadReference(inputFile);
|
||||
|
||||
// Count the occurences of each given base.
|
||||
int[] occurrences = creator.countOccurrences(sequence);
|
||||
System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences[0],occurrences[1],occurrences[2],occurrences[3]);
|
||||
|
||||
// Generate the suffix array and print diagnostics.
|
||||
int[] suffixArray = creator.createSuffixArray(sequence);
|
||||
for( int i = 0; i < 8; i++ )
|
||||
System.out.printf("suffixArray[%d] = %d (%s...)%n", i, suffixArray[i], sequence.substring(suffixArray[i],Math.min(suffixArray[i]+100,sequence.length())));
|
||||
int[] suffixArrayData = creator.createSuffixArray(sequence);
|
||||
|
||||
// Invert the suffix array and print diagnostics.
|
||||
int[] inverseSuffixArray = creator.invertSuffixArray(suffixArray);
|
||||
int[] inverseSuffixArray = creator.invertSuffixArray(suffixArrayData);
|
||||
|
||||
SuffixArray suffixArray = new SuffixArray( inverseSuffixArray[0], occurrences, suffixArrayData );
|
||||
|
||||
for( int i = 0; i < 8; i++ )
|
||||
System.out.printf("suffixArray[%d] = %d (%s...)%n", i, suffixArray.sequence[i], sequence.substring(suffixArray.sequence[i],Math.min(suffixArray.sequence[i]+100,sequence.length())));
|
||||
for( int i = 0; i < 8; i++ )
|
||||
System.out.printf("inverseSuffixArray[%d] = %d (%s...)%n", i, inverseSuffixArray[i], sequence.substring(i,Math.min(i+100,sequence.length())));
|
||||
|
||||
// Create the data structure for the compressed suffix array and print diagnostics.
|
||||
int[] compressedSuffixArray = creator.createCompressedSuffixArray(suffixArray,inverseSuffixArray);
|
||||
int[] compressedSuffixArray = creator.createCompressedSuffixArray(suffixArray.sequence,inverseSuffixArray);
|
||||
int reconstructedInverseSA = compressedSuffixArray[0];
|
||||
for( int i = 0; i < 8; i++ ) {
|
||||
System.out.printf("compressedSuffixArray[%d] = %d (SA-1[%d] = %d)%n", i, compressedSuffixArray[i], i, reconstructedInverseSA);
|
||||
|
|
@ -160,64 +167,39 @@ public class CreateBWTFromReference {
|
|||
System.out.printf("inverseCompressedSuffixArray[%d] = %d%n", i, inverseCompressedSuffixArray[i]);
|
||||
}
|
||||
|
||||
// Count the occurences of each given base.
|
||||
int[] occurrences = creator.countOccurrences(sequence);
|
||||
System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences[0],occurrences[1],occurrences[2],occurrences[3]);
|
||||
|
||||
// Create the BWT.
|
||||
byte[] bwt = creator.createBWT(sequence, suffixArray);
|
||||
BWT bwt = new BWT( inverseSuffixArray[0], occurrences, creator.createBWT(sequence, suffixArray.sequence) );
|
||||
|
||||
String bwtAsString = new String(bwt);
|
||||
String bwtAsString = new String(bwt.sequence);
|
||||
System.out.printf("BWT: %s...%n", bwtAsString.substring(0,80));
|
||||
|
||||
OutputStream bwtOutputStream = new BufferedOutputStream(new FileOutputStream(bwtFile));
|
||||
BWTWriter bwtWriter = new BWTWriter(bwtFile);
|
||||
bwtWriter.write(bwt);
|
||||
bwtWriter.close();
|
||||
|
||||
ByteBuffer buffer = ByteBuffer.allocate(4).order(ByteOrder.LITTLE_ENDIAN);
|
||||
buffer.putInt(inverseSuffixArray[0]);
|
||||
bwtOutputStream.write(buffer.array());
|
||||
bwtOutputStream.flush();
|
||||
SuffixArrayWriter saWriter = new SuffixArrayWriter(saFile);
|
||||
saWriter.write(suffixArray);
|
||||
saWriter.close();
|
||||
|
||||
IntPackedOutputStream occurrenceWriter = new IntPackedOutputStream(bwtOutputStream,ByteOrder.LITTLE_ENDIAN);
|
||||
occurrenceWriter.write(occurrences);
|
||||
occurrenceWriter.flush();
|
||||
File existingBWTFile = new File(inputFileName+".bwt");
|
||||
BWTReader existingBWTReader = new BWTReader(existingBWTFile);
|
||||
BWT existingBWT = existingBWTReader.read();
|
||||
|
||||
BasePackedOutputStream<Integer> sequenceOutputStream = new BasePackedOutputStream<Integer>(Integer.class,bwtOutputStream,ByteOrder.LITTLE_ENDIAN);
|
||||
sequenceOutputStream.write(bwt);
|
||||
sequenceOutputStream.close();
|
||||
|
||||
OutputStream saOutputStream = new BufferedOutputStream(new FileOutputStream(saFile));
|
||||
IntPackedOutputStream saIntWriter = new IntPackedOutputStream(saOutputStream,ByteOrder.LITTLE_ENDIAN);
|
||||
|
||||
// SA file format is 'primary' (= SA-1[0]?), occurrence array, interval, sequence length, SA[]
|
||||
saIntWriter.write(inverseSuffixArray[0]);
|
||||
saIntWriter.write(occurrences);
|
||||
saIntWriter.write(1);
|
||||
saIntWriter.write(suffixArray.length-1);
|
||||
saIntWriter.write(suffixArray, 1, suffixArray.length-1);
|
||||
|
||||
saIntWriter.close();
|
||||
|
||||
File existingBwtFile = new File(inputFileName+".bwt");
|
||||
InputStream existingBwtStream = new BufferedInputStream(new FileInputStream(existingBwtFile));
|
||||
|
||||
IntPackedInputStream existingIntReader = new IntPackedInputStream(existingBwtStream,ByteOrder.LITTLE_ENDIAN);
|
||||
|
||||
int existingFirstInverseSA = existingIntReader.read();
|
||||
|
||||
int[] existingOccurrences = new int[4];
|
||||
existingIntReader.read(existingOccurrences);
|
||||
|
||||
BasePackedInputStream inputStream = new BasePackedInputStream<Integer>(Integer.class,existingBwtStream,ByteOrder.LITTLE_ENDIAN);
|
||||
byte[] existingBwt = inputStream.read(existingOccurrences[3]);
|
||||
|
||||
String existingBwtAsString = new String(existingBwt);
|
||||
String existingBwtAsString = new String(existingBWT.sequence);
|
||||
System.out.printf("Existing BWT: %s...%n",existingBwtAsString.substring(0,80));
|
||||
|
||||
for( int i = 0; i < bwt.length; i++ ) {
|
||||
if( bwt[i] != existingBwt[i] ) {
|
||||
System.out.printf("First bwt mismatch: %d%n",i);
|
||||
break;
|
||||
}
|
||||
for( int i = 0; i < bwt.sequence.length; i++ ) {
|
||||
if( bwt.sequence[i] != existingBWT.sequence[i] )
|
||||
throw new StingException("BWT mismatch at " + i);
|
||||
}
|
||||
|
||||
File existingSAFile = new File(inputFileName+".sa");
|
||||
SuffixArrayReader existingSuffixArrayReader = new SuffixArrayReader(existingSAFile);
|
||||
SuffixArray existingSuffixArray = existingSuffixArrayReader.read();
|
||||
|
||||
for( int i = 0; i < suffixArray.sequence.length; i++ ) {
|
||||
if( suffixArray.sequence[i] != existingSuffixArray.sequence[i] )
|
||||
throw new StingException("Suffix array mismatch at " + i);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -35,9 +35,8 @@ public class IntPackedInputStream {
|
|||
* Read ints from the given InputStream.
|
||||
* @param inputStream Input stream from which to read ints.
|
||||
* @param byteOrder Endianness to use when writing a list of integers.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public IntPackedInputStream(InputStream inputStream, ByteOrder byteOrder) throws IOException {
|
||||
public IntPackedInputStream(InputStream inputStream, ByteOrder byteOrder) {
|
||||
this.targetInputStream = inputStream;
|
||||
this.buffer = ByteBuffer.allocate(PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BYTE).order(byteOrder);
|
||||
}
|
||||
|
|
@ -59,7 +58,18 @@ public class IntPackedInputStream {
|
|||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void read( int[] data ) throws IOException {
|
||||
for(int i = 0; i < data.length; i++) {
|
||||
read( data, 0, data.length );
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the data from the input stream, starting at the given offset.
|
||||
* @param data placeholder for input data.
|
||||
* @param offset place in the array to start reading in data.
|
||||
* @param length number of ints to read in.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void read( int[] data, int offset, int length ) throws IOException {
|
||||
for(int i = offset; i < offset+length; i++) {
|
||||
targetInputStream.read(buffer.array());
|
||||
data[i] = buffer.getInt();
|
||||
buffer.rewind();
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ public class IntPackedOutputStream {
|
|||
* @param byteOrder Endianness to use when writing a list of integers.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public IntPackedOutputStream(OutputStream outputStream, ByteOrder byteOrder) throws IOException {
|
||||
public IntPackedOutputStream(OutputStream outputStream, ByteOrder byteOrder) {
|
||||
this.targetOutputStream = outputStream;
|
||||
buffer = ByteBuffer.allocate(PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BYTE).order(byteOrder);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,34 @@
|
|||
package org.broadinstitute.sting.bwa;
|
||||
|
||||
/**
|
||||
* An in-memory representation of a suffix array.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class SuffixArray {
|
||||
public final int inverseSA0;
|
||||
public final int[] occurrences;
|
||||
public final int[] sequence;
|
||||
|
||||
/**
|
||||
* Creates a new sequence array with the given inverse SA, occurrences, and values.
|
||||
* @param inverseSA0 Inverse SA entry for the first element.
|
||||
* @param occurrences Cumulative number of occurrences of A,C,G,T, in order.
|
||||
* @param sequence The full suffix array.
|
||||
*/
|
||||
public SuffixArray(int inverseSA0, int[] occurrences, int[] sequence) {
|
||||
this.inverseSA0 = inverseSA0;
|
||||
this.occurrences = occurrences;
|
||||
this.sequence = sequence;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves the length of the sequence array.
|
||||
* @return Length of the suffix array.
|
||||
*/
|
||||
public int length() {
|
||||
return sequence.length;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,73 @@
|
|||
package org.broadinstitute.sting.bwa;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.ByteOrder;
|
||||
|
||||
/**
|
||||
* A reader for suffix arrays in permanent storage.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class SuffixArrayReader {
|
||||
/**
|
||||
* Input stream from which to read suffix array data.
|
||||
*/
|
||||
private InputStream inputStream;
|
||||
|
||||
/**
|
||||
* Create a new suffix array reader.
|
||||
* @param inputFile File in which the suffix array is stored.
|
||||
*/
|
||||
public SuffixArrayReader( File inputFile ) {
|
||||
try {
|
||||
this.inputStream = new BufferedInputStream(new FileInputStream(inputFile));
|
||||
}
|
||||
catch( FileNotFoundException ex ) {
|
||||
throw new StingException("Unable to open input file", ex);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a suffix array from the input stream.
|
||||
* @return The suffix array stored in the input stream.
|
||||
*/
|
||||
public SuffixArray read() {
|
||||
IntPackedInputStream intPackedInputStream = new IntPackedInputStream(inputStream, ByteOrder.LITTLE_ENDIAN);
|
||||
|
||||
int inverseSA0;
|
||||
int[] occurrences;
|
||||
int[] suffixArray;
|
||||
|
||||
try {
|
||||
inverseSA0 = intPackedInputStream.read();
|
||||
occurrences = new int[PackUtils.ALPHABET_SIZE];
|
||||
intPackedInputStream.read(occurrences);
|
||||
// Throw away the suffix array size in bytes and use the occurrences table directly.
|
||||
intPackedInputStream.read();
|
||||
int suffixArraySize = occurrences[PackUtils.ALPHABET_SIZE-1]+1;
|
||||
suffixArray = new int[suffixArraySize];
|
||||
intPackedInputStream.read(suffixArray);
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new StingException("Unable to read BWT from input stream.", ex);
|
||||
}
|
||||
|
||||
return new SuffixArray(inverseSA0, occurrences, suffixArray);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Close the input stream.
|
||||
*/
|
||||
public void close() {
|
||||
try {
|
||||
inputStream.close();
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new StingException("Unable to close input file", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
package org.broadinstitute.sting.bwa;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.ByteOrder;
|
||||
|
||||
/**
|
||||
* Javadoc goes here.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class SuffixArrayWriter {
|
||||
/**
|
||||
* Input stream from which to read suffix array data.
|
||||
*/
|
||||
private OutputStream outputStream;
|
||||
|
||||
/**
|
||||
* Create a new suffix array reader.
|
||||
* @param inputFile File in which the suffix array is stored.
|
||||
*/
|
||||
public SuffixArrayWriter( File inputFile ) {
|
||||
try {
|
||||
this.outputStream = new BufferedOutputStream(new FileOutputStream(inputFile));
|
||||
}
|
||||
catch( FileNotFoundException ex ) {
|
||||
throw new StingException("Unable to open input file", ex);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a suffix array to the output stream.
|
||||
* @param suffixArray suffix array to write.
|
||||
*/
|
||||
public void write(SuffixArray suffixArray) {
|
||||
IntPackedOutputStream intPackedOutputStream = new IntPackedOutputStream(outputStream, ByteOrder.LITTLE_ENDIAN);
|
||||
|
||||
try {
|
||||
intPackedOutputStream.write(suffixArray.inverseSA0);
|
||||
intPackedOutputStream.write(suffixArray.occurrences);
|
||||
// How frequently the suffix array entry is placed.
|
||||
intPackedOutputStream.write(1);
|
||||
// Length of the suffix array.
|
||||
intPackedOutputStream.write(suffixArray.length()-1);
|
||||
intPackedOutputStream.write(suffixArray.sequence, 1, suffixArray.length()-1);
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new StingException("Unable to read BWT from input stream.", ex);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Close the input stream.
|
||||
*/
|
||||
public void close() {
|
||||
try {
|
||||
outputStream.close();
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new StingException("Unable to close input file", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue