From 6de54dcd2a32f7172950788ebf8952315d547e2d Mon Sep 17 00:00:00 2001 From: hanna Date: Wed, 9 Sep 2009 22:45:32 +0000 Subject: [PATCH] Higher-level readers and writers for BWTs and suffix arrays. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1573 348d0f76-0448-11de-a6fe-93d51630548a --- .../src/org/broadinstitute/sting/bwa/BWT.java | 25 +++++ .../broadinstitute/sting/bwa/BWTReader.java | 69 ++++++++++++++ .../broadinstitute/sting/bwa/BWTWriter.java | 62 ++++++++++++ .../sting/bwa/BasePackedInputStream.java | 3 +- .../sting/bwa/CreateBWTFromReference.java | 94 ++++++++----------- .../sting/bwa/IntPackedInputStream.java | 16 +++- .../sting/bwa/IntPackedOutputStream.java | 2 +- .../broadinstitute/sting/bwa/SuffixArray.java | 34 +++++++ .../sting/bwa/SuffixArrayReader.java | 73 ++++++++++++++ .../sting/bwa/SuffixArrayWriter.java | 66 +++++++++++++ 10 files changed, 383 insertions(+), 61 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/bwa/BWT.java create mode 100644 java/src/org/broadinstitute/sting/bwa/BWTReader.java create mode 100644 java/src/org/broadinstitute/sting/bwa/BWTWriter.java create mode 100644 java/src/org/broadinstitute/sting/bwa/SuffixArray.java create mode 100644 java/src/org/broadinstitute/sting/bwa/SuffixArrayReader.java create mode 100644 java/src/org/broadinstitute/sting/bwa/SuffixArrayWriter.java diff --git a/java/src/org/broadinstitute/sting/bwa/BWT.java b/java/src/org/broadinstitute/sting/bwa/BWT.java new file mode 100644 index 000000000..3ff871d3e --- /dev/null +++ b/java/src/org/broadinstitute/sting/bwa/BWT.java @@ -0,0 +1,25 @@ +package org.broadinstitute.sting.bwa; + +/** + * Represents the Burrows-Wheeler Transform of a reference sequence. + * + * @author mhanna + * @version 0.1 + */ +public class BWT { + public final int inverseSA0; + public final int[] occurrences; + public final byte[] sequence; + + /** + * Creates a new BWT with the given inverse SA, occurrences, and sequence (in ASCII). + * @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence. + * @param occurrences Cumulative number of occurrences of A,C,G,T, in order. + * @param sequence The full BWT sequence, sans the '$'. + */ + public BWT( int inverseSA0, int[] occurrences, byte[] sequence ) { + this.inverseSA0 = inverseSA0; + this.occurrences = occurrences; + this.sequence = sequence; + } +} diff --git a/java/src/org/broadinstitute/sting/bwa/BWTReader.java b/java/src/org/broadinstitute/sting/bwa/BWTReader.java new file mode 100644 index 000000000..33c0781c1 --- /dev/null +++ b/java/src/org/broadinstitute/sting/bwa/BWTReader.java @@ -0,0 +1,69 @@ +package org.broadinstitute.sting.bwa; + +import org.broadinstitute.sting.utils.StingException; + +import java.io.*; +import java.nio.ByteOrder; + +/** + * Reads a BWT from a given file. + * + * @author mhanna + * @version 0.1 + */ +public class BWTReader { + /** + * Input stream from which to read BWT data. + */ + private InputStream inputStream; + + /** + * Create a new BWT reader. + * @param inputFile File in which the BWT is stored. + */ + public BWTReader( File inputFile ) { + try { + this.inputStream = new BufferedInputStream(new FileInputStream(inputFile)); + } + catch( FileNotFoundException ex ) { + throw new StingException("Unable to open input file", ex); + } + } + + /** + * Read a BWT from the input stream. + * @return The BWT stored in the input stream. + */ + public BWT read() { + IntPackedInputStream intPackedInputStream = new IntPackedInputStream(inputStream, ByteOrder.LITTLE_ENDIAN); + BasePackedInputStream basePackedInputStream = new BasePackedInputStream(Integer.class, inputStream, ByteOrder.LITTLE_ENDIAN); + + int inverseSA0; + int[] occurrences; + byte[] bwt; + + try { + inverseSA0 = intPackedInputStream.read(); + occurrences = new int[PackUtils.ALPHABET_SIZE]; + intPackedInputStream.read(occurrences); + bwt = basePackedInputStream.read(occurrences[PackUtils.ALPHABET_SIZE-1]); + } + catch( IOException ex ) { + throw new StingException("Unable to read BWT from input stream.", ex); + } + + return new BWT(inverseSA0, occurrences, bwt); + } + + /** + * Close the input stream. + */ + public void close() { + try { + inputStream.close(); + } + catch( IOException ex ) { + throw new StingException("Unable to close input file", ex); + } + } +} diff --git a/java/src/org/broadinstitute/sting/bwa/BWTWriter.java b/java/src/org/broadinstitute/sting/bwa/BWTWriter.java new file mode 100644 index 000000000..210977716 --- /dev/null +++ b/java/src/org/broadinstitute/sting/bwa/BWTWriter.java @@ -0,0 +1,62 @@ +package org.broadinstitute.sting.bwa; + +import org.broadinstitute.sting.utils.StingException; + +import java.io.*; +import java.nio.ByteOrder; + +/** + * Writes an in-memory BWT to an outputstream. + * + * @author mhanna + * @version 0.1 + */ +public class BWTWriter { + /** + * Input stream from which to read BWT data. + */ + private OutputStream outputStream; + + /** + * Create a new BWT reader. + * @param inputFile File in which the BWT is stored. + */ + public BWTWriter( File inputFile ) { + try { + this.outputStream = new BufferedOutputStream(new FileOutputStream(inputFile)); + } + catch( FileNotFoundException ex ) { + throw new StingException("Unable to open output file", ex); + } + } + + /** + * Write a BWT to the output stream. + * @param bwt Transform to be written to the output stream. + */ + public void write( BWT bwt ) { + IntPackedOutputStream intPackedOutputStream = new IntPackedOutputStream(outputStream, ByteOrder.LITTLE_ENDIAN); + BasePackedOutputStream basePackedOutputStream = new BasePackedOutputStream(Integer.class, outputStream, ByteOrder.LITTLE_ENDIAN); + + try { + intPackedOutputStream.write(bwt.inverseSA0); + intPackedOutputStream.write(bwt.occurrences); + basePackedOutputStream.write(bwt.sequence); + } + catch( IOException ex ) { + throw new StingException("Unable to read BWT from input stream.", ex); + } + } + + /** + * Close the input stream. + */ + public void close() { + try { + outputStream.close(); + } + catch( IOException ex ) { + throw new StingException("Unable to close input file", ex); + } + } +} diff --git a/java/src/org/broadinstitute/sting/bwa/BasePackedInputStream.java b/java/src/org/broadinstitute/sting/bwa/BasePackedInputStream.java index fe47af206..a03328082 100644 --- a/java/src/org/broadinstitute/sting/bwa/BasePackedInputStream.java +++ b/java/src/org/broadinstitute/sting/bwa/BasePackedInputStream.java @@ -32,7 +32,7 @@ public class BasePackedInputStream { this(type,new BufferedInputStream(new FileInputStream(inputFile)),byteOrder); } - public BasePackedInputStream( Class type, InputStream inputStream, ByteOrder byteOrder ) throws FileNotFoundException { + public BasePackedInputStream( Class type, InputStream inputStream, ByteOrder byteOrder ) { if( type != Integer.class ) throw new StingException("Only bases packed into 32-bit words are currently supported by this input stream. Type specified: " + type.getName()); @@ -44,6 +44,7 @@ public class BasePackedInputStream { /** * Read the entire contents of the input stream. * @param length number of bases to read from the stream. + * @return a byte array of the given length. * @throws IOException if an I/O error occurs. */ public byte[] read( int length ) throws IOException { diff --git a/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java b/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java index adc3442e1..7f05d871f 100755 --- a/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java +++ b/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java @@ -33,8 +33,8 @@ import net.sf.samtools.util.StringUtil; import java.io.*; import java.util.TreeSet; import java.util.Comparator; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; + +import org.broadinstitute.sting.utils.StingException; /** * Create a suffix array data structure. @@ -136,18 +136,25 @@ public class CreateBWTFromReference { String sequence = creator.loadReference(inputFile); + // Count the occurences of each given base. + int[] occurrences = creator.countOccurrences(sequence); + System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences[0],occurrences[1],occurrences[2],occurrences[3]); + // Generate the suffix array and print diagnostics. - int[] suffixArray = creator.createSuffixArray(sequence); - for( int i = 0; i < 8; i++ ) - System.out.printf("suffixArray[%d] = %d (%s...)%n", i, suffixArray[i], sequence.substring(suffixArray[i],Math.min(suffixArray[i]+100,sequence.length()))); + int[] suffixArrayData = creator.createSuffixArray(sequence); // Invert the suffix array and print diagnostics. - int[] inverseSuffixArray = creator.invertSuffixArray(suffixArray); + int[] inverseSuffixArray = creator.invertSuffixArray(suffixArrayData); + + SuffixArray suffixArray = new SuffixArray( inverseSuffixArray[0], occurrences, suffixArrayData ); + + for( int i = 0; i < 8; i++ ) + System.out.printf("suffixArray[%d] = %d (%s...)%n", i, suffixArray.sequence[i], sequence.substring(suffixArray.sequence[i],Math.min(suffixArray.sequence[i]+100,sequence.length()))); for( int i = 0; i < 8; i++ ) System.out.printf("inverseSuffixArray[%d] = %d (%s...)%n", i, inverseSuffixArray[i], sequence.substring(i,Math.min(i+100,sequence.length()))); // Create the data structure for the compressed suffix array and print diagnostics. - int[] compressedSuffixArray = creator.createCompressedSuffixArray(suffixArray,inverseSuffixArray); + int[] compressedSuffixArray = creator.createCompressedSuffixArray(suffixArray.sequence,inverseSuffixArray); int reconstructedInverseSA = compressedSuffixArray[0]; for( int i = 0; i < 8; i++ ) { System.out.printf("compressedSuffixArray[%d] = %d (SA-1[%d] = %d)%n", i, compressedSuffixArray[i], i, reconstructedInverseSA); @@ -160,64 +167,39 @@ public class CreateBWTFromReference { System.out.printf("inverseCompressedSuffixArray[%d] = %d%n", i, inverseCompressedSuffixArray[i]); } - // Count the occurences of each given base. - int[] occurrences = creator.countOccurrences(sequence); - System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences[0],occurrences[1],occurrences[2],occurrences[3]); - // Create the BWT. - byte[] bwt = creator.createBWT(sequence, suffixArray); + BWT bwt = new BWT( inverseSuffixArray[0], occurrences, creator.createBWT(sequence, suffixArray.sequence) ); - String bwtAsString = new String(bwt); + String bwtAsString = new String(bwt.sequence); System.out.printf("BWT: %s...%n", bwtAsString.substring(0,80)); - OutputStream bwtOutputStream = new BufferedOutputStream(new FileOutputStream(bwtFile)); + BWTWriter bwtWriter = new BWTWriter(bwtFile); + bwtWriter.write(bwt); + bwtWriter.close(); - ByteBuffer buffer = ByteBuffer.allocate(4).order(ByteOrder.LITTLE_ENDIAN); - buffer.putInt(inverseSuffixArray[0]); - bwtOutputStream.write(buffer.array()); - bwtOutputStream.flush(); + SuffixArrayWriter saWriter = new SuffixArrayWriter(saFile); + saWriter.write(suffixArray); + saWriter.close(); - IntPackedOutputStream occurrenceWriter = new IntPackedOutputStream(bwtOutputStream,ByteOrder.LITTLE_ENDIAN); - occurrenceWriter.write(occurrences); - occurrenceWriter.flush(); + File existingBWTFile = new File(inputFileName+".bwt"); + BWTReader existingBWTReader = new BWTReader(existingBWTFile); + BWT existingBWT = existingBWTReader.read(); - BasePackedOutputStream sequenceOutputStream = new BasePackedOutputStream(Integer.class,bwtOutputStream,ByteOrder.LITTLE_ENDIAN); - sequenceOutputStream.write(bwt); - sequenceOutputStream.close(); - - OutputStream saOutputStream = new BufferedOutputStream(new FileOutputStream(saFile)); - IntPackedOutputStream saIntWriter = new IntPackedOutputStream(saOutputStream,ByteOrder.LITTLE_ENDIAN); - - // SA file format is 'primary' (= SA-1[0]?), occurrence array, interval, sequence length, SA[] - saIntWriter.write(inverseSuffixArray[0]); - saIntWriter.write(occurrences); - saIntWriter.write(1); - saIntWriter.write(suffixArray.length-1); - saIntWriter.write(suffixArray, 1, suffixArray.length-1); - - saIntWriter.close(); - - File existingBwtFile = new File(inputFileName+".bwt"); - InputStream existingBwtStream = new BufferedInputStream(new FileInputStream(existingBwtFile)); - - IntPackedInputStream existingIntReader = new IntPackedInputStream(existingBwtStream,ByteOrder.LITTLE_ENDIAN); - - int existingFirstInverseSA = existingIntReader.read(); - - int[] existingOccurrences = new int[4]; - existingIntReader.read(existingOccurrences); - - BasePackedInputStream inputStream = new BasePackedInputStream(Integer.class,existingBwtStream,ByteOrder.LITTLE_ENDIAN); - byte[] existingBwt = inputStream.read(existingOccurrences[3]); - - String existingBwtAsString = new String(existingBwt); + String existingBwtAsString = new String(existingBWT.sequence); System.out.printf("Existing BWT: %s...%n",existingBwtAsString.substring(0,80)); - for( int i = 0; i < bwt.length; i++ ) { - if( bwt[i] != existingBwt[i] ) { - System.out.printf("First bwt mismatch: %d%n",i); - break; - } + for( int i = 0; i < bwt.sequence.length; i++ ) { + if( bwt.sequence[i] != existingBWT.sequence[i] ) + throw new StingException("BWT mismatch at " + i); + } + + File existingSAFile = new File(inputFileName+".sa"); + SuffixArrayReader existingSuffixArrayReader = new SuffixArrayReader(existingSAFile); + SuffixArray existingSuffixArray = existingSuffixArrayReader.read(); + + for( int i = 0; i < suffixArray.sequence.length; i++ ) { + if( suffixArray.sequence[i] != existingSuffixArray.sequence[i] ) + throw new StingException("Suffix array mismatch at " + i); } } diff --git a/java/src/org/broadinstitute/sting/bwa/IntPackedInputStream.java b/java/src/org/broadinstitute/sting/bwa/IntPackedInputStream.java index 47e4ad71d..8de0ca8d6 100644 --- a/java/src/org/broadinstitute/sting/bwa/IntPackedInputStream.java +++ b/java/src/org/broadinstitute/sting/bwa/IntPackedInputStream.java @@ -35,9 +35,8 @@ public class IntPackedInputStream { * Read ints from the given InputStream. * @param inputStream Input stream from which to read ints. * @param byteOrder Endianness to use when writing a list of integers. - * @throws IOException if an I/O error occurs. */ - public IntPackedInputStream(InputStream inputStream, ByteOrder byteOrder) throws IOException { + public IntPackedInputStream(InputStream inputStream, ByteOrder byteOrder) { this.targetInputStream = inputStream; this.buffer = ByteBuffer.allocate(PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BYTE).order(byteOrder); } @@ -59,7 +58,18 @@ public class IntPackedInputStream { * @throws IOException if an I/O error occurs. */ public void read( int[] data ) throws IOException { - for(int i = 0; i < data.length; i++) { + read( data, 0, data.length ); + } + + /** + * Read the data from the input stream, starting at the given offset. + * @param data placeholder for input data. + * @param offset place in the array to start reading in data. + * @param length number of ints to read in. + * @throws IOException if an I/O error occurs. + */ + public void read( int[] data, int offset, int length ) throws IOException { + for(int i = offset; i < offset+length; i++) { targetInputStream.read(buffer.array()); data[i] = buffer.getInt(); buffer.rewind(); diff --git a/java/src/org/broadinstitute/sting/bwa/IntPackedOutputStream.java b/java/src/org/broadinstitute/sting/bwa/IntPackedOutputStream.java index 315976b10..e9386de37 100755 --- a/java/src/org/broadinstitute/sting/bwa/IntPackedOutputStream.java +++ b/java/src/org/broadinstitute/sting/bwa/IntPackedOutputStream.java @@ -62,7 +62,7 @@ public class IntPackedOutputStream { * @param byteOrder Endianness to use when writing a list of integers. * @throws IOException if an I/O error occurs. */ - public IntPackedOutputStream(OutputStream outputStream, ByteOrder byteOrder) throws IOException { + public IntPackedOutputStream(OutputStream outputStream, ByteOrder byteOrder) { this.targetOutputStream = outputStream; buffer = ByteBuffer.allocate(PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BYTE).order(byteOrder); } diff --git a/java/src/org/broadinstitute/sting/bwa/SuffixArray.java b/java/src/org/broadinstitute/sting/bwa/SuffixArray.java new file mode 100644 index 000000000..240c60e34 --- /dev/null +++ b/java/src/org/broadinstitute/sting/bwa/SuffixArray.java @@ -0,0 +1,34 @@ +package org.broadinstitute.sting.bwa; + +/** + * An in-memory representation of a suffix array. + * + * @author mhanna + * @version 0.1 + */ +public class SuffixArray { + public final int inverseSA0; + public final int[] occurrences; + public final int[] sequence; + + /** + * Creates a new sequence array with the given inverse SA, occurrences, and values. + * @param inverseSA0 Inverse SA entry for the first element. + * @param occurrences Cumulative number of occurrences of A,C,G,T, in order. + * @param sequence The full suffix array. + */ + public SuffixArray(int inverseSA0, int[] occurrences, int[] sequence) { + this.inverseSA0 = inverseSA0; + this.occurrences = occurrences; + this.sequence = sequence; + } + + /** + * Retrieves the length of the sequence array. + * @return Length of the suffix array. + */ + public int length() { + return sequence.length; + } + +} diff --git a/java/src/org/broadinstitute/sting/bwa/SuffixArrayReader.java b/java/src/org/broadinstitute/sting/bwa/SuffixArrayReader.java new file mode 100644 index 000000000..fe07b0a03 --- /dev/null +++ b/java/src/org/broadinstitute/sting/bwa/SuffixArrayReader.java @@ -0,0 +1,73 @@ +package org.broadinstitute.sting.bwa; + +import org.broadinstitute.sting.utils.StingException; + +import java.io.*; +import java.nio.ByteOrder; + +/** + * A reader for suffix arrays in permanent storage. + * + * @author mhanna + * @version 0.1 + */ +public class SuffixArrayReader { + /** + * Input stream from which to read suffix array data. + */ + private InputStream inputStream; + + /** + * Create a new suffix array reader. + * @param inputFile File in which the suffix array is stored. + */ + public SuffixArrayReader( File inputFile ) { + try { + this.inputStream = new BufferedInputStream(new FileInputStream(inputFile)); + } + catch( FileNotFoundException ex ) { + throw new StingException("Unable to open input file", ex); + } + } + + /** + * Read a suffix array from the input stream. + * @return The suffix array stored in the input stream. + */ + public SuffixArray read() { + IntPackedInputStream intPackedInputStream = new IntPackedInputStream(inputStream, ByteOrder.LITTLE_ENDIAN); + + int inverseSA0; + int[] occurrences; + int[] suffixArray; + + try { + inverseSA0 = intPackedInputStream.read(); + occurrences = new int[PackUtils.ALPHABET_SIZE]; + intPackedInputStream.read(occurrences); + // Throw away the suffix array size in bytes and use the occurrences table directly. + intPackedInputStream.read(); + int suffixArraySize = occurrences[PackUtils.ALPHABET_SIZE-1]+1; + suffixArray = new int[suffixArraySize]; + intPackedInputStream.read(suffixArray); + } + catch( IOException ex ) { + throw new StingException("Unable to read BWT from input stream.", ex); + } + + return new SuffixArray(inverseSA0, occurrences, suffixArray); + } + + + /** + * Close the input stream. + */ + public void close() { + try { + inputStream.close(); + } + catch( IOException ex ) { + throw new StingException("Unable to close input file", ex); + } + } +} diff --git a/java/src/org/broadinstitute/sting/bwa/SuffixArrayWriter.java b/java/src/org/broadinstitute/sting/bwa/SuffixArrayWriter.java new file mode 100644 index 000000000..47d40f7ad --- /dev/null +++ b/java/src/org/broadinstitute/sting/bwa/SuffixArrayWriter.java @@ -0,0 +1,66 @@ +package org.broadinstitute.sting.bwa; + +import org.broadinstitute.sting.utils.StingException; + +import java.io.*; +import java.nio.ByteOrder; + +/** + * Javadoc goes here. + * + * @author mhanna + * @version 0.1 + */ +public class SuffixArrayWriter { + /** + * Input stream from which to read suffix array data. + */ + private OutputStream outputStream; + + /** + * Create a new suffix array reader. + * @param inputFile File in which the suffix array is stored. + */ + public SuffixArrayWriter( File inputFile ) { + try { + this.outputStream = new BufferedOutputStream(new FileOutputStream(inputFile)); + } + catch( FileNotFoundException ex ) { + throw new StingException("Unable to open input file", ex); + } + } + + /** + * Write a suffix array to the output stream. + * @param suffixArray suffix array to write. + */ + public void write(SuffixArray suffixArray) { + IntPackedOutputStream intPackedOutputStream = new IntPackedOutputStream(outputStream, ByteOrder.LITTLE_ENDIAN); + + try { + intPackedOutputStream.write(suffixArray.inverseSA0); + intPackedOutputStream.write(suffixArray.occurrences); + // How frequently the suffix array entry is placed. + intPackedOutputStream.write(1); + // Length of the suffix array. + intPackedOutputStream.write(suffixArray.length()-1); + intPackedOutputStream.write(suffixArray.sequence, 1, suffixArray.length()-1); + } + catch( IOException ex ) { + throw new StingException("Unable to read BWT from input stream.", ex); + } + } + + + /** + * Close the input stream. + */ + public void close() { + try { + outputStream.close(); + } + catch( IOException ex ) { + throw new StingException("Unable to close input file", ex); + } + } +}