From 0f3049652a72e2abebc14a79140fe66d200045f0 Mon Sep 17 00:00:00 2001 From: hanna Date: Fri, 11 Sep 2009 18:23:15 +0000 Subject: [PATCH] Start to build BWT abstractions, so we can present a reasonable facsimile of the BWT to the user no matter how it's represented on disk. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1589 348d0f76-0448-11de-a6fe-93d51630548a --- .../src/org/broadinstitute/sting/bwa/BWT.java | 114 +++++++++++++++++- .../broadinstitute/sting/bwa/BWTReader.java | 27 ++++- .../broadinstitute/sting/bwa/BWTWriter.java | 20 ++- .../sting/bwa/BasePackedInputStream.java | 22 ++-- .../sting/bwa/BasePackedOutputStream.java | 15 ++- .../sting/bwa/CreateBWTFromReference.java | 48 ++++++-- .../sting/bwa/CreatePACFromReference.java | 19 ++- .../broadinstitute/sting/bwa/PackUtils.java | 31 ++++- 8 files changed, 257 insertions(+), 39 deletions(-) diff --git a/java/src/org/broadinstitute/sting/bwa/BWT.java b/java/src/org/broadinstitute/sting/bwa/BWT.java index 3ff871d3e..0d2b7300f 100644 --- a/java/src/org/broadinstitute/sting/bwa/BWT.java +++ b/java/src/org/broadinstitute/sting/bwa/BWT.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.bwa; +import java.util.Arrays; + /** * Represents the Burrows-Wheeler Transform of a reference sequence. * @@ -7,18 +9,120 @@ package org.broadinstitute.sting.bwa; * @version 0.1 */ public class BWT { + /** + * Write an occurrence table after every SEQUENCE_BLOCK_SIZE bases. + * For this implementation to behave correctly, SEQUENCE_BLOCK_SIZE % 8 == 0 + */ + public static final int SEQUENCE_BLOCK_SIZE = 128; + public final int inverseSA0; - public final int[] occurrences; - public final byte[] sequence; + public final int[] count; + public final SequenceBlock[] sequenceBlocks; + + /** + * Creates a new BWT with the given inverse SA, counts, and sequence (in ASCII). + * @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence. + * @param count Cumulative count of bases, in A,C,G,T order. + * @param sequenceBlocks The full BWT sequence, sans the '$'. + */ + public BWT( int inverseSA0, int[] count, SequenceBlock[] sequenceBlocks ) { + this.inverseSA0 = inverseSA0; + this.count = count; + this.sequenceBlocks = sequenceBlocks; + } /** * Creates a new BWT with the given inverse SA, occurrences, and sequence (in ASCII). * @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence. - * @param occurrences Cumulative number of occurrences of A,C,G,T, in order. + * @param count Cumulative count of bases, in A,C,G,T order. * @param sequence The full BWT sequence, sans the '$'. */ - public BWT( int inverseSA0, int[] occurrences, byte[] sequence ) { - this.inverseSA0 = inverseSA0; + public BWT( int inverseSA0, int[] count, byte[] sequence ) { + this(inverseSA0,count,generateSequenceBlocks(sequence)); + } + + /** + * Extract the full sequence from the list of block. + * @return The full BWT string as a byte array. + */ + public byte[] getSequence() { + byte[] sequence = new byte[count[count.length-1]]; + for( SequenceBlock block: sequenceBlocks ) + System.arraycopy(block.sequence,0,sequence,block.sequenceStart,block.sequenceLength); + return sequence; + } + + /** + * The number of bases in the BWT as a whole. + * @return Number of bases. + */ + public int length() { + return count[count.length-1]; + } + + /** + * Create a set of sequence blocks from one long sequence. + * @param sequence Sequence from which to derive blocks. + * @return Array of sequence blocks containing data from the sequence. + */ + private static SequenceBlock[] generateSequenceBlocks( byte[] sequence ) { + int[] occurrences = {0,0,0,0}; + + int numSequenceBlocks = PackUtils.numberOfPartitions(sequence.length,SEQUENCE_BLOCK_SIZE); + SequenceBlock[] sequenceBlocks = new SequenceBlock[numSequenceBlocks]; + + for( int block = 0; block < numSequenceBlocks; block++ ) { + int blockStart = block*SEQUENCE_BLOCK_SIZE; + int blockLength = Math.min(SEQUENCE_BLOCK_SIZE, sequence.length-blockStart); + byte[] subsequence = new byte[blockLength]; + + System.arraycopy(sequence,blockStart,subsequence,0,blockLength); + + sequenceBlocks[block] = new SequenceBlock(blockStart,blockLength,Arrays.copyOf(occurrences,occurrences.length),subsequence); + + for( byte base: subsequence ) + occurrences[PackUtils.packBase(base)]++; + } + + return sequenceBlocks; + } +} + +/** + * Models a block of bases within the BWT. + */ +class SequenceBlock { + /** + * Start position of this sequence within the BWT. + */ + public final int sequenceStart; + + /** + * Length of this sequence within the BWT. + */ + public final int sequenceLength; + + + /** + * Occurrences of each letter up to this sequence block. + */ + public final int[] occurrences; + + /** + * Sequence for this segment. + */ + public final byte[] sequence; + + /** + * Create a new block within this BWT. + * @param sequenceStart Starting position of this sequence within the BWT. + * @param sequenceLength Length of this sequence. + * @param occurrences How many of each base has been seen before this sequence began. + * @param sequence The actual sequence from the BWT. + */ + SequenceBlock( int sequenceStart, int sequenceLength, int[] occurrences, byte[] sequence ) { + this.sequenceStart = sequenceStart; + this.sequenceLength = sequenceLength; this.occurrences = occurrences; this.sequence = sequence; } diff --git a/java/src/org/broadinstitute/sting/bwa/BWTReader.java b/java/src/org/broadinstitute/sting/bwa/BWTReader.java index 33c0781c1..06437e72c 100644 --- a/java/src/org/broadinstitute/sting/bwa/BWTReader.java +++ b/java/src/org/broadinstitute/sting/bwa/BWTReader.java @@ -39,20 +39,35 @@ public class BWTReader { BasePackedInputStream basePackedInputStream = new BasePackedInputStream(Integer.class, inputStream, ByteOrder.LITTLE_ENDIAN); int inverseSA0; - int[] occurrences; - byte[] bwt; + int[] count; + SequenceBlock[] sequenceBlocks; try { inverseSA0 = intPackedInputStream.read(); - occurrences = new int[PackUtils.ALPHABET_SIZE]; - intPackedInputStream.read(occurrences); - bwt = basePackedInputStream.read(occurrences[PackUtils.ALPHABET_SIZE-1]); + count = new int[PackUtils.ALPHABET_SIZE]; + intPackedInputStream.read(count); + + int bwtSize = count[PackUtils.ALPHABET_SIZE-1]; + sequenceBlocks = new SequenceBlock[PackUtils.numberOfPartitions(bwtSize,BWT.SEQUENCE_BLOCK_SIZE)]; + + for( int block = 0; block < sequenceBlocks.length; block++ ) { + int sequenceStart = block*BWT.SEQUENCE_BLOCK_SIZE; + int sequenceLength = Math.min(BWT.SEQUENCE_BLOCK_SIZE,bwtSize-sequenceStart); + + int[] occurrences = new int[PackUtils.ALPHABET_SIZE]; + byte[] bwt = new byte[sequenceLength]; + + intPackedInputStream.read(occurrences); + basePackedInputStream.read(bwt); + + sequenceBlocks[block] = new SequenceBlock(sequenceStart,sequenceLength,occurrences,bwt); + } } catch( IOException ex ) { throw new StingException("Unable to read BWT from input stream.", ex); } - return new BWT(inverseSA0, occurrences, bwt); + return new BWT(inverseSA0, count, sequenceBlocks); } /** diff --git a/java/src/org/broadinstitute/sting/bwa/BWTWriter.java b/java/src/org/broadinstitute/sting/bwa/BWTWriter.java index 210977716..7cc32faf0 100644 --- a/java/src/org/broadinstitute/sting/bwa/BWTWriter.java +++ b/java/src/org/broadinstitute/sting/bwa/BWTWriter.java @@ -15,7 +15,7 @@ public class BWTWriter { /** * Input stream from which to read BWT data. */ - private OutputStream outputStream; + private final OutputStream outputStream; /** * Create a new BWT reader. @@ -40,8 +40,22 @@ public class BWTWriter { try { intPackedOutputStream.write(bwt.inverseSA0); - intPackedOutputStream.write(bwt.occurrences); - basePackedOutputStream.write(bwt.sequence); + intPackedOutputStream.write(bwt.count); + + for( SequenceBlock block: bwt.sequenceBlocks ) { + intPackedOutputStream.write(block.occurrences); + basePackedOutputStream.write(block.sequence); + } + + // The last block is the last set of counts in the structure. + int[] finalCount = new int[bwt.count.length]; + for( int i = 0; i < finalCount.length; i++ ) { + if(i != 0) + finalCount[i] = bwt.count[i]-bwt.count[i-1]; + else + finalCount[i] = bwt.count[i]; + } + intPackedOutputStream.write(finalCount); } catch( IOException ex ) { throw new StingException("Unable to read BWT from input stream.", ex); diff --git a/java/src/org/broadinstitute/sting/bwa/BasePackedInputStream.java b/java/src/org/broadinstitute/sting/bwa/BasePackedInputStream.java index a03328082..9a6abda94 100644 --- a/java/src/org/broadinstitute/sting/bwa/BasePackedInputStream.java +++ b/java/src/org/broadinstitute/sting/bwa/BasePackedInputStream.java @@ -43,14 +43,23 @@ public class BasePackedInputStream { /** * Read the entire contents of the input stream. - * @param length number of bases to read from the stream. + * @param bwt array into which bases should be read. * @return a byte array of the given length. * @throws IOException if an I/O error occurs. */ - public byte[] read( int length ) throws IOException { - byte[] bwt = new byte[length]; - int packedWord = 0; + public void read(byte[] bwt) throws IOException { + read(bwt,0,bwt.length); + } + /** + * Read the next length bases into the bwt array, starting at the given offset. + * @param bwt array holding the given data. + * @param offset target position in the bases array into which bytes should be written. + * @param length number of bases to read from the stream. + * @throws IOException if an I/O error occurs. + */ + public void read( byte[] bwt, int offset, int length ) throws IOException { + int packedWord = 0; final int basesPerEntry = PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BASE; for( int i = 0; i < length; i++ ) { if( i % basesPerEntry == 0 ) { @@ -60,10 +69,7 @@ public class BasePackedInputStream { } int position = basesPerEntry - i % basesPerEntry - 1; - bwt[i] = PackUtils.unpackBase((byte)((packedWord >> position*PackUtils.BITS_PER_BASE) & 0x3)); + bwt[i+offset] = PackUtils.unpackBase((byte)((packedWord >> position*PackUtils.BITS_PER_BASE) & 0x3)); } - - return bwt; } - } diff --git a/java/src/org/broadinstitute/sting/bwa/BasePackedOutputStream.java b/java/src/org/broadinstitute/sting/bwa/BasePackedOutputStream.java index a84cd6c00..0ba98693b 100644 --- a/java/src/org/broadinstitute/sting/bwa/BasePackedOutputStream.java +++ b/java/src/org/broadinstitute/sting/bwa/BasePackedOutputStream.java @@ -65,11 +65,22 @@ public class BasePackedOutputStream { * @throws IOException if an I/O error occurs. */ public void write( byte[] bases ) throws IOException { + write(bases,0,bases.length); + } + + /** + * Writes a subset of the array of bases to the output stream. + * @param bases List of bases to write. + * @param offset site at which to start writing. + * @param length number of bases to write. + * @throws IOException if an I/O error occurs. + */ + public void write( byte[] bases, int offset, int length ) throws IOException { int packedBases = 0; int positionInPack = 0; - for(byte base: bases) { - packedBases = packBase(base, packedBases, positionInPack); + for( int base = offset; base < offset+length; base++ ) { + packedBases = packBase(bases[base], packedBases, positionInPack); // Increment the packed counter. If all possible bases have been squeezed into this byte, write it out. positionInPack = ++positionInPack % basesPerType; diff --git a/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java b/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java index 7f05d871f..9582ed626 100755 --- a/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java +++ b/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java @@ -52,6 +52,13 @@ public class CreateBWTFromReference { return StringUtil.bytesToString(sequence.getBases()); } + private String loadReverseReference( File inputFile ) { + ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile); + ReferenceSequence sequence = reference.nextSequence(); + PackUtils.reverse(sequence.getBases()); + return StringUtil.bytesToString(sequence.getBases()); + } + private int[] countOccurrences( String sequence ) { int occurrences[] = new int[ALPHABET_SIZE]; for( char base: sequence.toCharArray() ) @@ -118,8 +125,8 @@ public class CreateBWTFromReference { } public static void main( String argv[] ) throws IOException { - if( argv.length != 3 ) { - System.out.println("USAGE: CreateBWTFromReference .fasta "); + if( argv.length != 5 ) { + System.out.println("USAGE: CreateBWTFromReference .fasta "); return; } @@ -129,12 +136,19 @@ public class CreateBWTFromReference { String bwtFileName = argv[1]; File bwtFile = new File(bwtFileName); - String saFileName = argv[2]; + String rbwtFileName = argv[2]; + File rbwtFile = new File(rbwtFileName); + + String saFileName = argv[3]; File saFile = new File(saFileName); + String rsaFileName = argv[4]; + File rsaFile = new File(rsaFileName); + CreateBWTFromReference creator = new CreateBWTFromReference(); String sequence = creator.loadReference(inputFile); + String reverseSequence = creator.loadReverseReference(inputFile); // Count the occurences of each given base. int[] occurrences = creator.countOccurrences(sequence); @@ -142,17 +156,23 @@ public class CreateBWTFromReference { // Generate the suffix array and print diagnostics. int[] suffixArrayData = creator.createSuffixArray(sequence); + int[] reverseSuffixArrayData = creator.createSuffixArray(reverseSequence); // Invert the suffix array and print diagnostics. int[] inverseSuffixArray = creator.invertSuffixArray(suffixArrayData); + int[] reverseInverseSuffixArray = creator.invertSuffixArray(reverseSuffixArrayData); SuffixArray suffixArray = new SuffixArray( inverseSuffixArray[0], occurrences, suffixArrayData ); + SuffixArray reverseSuffixArray = new SuffixArray( reverseInverseSuffixArray[0], occurrences, reverseSuffixArrayData ); + /* for( int i = 0; i < 8; i++ ) System.out.printf("suffixArray[%d] = %d (%s...)%n", i, suffixArray.sequence[i], sequence.substring(suffixArray.sequence[i],Math.min(suffixArray.sequence[i]+100,sequence.length()))); for( int i = 0; i < 8; i++ ) System.out.printf("inverseSuffixArray[%d] = %d (%s...)%n", i, inverseSuffixArray[i], sequence.substring(i,Math.min(i+100,sequence.length()))); + */ + /* // Create the data structure for the compressed suffix array and print diagnostics. int[] compressedSuffixArray = creator.createCompressedSuffixArray(suffixArray.sequence,inverseSuffixArray); int reconstructedInverseSA = compressedSuffixArray[0]; @@ -166,30 +186,40 @@ public class CreateBWTFromReference { for( int i = 0; i < 8; i++ ) { System.out.printf("inverseCompressedSuffixArray[%d] = %d%n", i, inverseCompressedSuffixArray[i]); } + */ // Create the BWT. BWT bwt = new BWT( inverseSuffixArray[0], occurrences, creator.createBWT(sequence, suffixArray.sequence) ); + BWT reverseBWT = new BWT( reverseInverseSuffixArray[0], occurrences, creator.createBWT(reverseSequence, reverseSuffixArray.sequence)); - String bwtAsString = new String(bwt.sequence); - System.out.printf("BWT: %s...%n", bwtAsString.substring(0,80)); + byte[] bwtSequence = bwt.getSequence(); + System.out.printf("BWT: %s... (length = %d)%n", new String(bwtSequence,0,80),bwt.length()); BWTWriter bwtWriter = new BWTWriter(bwtFile); bwtWriter.write(bwt); bwtWriter.close(); + BWTWriter reverseBWTWriter = new BWTWriter(rbwtFile); + reverseBWTWriter.write(reverseBWT); + reverseBWTWriter.close(); + SuffixArrayWriter saWriter = new SuffixArrayWriter(saFile); saWriter.write(suffixArray); saWriter.close(); + SuffixArrayWriter reverseSAWriter = new SuffixArrayWriter(rsaFile); + reverseSAWriter.write(reverseSuffixArray); + reverseSAWriter.close(); + File existingBWTFile = new File(inputFileName+".bwt"); BWTReader existingBWTReader = new BWTReader(existingBWTFile); BWT existingBWT = existingBWTReader.read(); - String existingBwtAsString = new String(existingBWT.sequence); - System.out.printf("Existing BWT: %s...%n",existingBwtAsString.substring(0,80)); + byte[] existingBWTSequence = existingBWT.getSequence(); + System.out.printf("Existing BWT: %s... (length = %d)%n",new String(existingBWTSequence,0,80),existingBWT.length()); - for( int i = 0; i < bwt.sequence.length; i++ ) { - if( bwt.sequence[i] != existingBWT.sequence[i] ) + for( int i = 0; i < bwt.length(); i++ ) { + if( bwtSequence[i] != existingBWTSequence[i] ) throw new StingException("BWT mismatch at " + i); } diff --git a/java/src/org/broadinstitute/sting/bwa/CreatePACFromReference.java b/java/src/org/broadinstitute/sting/bwa/CreatePACFromReference.java index e2eefbcb1..1ce2925d3 100755 --- a/java/src/org/broadinstitute/sting/bwa/CreatePACFromReference.java +++ b/java/src/org/broadinstitute/sting/bwa/CreatePACFromReference.java @@ -41,8 +41,8 @@ import java.nio.ByteOrder; public class CreatePACFromReference { public static void main( String argv[] ) throws IOException { - if( argv.length != 2 ) { - System.out.println("USAGE: CreatePACFromReference .fasta "); + if( argv.length != 3 ) { + System.out.println("USAGE: CreatePACFromReference .fasta "); return; } @@ -53,13 +53,22 @@ public class CreatePACFromReference { ReferenceSequence sequence = reference.nextSequence(); // Target file for output - File outputFile = new File(argv[1]); + writeSequence( new File(argv[1]), sequence.getBases() ); + + // Reverse the bases in the reference + PackUtils.reverse(sequence.getBases()); + + // Target file for output + writeSequence( new File(argv[2]), sequence.getBases() ); + } + + private static void writeSequence( File outputFile, byte[] bases ) throws IOException { OutputStream outputStream = new FileOutputStream(outputFile); BasePackedOutputStream basePackedOutputStream = new BasePackedOutputStream(Byte.class, outputStream, ByteOrder.BIG_ENDIAN); - basePackedOutputStream.write(sequence.getBases()); + basePackedOutputStream.write(bases); - outputStream.write(sequence.getBases().length%PackUtils.ALPHABET_SIZE); + outputStream.write(bases.length%PackUtils.ALPHABET_SIZE); outputStream.close(); } diff --git a/java/src/org/broadinstitute/sting/bwa/PackUtils.java b/java/src/org/broadinstitute/sting/bwa/PackUtils.java index 33b420f4b..c3313f9e4 100644 --- a/java/src/org/broadinstitute/sting/bwa/PackUtils.java +++ b/java/src/org/broadinstitute/sting/bwa/PackUtils.java @@ -29,7 +29,7 @@ public class PackUtils { * @param type Type to test. * @return Number of bits that the given type can hold. */ - public static final int bitsInType( Class type ) { + public static int bitsInType( Class type ) { try { long typeSize = type.getField("MAX_VALUE").getLong(null) - type.getField("MIN_VALUE").getLong(null)+1; long intTypeSize = (long)Integer.MAX_VALUE - (long)Integer.MIN_VALUE + 1; @@ -65,6 +65,11 @@ public class PackUtils { } } + /** + * Converts a two-bit representation of a base into an ASCII representation of a base. + * @param pack Byte from 0-3 indicating which base is represented. + * @return An ASCII value representing the packed base. + */ public static byte unpackBase(byte pack) { switch( pack ) { case 0: @@ -79,4 +84,28 @@ public class PackUtils { throw new StingException("Unknown pack type: " + pack); } } + + /** + * Reverses an unpacked sequence of bases. + * @param bases bases to reverse. + */ + public static void reverse( byte[] bases ) { + for( int i = 0, j = bases.length-1; i < j; i++, j-- ) { + byte temp = bases[j]; + bases[j] = bases[i]; + bases[i] = temp; + } + } + + /** + * Given a structure of size size that should be split + * into partitionSize partitions, how many partitions should + * be created? Size of last partition will be <= partitionSize. + * @param size Total size of the data structure. + * @param partitionSize Size of an individual partition. + * @return Number of partitions that would be created. + */ + public static int numberOfPartitions( int size, int partitionSize ) { + return (size + partitionSize - 1)/partitionSize; + } }