From e2a79c5cd923b602fe82874d8da83b2f1b97b90d Mon Sep 17 00:00:00 2001 From: hanna Date: Wed, 2 Sep 2009 22:18:17 +0000 Subject: [PATCH] Checkpoint. The BWT that we generate now matches the first 16% of the BWT that BWT-SW generates. Cleaned up output streams to separate the byte packing / word packing from the data structure generation. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1508 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/bwa/BytePackedOutputStream.java | 142 ++++++++++++++++++ .../sting/bwa/CreateBWTFromReference.java | 54 +++++-- .../sting/bwa/CreatePACFromReference.java | 58 +------ .../sting/bwa/OccurrenceOutputStream.java | 99 ++++++++++++ .../sting/bwa/WordPackedOutputStream.java | 131 ++++++++++++++++ 5 files changed, 418 insertions(+), 66 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/bwa/BytePackedOutputStream.java create mode 100755 java/src/org/broadinstitute/sting/bwa/OccurrenceOutputStream.java create mode 100755 java/src/org/broadinstitute/sting/bwa/WordPackedOutputStream.java diff --git a/java/src/org/broadinstitute/sting/bwa/BytePackedOutputStream.java b/java/src/org/broadinstitute/sting/bwa/BytePackedOutputStream.java new file mode 100755 index 000000000..2b35ec062 --- /dev/null +++ b/java/src/org/broadinstitute/sting/bwa/BytePackedOutputStream.java @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.bwa; + +import org.broadinstitute.sting.utils.StingException; + +import java.io.*; + +/** + * Write packed bases to an output stream. Pack each base into 2 bits. + * + * @author mhanna + * @version 0.1 + */ +public class BytePackedOutputStream { + /** + * How many possible bases can be encoded? + */ + public static final int ALPHABET_SIZE = 4; + + /** + * Ultimate target for the packed bases. + */ + private final OutputStream targetOutputStream; + + /** + * The next byte to write to the output stream. Will be added + * to the output stream when enough bases are accumulated, or when + * the file is closed. + */ + private byte packedBases; + + /** + * Where will the next base be embedded into packedBases? + */ + private int positionInPack = 0; + + public BytePackedOutputStream( File outputFile ) throws FileNotFoundException { + this(new BufferedOutputStream(new FileOutputStream(outputFile))); + } + + /** + * Write packed bases to the given output stream. + * @param outputStream Output stream to which to write packed bases. + */ + public BytePackedOutputStream( OutputStream outputStream ) { + this.targetOutputStream = outputStream; + } + + /** + * Write a given base to the output stream. + * @param base Base to write. + * @throws IOException if an I/O error occurs. + */ + public void write( byte base ) throws IOException { + packedBases |= (getPackedRepresentation(base) << 2*(ALPHABET_SIZE-positionInPack-1)); + + // Increment the packed counter. If all possible bases have been squeezed into this byte, write it out. + positionInPack = ++positionInPack % ALPHABET_SIZE; + if( positionInPack == 0 ) { + targetOutputStream.write(packedBases); + packedBases = 0; + } + } + + /** + * Writes an array of bases to the target output stream. + * @param bases List of bases to write. + * @throws IOException if an I/O error occurs. + */ + public void write( byte[] bases ) throws IOException { + for(byte base: bases) write(base); + } + + /** + * Flush the contents of the OutputStream to disk. + * @throws IOException if an I/O error occurs. + */ + public void flush() throws IOException { + targetOutputStream.flush(); + } + + /** + * Closes the given output stream. + * @throws IOException if an I/O error occurs. + */ + public void close() throws IOException { + // Write (incomplete) block in file, and number of bases in that last byte. + if( positionInPack > 0 ) { + targetOutputStream.write(packedBases); + targetOutputStream.write(positionInPack); + } + else + targetOutputStream.write(ALPHABET_SIZE); + + targetOutputStream.close(); + } + + /** + * Gets the two-bit representation of a base. A=00b, C=01b, G=10b, T=11b. + * @param base ASCII value for the base to pack. + * @return A byte from 0-3 indicating the base's packed value. + */ + public static byte getPackedRepresentation(byte base) { + switch( base ) { + case 'A': + return 0; + case 'C': + return 1; + case 'G': + return 2; + case 'T': + return 3; + default: + throw new StingException("Unknown base type: " + base); + } + } + +} diff --git a/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java b/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java index 86e96c658..93a1588a8 100755 --- a/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java +++ b/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java @@ -30,10 +30,11 @@ import net.sf.picard.reference.ReferenceSequenceFileFactory; import net.sf.picard.reference.ReferenceSequence; import net.sf.samtools.util.StringUtil; -import java.io.IOException; -import java.io.File; +import java.io.*; import java.util.TreeSet; import java.util.Comparator; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; /** * Create a suffix array data structure. @@ -54,7 +55,7 @@ public class CreateBWTFromReference { private int[] countOccurrences( String sequence ) { int occurrences[] = new int[ALPHABET_SIZE]; for( char base: sequence.toCharArray() ) - occurrences[ CreatePACFromReference.getPackedRepresentation((byte)base) ]++; + occurrences[ BytePackedOutputStream.getPackedRepresentation((byte)base) ]++; // Make occurrences cumulative for( int i = 1; i < ALPHABET_SIZE; i++ ) @@ -98,34 +99,33 @@ public class CreateBWTFromReference { return compressedSuffixArray; } - private char[] createBWT( String sequence, int[] suffixArray ) { - char[] bwt = new char[suffixArray.length]; + private byte[] createBWT( String sequence, int[] suffixArray ) { + byte[] bwt = new byte[suffixArray.length]; for( int i = 0; i < suffixArray.length; i++ ) { // Find the first character after the current character in the rotation. If the character is past the end // (in other words, '$'), back up to the previous character. int sequenceEnd = Math.min((suffixArray[i]+suffixArray.length-1)%suffixArray.length, sequence.length()-1 ); - bwt[i] = sequence.charAt(sequenceEnd); + bwt[i] = (byte)sequence.charAt(sequenceEnd); } return bwt; } public static void main( String argv[] ) throws IOException { - if( argv.length != 1 ) { - System.out.println("No reference"); + if( argv.length != 2 ) { + System.out.println("USAGE: CreateBWTFromReference .fasta "); return; } String inputFileName = argv[0]; File inputFile = new File(inputFileName); + String outputFileName = argv[1]; + File outputFile = new File(outputFileName); + CreateBWTFromReference creator = new CreateBWTFromReference(); String sequence = creator.loadReference(inputFile); - // Count the occurences of each given base. - int[] occurrences = creator.countOccurrences(sequence); - System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences[0],occurrences[1],occurrences[2],occurrences[3]); - // Generate the suffix array and print diagnostics. int[] suffixArray = creator.createSuffixArray(sequence); for( int i = 0; i < 8; i++ ) @@ -144,9 +144,35 @@ public class CreateBWTFromReference { reconstructedInverseSA = compressedSuffixArray[reconstructedInverseSA]; } + // Count the occurences of each given base. + int[] occurrences = creator.countOccurrences(sequence); + System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences[0],occurrences[1],occurrences[2],occurrences[3]); + // Create the BWT. - char[] bwt = creator.createBWT(sequence, suffixArray); - System.out.printf("BWT: %s%n", new String(bwt)); + byte[] bwt = creator.createBWT(sequence, suffixArray); + + String bwtAsString = new String(bwt); + System.out.printf("BWT:%n"); + while( bwtAsString.length() > 0 ) { + int end = Math.min( 80, bwtAsString.length() ); + //System.out.printf("%s%n", bwtAsString.substring(0,end)); + bwtAsString = bwtAsString.substring(end); + } + + OutputStream outputStream = new BufferedOutputStream(new FileOutputStream(outputFile)); + + ByteBuffer buffer = ByteBuffer.allocate(4).order(ByteOrder.LITTLE_ENDIAN); + buffer.putInt(inverseSuffixArray[0]); + outputStream.write(buffer.array()); + outputStream.flush(); + + OccurrenceOutputStream occurrenceWriter = new OccurrenceOutputStream(outputStream); + occurrenceWriter.write(occurrences); + occurrenceWriter.flush(); + + WordPackedOutputStream bwtOutputStream = new WordPackedOutputStream(outputStream,ByteOrder.LITTLE_ENDIAN); + bwtOutputStream.write(bwt); + bwtOutputStream.close(); } /** diff --git a/java/src/org/broadinstitute/sting/bwa/CreatePACFromReference.java b/java/src/org/broadinstitute/sting/bwa/CreatePACFromReference.java index 5d071632e..fd26881d1 100755 --- a/java/src/org/broadinstitute/sting/bwa/CreatePACFromReference.java +++ b/java/src/org/broadinstitute/sting/bwa/CreatePACFromReference.java @@ -31,8 +31,6 @@ import net.sf.picard.reference.ReferenceSequence; import java.io.*; -import org.broadinstitute.sting.utils.StingException; - /** * Generate a .PAC file from a given reference. * @@ -41,11 +39,9 @@ import org.broadinstitute.sting.utils.StingException; */ public class CreatePACFromReference { - private static final int ALPHABET_SIZE = 4; - - public static void main( String argv[] ) throws FileNotFoundException, IOException { - if( argv.length != 1 ) { - System.out.println("No reference"); + public static void main( String argv[] ) throws IOException { + if( argv.length != 2 ) { + System.out.println("USAGE: CreatePACFromReference .fasta "); return; } @@ -56,52 +52,10 @@ public class CreatePACFromReference { ReferenceSequence sequence = reference.nextSequence(); // Target file for output - File outputFile = new File(inputFileName + ".mypac"); - BufferedOutputStream outputStream = new BufferedOutputStream(new FileOutputStream(outputFile)); + File outputFile = new File(argv[1]); + BytePackedOutputStream outputStream = new BytePackedOutputStream(outputFile); - // Number of bytes, rounded up, plus one extra byte indicating how many bases the last byte holds. - byte packed = 0; - int positionInPack = 0; - - for( byte base: sequence.getBases() ) { - // Pack base into the appropriate bits of the byte. - packed |= (getPackedRepresentation(base) << 2*(ALPHABET_SIZE-positionInPack-1)); - - // Increment the packed counter. If all possible bases have been squeezed into this byte, write it out. - positionInPack = ++positionInPack % 4; - if( positionInPack == 0 ) { - outputStream.write(packed); - packed = 0; - } - } - - // Last (incomplete) block in file. - if( positionInPack > 0 ) - outputStream.write(packed); - - // Last character of a .pac file is how many bases are in the last byte. - outputStream.write(positionInPack); - + outputStream.write(sequence.getBases()); outputStream.close(); } - - /** - * Gets the two-bit representation of a base. A=00b, C=01b, G=10b, T=11b. - * @param base ASCII value for the base to pack. - * @return A byte from 0-3 indicating the base's packed value. - */ - public static byte getPackedRepresentation(byte base) { - switch( base ) { - case 'A': - return 0; - case 'C': - return 1; - case 'G': - return 2; - case 'T': - return 3; - default: - throw new StingException("Unknown base type: " + base); - } - } } diff --git a/java/src/org/broadinstitute/sting/bwa/OccurrenceOutputStream.java b/java/src/org/broadinstitute/sting/bwa/OccurrenceOutputStream.java new file mode 100755 index 000000000..9b3bb2d7e --- /dev/null +++ b/java/src/org/broadinstitute/sting/bwa/OccurrenceOutputStream.java @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.bwa; + +import org.broadinstitute.sting.utils.StingException; + +import java.io.*; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +/** + * Writes an occurrence array to the output file. + * + * @author mhanna + * @version 0.1 + */ +public class OccurrenceOutputStream { + /** + * How many bytes does it take to hold an integer in Java? + */ + private static final int INT_SIZE_IN_BYTES = 4; + + /** + * Ultimate target for the occurrence array. + */ + private final OutputStream targetOutputStream; + + /** + * Create a new OccurrenceArrayOutputStream, writing to the given target file. + * @param outputFile target output file. + * @throws IOException if an I/O error occurs. + */ + public OccurrenceOutputStream( File outputFile ) throws IOException { + this(new FileOutputStream(outputFile)); + } + + /** + * Write occurrence array to the given OutputStream. + * @param outputStream Output stream to which to write packed bases. + * @throws IOException if an I/O error occurs. + */ + public OccurrenceOutputStream( OutputStream outputStream ) throws IOException { + this.targetOutputStream = outputStream; + } + + /** + * Write the cumulative occurrences to the output stream. + * @param occurrences occurrences to write. occurrences.length must match alphabet size. + * @throws IOException if an I/O error occurs. + */ + public void write( int[] occurrences ) throws IOException { + if( occurrences.length > BytePackedOutputStream.ALPHABET_SIZE ) + throw new StingException("Wrong number of occurrence data points; expected " + BytePackedOutputStream.ALPHABET_SIZE); + ByteBuffer buffer = ByteBuffer.allocate(INT_SIZE_IN_BYTES*occurrences.length).order(ByteOrder.LITTLE_ENDIAN); + for(int occurrence: occurrences) + buffer.putInt(occurrence); + targetOutputStream.write(buffer.array()); + } + + /** + * Flush the contents of the OutputStream to disk. + * @throws IOException if an I/O error occurs. + */ + public void flush() throws IOException { + targetOutputStream.flush(); + } + + /** + * Closes the given output stream. + * @throws IOException if an I/O error occurs. + */ + public void close() throws IOException { + targetOutputStream.close(); + } + +} diff --git a/java/src/org/broadinstitute/sting/bwa/WordPackedOutputStream.java b/java/src/org/broadinstitute/sting/bwa/WordPackedOutputStream.java new file mode 100755 index 000000000..af3e9488b --- /dev/null +++ b/java/src/org/broadinstitute/sting/bwa/WordPackedOutputStream.java @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.bwa; + +import java.io.*; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +/** + * Word-pack bases into the output stream. Bytes are stored as + * little-endian unsigned ints. + * + * @author mhanna + * @version 0.1 + */ +public class WordPackedOutputStream { + /** + * How many bases can be stored in the given word? + */ + private static final int BASES_PER_WORD = 16; + + /** + * Ultimate target for the packed bases. + */ + private final OutputStream targetOutputStream; + + /** + * The next byte to write to the output stream. Will be added + * to the output stream when enough bases are accumulated, or when + * the file is closed. + */ + private int packedBases; + + /** + * Where will the next base be embedded into packedBases? + */ + private int positionInPack = 0; + + /** + * A fixed-size buffer for word-packed data. + */ + private final ByteBuffer buffer; + + public WordPackedOutputStream( File outputFile, ByteOrder byteOrder ) throws FileNotFoundException { + this(new BufferedOutputStream(new FileOutputStream(outputFile)),byteOrder); + } + + /** + * Write packed bases to the given output stream. + * @param outputStream Output stream to which to write packed bases. + * @param byteOrder Switch between big endian / little endian when reading / writing files. + */ + public WordPackedOutputStream(OutputStream outputStream, ByteOrder byteOrder) { + this.targetOutputStream = outputStream; + this.buffer = ByteBuffer.allocate(BASES_PER_WORD/BytePackedOutputStream.ALPHABET_SIZE).order(byteOrder); + } + + /** + * Write a given base to the output stream. + * @param base Base to write. + * @throws IOException if an I/O error occurs. + */ + public void write( byte base ) throws IOException { + packedBases |= (BytePackedOutputStream.getPackedRepresentation(base) << 2*(BASES_PER_WORD-positionInPack-1)); + + // Increment the packed counter. If all possible bases have been squeezed into this byte, write it out. + positionInPack = ++positionInPack % BASES_PER_WORD; + if( positionInPack == 0 ) { + buffer.rewind(); + buffer.putInt(packedBases); + targetOutputStream.write(buffer.array()); + packedBases = 0; + } + } + + /** + * Writes an array of bases to the target output stream. + * @param bases List of bases to write. + * @throws IOException if an I/O error occurs. + */ + public void write( byte[] bases ) throws IOException { + for(byte base: bases) write(base); + } + + /** + * Flush the contents of the OutputStream to disk. + * @throws IOException if an I/O error occurs. + */ + public void flush() throws IOException { + targetOutputStream.flush(); + } + + /** + * Closes the given output stream. + * @throws IOException if an I/O error occurs. + */ + public void close() throws IOException { + // Write (incomplete) block in file, and number of bases in that last byte. + if( positionInPack > 0 ) { + buffer.rewind(); + buffer.putInt(packedBases); + targetOutputStream.write(buffer.array()); + } + targetOutputStream.close(); + } + +} +