diff --git a/java/src/org/broadinstitute/sting/bwa/BasePackedOutputStream.java b/java/src/org/broadinstitute/sting/bwa/BasePackedOutputStream.java new file mode 100644 index 000000000..16f9947a3 --- /dev/null +++ b/java/src/org/broadinstitute/sting/bwa/BasePackedOutputStream.java @@ -0,0 +1,120 @@ +package org.broadinstitute.sting.bwa; + +import org.broadinstitute.sting.utils.StingException; + +import java.io.*; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +/** + * A general-purpose stream for writing packed bases. + * + * @author mhanna + * @version 0.1 + */ +public class BasePackedOutputStream { + /** + * Type of object to pack. + */ + private final Class type; + + /** + * How many bases can be stored in the given data structure? + */ + private final int basesPerType; + + /** + * Ultimate target for the packed bases. + */ + private final OutputStream targetOutputStream; + + /** + * A fixed-size buffer for word-packed data. + */ + private final ByteBuffer buffer; + + public BasePackedOutputStream( Class type, File outputFile, ByteOrder byteOrder ) throws FileNotFoundException { + this(type,new BufferedOutputStream(new FileOutputStream(outputFile)),byteOrder); + } + + /** + * Write packed bases to the given output stream. + * @param type Type of data to pack bases into. + * @param outputStream Output stream to which to write packed bases. + * @param byteOrder Switch between big endian / little endian when reading / writing files. + */ + public BasePackedOutputStream( Class type, OutputStream outputStream, ByteOrder byteOrder) { + this.targetOutputStream = outputStream; + this.type = type; + basesPerType = PackUtils.bitsInType(type)/PackUtils.BITS_PER_BASE; + this.buffer = ByteBuffer.allocate(basesPerType/PackUtils.ALPHABET_SIZE).order(byteOrder); + } + + /** + * Writes an array of bases to the target output stream. + * @param bases List of bases to write. + * @throws IOException if an I/O error occurs. + */ + public void write( byte[] bases ) throws IOException { + int packedBases = 0; + int positionInPack = 0; + + for(byte base: bases) { + packedBases = packBase(base, packedBases, positionInPack); + + // Increment the packed counter. If all possible bases have been squeezed into this byte, write it out. + positionInPack = ++positionInPack % basesPerType; + if( positionInPack == 0 ) { + writePackedBases(packedBases); + packedBases = 0; + } + } + + if( positionInPack > 0 ) + writePackedBases(packedBases); + } + + /** + * Flush the contents of the OutputStream to disk. + * @throws IOException if an I/O error occurs. + */ + public void flush() throws IOException { + targetOutputStream.flush(); + } + + /** + * Closes the given output stream. + * @throws IOException if an I/O error occurs. + */ + public void close() throws IOException { + targetOutputStream.close(); + } + + /** + * Pack the given base into the basepack. + * @param base The base to pack. + * @param basePack Target for the pack operation. + * @param position Position within the pack to which to add the base. + * @return The packed integer. + */ + private int packBase( byte base, int basePack, int position ) { + basePack |= (PackUtils.packBase(base) << 2*(basesPerType-position-1)); + return basePack; + } + + /** + * Write the given packed base structure to the output file. + * @param packedBases Packed bases to write. + * @throws IOException on error writing to the file. + */ + private void writePackedBases(int packedBases) throws IOException { + buffer.rewind(); + if( type == Integer.class ) + buffer.putInt(packedBases); + else if( type == Byte.class ) + buffer.put((byte)packedBases); + else + throw new StingException("Cannot pack bases into type " + type.getName()); + targetOutputStream.write(buffer.array()); + } +} diff --git a/java/src/org/broadinstitute/sting/bwa/BytePackedOutputStream.java b/java/src/org/broadinstitute/sting/bwa/BytePackedOutputStream.java deleted file mode 100755 index f33e15fad..000000000 --- a/java/src/org/broadinstitute/sting/bwa/BytePackedOutputStream.java +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.bwa; - -import org.broadinstitute.sting.utils.StingException; - -import java.io.*; - -/** - * Write packed bases to an output stream. Pack each base into 2 bits. - * - * @author mhanna - * @version 0.1 - */ -public class BytePackedOutputStream { - /** - * How many possible bases can be encoded? - */ - public static final int ALPHABET_SIZE = 4; - - /** - * Ultimate target for the packed bases. - */ - private final OutputStream targetOutputStream; - - /** - * The next byte to write to the output stream. Will be added - * to the output stream when enough bases are accumulated, or when - * the file is closed. - */ - private byte packedBases; - - /** - * Where will the next base be embedded into packedBases? - */ - private int positionInPack = 0; - - public BytePackedOutputStream( File outputFile ) throws FileNotFoundException { - this(new BufferedOutputStream(new FileOutputStream(outputFile))); - } - - /** - * Write packed bases to the given output stream. - * @param outputStream Output stream to which to write packed bases. - */ - public BytePackedOutputStream( OutputStream outputStream ) { - this.targetOutputStream = outputStream; - } - - /** - * Write a given base to the output stream. - * @param base Base to write. - * @throws IOException if an I/O error occurs. - */ - public void write( byte base ) throws IOException { - packedBases |= (getPackedRepresentation(base) << 2*(ALPHABET_SIZE-positionInPack-1)); - - // Increment the packed counter. If all possible bases have been squeezed into this byte, write it out. - positionInPack = ++positionInPack % ALPHABET_SIZE; - if( positionInPack == 0 ) { - targetOutputStream.write(packedBases); - packedBases = 0; - } - } - - /** - * Writes an array of bases to the target output stream. - * @param bases List of bases to write. - * @throws IOException if an I/O error occurs. - */ - public void write( byte[] bases ) throws IOException { - for(byte base: bases) write(base); - } - - /** - * Flush the contents of the OutputStream to disk. - * @throws IOException if an I/O error occurs. - */ - public void flush() throws IOException { - targetOutputStream.flush(); - } - - /** - * Closes the given output stream. - * @throws IOException if an I/O error occurs. - */ - public void close() throws IOException { - // Write (incomplete) block in file, and number of bases in that last byte. - if( positionInPack > 0 ) { - targetOutputStream.write(packedBases); - targetOutputStream.write(positionInPack); - } - else - targetOutputStream.write(ALPHABET_SIZE); - - targetOutputStream.close(); - } - - /** - * Gets the two-bit representation of a base. A=00b, C=01b, G=10b, T=11b. - * @param base ASCII value for the base to pack. - * @return A byte from 0-3 indicating the base's packed value. - */ - public static byte getPackedRepresentation(byte base) { - switch( base ) { - case 'A': - return 0; - case 'C': - return 1; - case 'G': - return 2; - case 'T': - return 3; - default: - throw new StingException("Unknown base type: " + base); - } - } - - public static byte decodePackedRepresentation(byte pack) { - switch( pack ) { - case 0: - return 'A'; - case 1: - return 'C'; - case 2: - return 'G'; - case 3: - return 'T'; - default: - throw new StingException("Unknown pack type: " + pack); - } - } - -} diff --git a/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java b/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java index 254c92a9b..cc7173236 100755 --- a/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java +++ b/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java @@ -33,7 +33,6 @@ import net.sf.samtools.util.StringUtil; import java.io.*; import java.util.TreeSet; import java.util.Comparator; -import java.util.Arrays; import java.nio.ByteBuffer; import java.nio.ByteOrder; @@ -56,7 +55,7 @@ public class CreateBWTFromReference { private int[] countOccurrences( String sequence ) { int occurrences[] = new int[ALPHABET_SIZE]; for( char base: sequence.toCharArray() ) - occurrences[ BytePackedOutputStream.getPackedRepresentation((byte)base) ]++; + occurrences[PackUtils.packBase((byte)base)]++; // Make occurrences cumulative for( int i = 1; i < ALPHABET_SIZE; i++ ) @@ -182,7 +181,7 @@ public class CreateBWTFromReference { occurrenceWriter.write(occurrences); occurrenceWriter.flush(); - WordPackedOutputStream sequenceOutputStream = new WordPackedOutputStream(bwtOutputStream,ByteOrder.LITTLE_ENDIAN); + BasePackedOutputStream sequenceOutputStream = new BasePackedOutputStream(Integer.class,bwtOutputStream,ByteOrder.LITTLE_ENDIAN); sequenceOutputStream.write(bwt); sequenceOutputStream.close(); diff --git a/java/src/org/broadinstitute/sting/bwa/CreatePACFromReference.java b/java/src/org/broadinstitute/sting/bwa/CreatePACFromReference.java index fd26881d1..e2eefbcb1 100755 --- a/java/src/org/broadinstitute/sting/bwa/CreatePACFromReference.java +++ b/java/src/org/broadinstitute/sting/bwa/CreatePACFromReference.java @@ -30,6 +30,7 @@ import net.sf.picard.reference.ReferenceSequenceFileFactory; import net.sf.picard.reference.ReferenceSequence; import java.io.*; +import java.nio.ByteOrder; /** * Generate a .PAC file from a given reference. @@ -53,9 +54,13 @@ public class CreatePACFromReference { // Target file for output File outputFile = new File(argv[1]); - BytePackedOutputStream outputStream = new BytePackedOutputStream(outputFile); + OutputStream outputStream = new FileOutputStream(outputFile); + + BasePackedOutputStream basePackedOutputStream = new BasePackedOutputStream(Byte.class, outputStream, ByteOrder.BIG_ENDIAN); + basePackedOutputStream.write(sequence.getBases()); + + outputStream.write(sequence.getBases().length%PackUtils.ALPHABET_SIZE); - outputStream.write(sequence.getBases()); outputStream.close(); } } diff --git a/java/src/org/broadinstitute/sting/bwa/PackUtils.java b/java/src/org/broadinstitute/sting/bwa/PackUtils.java new file mode 100644 index 000000000..33b420f4b --- /dev/null +++ b/java/src/org/broadinstitute/sting/bwa/PackUtils.java @@ -0,0 +1,82 @@ +package org.broadinstitute.sting.bwa; + +import org.broadinstitute.sting.utils.StingException; + +/** + * Utilities designed for packing / unpacking bases. + * + * @author mhanna + * @version 0.1 + */ +public class PackUtils { + /** + * How many possible bases can be encoded? + */ + public static final int ALPHABET_SIZE = 4; + + /** + * How many bits does it take to store a single base? + */ + public static final int BITS_PER_BASE = (int)(Math.log(ALPHABET_SIZE)/Math.log(2)); + + /** + * How many bits fit into a single byte? + */ + public static final int BITS_PER_BYTE = 8; + + /** + * How many bits can a given type hold? + * @param type Type to test. + * @return Number of bits that the given type can hold. + */ + public static final int bitsInType( Class type ) { + try { + long typeSize = type.getField("MAX_VALUE").getLong(null) - type.getField("MIN_VALUE").getLong(null)+1; + long intTypeSize = (long)Integer.MAX_VALUE - (long)Integer.MIN_VALUE + 1; + if( typeSize > intTypeSize ) + throw new StingException("Cannot determine number of bits available in type: " + type.getName()); + return (int)(Math.log(typeSize)/Math.log(2)); + } + catch( NoSuchFieldException ex ) { + throw new StingException("Cannot determine number of bits available in type: " + type.getName(),ex); + } + catch( IllegalAccessException ex ) { + throw new StingException("Cannot determine number of bits available in type: " + type.getName(),ex); + } + } + + /** + * Gets the two-bit representation of a base. A=00b, C=01b, G=10b, T=11b. + * @param base ASCII value for the base to pack. + * @return A byte from 0-3 indicating the base's packed value. + */ + public static byte packBase(byte base) { + switch( base ) { + case 'A': + return 0; + case 'C': + return 1; + case 'G': + return 2; + case 'T': + return 3; + default: + throw new StingException("Unknown base type: " + base); + } + } + + public static byte unpackBase(byte pack) { + switch( pack ) { + case 0: + return 'A'; + case 1: + return 'C'; + case 2: + return 'G'; + case 3: + return 'T'; + default: + throw new StingException("Unknown pack type: " + pack); + } + } +} diff --git a/java/src/org/broadinstitute/sting/bwa/WordPackedInputStream.java b/java/src/org/broadinstitute/sting/bwa/WordPackedInputStream.java index c777cce11..0abe4bdda 100644 --- a/java/src/org/broadinstitute/sting/bwa/WordPackedInputStream.java +++ b/java/src/org/broadinstitute/sting/bwa/WordPackedInputStream.java @@ -26,7 +26,7 @@ public class WordPackedInputStream { public WordPackedInputStream( File inputFile, ByteOrder byteOrder ) throws FileNotFoundException { this.targetInputStream = new BufferedInputStream(new FileInputStream(inputFile)); - this.buffer = ByteBuffer.allocate(WordPackedOutputStream.BASES_PER_WORD/BytePackedOutputStream.ALPHABET_SIZE).order(byteOrder); + this.buffer = ByteBuffer.allocate(PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BYTE).order(byteOrder); } /** @@ -44,9 +44,9 @@ public class WordPackedInputStream { List bwtList = new ArrayList(); while(targetInputStream.read(buffer.array()) > 0) { int packedWord = buffer.getInt(); - for( int i = WordPackedOutputStream.BASES_PER_WORD-1; i >= 0; i-- ) { + for( int i = PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BASE - 1; i >= 0; i-- ) { byte packedByte = (byte)((packedWord >> i*2) & 0x3); - bwtList.add(BytePackedOutputStream.decodePackedRepresentation(packedByte)); + bwtList.add(PackUtils.unpackBase(packedByte)); } buffer.rewind(); } diff --git a/java/src/org/broadinstitute/sting/bwa/WordPackedOutputStream.java b/java/src/org/broadinstitute/sting/bwa/WordPackedOutputStream.java deleted file mode 100755 index 03eae9c17..000000000 --- a/java/src/org/broadinstitute/sting/bwa/WordPackedOutputStream.java +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.bwa; - -import java.io.*; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; - -/** - * Word-pack bases into the output stream. Bytes are stored as - * little-endian unsigned ints. - * - * @author mhanna - * @version 0.1 - */ -public class WordPackedOutputStream { - /** - * How many bases can be stored in the given word? - */ - public static final int BASES_PER_WORD = 16; - - /** - * Ultimate target for the packed bases. - */ - private final OutputStream targetOutputStream; - - /** - * The next byte to write to the output stream. Will be added - * to the output stream when enough bases are accumulated, or when - * the file is closed. - */ - private int packedBases; - - /** - * Where will the next base be embedded into packedBases? - */ - private int positionInPack = 0; - - /** - * A fixed-size buffer for word-packed data. - */ - private final ByteBuffer buffer; - - public WordPackedOutputStream( File outputFile, ByteOrder byteOrder ) throws FileNotFoundException { - this(new BufferedOutputStream(new FileOutputStream(outputFile)),byteOrder); - } - - /** - * Write packed bases to the given output stream. - * @param outputStream Output stream to which to write packed bases. - * @param byteOrder Switch between big endian / little endian when reading / writing files. - */ - public WordPackedOutputStream(OutputStream outputStream, ByteOrder byteOrder) { - this.targetOutputStream = outputStream; - this.buffer = ByteBuffer.allocate(BASES_PER_WORD/BytePackedOutputStream.ALPHABET_SIZE).order(byteOrder); - } - - /** - * Write a given base to the output stream. - * @param base Base to write. - * @throws IOException if an I/O error occurs. - */ - public void write( byte base ) throws IOException { - packedBases |= (BytePackedOutputStream.getPackedRepresentation(base) << 2*(BASES_PER_WORD-positionInPack-1)); - - // Increment the packed counter. If all possible bases have been squeezed into this byte, write it out. - positionInPack = ++positionInPack % BASES_PER_WORD; - if( positionInPack == 0 ) { - buffer.rewind(); - buffer.putInt(packedBases); - targetOutputStream.write(buffer.array()); - packedBases = 0; - } - } - - /** - * Writes an array of bases to the target output stream. - * @param bases List of bases to write. - * @throws IOException if an I/O error occurs. - */ - public void write( byte[] bases ) throws IOException { - for(byte base: bases) write(base); - } - - /** - * Flush the contents of the OutputStream to disk. - * @throws IOException if an I/O error occurs. - */ - public void flush() throws IOException { - targetOutputStream.flush(); - } - - /** - * Closes the given output stream. - * @throws IOException if an I/O error occurs. - */ - public void close() throws IOException { - // Write (incomplete) block in file, and number of bases in that last byte. - if( positionInPack > 0 ) { - buffer.rewind(); - buffer.putInt(packedBases); - targetOutputStream.write(buffer.array()); - } - targetOutputStream.close(); - } - -} -