Cleanup. Separate common packing functionality into utils class. Make base packing utility as generic as possible.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1566 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
3b1e966b4c
commit
43d1c6741c
|
|
@ -0,0 +1,120 @@
|
|||
package org.broadinstitute.sting.bwa;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
|
||||
/**
|
||||
* A general-purpose stream for writing packed bases.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BasePackedOutputStream<T> {
|
||||
/**
|
||||
* Type of object to pack.
|
||||
*/
|
||||
private final Class<T> type;
|
||||
|
||||
/**
|
||||
* How many bases can be stored in the given data structure?
|
||||
*/
|
||||
private final int basesPerType;
|
||||
|
||||
/**
|
||||
* Ultimate target for the packed bases.
|
||||
*/
|
||||
private final OutputStream targetOutputStream;
|
||||
|
||||
/**
|
||||
* A fixed-size buffer for word-packed data.
|
||||
*/
|
||||
private final ByteBuffer buffer;
|
||||
|
||||
public BasePackedOutputStream( Class<T> type, File outputFile, ByteOrder byteOrder ) throws FileNotFoundException {
|
||||
this(type,new BufferedOutputStream(new FileOutputStream(outputFile)),byteOrder);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write packed bases to the given output stream.
|
||||
* @param type Type of data to pack bases into.
|
||||
* @param outputStream Output stream to which to write packed bases.
|
||||
* @param byteOrder Switch between big endian / little endian when reading / writing files.
|
||||
*/
|
||||
public BasePackedOutputStream( Class<T> type, OutputStream outputStream, ByteOrder byteOrder) {
|
||||
this.targetOutputStream = outputStream;
|
||||
this.type = type;
|
||||
basesPerType = PackUtils.bitsInType(type)/PackUtils.BITS_PER_BASE;
|
||||
this.buffer = ByteBuffer.allocate(basesPerType/PackUtils.ALPHABET_SIZE).order(byteOrder);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes an array of bases to the target output stream.
|
||||
* @param bases List of bases to write.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void write( byte[] bases ) throws IOException {
|
||||
int packedBases = 0;
|
||||
int positionInPack = 0;
|
||||
|
||||
for(byte base: bases) {
|
||||
packedBases = packBase(base, packedBases, positionInPack);
|
||||
|
||||
// Increment the packed counter. If all possible bases have been squeezed into this byte, write it out.
|
||||
positionInPack = ++positionInPack % basesPerType;
|
||||
if( positionInPack == 0 ) {
|
||||
writePackedBases(packedBases);
|
||||
packedBases = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if( positionInPack > 0 )
|
||||
writePackedBases(packedBases);
|
||||
}
|
||||
|
||||
/**
|
||||
* Flush the contents of the OutputStream to disk.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void flush() throws IOException {
|
||||
targetOutputStream.flush();
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the given output stream.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void close() throws IOException {
|
||||
targetOutputStream.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Pack the given base into the basepack.
|
||||
* @param base The base to pack.
|
||||
* @param basePack Target for the pack operation.
|
||||
* @param position Position within the pack to which to add the base.
|
||||
* @return The packed integer.
|
||||
*/
|
||||
private int packBase( byte base, int basePack, int position ) {
|
||||
basePack |= (PackUtils.packBase(base) << 2*(basesPerType-position-1));
|
||||
return basePack;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write the given packed base structure to the output file.
|
||||
* @param packedBases Packed bases to write.
|
||||
* @throws IOException on error writing to the file.
|
||||
*/
|
||||
private void writePackedBases(int packedBases) throws IOException {
|
||||
buffer.rewind();
|
||||
if( type == Integer.class )
|
||||
buffer.putInt(packedBases);
|
||||
else if( type == Byte.class )
|
||||
buffer.put((byte)packedBases);
|
||||
else
|
||||
throw new StingException("Cannot pack bases into type " + type.getName());
|
||||
targetOutputStream.write(buffer.array());
|
||||
}
|
||||
}
|
||||
|
|
@ -1,157 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.bwa;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
/**
|
||||
* Write packed bases to an output stream. Pack each base into 2 bits.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BytePackedOutputStream {
|
||||
/**
|
||||
* How many possible bases can be encoded?
|
||||
*/
|
||||
public static final int ALPHABET_SIZE = 4;
|
||||
|
||||
/**
|
||||
* Ultimate target for the packed bases.
|
||||
*/
|
||||
private final OutputStream targetOutputStream;
|
||||
|
||||
/**
|
||||
* The next byte to write to the output stream. Will be added
|
||||
* to the output stream when enough bases are accumulated, or when
|
||||
* the file is closed.
|
||||
*/
|
||||
private byte packedBases;
|
||||
|
||||
/**
|
||||
* Where will the next base be embedded into packedBases?
|
||||
*/
|
||||
private int positionInPack = 0;
|
||||
|
||||
public BytePackedOutputStream( File outputFile ) throws FileNotFoundException {
|
||||
this(new BufferedOutputStream(new FileOutputStream(outputFile)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Write packed bases to the given output stream.
|
||||
* @param outputStream Output stream to which to write packed bases.
|
||||
*/
|
||||
public BytePackedOutputStream( OutputStream outputStream ) {
|
||||
this.targetOutputStream = outputStream;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a given base to the output stream.
|
||||
* @param base Base to write.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void write( byte base ) throws IOException {
|
||||
packedBases |= (getPackedRepresentation(base) << 2*(ALPHABET_SIZE-positionInPack-1));
|
||||
|
||||
// Increment the packed counter. If all possible bases have been squeezed into this byte, write it out.
|
||||
positionInPack = ++positionInPack % ALPHABET_SIZE;
|
||||
if( positionInPack == 0 ) {
|
||||
targetOutputStream.write(packedBases);
|
||||
packedBases = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes an array of bases to the target output stream.
|
||||
* @param bases List of bases to write.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void write( byte[] bases ) throws IOException {
|
||||
for(byte base: bases) write(base);
|
||||
}
|
||||
|
||||
/**
|
||||
* Flush the contents of the OutputStream to disk.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void flush() throws IOException {
|
||||
targetOutputStream.flush();
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the given output stream.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void close() throws IOException {
|
||||
// Write (incomplete) block in file, and number of bases in that last byte.
|
||||
if( positionInPack > 0 ) {
|
||||
targetOutputStream.write(packedBases);
|
||||
targetOutputStream.write(positionInPack);
|
||||
}
|
||||
else
|
||||
targetOutputStream.write(ALPHABET_SIZE);
|
||||
|
||||
targetOutputStream.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the two-bit representation of a base. A=00b, C=01b, G=10b, T=11b.
|
||||
* @param base ASCII value for the base to pack.
|
||||
* @return A byte from 0-3 indicating the base's packed value.
|
||||
*/
|
||||
public static byte getPackedRepresentation(byte base) {
|
||||
switch( base ) {
|
||||
case 'A':
|
||||
return 0;
|
||||
case 'C':
|
||||
return 1;
|
||||
case 'G':
|
||||
return 2;
|
||||
case 'T':
|
||||
return 3;
|
||||
default:
|
||||
throw new StingException("Unknown base type: " + base);
|
||||
}
|
||||
}
|
||||
|
||||
public static byte decodePackedRepresentation(byte pack) {
|
||||
switch( pack ) {
|
||||
case 0:
|
||||
return 'A';
|
||||
case 1:
|
||||
return 'C';
|
||||
case 2:
|
||||
return 'G';
|
||||
case 3:
|
||||
return 'T';
|
||||
default:
|
||||
throw new StingException("Unknown pack type: " + pack);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -33,7 +33,6 @@ import net.sf.samtools.util.StringUtil;
|
|||
import java.io.*;
|
||||
import java.util.TreeSet;
|
||||
import java.util.Comparator;
|
||||
import java.util.Arrays;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
|
||||
|
|
@ -56,7 +55,7 @@ public class CreateBWTFromReference {
|
|||
private int[] countOccurrences( String sequence ) {
|
||||
int occurrences[] = new int[ALPHABET_SIZE];
|
||||
for( char base: sequence.toCharArray() )
|
||||
occurrences[ BytePackedOutputStream.getPackedRepresentation((byte)base) ]++;
|
||||
occurrences[PackUtils.packBase((byte)base)]++;
|
||||
|
||||
// Make occurrences cumulative
|
||||
for( int i = 1; i < ALPHABET_SIZE; i++ )
|
||||
|
|
@ -182,7 +181,7 @@ public class CreateBWTFromReference {
|
|||
occurrenceWriter.write(occurrences);
|
||||
occurrenceWriter.flush();
|
||||
|
||||
WordPackedOutputStream sequenceOutputStream = new WordPackedOutputStream(bwtOutputStream,ByteOrder.LITTLE_ENDIAN);
|
||||
BasePackedOutputStream<Integer> sequenceOutputStream = new BasePackedOutputStream<Integer>(Integer.class,bwtOutputStream,ByteOrder.LITTLE_ENDIAN);
|
||||
sequenceOutputStream.write(bwt);
|
||||
sequenceOutputStream.close();
|
||||
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
|||
import net.sf.picard.reference.ReferenceSequence;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.ByteOrder;
|
||||
|
||||
/**
|
||||
* Generate a .PAC file from a given reference.
|
||||
|
|
@ -53,9 +54,13 @@ public class CreatePACFromReference {
|
|||
|
||||
// Target file for output
|
||||
File outputFile = new File(argv[1]);
|
||||
BytePackedOutputStream outputStream = new BytePackedOutputStream(outputFile);
|
||||
OutputStream outputStream = new FileOutputStream(outputFile);
|
||||
|
||||
BasePackedOutputStream<Byte> basePackedOutputStream = new BasePackedOutputStream<Byte>(Byte.class, outputStream, ByteOrder.BIG_ENDIAN);
|
||||
basePackedOutputStream.write(sequence.getBases());
|
||||
|
||||
outputStream.write(sequence.getBases().length%PackUtils.ALPHABET_SIZE);
|
||||
|
||||
outputStream.write(sequence.getBases());
|
||||
outputStream.close();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,82 @@
|
|||
package org.broadinstitute.sting.bwa;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
/**
|
||||
* Utilities designed for packing / unpacking bases.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class PackUtils {
|
||||
/**
|
||||
* How many possible bases can be encoded?
|
||||
*/
|
||||
public static final int ALPHABET_SIZE = 4;
|
||||
|
||||
/**
|
||||
* How many bits does it take to store a single base?
|
||||
*/
|
||||
public static final int BITS_PER_BASE = (int)(Math.log(ALPHABET_SIZE)/Math.log(2));
|
||||
|
||||
/**
|
||||
* How many bits fit into a single byte?
|
||||
*/
|
||||
public static final int BITS_PER_BYTE = 8;
|
||||
|
||||
/**
|
||||
* How many bits can a given type hold?
|
||||
* @param type Type to test.
|
||||
* @return Number of bits that the given type can hold.
|
||||
*/
|
||||
public static final int bitsInType( Class<?> type ) {
|
||||
try {
|
||||
long typeSize = type.getField("MAX_VALUE").getLong(null) - type.getField("MIN_VALUE").getLong(null)+1;
|
||||
long intTypeSize = (long)Integer.MAX_VALUE - (long)Integer.MIN_VALUE + 1;
|
||||
if( typeSize > intTypeSize )
|
||||
throw new StingException("Cannot determine number of bits available in type: " + type.getName());
|
||||
return (int)(Math.log(typeSize)/Math.log(2));
|
||||
}
|
||||
catch( NoSuchFieldException ex ) {
|
||||
throw new StingException("Cannot determine number of bits available in type: " + type.getName(),ex);
|
||||
}
|
||||
catch( IllegalAccessException ex ) {
|
||||
throw new StingException("Cannot determine number of bits available in type: " + type.getName(),ex);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the two-bit representation of a base. A=00b, C=01b, G=10b, T=11b.
|
||||
* @param base ASCII value for the base to pack.
|
||||
* @return A byte from 0-3 indicating the base's packed value.
|
||||
*/
|
||||
public static byte packBase(byte base) {
|
||||
switch( base ) {
|
||||
case 'A':
|
||||
return 0;
|
||||
case 'C':
|
||||
return 1;
|
||||
case 'G':
|
||||
return 2;
|
||||
case 'T':
|
||||
return 3;
|
||||
default:
|
||||
throw new StingException("Unknown base type: " + base);
|
||||
}
|
||||
}
|
||||
|
||||
public static byte unpackBase(byte pack) {
|
||||
switch( pack ) {
|
||||
case 0:
|
||||
return 'A';
|
||||
case 1:
|
||||
return 'C';
|
||||
case 2:
|
||||
return 'G';
|
||||
case 3:
|
||||
return 'T';
|
||||
default:
|
||||
throw new StingException("Unknown pack type: " + pack);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -26,7 +26,7 @@ public class WordPackedInputStream {
|
|||
|
||||
public WordPackedInputStream( File inputFile, ByteOrder byteOrder ) throws FileNotFoundException {
|
||||
this.targetInputStream = new BufferedInputStream(new FileInputStream(inputFile));
|
||||
this.buffer = ByteBuffer.allocate(WordPackedOutputStream.BASES_PER_WORD/BytePackedOutputStream.ALPHABET_SIZE).order(byteOrder);
|
||||
this.buffer = ByteBuffer.allocate(PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BYTE).order(byteOrder);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -44,9 +44,9 @@ public class WordPackedInputStream {
|
|||
List<Byte> bwtList = new ArrayList<Byte>();
|
||||
while(targetInputStream.read(buffer.array()) > 0) {
|
||||
int packedWord = buffer.getInt();
|
||||
for( int i = WordPackedOutputStream.BASES_PER_WORD-1; i >= 0; i-- ) {
|
||||
for( int i = PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BASE - 1; i >= 0; i-- ) {
|
||||
byte packedByte = (byte)((packedWord >> i*2) & 0x3);
|
||||
bwtList.add(BytePackedOutputStream.decodePackedRepresentation(packedByte));
|
||||
bwtList.add(PackUtils.unpackBase(packedByte));
|
||||
}
|
||||
buffer.rewind();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,131 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.bwa;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
|
||||
/**
|
||||
* Word-pack bases into the output stream. Bytes are stored as
|
||||
* little-endian unsigned ints.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class WordPackedOutputStream {
|
||||
/**
|
||||
* How many bases can be stored in the given word?
|
||||
*/
|
||||
public static final int BASES_PER_WORD = 16;
|
||||
|
||||
/**
|
||||
* Ultimate target for the packed bases.
|
||||
*/
|
||||
private final OutputStream targetOutputStream;
|
||||
|
||||
/**
|
||||
* The next byte to write to the output stream. Will be added
|
||||
* to the output stream when enough bases are accumulated, or when
|
||||
* the file is closed.
|
||||
*/
|
||||
private int packedBases;
|
||||
|
||||
/**
|
||||
* Where will the next base be embedded into packedBases?
|
||||
*/
|
||||
private int positionInPack = 0;
|
||||
|
||||
/**
|
||||
* A fixed-size buffer for word-packed data.
|
||||
*/
|
||||
private final ByteBuffer buffer;
|
||||
|
||||
public WordPackedOutputStream( File outputFile, ByteOrder byteOrder ) throws FileNotFoundException {
|
||||
this(new BufferedOutputStream(new FileOutputStream(outputFile)),byteOrder);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write packed bases to the given output stream.
|
||||
* @param outputStream Output stream to which to write packed bases.
|
||||
* @param byteOrder Switch between big endian / little endian when reading / writing files.
|
||||
*/
|
||||
public WordPackedOutputStream(OutputStream outputStream, ByteOrder byteOrder) {
|
||||
this.targetOutputStream = outputStream;
|
||||
this.buffer = ByteBuffer.allocate(BASES_PER_WORD/BytePackedOutputStream.ALPHABET_SIZE).order(byteOrder);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a given base to the output stream.
|
||||
* @param base Base to write.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void write( byte base ) throws IOException {
|
||||
packedBases |= (BytePackedOutputStream.getPackedRepresentation(base) << 2*(BASES_PER_WORD-positionInPack-1));
|
||||
|
||||
// Increment the packed counter. If all possible bases have been squeezed into this byte, write it out.
|
||||
positionInPack = ++positionInPack % BASES_PER_WORD;
|
||||
if( positionInPack == 0 ) {
|
||||
buffer.rewind();
|
||||
buffer.putInt(packedBases);
|
||||
targetOutputStream.write(buffer.array());
|
||||
packedBases = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes an array of bases to the target output stream.
|
||||
* @param bases List of bases to write.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void write( byte[] bases ) throws IOException {
|
||||
for(byte base: bases) write(base);
|
||||
}
|
||||
|
||||
/**
|
||||
* Flush the contents of the OutputStream to disk.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void flush() throws IOException {
|
||||
targetOutputStream.flush();
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the given output stream.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void close() throws IOException {
|
||||
// Write (incomplete) block in file, and number of bases in that last byte.
|
||||
if( positionInPack > 0 ) {
|
||||
buffer.rewind();
|
||||
buffer.putInt(packedBases);
|
||||
targetOutputStream.write(buffer.array());
|
||||
}
|
||||
targetOutputStream.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue