Checkpoint. The BWT that we generate now matches the first 16% of the BWT that BWT-SW generates. Cleaned up output streams to separate the byte packing / word packing from the data structure generation.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1508 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-09-02 22:18:17 +00:00
parent 3dfc77dc89
commit e2a79c5cd9
5 changed files with 418 additions and 66 deletions

View File

@ -0,0 +1,142 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.bwa;
import org.broadinstitute.sting.utils.StingException;
import java.io.*;
/**
* Write packed bases to an output stream. Pack each base into 2 bits.
*
* @author mhanna
* @version 0.1
*/
public class BytePackedOutputStream {
/**
* How many possible bases can be encoded?
*/
public static final int ALPHABET_SIZE = 4;
/**
* Ultimate target for the packed bases.
*/
private final OutputStream targetOutputStream;
/**
* The next byte to write to the output stream. Will be added
* to the output stream when enough bases are accumulated, or when
* the file is closed.
*/
private byte packedBases;
/**
* Where will the next base be embedded into packedBases?
*/
private int positionInPack = 0;
public BytePackedOutputStream( File outputFile ) throws FileNotFoundException {
this(new BufferedOutputStream(new FileOutputStream(outputFile)));
}
/**
* Write packed bases to the given output stream.
* @param outputStream Output stream to which to write packed bases.
*/
public BytePackedOutputStream( OutputStream outputStream ) {
this.targetOutputStream = outputStream;
}
/**
* Write a given base to the output stream.
* @param base Base to write.
* @throws IOException if an I/O error occurs.
*/
public void write( byte base ) throws IOException {
packedBases |= (getPackedRepresentation(base) << 2*(ALPHABET_SIZE-positionInPack-1));
// Increment the packed counter. If all possible bases have been squeezed into this byte, write it out.
positionInPack = ++positionInPack % ALPHABET_SIZE;
if( positionInPack == 0 ) {
targetOutputStream.write(packedBases);
packedBases = 0;
}
}
/**
* Writes an array of bases to the target output stream.
* @param bases List of bases to write.
* @throws IOException if an I/O error occurs.
*/
public void write( byte[] bases ) throws IOException {
for(byte base: bases) write(base);
}
/**
* Flush the contents of the OutputStream to disk.
* @throws IOException if an I/O error occurs.
*/
public void flush() throws IOException {
targetOutputStream.flush();
}
/**
* Closes the given output stream.
* @throws IOException if an I/O error occurs.
*/
public void close() throws IOException {
// Write (incomplete) block in file, and number of bases in that last byte.
if( positionInPack > 0 ) {
targetOutputStream.write(packedBases);
targetOutputStream.write(positionInPack);
}
else
targetOutputStream.write(ALPHABET_SIZE);
targetOutputStream.close();
}
/**
* Gets the two-bit representation of a base. A=00b, C=01b, G=10b, T=11b.
* @param base ASCII value for the base to pack.
* @return A byte from 0-3 indicating the base's packed value.
*/
public static byte getPackedRepresentation(byte base) {
switch( base ) {
case 'A':
return 0;
case 'C':
return 1;
case 'G':
return 2;
case 'T':
return 3;
default:
throw new StingException("Unknown base type: " + base);
}
}
}

View File

@ -30,10 +30,11 @@ import net.sf.picard.reference.ReferenceSequenceFileFactory;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.samtools.util.StringUtil;
import java.io.IOException;
import java.io.File;
import java.io.*;
import java.util.TreeSet;
import java.util.Comparator;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
/**
* Create a suffix array data structure.
@ -54,7 +55,7 @@ public class CreateBWTFromReference {
private int[] countOccurrences( String sequence ) {
int occurrences[] = new int[ALPHABET_SIZE];
for( char base: sequence.toCharArray() )
occurrences[ CreatePACFromReference.getPackedRepresentation((byte)base) ]++;
occurrences[ BytePackedOutputStream.getPackedRepresentation((byte)base) ]++;
// Make occurrences cumulative
for( int i = 1; i < ALPHABET_SIZE; i++ )
@ -98,34 +99,33 @@ public class CreateBWTFromReference {
return compressedSuffixArray;
}
private char[] createBWT( String sequence, int[] suffixArray ) {
char[] bwt = new char[suffixArray.length];
private byte[] createBWT( String sequence, int[] suffixArray ) {
byte[] bwt = new byte[suffixArray.length];
for( int i = 0; i < suffixArray.length; i++ ) {
// Find the first character after the current character in the rotation. If the character is past the end
// (in other words, '$'), back up to the previous character.
int sequenceEnd = Math.min((suffixArray[i]+suffixArray.length-1)%suffixArray.length, sequence.length()-1 );
bwt[i] = sequence.charAt(sequenceEnd);
bwt[i] = (byte)sequence.charAt(sequenceEnd);
}
return bwt;
}
public static void main( String argv[] ) throws IOException {
if( argv.length != 1 ) {
System.out.println("No reference");
if( argv.length != 2 ) {
System.out.println("USAGE: CreateBWTFromReference <input>.fasta <output>");
return;
}
String inputFileName = argv[0];
File inputFile = new File(inputFileName);
String outputFileName = argv[1];
File outputFile = new File(outputFileName);
CreateBWTFromReference creator = new CreateBWTFromReference();
String sequence = creator.loadReference(inputFile);
// Count the occurences of each given base.
int[] occurrences = creator.countOccurrences(sequence);
System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences[0],occurrences[1],occurrences[2],occurrences[3]);
// Generate the suffix array and print diagnostics.
int[] suffixArray = creator.createSuffixArray(sequence);
for( int i = 0; i < 8; i++ )
@ -144,9 +144,35 @@ public class CreateBWTFromReference {
reconstructedInverseSA = compressedSuffixArray[reconstructedInverseSA];
}
// Count the occurences of each given base.
int[] occurrences = creator.countOccurrences(sequence);
System.out.printf("Occurrences: a=%d, c=%d, g=%d, t=%d%n",occurrences[0],occurrences[1],occurrences[2],occurrences[3]);
// Create the BWT.
char[] bwt = creator.createBWT(sequence, suffixArray);
System.out.printf("BWT: %s%n", new String(bwt));
byte[] bwt = creator.createBWT(sequence, suffixArray);
String bwtAsString = new String(bwt);
System.out.printf("BWT:%n");
while( bwtAsString.length() > 0 ) {
int end = Math.min( 80, bwtAsString.length() );
//System.out.printf("%s%n", bwtAsString.substring(0,end));
bwtAsString = bwtAsString.substring(end);
}
OutputStream outputStream = new BufferedOutputStream(new FileOutputStream(outputFile));
ByteBuffer buffer = ByteBuffer.allocate(4).order(ByteOrder.LITTLE_ENDIAN);
buffer.putInt(inverseSuffixArray[0]);
outputStream.write(buffer.array());
outputStream.flush();
OccurrenceOutputStream occurrenceWriter = new OccurrenceOutputStream(outputStream);
occurrenceWriter.write(occurrences);
occurrenceWriter.flush();
WordPackedOutputStream bwtOutputStream = new WordPackedOutputStream(outputStream,ByteOrder.LITTLE_ENDIAN);
bwtOutputStream.write(bwt);
bwtOutputStream.close();
}
/**

View File

@ -31,8 +31,6 @@ import net.sf.picard.reference.ReferenceSequence;
import java.io.*;
import org.broadinstitute.sting.utils.StingException;
/**
* Generate a .PAC file from a given reference.
*
@ -41,11 +39,9 @@ import org.broadinstitute.sting.utils.StingException;
*/
public class CreatePACFromReference {
private static final int ALPHABET_SIZE = 4;
public static void main( String argv[] ) throws FileNotFoundException, IOException {
if( argv.length != 1 ) {
System.out.println("No reference");
public static void main( String argv[] ) throws IOException {
if( argv.length != 2 ) {
System.out.println("USAGE: CreatePACFromReference <input>.fasta <output>");
return;
}
@ -56,52 +52,10 @@ public class CreatePACFromReference {
ReferenceSequence sequence = reference.nextSequence();
// Target file for output
File outputFile = new File(inputFileName + ".mypac");
BufferedOutputStream outputStream = new BufferedOutputStream(new FileOutputStream(outputFile));
File outputFile = new File(argv[1]);
BytePackedOutputStream outputStream = new BytePackedOutputStream(outputFile);
// Number of bytes, rounded up, plus one extra byte indicating how many bases the last byte holds.
byte packed = 0;
int positionInPack = 0;
for( byte base: sequence.getBases() ) {
// Pack base into the appropriate bits of the byte.
packed |= (getPackedRepresentation(base) << 2*(ALPHABET_SIZE-positionInPack-1));
// Increment the packed counter. If all possible bases have been squeezed into this byte, write it out.
positionInPack = ++positionInPack % 4;
if( positionInPack == 0 ) {
outputStream.write(packed);
packed = 0;
}
}
// Last (incomplete) block in file.
if( positionInPack > 0 )
outputStream.write(packed);
// Last character of a .pac file is how many bases are in the last byte.
outputStream.write(positionInPack);
outputStream.write(sequence.getBases());
outputStream.close();
}
/**
* Gets the two-bit representation of a base. A=00b, C=01b, G=10b, T=11b.
* @param base ASCII value for the base to pack.
* @return A byte from 0-3 indicating the base's packed value.
*/
public static byte getPackedRepresentation(byte base) {
switch( base ) {
case 'A':
return 0;
case 'C':
return 1;
case 'G':
return 2;
case 'T':
return 3;
default:
throw new StingException("Unknown base type: " + base);
}
}
}

View File

@ -0,0 +1,99 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.bwa;
import org.broadinstitute.sting.utils.StingException;
import java.io.*;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
/**
* Writes an occurrence array to the output file.
*
* @author mhanna
* @version 0.1
*/
public class OccurrenceOutputStream {
/**
* How many bytes does it take to hold an integer in Java?
*/
private static final int INT_SIZE_IN_BYTES = 4;
/**
* Ultimate target for the occurrence array.
*/
private final OutputStream targetOutputStream;
/**
* Create a new OccurrenceArrayOutputStream, writing to the given target file.
* @param outputFile target output file.
* @throws IOException if an I/O error occurs.
*/
public OccurrenceOutputStream( File outputFile ) throws IOException {
this(new FileOutputStream(outputFile));
}
/**
* Write occurrence array to the given OutputStream.
* @param outputStream Output stream to which to write packed bases.
* @throws IOException if an I/O error occurs.
*/
public OccurrenceOutputStream( OutputStream outputStream ) throws IOException {
this.targetOutputStream = outputStream;
}
/**
* Write the cumulative occurrences to the output stream.
* @param occurrences occurrences to write. occurrences.length must match alphabet size.
* @throws IOException if an I/O error occurs.
*/
public void write( int[] occurrences ) throws IOException {
if( occurrences.length > BytePackedOutputStream.ALPHABET_SIZE )
throw new StingException("Wrong number of occurrence data points; expected " + BytePackedOutputStream.ALPHABET_SIZE);
ByteBuffer buffer = ByteBuffer.allocate(INT_SIZE_IN_BYTES*occurrences.length).order(ByteOrder.LITTLE_ENDIAN);
for(int occurrence: occurrences)
buffer.putInt(occurrence);
targetOutputStream.write(buffer.array());
}
/**
* Flush the contents of the OutputStream to disk.
* @throws IOException if an I/O error occurs.
*/
public void flush() throws IOException {
targetOutputStream.flush();
}
/**
* Closes the given output stream.
* @throws IOException if an I/O error occurs.
*/
public void close() throws IOException {
targetOutputStream.close();
}
}

View File

@ -0,0 +1,131 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.bwa;
import java.io.*;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
/**
* Word-pack bases into the output stream. Bytes are stored as
* little-endian unsigned ints.
*
* @author mhanna
* @version 0.1
*/
public class WordPackedOutputStream {
/**
* How many bases can be stored in the given word?
*/
private static final int BASES_PER_WORD = 16;
/**
* Ultimate target for the packed bases.
*/
private final OutputStream targetOutputStream;
/**
* The next byte to write to the output stream. Will be added
* to the output stream when enough bases are accumulated, or when
* the file is closed.
*/
private int packedBases;
/**
* Where will the next base be embedded into packedBases?
*/
private int positionInPack = 0;
/**
* A fixed-size buffer for word-packed data.
*/
private final ByteBuffer buffer;
public WordPackedOutputStream( File outputFile, ByteOrder byteOrder ) throws FileNotFoundException {
this(new BufferedOutputStream(new FileOutputStream(outputFile)),byteOrder);
}
/**
* Write packed bases to the given output stream.
* @param outputStream Output stream to which to write packed bases.
* @param byteOrder Switch between big endian / little endian when reading / writing files.
*/
public WordPackedOutputStream(OutputStream outputStream, ByteOrder byteOrder) {
this.targetOutputStream = outputStream;
this.buffer = ByteBuffer.allocate(BASES_PER_WORD/BytePackedOutputStream.ALPHABET_SIZE).order(byteOrder);
}
/**
* Write a given base to the output stream.
* @param base Base to write.
* @throws IOException if an I/O error occurs.
*/
public void write( byte base ) throws IOException {
packedBases |= (BytePackedOutputStream.getPackedRepresentation(base) << 2*(BASES_PER_WORD-positionInPack-1));
// Increment the packed counter. If all possible bases have been squeezed into this byte, write it out.
positionInPack = ++positionInPack % BASES_PER_WORD;
if( positionInPack == 0 ) {
buffer.rewind();
buffer.putInt(packedBases);
targetOutputStream.write(buffer.array());
packedBases = 0;
}
}
/**
* Writes an array of bases to the target output stream.
* @param bases List of bases to write.
* @throws IOException if an I/O error occurs.
*/
public void write( byte[] bases ) throws IOException {
for(byte base: bases) write(base);
}
/**
* Flush the contents of the OutputStream to disk.
* @throws IOException if an I/O error occurs.
*/
public void flush() throws IOException {
targetOutputStream.flush();
}
/**
* Closes the given output stream.
* @throws IOException if an I/O error occurs.
*/
public void close() throws IOException {
// Write (incomplete) block in file, and number of bases in that last byte.
if( positionInPack > 0 ) {
buffer.rewind();
buffer.putInt(packedBases);
targetOutputStream.write(buffer.array());
}
targetOutputStream.close();
}
}