Start to build BWT abstractions, so we can present a reasonable facsimile of the BWT to the user no matter how it's represented on disk.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1589 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
c3f77acd5e
commit
0f3049652a
|
|
@ -1,5 +1,7 @@
|
||||||
package org.broadinstitute.sting.bwa;
|
package org.broadinstitute.sting.bwa;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Represents the Burrows-Wheeler Transform of a reference sequence.
|
* Represents the Burrows-Wheeler Transform of a reference sequence.
|
||||||
*
|
*
|
||||||
|
|
@ -7,18 +9,120 @@ package org.broadinstitute.sting.bwa;
|
||||||
* @version 0.1
|
* @version 0.1
|
||||||
*/
|
*/
|
||||||
public class BWT {
|
public class BWT {
|
||||||
|
/**
|
||||||
|
* Write an occurrence table after every SEQUENCE_BLOCK_SIZE bases.
|
||||||
|
* For this implementation to behave correctly, SEQUENCE_BLOCK_SIZE % 8 == 0
|
||||||
|
*/
|
||||||
|
public static final int SEQUENCE_BLOCK_SIZE = 128;
|
||||||
|
|
||||||
public final int inverseSA0;
|
public final int inverseSA0;
|
||||||
public final int[] occurrences;
|
public final int[] count;
|
||||||
public final byte[] sequence;
|
public final SequenceBlock[] sequenceBlocks;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new BWT with the given inverse SA, counts, and sequence (in ASCII).
|
||||||
|
* @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence.
|
||||||
|
* @param count Cumulative count of bases, in A,C,G,T order.
|
||||||
|
* @param sequenceBlocks The full BWT sequence, sans the '$'.
|
||||||
|
*/
|
||||||
|
public BWT( int inverseSA0, int[] count, SequenceBlock[] sequenceBlocks ) {
|
||||||
|
this.inverseSA0 = inverseSA0;
|
||||||
|
this.count = count;
|
||||||
|
this.sequenceBlocks = sequenceBlocks;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new BWT with the given inverse SA, occurrences, and sequence (in ASCII).
|
* Creates a new BWT with the given inverse SA, occurrences, and sequence (in ASCII).
|
||||||
* @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence.
|
* @param inverseSA0 Inverse SA entry for the first element. Will be missing from the BWT sequence.
|
||||||
* @param occurrences Cumulative number of occurrences of A,C,G,T, in order.
|
* @param count Cumulative count of bases, in A,C,G,T order.
|
||||||
* @param sequence The full BWT sequence, sans the '$'.
|
* @param sequence The full BWT sequence, sans the '$'.
|
||||||
*/
|
*/
|
||||||
public BWT( int inverseSA0, int[] occurrences, byte[] sequence ) {
|
public BWT( int inverseSA0, int[] count, byte[] sequence ) {
|
||||||
this.inverseSA0 = inverseSA0;
|
this(inverseSA0,count,generateSequenceBlocks(sequence));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract the full sequence from the list of block.
|
||||||
|
* @return The full BWT string as a byte array.
|
||||||
|
*/
|
||||||
|
public byte[] getSequence() {
|
||||||
|
byte[] sequence = new byte[count[count.length-1]];
|
||||||
|
for( SequenceBlock block: sequenceBlocks )
|
||||||
|
System.arraycopy(block.sequence,0,sequence,block.sequenceStart,block.sequenceLength);
|
||||||
|
return sequence;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The number of bases in the BWT as a whole.
|
||||||
|
* @return Number of bases.
|
||||||
|
*/
|
||||||
|
public int length() {
|
||||||
|
return count[count.length-1];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a set of sequence blocks from one long sequence.
|
||||||
|
* @param sequence Sequence from which to derive blocks.
|
||||||
|
* @return Array of sequence blocks containing data from the sequence.
|
||||||
|
*/
|
||||||
|
private static SequenceBlock[] generateSequenceBlocks( byte[] sequence ) {
|
||||||
|
int[] occurrences = {0,0,0,0};
|
||||||
|
|
||||||
|
int numSequenceBlocks = PackUtils.numberOfPartitions(sequence.length,SEQUENCE_BLOCK_SIZE);
|
||||||
|
SequenceBlock[] sequenceBlocks = new SequenceBlock[numSequenceBlocks];
|
||||||
|
|
||||||
|
for( int block = 0; block < numSequenceBlocks; block++ ) {
|
||||||
|
int blockStart = block*SEQUENCE_BLOCK_SIZE;
|
||||||
|
int blockLength = Math.min(SEQUENCE_BLOCK_SIZE, sequence.length-blockStart);
|
||||||
|
byte[] subsequence = new byte[blockLength];
|
||||||
|
|
||||||
|
System.arraycopy(sequence,blockStart,subsequence,0,blockLength);
|
||||||
|
|
||||||
|
sequenceBlocks[block] = new SequenceBlock(blockStart,blockLength,Arrays.copyOf(occurrences,occurrences.length),subsequence);
|
||||||
|
|
||||||
|
for( byte base: subsequence )
|
||||||
|
occurrences[PackUtils.packBase(base)]++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return sequenceBlocks;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Models a block of bases within the BWT.
|
||||||
|
*/
|
||||||
|
class SequenceBlock {
|
||||||
|
/**
|
||||||
|
* Start position of this sequence within the BWT.
|
||||||
|
*/
|
||||||
|
public final int sequenceStart;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Length of this sequence within the BWT.
|
||||||
|
*/
|
||||||
|
public final int sequenceLength;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Occurrences of each letter up to this sequence block.
|
||||||
|
*/
|
||||||
|
public final int[] occurrences;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sequence for this segment.
|
||||||
|
*/
|
||||||
|
public final byte[] sequence;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new block within this BWT.
|
||||||
|
* @param sequenceStart Starting position of this sequence within the BWT.
|
||||||
|
* @param sequenceLength Length of this sequence.
|
||||||
|
* @param occurrences How many of each base has been seen before this sequence began.
|
||||||
|
* @param sequence The actual sequence from the BWT.
|
||||||
|
*/
|
||||||
|
SequenceBlock( int sequenceStart, int sequenceLength, int[] occurrences, byte[] sequence ) {
|
||||||
|
this.sequenceStart = sequenceStart;
|
||||||
|
this.sequenceLength = sequenceLength;
|
||||||
this.occurrences = occurrences;
|
this.occurrences = occurrences;
|
||||||
this.sequence = sequence;
|
this.sequence = sequence;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -39,20 +39,35 @@ public class BWTReader {
|
||||||
BasePackedInputStream basePackedInputStream = new BasePackedInputStream<Integer>(Integer.class, inputStream, ByteOrder.LITTLE_ENDIAN);
|
BasePackedInputStream basePackedInputStream = new BasePackedInputStream<Integer>(Integer.class, inputStream, ByteOrder.LITTLE_ENDIAN);
|
||||||
|
|
||||||
int inverseSA0;
|
int inverseSA0;
|
||||||
int[] occurrences;
|
int[] count;
|
||||||
byte[] bwt;
|
SequenceBlock[] sequenceBlocks;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
inverseSA0 = intPackedInputStream.read();
|
inverseSA0 = intPackedInputStream.read();
|
||||||
occurrences = new int[PackUtils.ALPHABET_SIZE];
|
count = new int[PackUtils.ALPHABET_SIZE];
|
||||||
intPackedInputStream.read(occurrences);
|
intPackedInputStream.read(count);
|
||||||
bwt = basePackedInputStream.read(occurrences[PackUtils.ALPHABET_SIZE-1]);
|
|
||||||
|
int bwtSize = count[PackUtils.ALPHABET_SIZE-1];
|
||||||
|
sequenceBlocks = new SequenceBlock[PackUtils.numberOfPartitions(bwtSize,BWT.SEQUENCE_BLOCK_SIZE)];
|
||||||
|
|
||||||
|
for( int block = 0; block < sequenceBlocks.length; block++ ) {
|
||||||
|
int sequenceStart = block*BWT.SEQUENCE_BLOCK_SIZE;
|
||||||
|
int sequenceLength = Math.min(BWT.SEQUENCE_BLOCK_SIZE,bwtSize-sequenceStart);
|
||||||
|
|
||||||
|
int[] occurrences = new int[PackUtils.ALPHABET_SIZE];
|
||||||
|
byte[] bwt = new byte[sequenceLength];
|
||||||
|
|
||||||
|
intPackedInputStream.read(occurrences);
|
||||||
|
basePackedInputStream.read(bwt);
|
||||||
|
|
||||||
|
sequenceBlocks[block] = new SequenceBlock(sequenceStart,sequenceLength,occurrences,bwt);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
catch( IOException ex ) {
|
catch( IOException ex ) {
|
||||||
throw new StingException("Unable to read BWT from input stream.", ex);
|
throw new StingException("Unable to read BWT from input stream.", ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
return new BWT(inverseSA0, occurrences, bwt);
|
return new BWT(inverseSA0, count, sequenceBlocks);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ public class BWTWriter {
|
||||||
/**
|
/**
|
||||||
* Input stream from which to read BWT data.
|
* Input stream from which to read BWT data.
|
||||||
*/
|
*/
|
||||||
private OutputStream outputStream;
|
private final OutputStream outputStream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new BWT reader.
|
* Create a new BWT reader.
|
||||||
|
|
@ -40,8 +40,22 @@ public class BWTWriter {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
intPackedOutputStream.write(bwt.inverseSA0);
|
intPackedOutputStream.write(bwt.inverseSA0);
|
||||||
intPackedOutputStream.write(bwt.occurrences);
|
intPackedOutputStream.write(bwt.count);
|
||||||
basePackedOutputStream.write(bwt.sequence);
|
|
||||||
|
for( SequenceBlock block: bwt.sequenceBlocks ) {
|
||||||
|
intPackedOutputStream.write(block.occurrences);
|
||||||
|
basePackedOutputStream.write(block.sequence);
|
||||||
|
}
|
||||||
|
|
||||||
|
// The last block is the last set of counts in the structure.
|
||||||
|
int[] finalCount = new int[bwt.count.length];
|
||||||
|
for( int i = 0; i < finalCount.length; i++ ) {
|
||||||
|
if(i != 0)
|
||||||
|
finalCount[i] = bwt.count[i]-bwt.count[i-1];
|
||||||
|
else
|
||||||
|
finalCount[i] = bwt.count[i];
|
||||||
|
}
|
||||||
|
intPackedOutputStream.write(finalCount);
|
||||||
}
|
}
|
||||||
catch( IOException ex ) {
|
catch( IOException ex ) {
|
||||||
throw new StingException("Unable to read BWT from input stream.", ex);
|
throw new StingException("Unable to read BWT from input stream.", ex);
|
||||||
|
|
|
||||||
|
|
@ -43,14 +43,23 @@ public class BasePackedInputStream<T> {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Read the entire contents of the input stream.
|
* Read the entire contents of the input stream.
|
||||||
* @param length number of bases to read from the stream.
|
* @param bwt array into which bases should be read.
|
||||||
* @return a byte array of the given length.
|
* @return a byte array of the given length.
|
||||||
* @throws IOException if an I/O error occurs.
|
* @throws IOException if an I/O error occurs.
|
||||||
*/
|
*/
|
||||||
public byte[] read( int length ) throws IOException {
|
public void read(byte[] bwt) throws IOException {
|
||||||
byte[] bwt = new byte[length];
|
read(bwt,0,bwt.length);
|
||||||
int packedWord = 0;
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read the next <code>length</code> bases into the bwt array, starting at the given offset.
|
||||||
|
* @param bwt array holding the given data.
|
||||||
|
* @param offset target position in the bases array into which bytes should be written.
|
||||||
|
* @param length number of bases to read from the stream.
|
||||||
|
* @throws IOException if an I/O error occurs.
|
||||||
|
*/
|
||||||
|
public void read( byte[] bwt, int offset, int length ) throws IOException {
|
||||||
|
int packedWord = 0;
|
||||||
final int basesPerEntry = PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BASE;
|
final int basesPerEntry = PackUtils.bitsInType(Integer.class)/PackUtils.BITS_PER_BASE;
|
||||||
for( int i = 0; i < length; i++ ) {
|
for( int i = 0; i < length; i++ ) {
|
||||||
if( i % basesPerEntry == 0 ) {
|
if( i % basesPerEntry == 0 ) {
|
||||||
|
|
@ -60,10 +69,7 @@ public class BasePackedInputStream<T> {
|
||||||
}
|
}
|
||||||
|
|
||||||
int position = basesPerEntry - i % basesPerEntry - 1;
|
int position = basesPerEntry - i % basesPerEntry - 1;
|
||||||
bwt[i] = PackUtils.unpackBase((byte)((packedWord >> position*PackUtils.BITS_PER_BASE) & 0x3));
|
bwt[i+offset] = PackUtils.unpackBase((byte)((packedWord >> position*PackUtils.BITS_PER_BASE) & 0x3));
|
||||||
}
|
}
|
||||||
|
|
||||||
return bwt;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -65,11 +65,22 @@ public class BasePackedOutputStream<T> {
|
||||||
* @throws IOException if an I/O error occurs.
|
* @throws IOException if an I/O error occurs.
|
||||||
*/
|
*/
|
||||||
public void write( byte[] bases ) throws IOException {
|
public void write( byte[] bases ) throws IOException {
|
||||||
|
write(bases,0,bases.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes a subset of the array of bases to the output stream.
|
||||||
|
* @param bases List of bases to write.
|
||||||
|
* @param offset site at which to start writing.
|
||||||
|
* @param length number of bases to write.
|
||||||
|
* @throws IOException if an I/O error occurs.
|
||||||
|
*/
|
||||||
|
public void write( byte[] bases, int offset, int length ) throws IOException {
|
||||||
int packedBases = 0;
|
int packedBases = 0;
|
||||||
int positionInPack = 0;
|
int positionInPack = 0;
|
||||||
|
|
||||||
for(byte base: bases) {
|
for( int base = offset; base < offset+length; base++ ) {
|
||||||
packedBases = packBase(base, packedBases, positionInPack);
|
packedBases = packBase(bases[base], packedBases, positionInPack);
|
||||||
|
|
||||||
// Increment the packed counter. If all possible bases have been squeezed into this byte, write it out.
|
// Increment the packed counter. If all possible bases have been squeezed into this byte, write it out.
|
||||||
positionInPack = ++positionInPack % basesPerType;
|
positionInPack = ++positionInPack % basesPerType;
|
||||||
|
|
|
||||||
|
|
@ -52,6 +52,13 @@ public class CreateBWTFromReference {
|
||||||
return StringUtil.bytesToString(sequence.getBases());
|
return StringUtil.bytesToString(sequence.getBases());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String loadReverseReference( File inputFile ) {
|
||||||
|
ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(inputFile);
|
||||||
|
ReferenceSequence sequence = reference.nextSequence();
|
||||||
|
PackUtils.reverse(sequence.getBases());
|
||||||
|
return StringUtil.bytesToString(sequence.getBases());
|
||||||
|
}
|
||||||
|
|
||||||
private int[] countOccurrences( String sequence ) {
|
private int[] countOccurrences( String sequence ) {
|
||||||
int occurrences[] = new int[ALPHABET_SIZE];
|
int occurrences[] = new int[ALPHABET_SIZE];
|
||||||
for( char base: sequence.toCharArray() )
|
for( char base: sequence.toCharArray() )
|
||||||
|
|
@ -118,8 +125,8 @@ public class CreateBWTFromReference {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main( String argv[] ) throws IOException {
|
public static void main( String argv[] ) throws IOException {
|
||||||
if( argv.length != 3 ) {
|
if( argv.length != 5 ) {
|
||||||
System.out.println("USAGE: CreateBWTFromReference <input>.fasta <output bwt> <output sa>");
|
System.out.println("USAGE: CreateBWTFromReference <input>.fasta <output bwt> <output rbwt> <output sa> <output rsa>");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -129,12 +136,19 @@ public class CreateBWTFromReference {
|
||||||
String bwtFileName = argv[1];
|
String bwtFileName = argv[1];
|
||||||
File bwtFile = new File(bwtFileName);
|
File bwtFile = new File(bwtFileName);
|
||||||
|
|
||||||
String saFileName = argv[2];
|
String rbwtFileName = argv[2];
|
||||||
|
File rbwtFile = new File(rbwtFileName);
|
||||||
|
|
||||||
|
String saFileName = argv[3];
|
||||||
File saFile = new File(saFileName);
|
File saFile = new File(saFileName);
|
||||||
|
|
||||||
|
String rsaFileName = argv[4];
|
||||||
|
File rsaFile = new File(rsaFileName);
|
||||||
|
|
||||||
CreateBWTFromReference creator = new CreateBWTFromReference();
|
CreateBWTFromReference creator = new CreateBWTFromReference();
|
||||||
|
|
||||||
String sequence = creator.loadReference(inputFile);
|
String sequence = creator.loadReference(inputFile);
|
||||||
|
String reverseSequence = creator.loadReverseReference(inputFile);
|
||||||
|
|
||||||
// Count the occurences of each given base.
|
// Count the occurences of each given base.
|
||||||
int[] occurrences = creator.countOccurrences(sequence);
|
int[] occurrences = creator.countOccurrences(sequence);
|
||||||
|
|
@ -142,17 +156,23 @@ public class CreateBWTFromReference {
|
||||||
|
|
||||||
// Generate the suffix array and print diagnostics.
|
// Generate the suffix array and print diagnostics.
|
||||||
int[] suffixArrayData = creator.createSuffixArray(sequence);
|
int[] suffixArrayData = creator.createSuffixArray(sequence);
|
||||||
|
int[] reverseSuffixArrayData = creator.createSuffixArray(reverseSequence);
|
||||||
|
|
||||||
// Invert the suffix array and print diagnostics.
|
// Invert the suffix array and print diagnostics.
|
||||||
int[] inverseSuffixArray = creator.invertSuffixArray(suffixArrayData);
|
int[] inverseSuffixArray = creator.invertSuffixArray(suffixArrayData);
|
||||||
|
int[] reverseInverseSuffixArray = creator.invertSuffixArray(reverseSuffixArrayData);
|
||||||
|
|
||||||
SuffixArray suffixArray = new SuffixArray( inverseSuffixArray[0], occurrences, suffixArrayData );
|
SuffixArray suffixArray = new SuffixArray( inverseSuffixArray[0], occurrences, suffixArrayData );
|
||||||
|
SuffixArray reverseSuffixArray = new SuffixArray( reverseInverseSuffixArray[0], occurrences, reverseSuffixArrayData );
|
||||||
|
|
||||||
|
/*
|
||||||
for( int i = 0; i < 8; i++ )
|
for( int i = 0; i < 8; i++ )
|
||||||
System.out.printf("suffixArray[%d] = %d (%s...)%n", i, suffixArray.sequence[i], sequence.substring(suffixArray.sequence[i],Math.min(suffixArray.sequence[i]+100,sequence.length())));
|
System.out.printf("suffixArray[%d] = %d (%s...)%n", i, suffixArray.sequence[i], sequence.substring(suffixArray.sequence[i],Math.min(suffixArray.sequence[i]+100,sequence.length())));
|
||||||
for( int i = 0; i < 8; i++ )
|
for( int i = 0; i < 8; i++ )
|
||||||
System.out.printf("inverseSuffixArray[%d] = %d (%s...)%n", i, inverseSuffixArray[i], sequence.substring(i,Math.min(i+100,sequence.length())));
|
System.out.printf("inverseSuffixArray[%d] = %d (%s...)%n", i, inverseSuffixArray[i], sequence.substring(i,Math.min(i+100,sequence.length())));
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
// Create the data structure for the compressed suffix array and print diagnostics.
|
// Create the data structure for the compressed suffix array and print diagnostics.
|
||||||
int[] compressedSuffixArray = creator.createCompressedSuffixArray(suffixArray.sequence,inverseSuffixArray);
|
int[] compressedSuffixArray = creator.createCompressedSuffixArray(suffixArray.sequence,inverseSuffixArray);
|
||||||
int reconstructedInverseSA = compressedSuffixArray[0];
|
int reconstructedInverseSA = compressedSuffixArray[0];
|
||||||
|
|
@ -166,30 +186,40 @@ public class CreateBWTFromReference {
|
||||||
for( int i = 0; i < 8; i++ ) {
|
for( int i = 0; i < 8; i++ ) {
|
||||||
System.out.printf("inverseCompressedSuffixArray[%d] = %d%n", i, inverseCompressedSuffixArray[i]);
|
System.out.printf("inverseCompressedSuffixArray[%d] = %d%n", i, inverseCompressedSuffixArray[i]);
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
// Create the BWT.
|
// Create the BWT.
|
||||||
BWT bwt = new BWT( inverseSuffixArray[0], occurrences, creator.createBWT(sequence, suffixArray.sequence) );
|
BWT bwt = new BWT( inverseSuffixArray[0], occurrences, creator.createBWT(sequence, suffixArray.sequence) );
|
||||||
|
BWT reverseBWT = new BWT( reverseInverseSuffixArray[0], occurrences, creator.createBWT(reverseSequence, reverseSuffixArray.sequence));
|
||||||
|
|
||||||
String bwtAsString = new String(bwt.sequence);
|
byte[] bwtSequence = bwt.getSequence();
|
||||||
System.out.printf("BWT: %s...%n", bwtAsString.substring(0,80));
|
System.out.printf("BWT: %s... (length = %d)%n", new String(bwtSequence,0,80),bwt.length());
|
||||||
|
|
||||||
BWTWriter bwtWriter = new BWTWriter(bwtFile);
|
BWTWriter bwtWriter = new BWTWriter(bwtFile);
|
||||||
bwtWriter.write(bwt);
|
bwtWriter.write(bwt);
|
||||||
bwtWriter.close();
|
bwtWriter.close();
|
||||||
|
|
||||||
|
BWTWriter reverseBWTWriter = new BWTWriter(rbwtFile);
|
||||||
|
reverseBWTWriter.write(reverseBWT);
|
||||||
|
reverseBWTWriter.close();
|
||||||
|
|
||||||
SuffixArrayWriter saWriter = new SuffixArrayWriter(saFile);
|
SuffixArrayWriter saWriter = new SuffixArrayWriter(saFile);
|
||||||
saWriter.write(suffixArray);
|
saWriter.write(suffixArray);
|
||||||
saWriter.close();
|
saWriter.close();
|
||||||
|
|
||||||
|
SuffixArrayWriter reverseSAWriter = new SuffixArrayWriter(rsaFile);
|
||||||
|
reverseSAWriter.write(reverseSuffixArray);
|
||||||
|
reverseSAWriter.close();
|
||||||
|
|
||||||
File existingBWTFile = new File(inputFileName+".bwt");
|
File existingBWTFile = new File(inputFileName+".bwt");
|
||||||
BWTReader existingBWTReader = new BWTReader(existingBWTFile);
|
BWTReader existingBWTReader = new BWTReader(existingBWTFile);
|
||||||
BWT existingBWT = existingBWTReader.read();
|
BWT existingBWT = existingBWTReader.read();
|
||||||
|
|
||||||
String existingBwtAsString = new String(existingBWT.sequence);
|
byte[] existingBWTSequence = existingBWT.getSequence();
|
||||||
System.out.printf("Existing BWT: %s...%n",existingBwtAsString.substring(0,80));
|
System.out.printf("Existing BWT: %s... (length = %d)%n",new String(existingBWTSequence,0,80),existingBWT.length());
|
||||||
|
|
||||||
for( int i = 0; i < bwt.sequence.length; i++ ) {
|
for( int i = 0; i < bwt.length(); i++ ) {
|
||||||
if( bwt.sequence[i] != existingBWT.sequence[i] )
|
if( bwtSequence[i] != existingBWTSequence[i] )
|
||||||
throw new StingException("BWT mismatch at " + i);
|
throw new StingException("BWT mismatch at " + i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -41,8 +41,8 @@ import java.nio.ByteOrder;
|
||||||
|
|
||||||
public class CreatePACFromReference {
|
public class CreatePACFromReference {
|
||||||
public static void main( String argv[] ) throws IOException {
|
public static void main( String argv[] ) throws IOException {
|
||||||
if( argv.length != 2 ) {
|
if( argv.length != 3 ) {
|
||||||
System.out.println("USAGE: CreatePACFromReference <input>.fasta <output>");
|
System.out.println("USAGE: CreatePACFromReference <input>.fasta <output pac> <output rpac>");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -53,13 +53,22 @@ public class CreatePACFromReference {
|
||||||
ReferenceSequence sequence = reference.nextSequence();
|
ReferenceSequence sequence = reference.nextSequence();
|
||||||
|
|
||||||
// Target file for output
|
// Target file for output
|
||||||
File outputFile = new File(argv[1]);
|
writeSequence( new File(argv[1]), sequence.getBases() );
|
||||||
|
|
||||||
|
// Reverse the bases in the reference
|
||||||
|
PackUtils.reverse(sequence.getBases());
|
||||||
|
|
||||||
|
// Target file for output
|
||||||
|
writeSequence( new File(argv[2]), sequence.getBases() );
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void writeSequence( File outputFile, byte[] bases ) throws IOException {
|
||||||
OutputStream outputStream = new FileOutputStream(outputFile);
|
OutputStream outputStream = new FileOutputStream(outputFile);
|
||||||
|
|
||||||
BasePackedOutputStream<Byte> basePackedOutputStream = new BasePackedOutputStream<Byte>(Byte.class, outputStream, ByteOrder.BIG_ENDIAN);
|
BasePackedOutputStream<Byte> basePackedOutputStream = new BasePackedOutputStream<Byte>(Byte.class, outputStream, ByteOrder.BIG_ENDIAN);
|
||||||
basePackedOutputStream.write(sequence.getBases());
|
basePackedOutputStream.write(bases);
|
||||||
|
|
||||||
outputStream.write(sequence.getBases().length%PackUtils.ALPHABET_SIZE);
|
outputStream.write(bases.length%PackUtils.ALPHABET_SIZE);
|
||||||
|
|
||||||
outputStream.close();
|
outputStream.close();
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -29,7 +29,7 @@ public class PackUtils {
|
||||||
* @param type Type to test.
|
* @param type Type to test.
|
||||||
* @return Number of bits that the given type can hold.
|
* @return Number of bits that the given type can hold.
|
||||||
*/
|
*/
|
||||||
public static final int bitsInType( Class<?> type ) {
|
public static int bitsInType( Class<?> type ) {
|
||||||
try {
|
try {
|
||||||
long typeSize = type.getField("MAX_VALUE").getLong(null) - type.getField("MIN_VALUE").getLong(null)+1;
|
long typeSize = type.getField("MAX_VALUE").getLong(null) - type.getField("MIN_VALUE").getLong(null)+1;
|
||||||
long intTypeSize = (long)Integer.MAX_VALUE - (long)Integer.MIN_VALUE + 1;
|
long intTypeSize = (long)Integer.MAX_VALUE - (long)Integer.MIN_VALUE + 1;
|
||||||
|
|
@ -65,6 +65,11 @@ public class PackUtils {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts a two-bit representation of a base into an ASCII representation of a base.
|
||||||
|
* @param pack Byte from 0-3 indicating which base is represented.
|
||||||
|
* @return An ASCII value representing the packed base.
|
||||||
|
*/
|
||||||
public static byte unpackBase(byte pack) {
|
public static byte unpackBase(byte pack) {
|
||||||
switch( pack ) {
|
switch( pack ) {
|
||||||
case 0:
|
case 0:
|
||||||
|
|
@ -79,4 +84,28 @@ public class PackUtils {
|
||||||
throw new StingException("Unknown pack type: " + pack);
|
throw new StingException("Unknown pack type: " + pack);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reverses an unpacked sequence of bases.
|
||||||
|
* @param bases bases to reverse.
|
||||||
|
*/
|
||||||
|
public static void reverse( byte[] bases ) {
|
||||||
|
for( int i = 0, j = bases.length-1; i < j; i++, j-- ) {
|
||||||
|
byte temp = bases[j];
|
||||||
|
bases[j] = bases[i];
|
||||||
|
bases[i] = temp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Given a structure of size <code>size</code> that should be split
|
||||||
|
* into <code>partitionSize</code> partitions, how many partitions should
|
||||||
|
* be created? Size of last partition will be <= partitionSize.
|
||||||
|
* @param size Total size of the data structure.
|
||||||
|
* @param partitionSize Size of an individual partition.
|
||||||
|
* @return Number of partitions that would be created.
|
||||||
|
*/
|
||||||
|
public static int numberOfPartitions( int size, int partitionSize ) {
|
||||||
|
return (size + partitionSize - 1)/partitionSize;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue