My reference implementation is now generating a BWT which matches BWT-SW's.

Note to self: never give project status in an svn log.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1550 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-09-08 22:11:03 +00:00
parent f22f590192
commit adce3bd536
4 changed files with 99 additions and 13 deletions

View File

@ -139,4 +139,19 @@ public class BytePackedOutputStream {
}
}
public static byte decodePackedRepresentation(byte pack) {
switch( pack ) {
case 0:
return 'A';
case 1:
return 'C';
case 2:
return 'G';
case 3:
return 'T';
default:
throw new StingException("Unknown pack type: " + pack);
}
}
}

View File

@ -33,6 +33,7 @@ import net.sf.samtools.util.StringUtil;
import java.io.*;
import java.util.TreeSet;
import java.util.Comparator;
import java.util.Arrays;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
@ -107,12 +108,12 @@ public class CreateBWTFromReference {
}
private byte[] createBWT( String sequence, int[] suffixArray ) {
byte[] bwt = new byte[suffixArray.length];
for( int i = 0; i < suffixArray.length; i++ ) {
// Find the first character after the current character in the rotation. If the character is past the end
// (in other words, '$'), back up to the previous character.
int sequenceEnd = Math.min((suffixArray[i]+suffixArray.length-1)%suffixArray.length, sequence.length()-1 );
bwt[i] = (byte)sequence.charAt(sequenceEnd);
byte[] bwt = new byte[suffixArray.length-1];
int i = 0;
for( int suffixArrayEntry: suffixArray ) {
if( suffixArrayEntry == 0 )
continue;
bwt[i++] = (byte)sequence.charAt(suffixArrayEntry-1);
}
return bwt;
}
@ -168,12 +169,7 @@ public class CreateBWTFromReference {
byte[] bwt = creator.createBWT(sequence, suffixArray);
String bwtAsString = new String(bwt);
//System.out.printf("BWT:%n");
while( bwtAsString.length() > 0 ) {
int end = Math.min( 80, bwtAsString.length() );
//System.out.printf("%s%n", bwtAsString.substring(0,end));
bwtAsString = bwtAsString.substring(end);
}
System.out.printf("BWT: %s...%n", bwtAsString.substring(0,80));
OutputStream bwtOutputStream = new BufferedOutputStream(new FileOutputStream(bwtFile));
@ -201,6 +197,20 @@ public class CreateBWTFromReference {
saIntWriter.write(suffixArray, 1, suffixArray.length-1);
saIntWriter.close();
File existingBwtFile = new File(inputFileName+".bwt");
WordPackedInputStream inputStream = new WordPackedInputStream(existingBwtFile,ByteOrder.LITTLE_ENDIAN);
byte[] existingBwt = inputStream.read();
String existingBwtAsString = new String(existingBwt);
System.out.printf("Existing BWT: %s...%n",existingBwtAsString.substring(0,80));
for( int i = 0; i < bwt.length; i++ ) {
if( bwt[i] != existingBwt[i] ) {
System.out.printf("First bwt mismatch: %d%n",i);
break;
}
}
}
/**

View File

@ -0,0 +1,61 @@
package org.broadinstitute.sting.bwa;
import java.io.*;
import java.nio.ByteOrder;
import java.nio.ByteBuffer;
import java.util.List;
import java.util.ArrayList;
/**
* Reads a word-packed version of the input stream.
*
* @author mhanna
* @version 0.1
*/
public class WordPackedInputStream {
/**
* Ultimate source for packed bases.
*/
private final InputStream targetInputStream;
/**
* A fixed-size buffer for word-packed data.
*/
private final ByteBuffer buffer;
public WordPackedInputStream( File inputFile, ByteOrder byteOrder ) throws FileNotFoundException {
this.targetInputStream = new BufferedInputStream(new FileInputStream(inputFile));
this.buffer = ByteBuffer.allocate(WordPackedOutputStream.BASES_PER_WORD/BytePackedOutputStream.ALPHABET_SIZE).order(byteOrder);
}
/**
* Read the entire contents of the input stream.
* @throws IOException if an I/O error occurs.
*/
public byte[] read() throws IOException {
// Skip over header info.
for( int i = 0; i < 5; i++ ) {
targetInputStream.read(buffer.array());
System.out.println("Skipping over: " + buffer.getInt());
buffer.rewind();
}
List<Byte> bwtList = new ArrayList<Byte>();
while(targetInputStream.read(buffer.array()) > 0) {
int packedWord = buffer.getInt();
for( int i = WordPackedOutputStream.BASES_PER_WORD-1; i >= 0; i-- ) {
byte packedByte = (byte)((packedWord >> i*2) & 0x3);
bwtList.add(BytePackedOutputStream.decodePackedRepresentation(packedByte));
}
buffer.rewind();
}
byte[] bwt = new byte[bwtList.size()];
for(int i = 0; i < bwtList.size(); i++)
bwt[i] = bwtList.get(i);
return bwt;
}
}

View File

@ -40,7 +40,7 @@ public class WordPackedOutputStream {
/**
* How many bases can be stored in the given word?
*/
private static final int BASES_PER_WORD = 16;
public static final int BASES_PER_WORD = 16;
/**
* Ultimate target for the packed bases.