From adce3bd536d9828884c66cfc19b82a1900a58cdf Mon Sep 17 00:00:00 2001 From: hanna Date: Tue, 8 Sep 2009 22:11:03 +0000 Subject: [PATCH] My reference implementation is now generating a BWT which matches BWT-SW's. Note to self: never give project status in an svn log. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1550 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/bwa/BytePackedOutputStream.java | 15 +++++ .../sting/bwa/CreateBWTFromReference.java | 34 +++++++---- .../sting/bwa/WordPackedInputStream.java | 61 +++++++++++++++++++ .../sting/bwa/WordPackedOutputStream.java | 2 +- 4 files changed, 99 insertions(+), 13 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/bwa/WordPackedInputStream.java diff --git a/java/src/org/broadinstitute/sting/bwa/BytePackedOutputStream.java b/java/src/org/broadinstitute/sting/bwa/BytePackedOutputStream.java index 2b35ec062..f33e15fad 100755 --- a/java/src/org/broadinstitute/sting/bwa/BytePackedOutputStream.java +++ b/java/src/org/broadinstitute/sting/bwa/BytePackedOutputStream.java @@ -139,4 +139,19 @@ public class BytePackedOutputStream { } } + public static byte decodePackedRepresentation(byte pack) { + switch( pack ) { + case 0: + return 'A'; + case 1: + return 'C'; + case 2: + return 'G'; + case 3: + return 'T'; + default: + throw new StingException("Unknown pack type: " + pack); + } + } + } diff --git a/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java b/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java index 04d6a730d..254c92a9b 100755 --- a/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java +++ b/java/src/org/broadinstitute/sting/bwa/CreateBWTFromReference.java @@ -33,6 +33,7 @@ import net.sf.samtools.util.StringUtil; import java.io.*; import java.util.TreeSet; import java.util.Comparator; +import java.util.Arrays; import java.nio.ByteBuffer; import java.nio.ByteOrder; @@ -107,12 +108,12 @@ public class CreateBWTFromReference { } private byte[] createBWT( String sequence, int[] suffixArray ) { - byte[] bwt = new byte[suffixArray.length]; - for( int i = 0; i < suffixArray.length; i++ ) { - // Find the first character after the current character in the rotation. If the character is past the end - // (in other words, '$'), back up to the previous character. - int sequenceEnd = Math.min((suffixArray[i]+suffixArray.length-1)%suffixArray.length, sequence.length()-1 ); - bwt[i] = (byte)sequence.charAt(sequenceEnd); + byte[] bwt = new byte[suffixArray.length-1]; + int i = 0; + for( int suffixArrayEntry: suffixArray ) { + if( suffixArrayEntry == 0 ) + continue; + bwt[i++] = (byte)sequence.charAt(suffixArrayEntry-1); } return bwt; } @@ -168,12 +169,7 @@ public class CreateBWTFromReference { byte[] bwt = creator.createBWT(sequence, suffixArray); String bwtAsString = new String(bwt); - //System.out.printf("BWT:%n"); - while( bwtAsString.length() > 0 ) { - int end = Math.min( 80, bwtAsString.length() ); - //System.out.printf("%s%n", bwtAsString.substring(0,end)); - bwtAsString = bwtAsString.substring(end); - } + System.out.printf("BWT: %s...%n", bwtAsString.substring(0,80)); OutputStream bwtOutputStream = new BufferedOutputStream(new FileOutputStream(bwtFile)); @@ -201,6 +197,20 @@ public class CreateBWTFromReference { saIntWriter.write(suffixArray, 1, suffixArray.length-1); saIntWriter.close(); + + File existingBwtFile = new File(inputFileName+".bwt"); + WordPackedInputStream inputStream = new WordPackedInputStream(existingBwtFile,ByteOrder.LITTLE_ENDIAN); + byte[] existingBwt = inputStream.read(); + + String existingBwtAsString = new String(existingBwt); + System.out.printf("Existing BWT: %s...%n",existingBwtAsString.substring(0,80)); + + for( int i = 0; i < bwt.length; i++ ) { + if( bwt[i] != existingBwt[i] ) { + System.out.printf("First bwt mismatch: %d%n",i); + break; + } + } } /** diff --git a/java/src/org/broadinstitute/sting/bwa/WordPackedInputStream.java b/java/src/org/broadinstitute/sting/bwa/WordPackedInputStream.java new file mode 100644 index 000000000..c777cce11 --- /dev/null +++ b/java/src/org/broadinstitute/sting/bwa/WordPackedInputStream.java @@ -0,0 +1,61 @@ +package org.broadinstitute.sting.bwa; + +import java.io.*; +import java.nio.ByteOrder; +import java.nio.ByteBuffer; +import java.util.List; +import java.util.ArrayList; + +/** + * Reads a word-packed version of the input stream. + * + * @author mhanna + * @version 0.1 + */ +public class WordPackedInputStream { + + /** + * Ultimate source for packed bases. + */ + private final InputStream targetInputStream; + + /** + * A fixed-size buffer for word-packed data. + */ + private final ByteBuffer buffer; + + public WordPackedInputStream( File inputFile, ByteOrder byteOrder ) throws FileNotFoundException { + this.targetInputStream = new BufferedInputStream(new FileInputStream(inputFile)); + this.buffer = ByteBuffer.allocate(WordPackedOutputStream.BASES_PER_WORD/BytePackedOutputStream.ALPHABET_SIZE).order(byteOrder); + } + + /** + * Read the entire contents of the input stream. + * @throws IOException if an I/O error occurs. + */ + public byte[] read() throws IOException { + // Skip over header info. + for( int i = 0; i < 5; i++ ) { + targetInputStream.read(buffer.array()); + System.out.println("Skipping over: " + buffer.getInt()); + buffer.rewind(); + } + + List bwtList = new ArrayList(); + while(targetInputStream.read(buffer.array()) > 0) { + int packedWord = buffer.getInt(); + for( int i = WordPackedOutputStream.BASES_PER_WORD-1; i >= 0; i-- ) { + byte packedByte = (byte)((packedWord >> i*2) & 0x3); + bwtList.add(BytePackedOutputStream.decodePackedRepresentation(packedByte)); + } + buffer.rewind(); + } + + byte[] bwt = new byte[bwtList.size()]; + for(int i = 0; i < bwtList.size(); i++) + bwt[i] = bwtList.get(i); + + return bwt; + } + +} diff --git a/java/src/org/broadinstitute/sting/bwa/WordPackedOutputStream.java b/java/src/org/broadinstitute/sting/bwa/WordPackedOutputStream.java index af3e9488b..03eae9c17 100755 --- a/java/src/org/broadinstitute/sting/bwa/WordPackedOutputStream.java +++ b/java/src/org/broadinstitute/sting/bwa/WordPackedOutputStream.java @@ -40,7 +40,7 @@ public class WordPackedOutputStream { /** * How many bases can be stored in the given word? */ - private static final int BASES_PER_WORD = 16; + public static final int BASES_PER_WORD = 16; /** * Ultimate target for the packed bases.