Proof-of-concept perfect read aligner, implemented as described in sec 2.4 of BWA paper. Has successfully aligned a handful of reads. Requires significant cleanup and refactoring.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1617 348d0f76-0448-11de-a6fe-93d51630548a
2009-09-14 21:54:56 +00:00 · 2009-09-14 21:54:56 +00:00 · 118071cfd8
parent 01e7b39c8d
commit 118071cfd8
2 changed files with 123 additions and 4 deletions
--- a/java/src/org/broadinstitute/sting/bwa/BWT.java
+++ b/java/src/org/broadinstitute/sting/bwa/BWT.java
@ -11,11 +11,22 @@ public class BWT {
     * Write an occurrence table after every SEQUENCE_BLOCK_SIZE bases.
     * For this implementation to behave correctly, SEQUENCE_BLOCK_SIZE % 8 == 0
     */
-    public static final int SEQUENCE_BLOCK_SIZE = 128;    
+    public static final int SEQUENCE_BLOCK_SIZE = 128;

-    public final int inverseSA0;
-    public final Counts counts;
-    public final SequenceBlock[] sequenceBlocks;
+    /**
+     * The inverse SA, used as a placeholder for determining where the special EOL character sits.
+     */
+    protected final int inverseSA0;
+
+    /**
+     * Cumulative counts for the entire BWT.
+     */
+    protected final Counts counts;
+
+    /**
+     * The individual sequence blocks, modelling how they appear on disk.
+     */
+    protected final SequenceBlock[] sequenceBlocks;

    /**
     * Creates a new BWT with the given inverse SA, counts, and sequence (in ASCII).
@ -50,6 +61,38 @@ public class BWT {
        return sequence;
    }

+    /**
+     * Get the total counts of bases lexicographically smaller than the given base, for Ferragina and Manzini's search.
+     * @param base The base.
+     * @return Total counts for all bases lexicographically smaller than this base.
+     */
+    public int counts(Base base) {
+        if( base.toPack() - 1 >= 0 )
+            return counts.getCumulative(Base.fromPack(base.toPack()-1));
+        else
+            return 0;
+    }
+
+    /**
+     * Get the total counts of bases lexicographically smaller than the given base, for Ferragina and Manzini's search.
+     * @param base The base.
+     * @param index The position to search within the BWT.
+     * @return Total counts for all bases lexicographically smaller than this base.
+     */
+    public int occurrences(Base base,int index) {
+        // If the index is above the SA-1[0], remap it to the appropriate coordinate space.
+        if( index > inverseSA0 ) index--;
+
+        SequenceBlock block = sequenceBlocks[index/SEQUENCE_BLOCK_SIZE];
+        int position = index % SEQUENCE_BLOCK_SIZE;
+        int accumulator = block.occurrences.get(base);
+        for(int i = 0; i <= position; i++) {
+            if(base == Base.fromASCII(block.sequence[i]))
+                accumulator++;
+        }
+        return accumulator;
+    }
+
    /**
     * The number of bases in the BWT as a whole.
     * @return Number of bases.
--- a/java/src/org/broadinstitute/sting/bwa/PerfectAlignerTestHarness.java
+++ b/java/src/org/broadinstitute/sting/bwa/PerfectAlignerTestHarness.java
@ -0,0 +1,76 @@
+package org.broadinstitute.sting.bwa;
+
+import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
+import org.broadinstitute.sting.utils.StingException;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+
+import net.sf.picard.reference.ReferenceSequence;
+
+/**
+ * A test harness to ensure that the perfect aligner works.
+ *
+ * @author mhanna
+ * @version 0.1
+ */
+public class PerfectAlignerTestHarness {
+    public static final String[] sampleReads = { "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA",
+                                                 "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTAG",
+                                                 "GCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGCGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCAGAT",
+                                                 "GCTTTTCATTCTGACTGCAACGGGCAATATGTATCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAA",
+                                                 "GCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAA",
+                                                 "CTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAAC",
+                                                 "TTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTACTGAACT",
+                                                 "TTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAAGAGTGTCTGATAGCAGCTTCTGAAC",
+                                                 "TTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACT",
+                                                 "CATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTGGTT"};
+
+    private static BWT bwt;
+    private static SuffixArray suffixArray;
+
+    public static void main( String argv[] ) throws FileNotFoundException {
+        if( argv.length != 3 ) {
+            System.out.println("PerfectAlignerTestHarness <fasta> <bwt> <sa>");
+            System.exit(1);
+        }
+
+        File referenceFile = new File(argv[0]);
+        IndexedFastaSequenceFile reference = new IndexedFastaSequenceFile(referenceFile);
+
+        File bwtFile = new File(argv[1]);
+        BWTReader reader = new BWTReader(bwtFile);
+        bwt = reader.read();
+
+        File suffixArrayFile = new File(argv[2]);
+        SuffixArrayReader suffixArrayReader = new SuffixArrayReader(suffixArrayFile);
+        suffixArray = suffixArrayReader.read();
+
+        for( String read: sampleReads ) {
+            int alignmentStart = align(read);
+            if( alignmentStart < 0 ) {
+                System.out.printf("Unable to align read %s%n",read);
+                continue;
+            }
+            ReferenceSequence subsequence = reference.getSubsequenceAt(reference.getSequenceDictionary().getSequences().get(0).getSequenceName(),alignmentStart,alignmentStart+read.length()-1);
+            for( int i = 0; i < subsequence.length(); i++) {
+                if( subsequence.getBases()[i] != read.charAt(i) )
+                    throw new StingException("Read is not an exact match!  Alignment has failed!");
+            }
+            System.out.printf("Read %s aligned to position %d%n", read, alignmentStart);            
+        }
+    }
+
+    public static int align( String read ) {
+        int lowerBound = 0, upperBound = bwt.length();
+        for( int i = read.length()-1; i >= 0; i-- ) {
+            Base base = Base.fromASCII((byte)read.charAt(i));
+            lowerBound = bwt.counts(base) + bwt.occurrences(base,lowerBound-1)+1;
+            upperBound = bwt.counts(base) + bwt.occurrences(base,upperBound);
+            if( lowerBound > upperBound ) return -1;
+        }
+        int alignmentStart = suffixArray.sequence[lowerBound]+1;
+        //System.out.printf("Read = %s; final bounds: (%d->%d); suffix array = %d%n",read,lowerBound,upperBound,alignmentStart);
+        return alignmentStart;
+    }
+}