From 118071cfd8b7b1d85a4787697ff15f351aee5708 Mon Sep 17 00:00:00 2001 From: hanna Date: Mon, 14 Sep 2009 21:54:56 +0000 Subject: [PATCH] Proof-of-concept perfect read aligner, implemented as described in sec 2.4 of BWA paper. Has successfully aligned a handful of reads. Requires significant cleanup and refactoring. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1617 348d0f76-0448-11de-a6fe-93d51630548a --- .../src/org/broadinstitute/sting/bwa/BWT.java | 51 ++++++++++++- .../sting/bwa/PerfectAlignerTestHarness.java | 76 +++++++++++++++++++ 2 files changed, 123 insertions(+), 4 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/bwa/PerfectAlignerTestHarness.java diff --git a/java/src/org/broadinstitute/sting/bwa/BWT.java b/java/src/org/broadinstitute/sting/bwa/BWT.java index be127cbfc..ffd55dd42 100644 --- a/java/src/org/broadinstitute/sting/bwa/BWT.java +++ b/java/src/org/broadinstitute/sting/bwa/BWT.java @@ -11,11 +11,22 @@ public class BWT { * Write an occurrence table after every SEQUENCE_BLOCK_SIZE bases. * For this implementation to behave correctly, SEQUENCE_BLOCK_SIZE % 8 == 0 */ - public static final int SEQUENCE_BLOCK_SIZE = 128; + public static final int SEQUENCE_BLOCK_SIZE = 128; - public final int inverseSA0; - public final Counts counts; - public final SequenceBlock[] sequenceBlocks; + /** + * The inverse SA, used as a placeholder for determining where the special EOL character sits. + */ + protected final int inverseSA0; + + /** + * Cumulative counts for the entire BWT. + */ + protected final Counts counts; + + /** + * The individual sequence blocks, modelling how they appear on disk. + */ + protected final SequenceBlock[] sequenceBlocks; /** * Creates a new BWT with the given inverse SA, counts, and sequence (in ASCII). @@ -50,6 +61,38 @@ public class BWT { return sequence; } + /** + * Get the total counts of bases lexicographically smaller than the given base, for Ferragina and Manzini's search. + * @param base The base. + * @return Total counts for all bases lexicographically smaller than this base. + */ + public int counts(Base base) { + if( base.toPack() - 1 >= 0 ) + return counts.getCumulative(Base.fromPack(base.toPack()-1)); + else + return 0; + } + + /** + * Get the total counts of bases lexicographically smaller than the given base, for Ferragina and Manzini's search. + * @param base The base. + * @param index The position to search within the BWT. + * @return Total counts for all bases lexicographically smaller than this base. + */ + public int occurrences(Base base,int index) { + // If the index is above the SA-1[0], remap it to the appropriate coordinate space. + if( index > inverseSA0 ) index--; + + SequenceBlock block = sequenceBlocks[index/SEQUENCE_BLOCK_SIZE]; + int position = index % SEQUENCE_BLOCK_SIZE; + int accumulator = block.occurrences.get(base); + for(int i = 0; i <= position; i++) { + if(base == Base.fromASCII(block.sequence[i])) + accumulator++; + } + return accumulator; + } + /** * The number of bases in the BWT as a whole. * @return Number of bases. diff --git a/java/src/org/broadinstitute/sting/bwa/PerfectAlignerTestHarness.java b/java/src/org/broadinstitute/sting/bwa/PerfectAlignerTestHarness.java new file mode 100644 index 000000000..a6a384510 --- /dev/null +++ b/java/src/org/broadinstitute/sting/bwa/PerfectAlignerTestHarness.java @@ -0,0 +1,76 @@ +package org.broadinstitute.sting.bwa; + +import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.StingException; + +import java.io.File; +import java.io.FileNotFoundException; + +import net.sf.picard.reference.ReferenceSequence; + +/** + * A test harness to ensure that the perfect aligner works. + * + * @author mhanna + * @version 0.1 + */ +public class PerfectAlignerTestHarness { + public static final String[] sampleReads = { "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA", + "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTAG", + "GCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGCGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCAGAT", + "GCTTTTCATTCTGACTGCAACGGGCAATATGTATCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAA", + "GCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAA", + "CTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAAC", + "TTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTACTGAACT", + "TTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAAGAGTGTCTGATAGCAGCTTCTGAAC", + "TTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACT", + "CATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTGGTT"}; + + private static BWT bwt; + private static SuffixArray suffixArray; + + public static void main( String argv[] ) throws FileNotFoundException { + if( argv.length != 3 ) { + System.out.println("PerfectAlignerTestHarness "); + System.exit(1); + } + + File referenceFile = new File(argv[0]); + IndexedFastaSequenceFile reference = new IndexedFastaSequenceFile(referenceFile); + + File bwtFile = new File(argv[1]); + BWTReader reader = new BWTReader(bwtFile); + bwt = reader.read(); + + File suffixArrayFile = new File(argv[2]); + SuffixArrayReader suffixArrayReader = new SuffixArrayReader(suffixArrayFile); + suffixArray = suffixArrayReader.read(); + + for( String read: sampleReads ) { + int alignmentStart = align(read); + if( alignmentStart < 0 ) { + System.out.printf("Unable to align read %s%n",read); + continue; + } + ReferenceSequence subsequence = reference.getSubsequenceAt(reference.getSequenceDictionary().getSequences().get(0).getSequenceName(),alignmentStart,alignmentStart+read.length()-1); + for( int i = 0; i < subsequence.length(); i++) { + if( subsequence.getBases()[i] != read.charAt(i) ) + throw new StingException("Read is not an exact match! Alignment has failed!"); + } + System.out.printf("Read %s aligned to position %d%n", read, alignmentStart); + } + } + + public static int align( String read ) { + int lowerBound = 0, upperBound = bwt.length(); + for( int i = read.length()-1; i >= 0; i-- ) { + Base base = Base.fromASCII((byte)read.charAt(i)); + lowerBound = bwt.counts(base) + bwt.occurrences(base,lowerBound-1)+1; + upperBound = bwt.counts(base) + bwt.occurrences(base,upperBound); + if( lowerBound > upperBound ) return -1; + } + int alignmentStart = suffixArray.sequence[lowerBound]+1; + //System.out.printf("Read = %s; final bounds: (%d->%d); suffix array = %d%n",read,lowerBound,upperBound,alignmentStart); + return alignmentStart; + } +}