Proof-of-concept perfect read aligner, implemented as described in sec 2.4 of BWA paper. Has successfully aligned a handful of reads. Requires significant cleanup and refactoring.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1617 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-09-14 21:54:56 +00:00
parent 01e7b39c8d
commit 118071cfd8
2 changed files with 123 additions and 4 deletions

View File

@ -11,11 +11,22 @@ public class BWT {
* Write an occurrence table after every SEQUENCE_BLOCK_SIZE bases.
* For this implementation to behave correctly, SEQUENCE_BLOCK_SIZE % 8 == 0
*/
public static final int SEQUENCE_BLOCK_SIZE = 128;
public static final int SEQUENCE_BLOCK_SIZE = 128;
public final int inverseSA0;
public final Counts counts;
public final SequenceBlock[] sequenceBlocks;
/**
* The inverse SA, used as a placeholder for determining where the special EOL character sits.
*/
protected final int inverseSA0;
/**
* Cumulative counts for the entire BWT.
*/
protected final Counts counts;
/**
* The individual sequence blocks, modelling how they appear on disk.
*/
protected final SequenceBlock[] sequenceBlocks;
/**
* Creates a new BWT with the given inverse SA, counts, and sequence (in ASCII).
@ -50,6 +61,38 @@ public class BWT {
return sequence;
}
/**
* Get the total counts of bases lexicographically smaller than the given base, for Ferragina and Manzini's search.
* @param base The base.
* @return Total counts for all bases lexicographically smaller than this base.
*/
public int counts(Base base) {
if( base.toPack() - 1 >= 0 )
return counts.getCumulative(Base.fromPack(base.toPack()-1));
else
return 0;
}
/**
* Get the total counts of bases lexicographically smaller than the given base, for Ferragina and Manzini's search.
* @param base The base.
* @param index The position to search within the BWT.
* @return Total counts for all bases lexicographically smaller than this base.
*/
public int occurrences(Base base,int index) {
// If the index is above the SA-1[0], remap it to the appropriate coordinate space.
if( index > inverseSA0 ) index--;
SequenceBlock block = sequenceBlocks[index/SEQUENCE_BLOCK_SIZE];
int position = index % SEQUENCE_BLOCK_SIZE;
int accumulator = block.occurrences.get(base);
for(int i = 0; i <= position; i++) {
if(base == Base.fromASCII(block.sequence[i]))
accumulator++;
}
return accumulator;
}
/**
* The number of bases in the BWT as a whole.
* @return Number of bases.

View File

@ -0,0 +1,76 @@
package org.broadinstitute.sting.bwa;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.StingException;
import java.io.File;
import java.io.FileNotFoundException;
import net.sf.picard.reference.ReferenceSequence;
/**
* A test harness to ensure that the perfect aligner works.
*
* @author mhanna
* @version 0.1
*/
public class PerfectAlignerTestHarness {
public static final String[] sampleReads = { "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA",
"AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTAG",
"GCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGCGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCAGAT",
"GCTTTTCATTCTGACTGCAACGGGCAATATGTATCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAA",
"GCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAA",
"CTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAAC",
"TTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTACTGAACT",
"TTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAAGAGTGTCTGATAGCAGCTTCTGAAC",
"TTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACT",
"CATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTGGTT"};
private static BWT bwt;
private static SuffixArray suffixArray;
public static void main( String argv[] ) throws FileNotFoundException {
if( argv.length != 3 ) {
System.out.println("PerfectAlignerTestHarness <fasta> <bwt> <sa>");
System.exit(1);
}
File referenceFile = new File(argv[0]);
IndexedFastaSequenceFile reference = new IndexedFastaSequenceFile(referenceFile);
File bwtFile = new File(argv[1]);
BWTReader reader = new BWTReader(bwtFile);
bwt = reader.read();
File suffixArrayFile = new File(argv[2]);
SuffixArrayReader suffixArrayReader = new SuffixArrayReader(suffixArrayFile);
suffixArray = suffixArrayReader.read();
for( String read: sampleReads ) {
int alignmentStart = align(read);
if( alignmentStart < 0 ) {
System.out.printf("Unable to align read %s%n",read);
continue;
}
ReferenceSequence subsequence = reference.getSubsequenceAt(reference.getSequenceDictionary().getSequences().get(0).getSequenceName(),alignmentStart,alignmentStart+read.length()-1);
for( int i = 0; i < subsequence.length(); i++) {
if( subsequence.getBases()[i] != read.charAt(i) )
throw new StingException("Read is not an exact match! Alignment has failed!");
}
System.out.printf("Read %s aligned to position %d%n", read, alignmentStart);
}
}
public static int align( String read ) {
int lowerBound = 0, upperBound = bwt.length();
for( int i = read.length()-1; i >= 0; i-- ) {
Base base = Base.fromASCII((byte)read.charAt(i));
lowerBound = bwt.counts(base) + bwt.occurrences(base,lowerBound-1)+1;
upperBound = bwt.counts(base) + bwt.occurrences(base,upperBound);
if( lowerBound > upperBound ) return -1;
}
int alignmentStart = suffixArray.sequence[lowerBound]+1;
//System.out.printf("Read = %s; final bounds: (%d->%d); suffix array = %d%n",read,lowerBound,upperBound,alignmentStart);
return alignmentStart;
}
}