Proof-of-concept perfect read aligner, implemented as described in sec 2.4 of BWA paper. Has successfully aligned a handful of reads. Requires significant cleanup and refactoring.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1617 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
01e7b39c8d
commit
118071cfd8
|
|
@ -11,11 +11,22 @@ public class BWT {
|
|||
* Write an occurrence table after every SEQUENCE_BLOCK_SIZE bases.
|
||||
* For this implementation to behave correctly, SEQUENCE_BLOCK_SIZE % 8 == 0
|
||||
*/
|
||||
public static final int SEQUENCE_BLOCK_SIZE = 128;
|
||||
public static final int SEQUENCE_BLOCK_SIZE = 128;
|
||||
|
||||
public final int inverseSA0;
|
||||
public final Counts counts;
|
||||
public final SequenceBlock[] sequenceBlocks;
|
||||
/**
|
||||
* The inverse SA, used as a placeholder for determining where the special EOL character sits.
|
||||
*/
|
||||
protected final int inverseSA0;
|
||||
|
||||
/**
|
||||
* Cumulative counts for the entire BWT.
|
||||
*/
|
||||
protected final Counts counts;
|
||||
|
||||
/**
|
||||
* The individual sequence blocks, modelling how they appear on disk.
|
||||
*/
|
||||
protected final SequenceBlock[] sequenceBlocks;
|
||||
|
||||
/**
|
||||
* Creates a new BWT with the given inverse SA, counts, and sequence (in ASCII).
|
||||
|
|
@ -50,6 +61,38 @@ public class BWT {
|
|||
return sequence;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the total counts of bases lexicographically smaller than the given base, for Ferragina and Manzini's search.
|
||||
* @param base The base.
|
||||
* @return Total counts for all bases lexicographically smaller than this base.
|
||||
*/
|
||||
public int counts(Base base) {
|
||||
if( base.toPack() - 1 >= 0 )
|
||||
return counts.getCumulative(Base.fromPack(base.toPack()-1));
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the total counts of bases lexicographically smaller than the given base, for Ferragina and Manzini's search.
|
||||
* @param base The base.
|
||||
* @param index The position to search within the BWT.
|
||||
* @return Total counts for all bases lexicographically smaller than this base.
|
||||
*/
|
||||
public int occurrences(Base base,int index) {
|
||||
// If the index is above the SA-1[0], remap it to the appropriate coordinate space.
|
||||
if( index > inverseSA0 ) index--;
|
||||
|
||||
SequenceBlock block = sequenceBlocks[index/SEQUENCE_BLOCK_SIZE];
|
||||
int position = index % SEQUENCE_BLOCK_SIZE;
|
||||
int accumulator = block.occurrences.get(base);
|
||||
for(int i = 0; i <= position; i++) {
|
||||
if(base == Base.fromASCII(block.sequence[i]))
|
||||
accumulator++;
|
||||
}
|
||||
return accumulator;
|
||||
}
|
||||
|
||||
/**
|
||||
* The number of bases in the BWT as a whole.
|
||||
* @return Number of bases.
|
||||
|
|
|
|||
|
|
@ -0,0 +1,76 @@
|
|||
package org.broadinstitute.sting.bwa;
|
||||
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
|
||||
/**
|
||||
* A test harness to ensure that the perfect aligner works.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class PerfectAlignerTestHarness {
|
||||
public static final String[] sampleReads = { "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA",
|
||||
"AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTAG",
|
||||
"GCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGCGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCAGAT",
|
||||
"GCTTTTCATTCTGACTGCAACGGGCAATATGTATCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAA",
|
||||
"GCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAA",
|
||||
"CTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAAC",
|
||||
"TTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTACTGAACT",
|
||||
"TTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAAGAGTGTCTGATAGCAGCTTCTGAAC",
|
||||
"TTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACT",
|
||||
"CATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTGGTT"};
|
||||
|
||||
private static BWT bwt;
|
||||
private static SuffixArray suffixArray;
|
||||
|
||||
public static void main( String argv[] ) throws FileNotFoundException {
|
||||
if( argv.length != 3 ) {
|
||||
System.out.println("PerfectAlignerTestHarness <fasta> <bwt> <sa>");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
File referenceFile = new File(argv[0]);
|
||||
IndexedFastaSequenceFile reference = new IndexedFastaSequenceFile(referenceFile);
|
||||
|
||||
File bwtFile = new File(argv[1]);
|
||||
BWTReader reader = new BWTReader(bwtFile);
|
||||
bwt = reader.read();
|
||||
|
||||
File suffixArrayFile = new File(argv[2]);
|
||||
SuffixArrayReader suffixArrayReader = new SuffixArrayReader(suffixArrayFile);
|
||||
suffixArray = suffixArrayReader.read();
|
||||
|
||||
for( String read: sampleReads ) {
|
||||
int alignmentStart = align(read);
|
||||
if( alignmentStart < 0 ) {
|
||||
System.out.printf("Unable to align read %s%n",read);
|
||||
continue;
|
||||
}
|
||||
ReferenceSequence subsequence = reference.getSubsequenceAt(reference.getSequenceDictionary().getSequences().get(0).getSequenceName(),alignmentStart,alignmentStart+read.length()-1);
|
||||
for( int i = 0; i < subsequence.length(); i++) {
|
||||
if( subsequence.getBases()[i] != read.charAt(i) )
|
||||
throw new StingException("Read is not an exact match! Alignment has failed!");
|
||||
}
|
||||
System.out.printf("Read %s aligned to position %d%n", read, alignmentStart);
|
||||
}
|
||||
}
|
||||
|
||||
public static int align( String read ) {
|
||||
int lowerBound = 0, upperBound = bwt.length();
|
||||
for( int i = read.length()-1; i >= 0; i-- ) {
|
||||
Base base = Base.fromASCII((byte)read.charAt(i));
|
||||
lowerBound = bwt.counts(base) + bwt.occurrences(base,lowerBound-1)+1;
|
||||
upperBound = bwt.counts(base) + bwt.occurrences(base,upperBound);
|
||||
if( lowerBound > upperBound ) return -1;
|
||||
}
|
||||
int alignmentStart = suffixArray.sequence[lowerBound]+1;
|
||||
//System.out.printf("Read = %s; final bounds: (%d->%d); suffix array = %d%n",read,lowerBound,upperBound,alignmentStart);
|
||||
return alignmentStart;
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue