More comprehensive testing of BWT (mismatches only) module, and lots of bug fixes.
Limitations: 1) Can't handle RC alignments. 2) Can't handle indels. 3) Can't handle N's in reference bases. 4) Stops at first hit. Ran BWT over a test suite of 800k Ecoli reads. After removing alignments with indels / reads with Ns, the remaining reads were aligned with quality 'equal to' that of the alignment stored in the BAM file. In this case 'equal' quality is <= mismatches to the reference as the existing alignment stored in the BAM file. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1710 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
b446b3f1b6
commit
b0ec7fc144
|
|
@ -7,6 +7,18 @@ package org.broadinstitute.sting.alignment;
|
|||
* @version 0.1
|
||||
*/
|
||||
public interface Alignment extends Comparable<Alignment> {
|
||||
/**
|
||||
* Gets the starting position for the given alignment.
|
||||
* @return Starting position.
|
||||
*/
|
||||
public int getAlignmentStart();
|
||||
|
||||
/**
|
||||
* Is the given alignment on the reverse strand?
|
||||
* @return True if the alignment is on the reverse strand.
|
||||
*/
|
||||
public boolean isNegativeStrand();
|
||||
|
||||
/**
|
||||
* Gets the score of this alignment.
|
||||
* @return The score.
|
||||
|
|
|
|||
|
|
@ -1,21 +1,19 @@
|
|||
package org.broadinstitute.sting.alignment.bwa;
|
||||
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.alignment.bwa.bwt.SuffixArrayReader;
|
||||
import org.broadinstitute.sting.alignment.bwa.bwt.*;
|
||||
import org.broadinstitute.sting.alignment.Aligner;
|
||||
import org.broadinstitute.sting.alignment.Alignment;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
|
||||
/**
|
||||
* A test harness to ensure that the perfect aligner works.
|
||||
|
|
@ -24,39 +22,28 @@ import net.sf.samtools.SAMFileReader;
|
|||
* @version 0.1
|
||||
*/
|
||||
public class AlignerTestHarness {
|
||||
public static final String[] sampleReads = { "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA",
|
||||
"AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTAG",
|
||||
"GCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGCGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCAGAT",
|
||||
"GCTTTTCATTCTGACTGCAACGGGCAATATGTATCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAA",
|
||||
"GCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAA",
|
||||
"CTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAAC",
|
||||
"TTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTACTGAACT",
|
||||
"TTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAAGAGTGTCTGATAGCAGCTTCTGAAC",
|
||||
"TTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACT",
|
||||
"CATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTGGTT"};
|
||||
|
||||
private static BWT bwt;
|
||||
private static SuffixArray suffixArray;
|
||||
|
||||
public static void main( String argv[] ) throws FileNotFoundException {
|
||||
if( argv.length != 4 ) {
|
||||
System.out.println("PerfectAlignerTestHarness <fasta> <bwt> <rbwt>");
|
||||
if( argv.length != 5 ) {
|
||||
System.out.println("PerfectAlignerTestHarness <fasta> <bwt> <rbwt> <sa> <bam>");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
File referenceFile = new File(argv[0]);
|
||||
File bwtFile = new File(argv[1]);
|
||||
File rbwtFile = new File(argv[2]);
|
||||
File bamFile = new File(argv[3]);
|
||||
File suffixArrayFile = null;
|
||||
File reverseSuffixArrayFile = new File(argv[3]);
|
||||
File bamFile = new File(argv[4]);
|
||||
|
||||
align(referenceFile,bwtFile,rbwtFile,bamFile);
|
||||
align(referenceFile,bwtFile,rbwtFile,reverseSuffixArrayFile,bamFile);
|
||||
}
|
||||
|
||||
private static void align(File referenceFile, File bwtFile, File rbwtFile, File bamFile) throws FileNotFoundException {
|
||||
private static void align(File referenceFile, File bwtFile, File rbwtFile, File reverseSuffixArrayFile, File bamFile) throws FileNotFoundException {
|
||||
BWT bwt = new BWTReader(bwtFile).read();
|
||||
|
||||
Aligner aligner = new BWAAligner(bwtFile,rbwtFile);
|
||||
Aligner aligner = new BWAAligner(bwtFile,rbwtFile,reverseSuffixArrayFile);
|
||||
int count = 0;
|
||||
|
||||
SAMFileReader reader = new SAMFileReader(bamFile);
|
||||
|
|
@ -64,50 +51,63 @@ public class AlignerTestHarness {
|
|||
|
||||
for(SAMRecord read: reader) {
|
||||
count++;
|
||||
if( count > 15 ) break;
|
||||
//if( count != 1 && count != 15 ) continue;
|
||||
//if( count > 500 ) break;
|
||||
//if( count != 39 /*&& count != 15*/ ) continue;
|
||||
//if( !read.getReadName().endsWith("1507:1636#0") )
|
||||
// continue;
|
||||
|
||||
boolean skipRead = false;
|
||||
|
||||
for( CigarElement cigarElement: read.getCigar().getCigarElements() ) {
|
||||
if( cigarElement.getOperator() != CigarOperator.M ) {
|
||||
System.out.printf("Skipping read %s because it features indels%n", read.getReadName());
|
||||
skipRead = true;
|
||||
}
|
||||
}
|
||||
|
||||
if(read.getReadString().indexOf("N") >= 0) {
|
||||
System.out.printf("Skipping read %s because it contains Ns%n", read.getReadName());
|
||||
skipRead = true;
|
||||
}
|
||||
|
||||
if(skipRead) continue;
|
||||
|
||||
List<Alignment> alignments = aligner.align(read);
|
||||
if(alignments.size() > 0 )
|
||||
System.out.printf("%s: Aligned read to reference with %d mismatches.%n", read.getReadName(), alignments.get(0).getScore());
|
||||
else
|
||||
System.out.printf("%s: Failed to align read to reference.%n", read.getReadName());
|
||||
}
|
||||
}
|
||||
if(alignments.size() == 0 )
|
||||
throw new StingException(String.format("Unable to align read %s to reference.",read.getReadName()));
|
||||
|
||||
private static void alignPerfect(File referenceFile, File bwtFile, File suffixArrayFile) throws FileNotFoundException
|
||||
{
|
||||
IndexedFastaSequenceFile reference = new IndexedFastaSequenceFile(referenceFile);
|
||||
BWT bwt = new BWTReader(bwtFile).read();
|
||||
SuffixArray suffixArray = new SuffixArrayReader(suffixArrayFile).read();
|
||||
//System.out.printf("%s: Aligned read to reference at position %d with %d mismatches.%n", read.getReadName(), alignments.get(0).getAlignmentStart(), alignments.get(0).getScore());
|
||||
|
||||
for( String read: sampleReads ) {
|
||||
int alignmentStart = align(read);
|
||||
if( alignmentStart < 0 ) {
|
||||
System.out.printf("Unable to align read %s%n",read);
|
||||
continue;
|
||||
Alignment alignment = alignments.get(0);
|
||||
if( read.getAlignmentStart() != alignment.getAlignmentStart() ) {
|
||||
IndexedFastaSequenceFile reference = new IndexedFastaSequenceFile(referenceFile);
|
||||
String expectedRef = new String(reference.getSubsequenceAt(reference.getSequenceDictionary().getSequences().get(0).getSequenceName(),read.getAlignmentStart(),read.getAlignmentStart()+read.getReadLength()-1).getBases());
|
||||
int expectedMismatches = 0;
|
||||
for( int i = 0; i < read.getReadLength(); i++ ) {
|
||||
if( read.getReadBases()[i] != expectedRef.charAt(i) )
|
||||
expectedMismatches++;
|
||||
}
|
||||
|
||||
String alignedRef = new String(reference.getSubsequenceAt(reference.getSequenceDictionary().getSequences().get(0).getSequenceName(),alignments.get(0).getAlignmentStart(),alignments.get(0).getAlignmentStart()+read.getReadLength()-1).getBases());
|
||||
int actualMismatches = 0;
|
||||
for( int i = 0; i < read.getReadLength(); i++ ) {
|
||||
if( read.getReadBases()[i] != expectedRef.charAt(i) )
|
||||
actualMismatches++;
|
||||
}
|
||||
|
||||
if( expectedMismatches != actualMismatches ) {
|
||||
System.out.printf("read = %s%n", read.getReadString());
|
||||
System.out.printf("expected ref = %s%n", expectedRef);
|
||||
System.out.printf("actual ref = %s%n", alignedRef);
|
||||
throw new StingException(String.format("Read %s was placed at incorrect location; target alignment = %d; actual alignment = %d%n",read.getReadName(),read.getAlignmentStart(),alignment.getAlignmentStart()));
|
||||
}
|
||||
}
|
||||
ReferenceSequence subsequence = reference.getSubsequenceAt(reference.getSequenceDictionary().getSequences().get(0).getSequenceName(),alignmentStart,alignmentStart+read.length()-1);
|
||||
for( int i = 0; i < subsequence.length(); i++) {
|
||||
if( subsequence.getBases()[i] != read.charAt(i) )
|
||||
throw new StingException("Read is not an exact match! Alignment has failed!");
|
||||
}
|
||||
System.out.printf("Read %s aligned to position %d%n", read, alignmentStart);
|
||||
}
|
||||
}
|
||||
|
||||
private static int align(String read) {
|
||||
int lowerBound = 0, upperBound = bwt.length();
|
||||
for( int i = read.length()-1; i >= 0; i-- ) {
|
||||
Base base = Base.fromASCII((byte)read.charAt(i));
|
||||
lowerBound = bwt.counts(base) + bwt.occurrences(base,lowerBound-1)+1;
|
||||
upperBound = bwt.counts(base) + bwt.occurrences(base,upperBound);
|
||||
if( lowerBound > upperBound ) return -1;
|
||||
if( count % 1000 == 0 )
|
||||
System.out.printf("%d reads examined.%n",count);
|
||||
}
|
||||
int alignmentStart = suffixArray.sequence[lowerBound]+1;
|
||||
//System.out.printf("Read = %s; final bounds: (%d->%d); suffix array = %d%n",read,lowerBound,upperBound,alignmentStart);
|
||||
return alignmentStart;
|
||||
}
|
||||
|
||||
System.out.printf("%d reads examined.%n",count);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,8 +1,6 @@
|
|||
package org.broadinstitute.sting.alignment.bwa;
|
||||
|
||||
import org.broadinstitute.sting.alignment.bwa.bwt.BWT;
|
||||
import org.broadinstitute.sting.alignment.bwa.bwt.BWTReader;
|
||||
import org.broadinstitute.sting.alignment.bwa.bwt.Base;
|
||||
import org.broadinstitute.sting.alignment.bwa.bwt.*;
|
||||
import org.broadinstitute.sting.alignment.Alignment;
|
||||
import org.broadinstitute.sting.alignment.Aligner;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
|
@ -14,7 +12,6 @@ import java.util.Collections;
|
|||
import java.util.EnumSet;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.picard.util.SequenceUtil;
|
||||
|
||||
/**
|
||||
* Create imperfect alignments from the read to the genome represented by the given BWT / suffix array.
|
||||
|
|
@ -33,6 +30,11 @@ public class BWAAligner implements Aligner {
|
|||
*/
|
||||
private BWT reverseBWT;
|
||||
|
||||
/**
|
||||
* Suffix array in the forward direction.
|
||||
*/
|
||||
private SuffixArray reverseSuffixArray;
|
||||
|
||||
/**
|
||||
* Maximum edit distance (-n option from original BWA).
|
||||
*/
|
||||
|
|
@ -63,41 +65,62 @@ public class BWAAligner implements Aligner {
|
|||
*/
|
||||
private static final int GAP_EXTENSION_PENALTY = 4;
|
||||
|
||||
public BWAAligner( File forwardBWTFile, File reverseBWTFile ) {
|
||||
public BWAAligner( File forwardBWTFile, File reverseBWTFile, File reverseSuffixArrayFile ) {
|
||||
forwardBWT = new BWTReader(forwardBWTFile).read();
|
||||
reverseBWT = new BWTReader(reverseBWTFile).read();
|
||||
reverseSuffixArray = new SuffixArrayReader(reverseSuffixArrayFile).read();
|
||||
}
|
||||
|
||||
public List<Alignment> align( SAMRecord read ) {
|
||||
byte[] bases = read.getReadBases();
|
||||
byte[] forwardBases = read.getReadBases();
|
||||
byte[] reverseBases = BaseUtils.simpleReverseComplement(forwardBases);
|
||||
|
||||
List<LowerBound> reverseLowerBounds = LowerBound.create(bases,reverseBWT);
|
||||
// for( int i = 0; i < reverseLowerBounds.size(); i++ )
|
||||
// System.out.printf("ReverseBWT: lb[%d] = %s%n",i,reverseLowerBounds.get(i));
|
||||
List<LowerBound> forwardLowerBounds = LowerBound.create(forwardBases,forwardBWT);
|
||||
//for( int i = 0; i < forwardLowerBounds.size(); i++ )
|
||||
// System.out.printf("ForwardBWT: lb[%d] = %s%n",i,forwardLowerBounds.get(i));
|
||||
List<LowerBound> reverseLowerBounds = LowerBound.create(reverseBases,reverseBWT);
|
||||
//for( int i = 0; i < reverseLowerBounds.size(); i++ )
|
||||
// System.out.printf("ReverseBWT: lb[%d] = %s%n",i,reverseLowerBounds.get(i));
|
||||
|
||||
PriorityQueue<BWAAlignment> alignments = new PriorityQueue<BWAAlignment>();
|
||||
|
||||
// Create a fictional initial alignment, with the position just off the end of the read, and the limits
|
||||
// set as the entire BWT.
|
||||
BWAAlignment initial = new BWAAlignment();
|
||||
initial.position = read.getReadLength();
|
||||
initial.negativeStrand = true;
|
||||
initial.position = 0;
|
||||
initial.loBound = 0;
|
||||
initial.hiBound = forwardBWT.length();
|
||||
initial.mismatches = 0;
|
||||
|
||||
BWAAlignment initialReverse = new BWAAlignment();
|
||||
initialReverse.negativeStrand = false;
|
||||
initialReverse.position = 0;
|
||||
initialReverse.loBound = 0;
|
||||
initialReverse.hiBound = reverseBWT.length();
|
||||
initialReverse.mismatches = 0;
|
||||
|
||||
alignments.add(initial);
|
||||
//alignments.add(initialReverse);
|
||||
|
||||
while(!alignments.isEmpty()) {
|
||||
BWAAlignment alignment = alignments.remove();
|
||||
|
||||
// Done with this particular alignment.
|
||||
if(alignment.position == 0)
|
||||
return Collections.<Alignment>singletonList(alignment);
|
||||
byte[] bases = alignment.negativeStrand ? forwardBases : reverseBases;
|
||||
BWT bwt = alignment.negativeStrand ? reverseBWT : forwardBWT;
|
||||
List<LowerBound> lowerBounds = alignment.negativeStrand ? forwardLowerBounds : reverseLowerBounds;
|
||||
|
||||
//System.out.printf("Processing alignments; queue size = %d, alignment = %s, bound = %d%n", alignments.size(), alignment, lowerBounds.get(alignment.position-1).value);
|
||||
// Done with this particular alignment.
|
||||
if(alignment.position == read.getReadLength()-1) {
|
||||
alignment.alignmentStart = reverseBWT.length() - (reverseSuffixArray.get(alignment.loBound)+read.getReadLength()) + 1;
|
||||
return Collections.<Alignment>singletonList(alignment);
|
||||
}
|
||||
|
||||
//System.out.printf("Processing alignments; queue size = %d, alignment = %s, bound = %d%n", alignments.size(), alignment, lowerBounds.get(alignment.position).value);
|
||||
|
||||
// if z < D(i) then return {}
|
||||
if( alignment.mismatches > reverseLowerBounds.get(alignment.position-1).value )
|
||||
int mismatches = MAXIMUM_EDIT_DISTANCE - alignment.mismatches;
|
||||
if( mismatches < lowerBounds.get(alignment.position+1).value )
|
||||
continue;
|
||||
|
||||
if( alignment.mismatches > MAXIMUM_EDIT_DISTANCE )
|
||||
|
|
@ -107,11 +130,13 @@ public class BWAAligner implements Aligner {
|
|||
for(Base base: EnumSet.allOf(Base.class)) {
|
||||
// Create and initialize a new alignment, given that base as the candidate.
|
||||
BWAAlignment newAlignment = new BWAAlignment();
|
||||
newAlignment.position = alignment.position - 1;
|
||||
newAlignment.loBound = forwardBWT.counts(base) + forwardBWT.occurrences(base,alignment.loBound-1) + 1;
|
||||
newAlignment.hiBound = forwardBWT.counts(base) + forwardBWT.occurrences(base,alignment.hiBound);
|
||||
newAlignment.negativeStrand = alignment.negativeStrand;
|
||||
newAlignment.position = alignment.position + 1;
|
||||
|
||||
newAlignment.loBound = bwt.counts(base) + bwt.occurrences(base,alignment.loBound-1) + 1;
|
||||
newAlignment.hiBound = bwt.counts(base) + bwt.occurrences(base,alignment.hiBound);
|
||||
newAlignment.mismatches = alignment.mismatches;
|
||||
if( base.toASCII() != read.getReadBases()[newAlignment.position] )
|
||||
if( base.toASCII() != bases[newAlignment.position] )
|
||||
newAlignment.mismatches++;
|
||||
|
||||
// If this alignment is valid, add it to the list.
|
||||
|
|
|
|||
|
|
@ -14,6 +14,16 @@ import net.sf.samtools.SAMRecord;
|
|||
public class BWAAlignment implements Alignment {
|
||||
enum State { MATCH, INSERTION, DELETION }
|
||||
|
||||
/**
|
||||
* Start of the final alignment.
|
||||
*/
|
||||
protected int alignmentStart;
|
||||
|
||||
/**
|
||||
* Working variable. Is this match being treated as a negative or positive strand?
|
||||
*/
|
||||
protected boolean negativeStrand;
|
||||
|
||||
/**
|
||||
* Working variable. How many bases have been matched at this point.
|
||||
*/
|
||||
|
|
@ -37,7 +47,23 @@ public class BWAAlignment implements Alignment {
|
|||
/**
|
||||
* Indicates the current state of an alignment. Are we in an insertion? Deletion?
|
||||
*/
|
||||
protected State alignmentState;
|
||||
protected State alignmentState;
|
||||
|
||||
/**
|
||||
* Gets the starting position for the given alignment.
|
||||
* @return Starting position.
|
||||
*/
|
||||
public int getAlignmentStart() {
|
||||
return alignmentStart;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is the given alignment on the reverse strand?
|
||||
* @return True if the alignment is on the reverse strand.
|
||||
*/
|
||||
public boolean isNegativeStrand() {
|
||||
return negativeStrand;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the BWA score of this alignment.
|
||||
|
|
@ -58,7 +84,7 @@ public class BWAAlignment implements Alignment {
|
|||
if( scoreComparison != 0 )
|
||||
return scoreComparison;
|
||||
else
|
||||
return Integer.valueOf(position).compareTo(((BWAAlignment)other).position);
|
||||
return -Integer.valueOf(position).compareTo(((BWAAlignment)other).position);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
|
|
|
|||
|
|
@ -31,4 +31,12 @@ public class SuffixArray {
|
|||
return sequence.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the suffix array value at a given sequence.
|
||||
* @param index
|
||||
* @return
|
||||
*/
|
||||
public int get( int index ) {
|
||||
return sequence[index];
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue