diff --git a/java/src/org/broadinstitute/sting/alignment/Alignment.java b/java/src/org/broadinstitute/sting/alignment/Alignment.java index 4078a1fd3..7e70abab1 100644 --- a/java/src/org/broadinstitute/sting/alignment/Alignment.java +++ b/java/src/org/broadinstitute/sting/alignment/Alignment.java @@ -7,6 +7,18 @@ package org.broadinstitute.sting.alignment; * @version 0.1 */ public interface Alignment extends Comparable { + /** + * Gets the starting position for the given alignment. + * @return Starting position. + */ + public int getAlignmentStart(); + + /** + * Is the given alignment on the reverse strand? + * @return True if the alignment is on the reverse strand. + */ + public boolean isNegativeStrand(); + /** * Gets the score of this alignment. * @return The score. diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/AlignerTestHarness.java b/java/src/org/broadinstitute/sting/alignment/bwa/AlignerTestHarness.java index fca4b25e9..afce86bb9 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/AlignerTestHarness.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/AlignerTestHarness.java @@ -1,21 +1,19 @@ package org.broadinstitute.sting.alignment.bwa; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.alignment.bwa.bwt.SuffixArrayReader; import org.broadinstitute.sting.alignment.bwa.bwt.*; import org.broadinstitute.sting.alignment.Aligner; import org.broadinstitute.sting.alignment.Alignment; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import java.io.File; import java.io.FileNotFoundException; import java.util.List; -import java.util.ArrayList; -import java.util.Iterator; -import net.sf.picard.reference.ReferenceSequence; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMFileReader; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; /** * A test harness to ensure that the perfect aligner works. @@ -24,39 +22,28 @@ import net.sf.samtools.SAMFileReader; * @version 0.1 */ public class AlignerTestHarness { - public static final String[] sampleReads = { "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA", - "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTAG", - "GCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGCGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCAGAT", - "GCTTTTCATTCTGACTGCAACGGGCAATATGTATCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAA", - "GCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAA", - "CTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAAC", - "TTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTACTGAACT", - "TTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAAGAGTGTCTGATAGCAGCTTCTGAAC", - "TTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACT", - "CATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTGGTT"}; - private static BWT bwt; private static SuffixArray suffixArray; public static void main( String argv[] ) throws FileNotFoundException { - if( argv.length != 4 ) { - System.out.println("PerfectAlignerTestHarness "); + if( argv.length != 5 ) { + System.out.println("PerfectAlignerTestHarness "); System.exit(1); } File referenceFile = new File(argv[0]); File bwtFile = new File(argv[1]); File rbwtFile = new File(argv[2]); - File bamFile = new File(argv[3]); - File suffixArrayFile = null; + File reverseSuffixArrayFile = new File(argv[3]); + File bamFile = new File(argv[4]); - align(referenceFile,bwtFile,rbwtFile,bamFile); + align(referenceFile,bwtFile,rbwtFile,reverseSuffixArrayFile,bamFile); } - private static void align(File referenceFile, File bwtFile, File rbwtFile, File bamFile) throws FileNotFoundException { + private static void align(File referenceFile, File bwtFile, File rbwtFile, File reverseSuffixArrayFile, File bamFile) throws FileNotFoundException { BWT bwt = new BWTReader(bwtFile).read(); - Aligner aligner = new BWAAligner(bwtFile,rbwtFile); + Aligner aligner = new BWAAligner(bwtFile,rbwtFile,reverseSuffixArrayFile); int count = 0; SAMFileReader reader = new SAMFileReader(bamFile); @@ -64,50 +51,63 @@ public class AlignerTestHarness { for(SAMRecord read: reader) { count++; - if( count > 15 ) break; - //if( count != 1 && count != 15 ) continue; + //if( count > 500 ) break; + //if( count != 39 /*&& count != 15*/ ) continue; + //if( !read.getReadName().endsWith("1507:1636#0") ) + // continue; + + boolean skipRead = false; + + for( CigarElement cigarElement: read.getCigar().getCigarElements() ) { + if( cigarElement.getOperator() != CigarOperator.M ) { + System.out.printf("Skipping read %s because it features indels%n", read.getReadName()); + skipRead = true; + } + } + + if(read.getReadString().indexOf("N") >= 0) { + System.out.printf("Skipping read %s because it contains Ns%n", read.getReadName()); + skipRead = true; + } + + if(skipRead) continue; List alignments = aligner.align(read); - if(alignments.size() > 0 ) - System.out.printf("%s: Aligned read to reference with %d mismatches.%n", read.getReadName(), alignments.get(0).getScore()); - else - System.out.printf("%s: Failed to align read to reference.%n", read.getReadName()); - } - } + if(alignments.size() == 0 ) + throw new StingException(String.format("Unable to align read %s to reference.",read.getReadName())); - private static void alignPerfect(File referenceFile, File bwtFile, File suffixArrayFile) throws FileNotFoundException - { - IndexedFastaSequenceFile reference = new IndexedFastaSequenceFile(referenceFile); - BWT bwt = new BWTReader(bwtFile).read(); - SuffixArray suffixArray = new SuffixArrayReader(suffixArrayFile).read(); + //System.out.printf("%s: Aligned read to reference at position %d with %d mismatches.%n", read.getReadName(), alignments.get(0).getAlignmentStart(), alignments.get(0).getScore()); - for( String read: sampleReads ) { - int alignmentStart = align(read); - if( alignmentStart < 0 ) { - System.out.printf("Unable to align read %s%n",read); - continue; + Alignment alignment = alignments.get(0); + if( read.getAlignmentStart() != alignment.getAlignmentStart() ) { + IndexedFastaSequenceFile reference = new IndexedFastaSequenceFile(referenceFile); + String expectedRef = new String(reference.getSubsequenceAt(reference.getSequenceDictionary().getSequences().get(0).getSequenceName(),read.getAlignmentStart(),read.getAlignmentStart()+read.getReadLength()-1).getBases()); + int expectedMismatches = 0; + for( int i = 0; i < read.getReadLength(); i++ ) { + if( read.getReadBases()[i] != expectedRef.charAt(i) ) + expectedMismatches++; + } + + String alignedRef = new String(reference.getSubsequenceAt(reference.getSequenceDictionary().getSequences().get(0).getSequenceName(),alignments.get(0).getAlignmentStart(),alignments.get(0).getAlignmentStart()+read.getReadLength()-1).getBases()); + int actualMismatches = 0; + for( int i = 0; i < read.getReadLength(); i++ ) { + if( read.getReadBases()[i] != expectedRef.charAt(i) ) + actualMismatches++; + } + + if( expectedMismatches != actualMismatches ) { + System.out.printf("read = %s%n", read.getReadString()); + System.out.printf("expected ref = %s%n", expectedRef); + System.out.printf("actual ref = %s%n", alignedRef); + throw new StingException(String.format("Read %s was placed at incorrect location; target alignment = %d; actual alignment = %d%n",read.getReadName(),read.getAlignmentStart(),alignment.getAlignmentStart())); + } } - ReferenceSequence subsequence = reference.getSubsequenceAt(reference.getSequenceDictionary().getSequences().get(0).getSequenceName(),alignmentStart,alignmentStart+read.length()-1); - for( int i = 0; i < subsequence.length(); i++) { - if( subsequence.getBases()[i] != read.charAt(i) ) - throw new StingException("Read is not an exact match! Alignment has failed!"); - } - System.out.printf("Read %s aligned to position %d%n", read, alignmentStart); - } - } - private static int align(String read) { - int lowerBound = 0, upperBound = bwt.length(); - for( int i = read.length()-1; i >= 0; i-- ) { - Base base = Base.fromASCII((byte)read.charAt(i)); - lowerBound = bwt.counts(base) + bwt.occurrences(base,lowerBound-1)+1; - upperBound = bwt.counts(base) + bwt.occurrences(base,upperBound); - if( lowerBound > upperBound ) return -1; + if( count % 1000 == 0 ) + System.out.printf("%d reads examined.%n",count); } - int alignmentStart = suffixArray.sequence[lowerBound]+1; - //System.out.printf("Read = %s; final bounds: (%d->%d); suffix array = %d%n",read,lowerBound,upperBound,alignmentStart); - return alignmentStart; - } + System.out.printf("%d reads examined.%n",count); + } } diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java b/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java index 5ebb02163..788bb08a3 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java @@ -1,8 +1,6 @@ package org.broadinstitute.sting.alignment.bwa; -import org.broadinstitute.sting.alignment.bwa.bwt.BWT; -import org.broadinstitute.sting.alignment.bwa.bwt.BWTReader; -import org.broadinstitute.sting.alignment.bwa.bwt.Base; +import org.broadinstitute.sting.alignment.bwa.bwt.*; import org.broadinstitute.sting.alignment.Alignment; import org.broadinstitute.sting.alignment.Aligner; import org.broadinstitute.sting.utils.BaseUtils; @@ -14,7 +12,6 @@ import java.util.Collections; import java.util.EnumSet; import net.sf.samtools.SAMRecord; -import net.sf.picard.util.SequenceUtil; /** * Create imperfect alignments from the read to the genome represented by the given BWT / suffix array. @@ -33,6 +30,11 @@ public class BWAAligner implements Aligner { */ private BWT reverseBWT; + /** + * Suffix array in the forward direction. + */ + private SuffixArray reverseSuffixArray; + /** * Maximum edit distance (-n option from original BWA). */ @@ -63,41 +65,62 @@ public class BWAAligner implements Aligner { */ private static final int GAP_EXTENSION_PENALTY = 4; - public BWAAligner( File forwardBWTFile, File reverseBWTFile ) { + public BWAAligner( File forwardBWTFile, File reverseBWTFile, File reverseSuffixArrayFile ) { forwardBWT = new BWTReader(forwardBWTFile).read(); reverseBWT = new BWTReader(reverseBWTFile).read(); + reverseSuffixArray = new SuffixArrayReader(reverseSuffixArrayFile).read(); } public List align( SAMRecord read ) { - byte[] bases = read.getReadBases(); + byte[] forwardBases = read.getReadBases(); + byte[] reverseBases = BaseUtils.simpleReverseComplement(forwardBases); - List reverseLowerBounds = LowerBound.create(bases,reverseBWT); -// for( int i = 0; i < reverseLowerBounds.size(); i++ ) -// System.out.printf("ReverseBWT: lb[%d] = %s%n",i,reverseLowerBounds.get(i)); + List forwardLowerBounds = LowerBound.create(forwardBases,forwardBWT); + //for( int i = 0; i < forwardLowerBounds.size(); i++ ) + // System.out.printf("ForwardBWT: lb[%d] = %s%n",i,forwardLowerBounds.get(i)); + List reverseLowerBounds = LowerBound.create(reverseBases,reverseBWT); + //for( int i = 0; i < reverseLowerBounds.size(); i++ ) + // System.out.printf("ReverseBWT: lb[%d] = %s%n",i,reverseLowerBounds.get(i)); PriorityQueue alignments = new PriorityQueue(); // Create a fictional initial alignment, with the position just off the end of the read, and the limits // set as the entire BWT. BWAAlignment initial = new BWAAlignment(); - initial.position = read.getReadLength(); + initial.negativeStrand = true; + initial.position = 0; initial.loBound = 0; initial.hiBound = forwardBWT.length(); initial.mismatches = 0; + BWAAlignment initialReverse = new BWAAlignment(); + initialReverse.negativeStrand = false; + initialReverse.position = 0; + initialReverse.loBound = 0; + initialReverse.hiBound = reverseBWT.length(); + initialReverse.mismatches = 0; + alignments.add(initial); + //alignments.add(initialReverse); while(!alignments.isEmpty()) { BWAAlignment alignment = alignments.remove(); - // Done with this particular alignment. - if(alignment.position == 0) - return Collections.singletonList(alignment); + byte[] bases = alignment.negativeStrand ? forwardBases : reverseBases; + BWT bwt = alignment.negativeStrand ? reverseBWT : forwardBWT; + List lowerBounds = alignment.negativeStrand ? forwardLowerBounds : reverseLowerBounds; - //System.out.printf("Processing alignments; queue size = %d, alignment = %s, bound = %d%n", alignments.size(), alignment, lowerBounds.get(alignment.position-1).value); + // Done with this particular alignment. + if(alignment.position == read.getReadLength()-1) { + alignment.alignmentStart = reverseBWT.length() - (reverseSuffixArray.get(alignment.loBound)+read.getReadLength()) + 1; + return Collections.singletonList(alignment); + } + + //System.out.printf("Processing alignments; queue size = %d, alignment = %s, bound = %d%n", alignments.size(), alignment, lowerBounds.get(alignment.position).value); // if z < D(i) then return {} - if( alignment.mismatches > reverseLowerBounds.get(alignment.position-1).value ) + int mismatches = MAXIMUM_EDIT_DISTANCE - alignment.mismatches; + if( mismatches < lowerBounds.get(alignment.position+1).value ) continue; if( alignment.mismatches > MAXIMUM_EDIT_DISTANCE ) @@ -107,11 +130,13 @@ public class BWAAligner implements Aligner { for(Base base: EnumSet.allOf(Base.class)) { // Create and initialize a new alignment, given that base as the candidate. BWAAlignment newAlignment = new BWAAlignment(); - newAlignment.position = alignment.position - 1; - newAlignment.loBound = forwardBWT.counts(base) + forwardBWT.occurrences(base,alignment.loBound-1) + 1; - newAlignment.hiBound = forwardBWT.counts(base) + forwardBWT.occurrences(base,alignment.hiBound); + newAlignment.negativeStrand = alignment.negativeStrand; + newAlignment.position = alignment.position + 1; + + newAlignment.loBound = bwt.counts(base) + bwt.occurrences(base,alignment.loBound-1) + 1; + newAlignment.hiBound = bwt.counts(base) + bwt.occurrences(base,alignment.hiBound); newAlignment.mismatches = alignment.mismatches; - if( base.toASCII() != read.getReadBases()[newAlignment.position] ) + if( base.toASCII() != bases[newAlignment.position] ) newAlignment.mismatches++; // If this alignment is valid, add it to the list. diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/BWAAlignment.java b/java/src/org/broadinstitute/sting/alignment/bwa/BWAAlignment.java index 63e08f404..bcb7c0cce 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/BWAAlignment.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/BWAAlignment.java @@ -14,6 +14,16 @@ import net.sf.samtools.SAMRecord; public class BWAAlignment implements Alignment { enum State { MATCH, INSERTION, DELETION } + /** + * Start of the final alignment. + */ + protected int alignmentStart; + + /** + * Working variable. Is this match being treated as a negative or positive strand? + */ + protected boolean negativeStrand; + /** * Working variable. How many bases have been matched at this point. */ @@ -37,7 +47,23 @@ public class BWAAlignment implements Alignment { /** * Indicates the current state of an alignment. Are we in an insertion? Deletion? */ - protected State alignmentState; + protected State alignmentState; + + /** + * Gets the starting position for the given alignment. + * @return Starting position. + */ + public int getAlignmentStart() { + return alignmentStart; + } + + /** + * Is the given alignment on the reverse strand? + * @return True if the alignment is on the reverse strand. + */ + public boolean isNegativeStrand() { + return negativeStrand; + } /** * Gets the BWA score of this alignment. @@ -58,7 +84,7 @@ public class BWAAlignment implements Alignment { if( scoreComparison != 0 ) return scoreComparison; else - return Integer.valueOf(position).compareTo(((BWAAlignment)other).position); + return -Integer.valueOf(position).compareTo(((BWAAlignment)other).position); } public String toString() { diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/bwt/SuffixArray.java b/java/src/org/broadinstitute/sting/alignment/bwa/bwt/SuffixArray.java index 7451fc894..7af37a1d9 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/bwt/SuffixArray.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/bwt/SuffixArray.java @@ -31,4 +31,12 @@ public class SuffixArray { return sequence.length; } + /** + * Get the suffix array value at a given sequence. + * @param index + * @return + */ + public int get( int index ) { + return sequence[index]; + } }