From 14477bb48e46c198a181e5391504819229578b24 Mon Sep 17 00:00:00 2001 From: hanna Date: Tue, 22 Sep 2009 19:05:10 +0000 Subject: [PATCH] Unidirectional alignments with mismatches now working. Significant refactoring will be required. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1686 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/alignment/Aligner.java | 44 +----- ...stHarness.java => AlignerTestHarness.java} | 61 +++++++-- .../sting/alignment/bwa/BWAAligner.java | 126 ++++++++++++++++++ .../sting/alignment/bwa/BWAAlignment.java | 33 ++++- .../sting/alignment/bwa/LowerBound.java | 46 +++++-- 5 files changed, 240 insertions(+), 70 deletions(-) rename java/src/org/broadinstitute/sting/alignment/bwa/{PerfectAlignerTestHarness.java => AlignerTestHarness.java} (66%) create mode 100644 java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java diff --git a/java/src/org/broadinstitute/sting/alignment/Aligner.java b/java/src/org/broadinstitute/sting/alignment/Aligner.java index 78161cf24..75649ae11 100644 --- a/java/src/org/broadinstitute/sting/alignment/Aligner.java +++ b/java/src/org/broadinstitute/sting/alignment/Aligner.java @@ -2,58 +2,20 @@ package org.broadinstitute.sting.alignment; import net.sf.samtools.SAMRecord; -import java.io.File; import java.util.List; -import org.broadinstitute.sting.alignment.bwa.*; -import org.broadinstitute.sting.alignment.bwa.bwt.BWT; -import org.broadinstitute.sting.alignment.bwa.bwt.SuffixArray; -import org.broadinstitute.sting.alignment.bwa.bwt.BWTReader; -import org.broadinstitute.sting.alignment.bwa.bwt.SuffixArrayReader; - /** * Create perfect alignments from the read to the genome represented by the given BWT / suffix array. * * @author mhanna * @version 0.1 */ -public class Aligner { +public interface Aligner { /** - * BWT in the forward direction. - */ - private BWT forwardBWT; - - /** - * Suffix array in the forward direction. - */ - private SuffixArray forwardSuffixArray; - - /** - * BWT in the reverse direction. - */ - private BWT reverseBWT; - - /** - * Suffix array in the reverse direction. - */ - private SuffixArray reverseSuffixArray; - - public Aligner( File forwardBWTFile, File forwardSuffixArrayFile, File reverseBWTFile, File reverseSuffixArrayFile ) { - forwardBWT = new BWTReader(forwardBWTFile).read(); - forwardSuffixArray = new SuffixArrayReader(forwardSuffixArrayFile).read(); - - reverseBWT = new BWTReader(reverseBWTFile).read(); - reverseSuffixArray = new SuffixArrayReader(reverseSuffixArrayFile).read(); - } - - /** - * Align the read to the given reference. + * Align the read to the reference. * @param read Read to align. * @return A list of the alignments. */ - public List align( SAMRecord read ) { - List lowerBounds = LowerBound.create(read,reverseBWT); - return null; - } + public List align(SAMRecord read); } diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/PerfectAlignerTestHarness.java b/java/src/org/broadinstitute/sting/alignment/bwa/AlignerTestHarness.java similarity index 66% rename from java/src/org/broadinstitute/sting/alignment/bwa/PerfectAlignerTestHarness.java rename to java/src/org/broadinstitute/sting/alignment/bwa/AlignerTestHarness.java index 0991dba37..fca4b25e9 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/PerfectAlignerTestHarness.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/AlignerTestHarness.java @@ -4,11 +4,18 @@ import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.alignment.bwa.bwt.SuffixArrayReader; import org.broadinstitute.sting.alignment.bwa.bwt.*; +import org.broadinstitute.sting.alignment.Aligner; +import org.broadinstitute.sting.alignment.Alignment; import java.io.File; import java.io.FileNotFoundException; +import java.util.List; +import java.util.ArrayList; +import java.util.Iterator; import net.sf.picard.reference.ReferenceSequence; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMFileReader; /** * A test harness to ensure that the perfect aligner works. @@ -16,7 +23,7 @@ import net.sf.picard.reference.ReferenceSequence; * @author mhanna * @version 0.1 */ -public class PerfectAlignerTestHarness { +public class AlignerTestHarness { public static final String[] sampleReads = { "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGA", "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTAG", "GCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGCGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCAGAT", @@ -32,21 +39,47 @@ public class PerfectAlignerTestHarness { private static SuffixArray suffixArray; public static void main( String argv[] ) throws FileNotFoundException { - if( argv.length != 3 ) { - System.out.println("PerfectAlignerTestHarness "); + if( argv.length != 4 ) { + System.out.println("PerfectAlignerTestHarness "); System.exit(1); } File referenceFile = new File(argv[0]); - IndexedFastaSequenceFile reference = new IndexedFastaSequenceFile(referenceFile); - File bwtFile = new File(argv[1]); - BWTReader reader = new BWTReader(bwtFile); - bwt = reader.read(); + File rbwtFile = new File(argv[2]); + File bamFile = new File(argv[3]); + File suffixArrayFile = null; - File suffixArrayFile = new File(argv[2]); - SuffixArrayReader suffixArrayReader = new SuffixArrayReader(suffixArrayFile); - suffixArray = suffixArrayReader.read(); + align(referenceFile,bwtFile,rbwtFile,bamFile); + } + + private static void align(File referenceFile, File bwtFile, File rbwtFile, File bamFile) throws FileNotFoundException { + BWT bwt = new BWTReader(bwtFile).read(); + + Aligner aligner = new BWAAligner(bwtFile,rbwtFile); + int count = 0; + + SAMFileReader reader = new SAMFileReader(bamFile); + reader.setValidationStringency(SAMFileReader.ValidationStringency.SILENT); + + for(SAMRecord read: reader) { + count++; + if( count > 15 ) break; + //if( count != 1 && count != 15 ) continue; + + List alignments = aligner.align(read); + if(alignments.size() > 0 ) + System.out.printf("%s: Aligned read to reference with %d mismatches.%n", read.getReadName(), alignments.get(0).getScore()); + else + System.out.printf("%s: Failed to align read to reference.%n", read.getReadName()); + } + } + + private static void alignPerfect(File referenceFile, File bwtFile, File suffixArrayFile) throws FileNotFoundException + { + IndexedFastaSequenceFile reference = new IndexedFastaSequenceFile(referenceFile); + BWT bwt = new BWTReader(bwtFile).read(); + SuffixArray suffixArray = new SuffixArrayReader(suffixArrayFile).read(); for( String read: sampleReads ) { int alignmentStart = align(read); @@ -59,11 +92,11 @@ public class PerfectAlignerTestHarness { if( subsequence.getBases()[i] != read.charAt(i) ) throw new StingException("Read is not an exact match! Alignment has failed!"); } - System.out.printf("Read %s aligned to position %d%n", read, alignmentStart); - } + System.out.printf("Read %s aligned to position %d%n", read, alignmentStart); + } } - public static int align( String read ) { + private static int align(String read) { int lowerBound = 0, upperBound = bwt.length(); for( int i = read.length()-1; i >= 0; i-- ) { Base base = Base.fromASCII((byte)read.charAt(i)); @@ -75,4 +108,6 @@ public class PerfectAlignerTestHarness { //System.out.printf("Read = %s; final bounds: (%d->%d); suffix array = %d%n",read,lowerBound,upperBound,alignmentStart); return alignmentStart; } + + } diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java b/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java new file mode 100644 index 000000000..5ebb02163 --- /dev/null +++ b/java/src/org/broadinstitute/sting/alignment/bwa/BWAAligner.java @@ -0,0 +1,126 @@ +package org.broadinstitute.sting.alignment.bwa; + +import org.broadinstitute.sting.alignment.bwa.bwt.BWT; +import org.broadinstitute.sting.alignment.bwa.bwt.BWTReader; +import org.broadinstitute.sting.alignment.bwa.bwt.Base; +import org.broadinstitute.sting.alignment.Alignment; +import org.broadinstitute.sting.alignment.Aligner; +import org.broadinstitute.sting.utils.BaseUtils; + +import java.io.File; +import java.util.List; +import java.util.PriorityQueue; +import java.util.Collections; +import java.util.EnumSet; + +import net.sf.samtools.SAMRecord; +import net.sf.picard.util.SequenceUtil; + +/** + * Create imperfect alignments from the read to the genome represented by the given BWT / suffix array. + * + * @author mhanna + * @version 0.1 + */ +public class BWAAligner implements Aligner { + /** + * BWT in the forward direction. + */ + private BWT forwardBWT; + + /** + * BWT in the reverse direction. + */ + private BWT reverseBWT; + + /** + * Maximum edit distance (-n option from original BWA). + */ + private static final int MAXIMUM_EDIT_DISTANCE = 4; + + /** + * Maximum number of gap opens (-o option from original BWA). + */ + private static final int MAXIMUM_GAP_OPENS = 1; + + /** + * Maximum number of gap extensions (-e option from original BWA). + */ + private static final int MAXIMUM_GAP_EXTENSIONS = -1; + + /** + * Penalty for straight mismatches (-M option from original BWA). + */ + private static final int MISMATCH_PENALTY = 3; + + /** + * Penalty for gap opens (-O option from original BWA). + */ + private static final int GAP_OPEN_PENALTY = 11; + + /** + * Penalty for gap extensions (-E option from original BWA). + */ + private static final int GAP_EXTENSION_PENALTY = 4; + + public BWAAligner( File forwardBWTFile, File reverseBWTFile ) { + forwardBWT = new BWTReader(forwardBWTFile).read(); + reverseBWT = new BWTReader(reverseBWTFile).read(); + } + + public List align( SAMRecord read ) { + byte[] bases = read.getReadBases(); + + List reverseLowerBounds = LowerBound.create(bases,reverseBWT); +// for( int i = 0; i < reverseLowerBounds.size(); i++ ) +// System.out.printf("ReverseBWT: lb[%d] = %s%n",i,reverseLowerBounds.get(i)); + + PriorityQueue alignments = new PriorityQueue(); + + // Create a fictional initial alignment, with the position just off the end of the read, and the limits + // set as the entire BWT. + BWAAlignment initial = new BWAAlignment(); + initial.position = read.getReadLength(); + initial.loBound = 0; + initial.hiBound = forwardBWT.length(); + initial.mismatches = 0; + + alignments.add(initial); + + while(!alignments.isEmpty()) { + BWAAlignment alignment = alignments.remove(); + + // Done with this particular alignment. + if(alignment.position == 0) + return Collections.singletonList(alignment); + + //System.out.printf("Processing alignments; queue size = %d, alignment = %s, bound = %d%n", alignments.size(), alignment, lowerBounds.get(alignment.position-1).value); + + // if z < D(i) then return {} + if( alignment.mismatches > reverseLowerBounds.get(alignment.position-1).value ) + continue; + + if( alignment.mismatches > MAXIMUM_EDIT_DISTANCE ) + continue; + + // For each base in { A, C, G, T } + for(Base base: EnumSet.allOf(Base.class)) { + // Create and initialize a new alignment, given that base as the candidate. + BWAAlignment newAlignment = new BWAAlignment(); + newAlignment.position = alignment.position - 1; + newAlignment.loBound = forwardBWT.counts(base) + forwardBWT.occurrences(base,alignment.loBound-1) + 1; + newAlignment.hiBound = forwardBWT.counts(base) + forwardBWT.occurrences(base,alignment.hiBound); + newAlignment.mismatches = alignment.mismatches; + if( base.toASCII() != read.getReadBases()[newAlignment.position] ) + newAlignment.mismatches++; + + // If this alignment is valid, add it to the list. + if( newAlignment.loBound <= newAlignment.hiBound ) + alignments.add(newAlignment); + } + } + + return Collections.emptyList(); + } + +} diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/BWAAlignment.java b/java/src/org/broadinstitute/sting/alignment/bwa/BWAAlignment.java index 7e024f5ce..63e08f404 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/BWAAlignment.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/BWAAlignment.java @@ -1,6 +1,8 @@ package org.broadinstitute.sting.alignment.bwa; import org.broadinstitute.sting.alignment.Alignment; +import org.broadinstitute.sting.alignment.bwa.bwt.BWT; +import net.sf.samtools.SAMRecord; /** * An alignment object to be used incrementally as the BWA aligner @@ -10,6 +12,18 @@ import org.broadinstitute.sting.alignment.Alignment; * @version 0.1 */ public class BWAAlignment implements Alignment { + enum State { MATCH, INSERTION, DELETION } + + /** + * Working variable. How many bases have been matched at this point. + */ + protected int position; + + /** + * Working variable. How many mismatches have been encountered at this point. + */ + protected int mismatches; + /** * Working variable. The lower bound of the alignment within the BWT. */ @@ -21,16 +35,16 @@ public class BWAAlignment implements Alignment { protected int hiBound; /** - * Current score for this alignment. + * Indicates the current state of an alignment. Are we in an insertion? Deletion? */ - protected int score; + protected State alignmentState; /** * Gets the BWA score of this alignment. * @return BWA-style scores. 0 is best. */ public int getScore() { - return score; + return mismatches; } /** @@ -38,7 +52,16 @@ public class BWAAlignment implements Alignment { * @param other Other alignment to which to compare. * @return < 0 if this < other, == 0 if this == other, > 0 if this > other */ - public int compareTo( Alignment other ) { - return Integer.valueOf(score).compareTo(other.getScore()); + public int compareTo(Alignment other) { + // If the scores are equal, use the position to disambiguate order. + int scoreComparison = Integer.valueOf(getScore()).compareTo(other.getScore()); + if( scoreComparison != 0 ) + return scoreComparison; + else + return Integer.valueOf(position).compareTo(((BWAAlignment)other).position); + } + + public String toString() { + return String.format("position = %d, mismatches = %d, loBound = %d, hiBound = %d, score = %d", position, mismatches, loBound, hiBound, getScore()); } } diff --git a/java/src/org/broadinstitute/sting/alignment/bwa/LowerBound.java b/java/src/org/broadinstitute/sting/alignment/bwa/LowerBound.java index 776dc2c5b..82e654ce1 100644 --- a/java/src/org/broadinstitute/sting/alignment/bwa/LowerBound.java +++ b/java/src/org/broadinstitute/sting/alignment/bwa/LowerBound.java @@ -1,7 +1,5 @@ package org.broadinstitute.sting.alignment.bwa; -import net.sf.samtools.SAMRecord; - import java.util.List; import java.util.ArrayList; @@ -16,6 +14,21 @@ import org.broadinstitute.sting.alignment.bwa.bwt.BWT; * @version 0.1 */ public class LowerBound { + /** + * Lower bound of the suffix array. + */ + public final int loIndex; + + /** + * Upper bound of the suffix array. + */ + public final int hiIndex; + + /** + * Width of the bwt from loIndex -> hiIndex, inclusive. + */ + public final int width; + /** * The lower bound at the given point. */ @@ -25,29 +38,40 @@ public class LowerBound { * Create a new lower bound with the given value. * @param value Value for the lower bound at this site. */ - private LowerBound(int value) { + private LowerBound(int loIndex, int hiIndex, int value) { + this.loIndex = loIndex; + this.hiIndex = hiIndex; + this.width = hiIndex - loIndex + 1; this.value = value; } /** * Create a non-optimal bound according to the algorithm specified in Figure 3 of the BWA paper. */ - public static List create( SAMRecord read, BWT reverseBWT ) { + public static List create( byte[] bases, BWT bwt ) { List bounds = new ArrayList(); - int loIndex = 0, hiIndex = reverseBWT.length(), mismatches = 0; - for( int i = 0; i < read.getReadBases().length; i++ ) { - Base base = Base.fromASCII(read.getReadBases()[i]); - loIndex = reverseBWT.counts(base) + reverseBWT.occurrences(base,loIndex-1) + 1; - hiIndex = reverseBWT.counts(base) + reverseBWT.occurrences(base,hiIndex); + int loIndex = 0, hiIndex = bwt.length(), mismatches = 0; + for( int i = bases.length-1; i >= 0; i-- ) { + Base base = Base.fromASCII(bases[i]); + loIndex = bwt.counts(base) + bwt.occurrences(base,loIndex-1) + 1; + hiIndex = bwt.counts(base) + bwt.occurrences(base,hiIndex); if( loIndex > hiIndex ) { loIndex = 0; - hiIndex = reverseBWT.length(); + hiIndex = bwt.length(); mismatches++; } - bounds.add(new LowerBound(mismatches)); + bounds.add(0,new LowerBound(loIndex,hiIndex,mismatches)); } return bounds; } + + /** + * Create a string representation of this bound. + * @return String version of this bound. + */ + public String toString() { + return String.format("LowerBound: w = %d, value = %d",width,value); + } }