From a17ed3cbf10d744d120ff6fdf595cfda59d52078 Mon Sep 17 00:00:00 2001 From: asivache Date: Mon, 16 Mar 2009 21:59:26 +0000 Subject: [PATCH] this class really computes (and keeps) a gapless pairwise alignment between the two sequences, ILT-style git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@72 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/indels/PairwiseAlignment.java | 162 ++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100755 playground/java/src/org/broadinstitute/sting/indels/PairwiseAlignment.java diff --git a/playground/java/src/org/broadinstitute/sting/indels/PairwiseAlignment.java b/playground/java/src/org/broadinstitute/sting/indels/PairwiseAlignment.java new file mode 100755 index 000000000..87ceb3dee --- /dev/null +++ b/playground/java/src/org/broadinstitute/sting/indels/PairwiseAlignment.java @@ -0,0 +1,162 @@ +package org.broadinstitute.sting.indels; + +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +public class PairwiseAlignment { + private static final int IMPOSSIBLE = 1000000000; + private String s1; + private String s2; + private int i1; // (external) id of the first sequence + private int i2; // (external) id of the second sequence + private int alignment_offset; // offset of s2 w/respect to s1 + private int best_mm; // mismatch count + private int next_mm; // next-best mismatch count + + /** Initializes the alignment with pair of sequences (that will be immediately aligned) and + * stores their specified external ids id1, id2. + * @param is1 first nucleotide sequence (pre-indexed) + * @param is2 second nucleotide sequence (pre-indexed) + * @param id1 external id of the first sequence + * @param id2 external id of the second sequence + */ + public PairwiseAlignment(IndexedSequence is1, IndexedSequence is2, int id1, int id2 ) { + s1 = new String(is1.getSequence()); + s2 = new String(is2.getSequence()); + i1 = id1; + i2 = id2; + best_mm = IMPOSSIBLE; + next_mm = IMPOSSIBLE; + align(is1,is2); + } + + /** Initializes the alignment with pair of sequences (that will be immediately aligned) and + * sets their external ids to -1. Such un-annotated pairwise alignment can not be added to MultipleAlignment. + * + */ + public PairwiseAlignment(IndexedSequence is1, IndexedSequence is2) { + this(is1,is2,-1,-1); + } + + /** + * Returns offset of sequence 2 with respect to sequence 1 in the best alignment + * @return positive offset if s2 is shifted right (starts later) wrt s1, or negative offset + * if s2 is shifted left (starts earlier) wrt s1 + */ + public int getBestOffset2wrt1() { return alignment_offset; } + public String getSequence1() { return s1; } + public String getSequence2() { return s2; } + public int id1() { return i1;} + public int id2() { return i2;} + + /** Returns mismatch count in the best alignment found. + * + * @return count of mismatches or impossibly large number of no mismatches were found + */ + public int getBestMMCount() { return best_mm; } + + /** Returns the number of mismatches in the next-best alignment found + * + * @return next-best count of mismatches or impossibly large number if at most one alignment + * was ever found (that one would make the best then) + */ + public int getNextBestMMCount() { return next_mm; } + + /** Returns the length of the overlapping region of sequences s1 and s2 in the best alignment found, or -1 if + * sequences do not align. + * + * @return overlap size; can not be smaller than the size of the kmer used in IndexedSequence arguments the + * alignment was built from + */ + public int getOverlap() { + if ( ! alignmentExists() ) return -1; + if ( alignment_offset >= 0 ) { + return Math.min(s1.length()-alignment_offset, s2.length()); + } else { + return Math.min(s2.length()+alignment_offset, s1.length()); + } + } + + /** Returns true if at least one alignment, no matter how bad, was found between the two sequences + * (i.e. the sequences have at least one kmer in common). + */ + public boolean alignmentExists() { return best_mm < IMPOSSIBLE; } + + public void align(IndexedSequence is1, IndexedSequence is2) { + + Set offsets = new HashSet() ; // possible offsets of s2 wrt s1 as suggested by matching kmers + for ( Map.Entry> e : is1 ) { // for each kmer in s1 + List kmer_offsets_2 = is2.getOffsets(e.getKey()); + if ( kmer_offsets_2 == null ) continue; // uh-oh, kmer is not found in the other sequence + for ( Integer i1 : e.getValue() ) { + for ( Integer i2 : kmer_offsets_2 ) { + offsets.add(i1-i2); // offset of seq 2 wrt seq1 as suggested by the currently inspected occurences of the same kmer e.getKey() in both sequences + } + } + } + // we have now a collection of distinct s1-s2 offsets seeded by matching kmers. + // lets extend these kmer matches and count mismatches: + + for ( Integer trial_offset : offsets ) { + int mm_cnt = countMismatches(is1.getSequence(), is2.getSequence(), trial_offset,next_mm); + if ( mm_cnt < best_mm ) { + alignment_offset = trial_offset; + next_mm = best_mm; + best_mm = mm_cnt; + } else { + if ( mm_cnt < next_mm ) next_mm = mm_cnt; + } + } + } + + public static int countMismatches(String seq1, String seq2, int offset2wrt1) { + int pos1 = ( offset2wrt1 >= 0 ? offset2wrt1 : 0 ); + int pos2 = ( offset2wrt1 >= 0 ? 0 : -offset2wrt1 ); + int cnt = 0; + while ( pos1 < seq1.length() && pos2 < seq2.length() ) { + if ( seq1.charAt(pos1++) == seq2.charAt(pos2++) ) continue; + cnt++; // found mismatch + } + return cnt; + } + + public static int countMismatches(String seq1, String seq2, int offset2wrt1, int maxerr) { + int pos1 = ( offset2wrt1 >= 0 ? offset2wrt1 : 0 ); + int pos2 = ( offset2wrt1 >= 0 ? 0 : -offset2wrt1 ); + int cnt = 0; + while ( pos1 < seq1.length() && pos2 < seq2.length() && cnt < maxerr ) { + if ( seq1.charAt(pos1++) == seq2.charAt(pos2++) ) continue; + cnt++; // found mismatch + } + return cnt; + } + + /** Returns a (multiline) string that represents the alignment visually: the sequences are appropriately + * shifted and ready for printout; the pairwise alignment is followed by a stats line + */ + public String toString() { + StringBuffer b = new StringBuffer(); + int skip1 = ( alignment_offset >= 0 ? 0 : -alignment_offset ); + int skip2 = ( alignment_offset >=0 ? alignment_offset : 0 ); + for ( int k = 0 ; k < skip1 ; k++ ) b.append(' '); + b.append(s1); + b.append('\n'); + for ( int k = 0 ; k < skip2 ; k++ ) b.append(' '); + b.append(s2); + b.append('\n'); + b.append(best_mm+" mismatches, "+ next_mm + " next best, " + getOverlap() + " overlapping bases, distance=" + distance() + "\n"); + return b.toString(); + } + + public double distance() { + int L = getOverlap(); + if ( L <=0 ) return 1e100; + double l = ( best_mm==0? 1.0 : (double)best_mm + Math.sqrt((double)best_mm) ); + return ( l / (double)L ); + } + +} + +