gatk-3.8/java/src/org/broadinstitute/sting/utils/SWPairwiseAlignment.java

318 lines
13 KiB
Java
Raw Normal View History

/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.Cigar;
import java.util.List;
import java.util.ArrayList;
import java.util.Collections;
/**
* Created by IntelliJ IDEA.
* User: asivache
* Date: Mar 23, 2009
* Time: 1:54:54 PM
* To change this template use File | Settings | File Templates.
*/
public class SWPairwiseAlignment {
private int alignment_offset; // offset of s2 w/respect to s1
private Cigar alignmentCigar;
private final double w_match;
private final double w_mismatch;
private final double w_open;
private final double w_extend;
private static final int MSTATE = 0;
private static final int ISTATE = 1;
private static final int DSTATE = 2;
// private static double [][] sw = new double[500][500];
// private static int [][] btrack = new int[500][500];
// ************************************************************************
// **** IMPORTANT NOTE: ****
// **** This class assumes that all bytes come from UPPERCASED chars! ****
// ************************************************************************
public SWPairwiseAlignment(byte[] seq1, byte[] seq2, double match, double mismatch, double open, double extend ) {
w_match = match;
w_mismatch = mismatch;
w_open = open;
w_extend = extend;
align(seq1,seq2);
}
public SWPairwiseAlignment(byte[] seq1, byte[] seq2) {
this(seq1,seq2,1.0,-1.0/3.0,-1.0-1.0/3.0,-1.0/3.0); // match=1, mismatch = -1/3, gap=-(1+k/3)
}
public Cigar getCigar() { return alignmentCigar ; }
public int getAlignmentStart2wrt1() { return alignment_offset; }
public void align(final byte[] a, final byte[] b) {
final int n = a.length;
final int m = b.length;
double [] sw = new double[(n+1)*(m+1)];
int [] btrack = new int[(n+1)*(m+1)];
calculateMatrix(a, b, sw, btrack);
calculateCigar(n, m, sw, btrack); // length of the segment (continuous matches, insertions or deletions)
}
private void calculateMatrix(final byte[] a, final byte[] b, double [] sw, int [] btrack ) {
final int n = a.length+1;
final int m = b.length+1;
// build smith-waterman matrix and keep backtrack info:
for ( int i = 1, row_offset_1 = 0 ; i < n ; i++ ) { // we do NOT update row_offset_1 here, see comment at the end of this outer loop
byte a_base = a[i-1]; // letter in a at the current pos
final int row_offset = row_offset_1 + m;
// On the entrance into the loop, row_offset_1 is the (linear) offset
// of the first element of row (i-1) and row_offset is the linear offset of the
// start of row i
for ( int j = 1, data_offset_1 = row_offset_1 ; j < m ; j++, data_offset_1++ ) {
// data_offset_1 is linearized offset of element [i-1][j-1]
final byte b_base = b[j-1]; // letter in b at the current pos
// in other words, step_diag = sw[i-1][j-1] + wd(a_base,b_base);
double step_diag = sw[data_offset_1] + wd(a_base,b_base);
double step_down = 0.0 ;
int kd = 0;
for ( int k = 1, data_offset_k = data_offset_1+1 ; k < i ; k++, data_offset_k -= m ) {
// data_offset_k is linearized offset of element [i-k][j]
// in other words, trial = sw[i-k][j]+gap_penalty:
final double trial = sw[data_offset_k]+wk(k);
if ( step_down < trial ) {
step_down=trial;
kd = k;
}
}
double step_right = 0;
int ki = 0;
for ( int k = 1, data_offset = row_offset+j-1 ; k < j ; k++, data_offset-- ) {
// data_offset is linearized offset of element [i][j-k]
// in other words, step_right=sw[i][j-k]+gap_penalty;
final double trial = sw[data_offset]+wk(k);
if ( step_right < trial ) {
step_right=trial;
ki = k;
}
}
final int data_offset = row_offset + j; // linearized offset of element [i][j]
if ( step_down > step_right ) {
if ( step_down > step_diag ) {
sw[data_offset] = Math.max(0,step_down);
btrack[data_offset] = kd; // positive=vertical
}
else {
sw[data_offset] = Math.max(0,step_diag);
btrack[data_offset] = 0; // 0 = diagonal
}
} else {
// step_down < step_right
if ( step_right > step_diag ) {
sw[data_offset] = Math.max(0,step_right);
btrack[data_offset] = -ki; // negative = horizontal
} else {
sw[data_offset] = Math.max(0,step_diag);
btrack[data_offset] = 0; // 0 = diagonal
}
}
sw[data_offset] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right)));
}
// IMPORTANT, IMPORTANT, IMPORTANT:
// note that we update this (secondary) outer loop variable here,
// so that we DO NOT need to update it
// in the for() statement itself.
row_offset_1 = row_offset;
}
// print(sw,a,b);
}
private void calculateCigar(int n, int m, double [] sw, int [] btrack) {
// p holds the position we start backtracking from; we will be assembling a cigar in the backwards order
//PrimitivePair.Int p = new PrimitivePair.Int();
int p1 = 0, p2 = 0;
double maxscore = 0.0;
int segment_length = 0; // length of the segment (continuous matches, insertions or deletions)
// look for largest score. we use >= combined with the traversal direction
// to ensure that if two scores are equal, the one closer to diagonal gets picked
for ( int i = 1, data_offset = m+1+m ; i < n+1 ; i++, data_offset += (m+1) ) {
// data_offset is the offset of [i][m]
if ( sw[data_offset] >= maxscore ) {
p1 = i; p2 = m ; maxscore = sw[data_offset];
}
}
for ( int j = 1, data_offset = n*(m+1)+1 ; j < m+1 ; j++, data_offset++ ) {
// data_offset is the offset of [n][j]
if ( sw[data_offset] > maxscore || sw[data_offset] == maxscore && Math.abs(n-j) < Math.abs(p1 - p2)) {
p1 = n;
p2 = j ;
// maxscore = sw[n][j];
maxscore = sw[data_offset];
segment_length = m - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment
}
}
// we will be placing all insertions and deletions into sequence b, so the state are named w/regard
// to that sequence
int state = MSTATE;
List<CigarElement> lce = new ArrayList<CigarElement>(5);
int data_offset = p1*(m+1)+p2; // offset of element [p1][p2]
do {
// int btr = btrack[p1][p2];
int btr = btrack[data_offset];
int step_left = ( btr < 0 ? -btr : 1);
int step_up = ( btr > 0 ? btr : 1 );
int new_state;
if ( btr > 0 ) new_state = DSTATE;
else if ( btr < 0 ) new_state = ISTATE;
else new_state = MSTATE;
int step_length = 1;
// move to next best location in the sw matrix:
switch( new_state ) {
case MSTATE: data_offset -= (m+2); break; // equivalent to p1--; p2--
case ISTATE: data_offset -= step_left; step_length = step_left; break; // equivalent to p2-=step_left;
case DSTATE: data_offset -= (m+1)*step_up; step_length = step_up; break; // equivalent to p1 -= step_up
}
// now let's see if the state actually changed:
if ( new_state == state ) segment_length+=step_length;
else {
// state changed, lets emit previous segment, whatever it was (Insertion Deletion, or (Mis)Match).
lce.add(makeElement(state, segment_length));
segment_length = step_length;
state = new_state;
}
// next condition is equivalent to while ( sw[p1][p2] != 0 ) (with modified p1 and/or p2:
} while ( sw[data_offset] != 0 );
// reinstate last values of p1, p2 we arrived to after matrix traversal:
p1 = data_offset / (m+1);
p2 = data_offset % (m+1);
// post-process the last segment we are still keeping
lce.add(makeElement(state, segment_length + p2));
alignment_offset = p1 - p2;
Collections.reverse(lce);
alignmentCigar = new Cigar(lce);
}
private CigarElement makeElement(int state, int segment_length) {
CigarOperator o = null;
switch(state) {
case MSTATE: o = CigarOperator.M; break;
case ISTATE: o = CigarOperator.I; break;
case DSTATE: o = CigarOperator.D; break;
}
return new CigarElement(segment_length,o);
}
private double wd(byte x, byte y) {
return (x == y ? w_match : w_mismatch);
}
private double wk(int k) {
return w_open+(k-1)*w_extend; // gap
}
private void print(int[][] s) {
for ( int i = 0 ; i < s.length ; i++) {
for ( int j = 0; j < s[i].length ; j++ ) {
System.out.printf(" %4d",s[i][j]);
}
System.out.println();
}
}
private void print(double[][] s) {
for ( int i = 0 ; i < s.length ; i++) {
for ( int j = 0; j < s[i].length ; j++ ) {
System.out.printf(" %4g",s[i][j]);
}
System.out.println();
}
}
private void print(int[][] s, String a, String b) {
System.out.print(" ");
for ( int j = 1 ; j < s[0].length ; j++) System.out.printf(" %4c",b.charAt(j-1)) ;
System.out.println();
for ( int i = 0 ; i < s.length ; i++) {
if ( i > 0 ) System.out.print(a.charAt(i-1));
else System.out.print(' ');
System.out.print(" ");
for ( int j = 0; j < s[i].length ; j++ ) {
System.out.printf(" %4d",s[i][j]);
}
System.out.println();
}
}
private void print(double[][] s, String a, String b) {
System.out.print("");
for ( int j = 1 ; j < s[0].length ; j++) System.out.printf(" %4c",b.charAt(j-1)) ;
System.out.println();
for ( int i = 0 ; i < s.length ; i++) {
if ( i > 0 ) System.out.print(a.charAt(i-1));
else System.out.print(' ');
System.out.print(" ");
for ( int j = 0; j < s[i].length ; j++ ) {
System.out.printf(" %2.1f",s[i][j]);
}
System.out.println();
}
}
}