about 10% improvement in SW alignment (and hence IndelRealigner!) speed by using c-style linearized array representation for matrices instead of java 2D arrays...
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4751 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
b03ac61e9d
commit
8ffea42b75
|
|
@ -52,6 +52,9 @@ public class SWPairwiseAlignment {
|
||||||
private static final int ISTATE = 1;
|
private static final int ISTATE = 1;
|
||||||
private static final int DSTATE = 2;
|
private static final int DSTATE = 2;
|
||||||
|
|
||||||
|
// private static double [][] sw = new double[500][500];
|
||||||
|
// private static int [][] btrack = new int[500][500];
|
||||||
|
|
||||||
// ************************************************************************
|
// ************************************************************************
|
||||||
// **** IMPORTANT NOTE: ****
|
// **** IMPORTANT NOTE: ****
|
||||||
// **** This class assumes that all bytes come from UPPERCASED chars! ****
|
// **** This class assumes that all bytes come from UPPERCASED chars! ****
|
||||||
|
|
@ -75,69 +78,93 @@ public class SWPairwiseAlignment {
|
||||||
public void align(final byte[] a, final byte[] b) {
|
public void align(final byte[] a, final byte[] b) {
|
||||||
final int n = a.length;
|
final int n = a.length;
|
||||||
final int m = b.length;
|
final int m = b.length;
|
||||||
double [][] sw = new double[n+1][m+1];
|
double [] sw = new double[(n+1)*(m+1)];
|
||||||
int [][] btrack = new int[n+1][m+1];
|
int [] btrack = new int[(n+1)*(m+1)];
|
||||||
calculateMatrix(a, b, sw, btrack);
|
calculateMatrix(a, b, sw, btrack);
|
||||||
calculateCigar(n, m, sw, btrack); // length of the segment (continuous matches, insertions or deletions)
|
calculateCigar(n, m, sw, btrack); // length of the segment (continuous matches, insertions or deletions)
|
||||||
}
|
}
|
||||||
|
|
||||||
private void calculateMatrix(final byte[] a, final byte[] b, double [][] sw, int [][] btrack ) {
|
private void calculateMatrix(final byte[] a, final byte[] b, double [] sw, int [] btrack ) {
|
||||||
final int n = a.length;
|
final int n = a.length+1;
|
||||||
final int m = b.length;
|
final int m = b.length+1;
|
||||||
|
|
||||||
// build smith-waterman matrix and keep backtrack info:
|
// build smith-waterman matrix and keep backtrack info:
|
||||||
for ( int i = 1 ; i < n+1 ; i++ ) {
|
for ( int i = 1, row_offset_1 = 0 ; i < n ; i++ ) { // we do NOT update row_offset_1 here, see comment at the end of this outer loop
|
||||||
byte a_base = a[i-1]; // letter in a at the current pos
|
byte a_base = a[i-1]; // letter in a at the current pos
|
||||||
for ( int j = 1 ; j < m+1 ; j++ ) {
|
|
||||||
|
final int row_offset = row_offset_1 + m;
|
||||||
|
|
||||||
|
// On the entrance into the loop, row_offset_1 is the (linear) offset
|
||||||
|
// of the first element of row (i-1) and row_offset is the linear offset of the
|
||||||
|
// start of row i
|
||||||
|
|
||||||
|
for ( int j = 1, data_offset_1 = row_offset_1 ; j < m ; j++, data_offset_1++ ) {
|
||||||
|
|
||||||
|
// data_offset_1 is linearized offset of element [i-1][j-1]
|
||||||
|
|
||||||
final byte b_base = b[j-1]; // letter in b at the current pos
|
final byte b_base = b[j-1]; // letter in b at the current pos
|
||||||
double step_diag = sw[i-1][j-1] + wd(a_base,b_base);
|
|
||||||
|
// in other words, step_diag = sw[i-1][j-1] + wd(a_base,b_base);
|
||||||
|
double step_diag = sw[data_offset_1] + wd(a_base,b_base);
|
||||||
double step_down = 0.0 ;
|
double step_down = 0.0 ;
|
||||||
int kd = 0;
|
int kd = 0;
|
||||||
for ( int k = 1 ; k < i ; k++ ) {
|
for ( int k = 1, data_offset_k = data_offset_1+1 ; k < i ; k++, data_offset_k -= m ) {
|
||||||
final double gap_penalty = wk(k);
|
// data_offset_k is linearized offset of element [i-k][j]
|
||||||
if ( step_down < sw[i-k][j]+gap_penalty ) {
|
// in other words, trial = sw[i-k][j]+gap_penalty:
|
||||||
step_down=sw[i-k][j]+gap_penalty;
|
final double trial = sw[data_offset_k]+wk(k);
|
||||||
|
if ( step_down < trial ) {
|
||||||
|
step_down=trial;
|
||||||
kd = k;
|
kd = k;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
double step_right = 0;
|
double step_right = 0;
|
||||||
int ki = 0;
|
int ki = 0;
|
||||||
for ( int k = 1 ; k < j ; k++ ) {
|
for ( int k = 1, data_offset = row_offset+j-1 ; k < j ; k++, data_offset-- ) {
|
||||||
final double gap_penalty = wk(k);
|
// data_offset is linearized offset of element [i][j-k]
|
||||||
if ( step_right < sw[i][j-k]+gap_penalty ) {
|
// in other words, step_right=sw[i][j-k]+gap_penalty;
|
||||||
step_right=sw[i][j-k]+gap_penalty;
|
final double trial = sw[data_offset]+wk(k);
|
||||||
|
if ( step_right < trial ) {
|
||||||
|
step_right=trial;
|
||||||
ki = k;
|
ki = k;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final int data_offset = row_offset + j; // linearized offset of element [i][j]
|
||||||
|
|
||||||
if ( step_down > step_right ) {
|
if ( step_down > step_right ) {
|
||||||
if ( step_down > step_diag ) {
|
if ( step_down > step_diag ) {
|
||||||
sw[i][j] = Math.max(0,step_down);
|
sw[data_offset] = Math.max(0,step_down);
|
||||||
btrack[i][j] = kd; // positive=vertical
|
btrack[data_offset] = kd; // positive=vertical
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
sw[i][j] = Math.max(0,step_diag);
|
sw[data_offset] = Math.max(0,step_diag);
|
||||||
btrack[i][j] = 0; // 0 = diagonal
|
btrack[data_offset] = 0; // 0 = diagonal
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// step_down < step_right
|
// step_down < step_right
|
||||||
if ( step_right > step_diag ) {
|
if ( step_right > step_diag ) {
|
||||||
sw[i][j] = Math.max(0,step_right);
|
sw[data_offset] = Math.max(0,step_right);
|
||||||
btrack[i][j] = -ki; // negative = horizontal
|
btrack[data_offset] = -ki; // negative = horizontal
|
||||||
} else {
|
} else {
|
||||||
sw[i][j] = Math.max(0,step_diag);
|
sw[data_offset] = Math.max(0,step_diag);
|
||||||
btrack[i][j] = 0; // 0 = diagonal
|
btrack[data_offset] = 0; // 0 = diagonal
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
sw[i][j] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right)));
|
sw[data_offset] = Math.max(0, Math.max(step_diag,Math.max(step_down,step_right)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// IMPORTANT, IMPORTANT, IMPORTANT:
|
||||||
|
// note that we update this (secondary) outer loop variable here,
|
||||||
|
// so that we DO NOT need to update it
|
||||||
|
// in the for() statement itself.
|
||||||
|
row_offset_1 = row_offset;
|
||||||
}
|
}
|
||||||
// print(sw,a,b);
|
// print(sw,a,b);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void calculateCigar(int n, int m, double [][] sw, int [][] btrack) {
|
private void calculateCigar(int n, int m, double [] sw, int [] btrack) {
|
||||||
// p holds the position we start backtracking from; we will be assembling a cigar in the backwards order
|
// p holds the position we start backtracking from; we will be assembling a cigar in the backwards order
|
||||||
//PrimitivePair.Int p = new PrimitivePair.Int();
|
//PrimitivePair.Int p = new PrimitivePair.Int();
|
||||||
int p1 = 0, p2 = 0;
|
int p1 = 0, p2 = 0;
|
||||||
|
|
@ -147,17 +174,20 @@ public class SWPairwiseAlignment {
|
||||||
|
|
||||||
// look for largest score. we use >= combined with the traversal direction
|
// look for largest score. we use >= combined with the traversal direction
|
||||||
// to ensure that if two scores are equal, the one closer to diagonal gets picked
|
// to ensure that if two scores are equal, the one closer to diagonal gets picked
|
||||||
for ( int i = 1 ; i < n+1 ; i++ ) {
|
for ( int i = 1, data_offset = m+1+m ; i < n+1 ; i++, data_offset += (m+1) ) {
|
||||||
if ( sw[i][m] >= maxscore ) {
|
// data_offset is the offset of [i][m]
|
||||||
p1 = i; p2 = m ; maxscore = sw[i][m];
|
if ( sw[data_offset] >= maxscore ) {
|
||||||
|
p1 = i; p2 = m ; maxscore = sw[data_offset];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for ( int j = 1 ; j < m+1 ; j++ ) {
|
for ( int j = 1, data_offset = n*(m+1)+1 ; j < m+1 ; j++, data_offset++ ) {
|
||||||
if ( sw[n][j] > maxscore || sw[n][j] == maxscore && Math.abs(n-j) < Math.abs(p1 - p2)) {
|
// data_offset is the offset of [n][j]
|
||||||
|
if ( sw[data_offset] > maxscore || sw[data_offset] == maxscore && Math.abs(n-j) < Math.abs(p1 - p2)) {
|
||||||
p1 = n;
|
p1 = n;
|
||||||
p2 = j ;
|
p2 = j ;
|
||||||
maxscore = sw[n][j];
|
// maxscore = sw[n][j];
|
||||||
|
maxscore = sw[data_offset];
|
||||||
segment_length = m - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment
|
segment_length = m - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -169,8 +199,11 @@ public class SWPairwiseAlignment {
|
||||||
int state = MSTATE;
|
int state = MSTATE;
|
||||||
List<CigarElement> lce = new ArrayList<CigarElement>(5);
|
List<CigarElement> lce = new ArrayList<CigarElement>(5);
|
||||||
|
|
||||||
|
int data_offset = p1*(m+1)+p2; // offset of element [p1][p2]
|
||||||
|
|
||||||
do {
|
do {
|
||||||
int btr = btrack[p1][p2];
|
// int btr = btrack[p1][p2];
|
||||||
|
int btr = btrack[data_offset];
|
||||||
int step_left = ( btr < 0 ? -btr : 1);
|
int step_left = ( btr < 0 ? -btr : 1);
|
||||||
int step_up = ( btr > 0 ? btr : 1 );
|
int step_up = ( btr > 0 ? btr : 1 );
|
||||||
|
|
||||||
|
|
@ -183,9 +216,9 @@ public class SWPairwiseAlignment {
|
||||||
|
|
||||||
// move to next best location in the sw matrix:
|
// move to next best location in the sw matrix:
|
||||||
switch( new_state ) {
|
switch( new_state ) {
|
||||||
case MSTATE: p1--; p2--; break;
|
case MSTATE: data_offset -= (m+2); break; // equivalent to p1--; p2--
|
||||||
case ISTATE: p2-=step_left; step_length = step_left; break;
|
case ISTATE: data_offset -= step_left; step_length = step_left; break; // equivalent to p2-=step_left;
|
||||||
case DSTATE: p1-=step_up; step_length = step_up; break;
|
case DSTATE: data_offset -= (m+1)*step_up; step_length = step_up; break; // equivalent to p1 -= step_up
|
||||||
}
|
}
|
||||||
|
|
||||||
// now let's see if the state actually changed:
|
// now let's see if the state actually changed:
|
||||||
|
|
@ -196,8 +229,12 @@ public class SWPairwiseAlignment {
|
||||||
segment_length = step_length;
|
segment_length = step_length;
|
||||||
state = new_state;
|
state = new_state;
|
||||||
}
|
}
|
||||||
} while ( sw[p1][p2] != 0 );
|
// next condition is equivalent to while ( sw[p1][p2] != 0 ) (with modified p1 and/or p2:
|
||||||
|
} while ( sw[data_offset] != 0 );
|
||||||
|
|
||||||
|
// reinstate last values of p1, p2 we arrived to after matrix traversal:
|
||||||
|
p1 = data_offset / (m+1);
|
||||||
|
p2 = data_offset % (m+1);
|
||||||
// post-process the last segment we are still keeping
|
// post-process the last segment we are still keeping
|
||||||
lce.add(makeElement(state, segment_length + p2));
|
lce.add(makeElement(state, segment_length + p2));
|
||||||
alignment_offset = p1 - p2;
|
alignment_offset = p1 - p2;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue