r494: simpler and better SR gap filling

Still one thing to do: left alignment
This commit is contained in:
Heng Li 2017-10-09 22:02:30 -04:00
parent 1e53610fb4
commit 46fa520db9
2 changed files with 43 additions and 68 deletions

109
align.c
View File

@ -286,6 +286,8 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
int32_t rs1, qs1, re1, qe1; int32_t rs1, qs1, re1, qe1;
int8_t mat[25]; int8_t mat[25];
if (is_sr) assert(!mi->is_hpc); // HPC won't work with SR because with HPC we can't easily tell if there is a gap
r2->cnt = 0; r2->cnt = 0;
if (r->cnt == 0) return; if (r->cnt == 0) return;
ksw_gen_simple_mat(5, mat, opt->a, opt->b); ksw_gen_simple_mat(5, mat, opt->a, opt->b);
@ -358,74 +360,47 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
re1 = rs, qe1 = qs; re1 = rs, qe1 = qs;
assert(qs1 >= 0 && rs1 >= 0); assert(qs1 >= 0 && rs1 >= 0);
if (is_sr) { // in the SR mode, just write M; no gap filling for (i = is_sr? cnt1 - 1 : 1; i < cnt1; ++i) { // gap filling
uint32_t l; if ((a[as1+i].y & (MM_SEED_IGNORE|MM_SEED_TANDEM)) && i != cnt1 - 1) continue;
int32_t score, max_score, max_i; if (is_sr && !mi->is_hpc) {
re = (int32_t)a[as1+cnt1-1].x + 1; re = (int32_t)a[as1 + i].x + 1;
qe = (int32_t)a[as1+cnt1-1].y + 1; qe = (int32_t)a[as1 + i].y + 1;
assert(re > re1 && re - re1 == qe - qe1); } else mm_adjust_minier(mi, qseq0, &a[as1 + i], &re, &qe);
qseq = &qseq0[rev][qs]; re1 = re, qe1 = qe;
mm_idx_getseq(mi, rid, rs, re, tseq); if (i == cnt1 - 1 || (a[as1+i].y&MM_SEED_LONG_JOIN) || (qe - qs >= opt->min_ksw_len && re - rs >= opt->min_ksw_len)) {
l = re - re1; int j, bw1 = bw;
score = max_score = 0, max_i = -1; if (a[as1+i].y & MM_SEED_LONG_JOIN)
for (i = 0; i < l; ++i) { bw1 = qe - qs > re - rs? qe - qs : re - rs;
if (qseq[i] >= 4 || tseq[i] >= 4) score += -opt->e2; qseq = &qseq0[rev][qs];
else score += qseq[i] == tseq[i]? opt->a : -opt->b; mm_idx_getseq(mi, rid, rs, re, tseq);
if (max_score < score) max_score = score, max_i = i; if (is_sr) { // perform ungapped alignment
else if (score < max_score - opt->zdrop) assert(qe - qs == re - rs);
break; ksw_reset_extz(ez);
} for (j = 0, ez->score = 0; j < qe - qs; ++j) {
assert(max_i >= 0); if (qseq[j] >= 4 || tseq[j] >= 4) ez->score += opt->e2;
if (i == l) { // not droppped else ez->score += qseq[j] == tseq[j]? opt->a : -opt->b;
uint32_t cigar1 = l << 4 | 0; }
mm_append_cigar(r, 1, &cigar1); ez->cigar = ksw_push_cigar(km, &ez->n_cigar, &ez->m_cigar, ez->cigar, 0, qe - qs);
r->p->dp_score += score; } else { // perform normal gapped alignment
re1 = re, qe1 = qe; mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag|KSW_EZ_APPROX_MAX, ez); // first pass: with approximate Z-drop
} else { // Z-dropped
uint32_t cigar1 = (max_i + 1) << 4 | 0;
int j;
mm_append_cigar(r, 1, &cigar1);
r->p->dp_score += max_score;
qe1 += max_i + 1, re1 += max_i + 1;
for (j = cnt1 - 1; j >= 0; --j)
if ((int32_t)a[as1 + j].x < rs + max_i)
break;
if (cnt1 - (j + 1) >= opt->min_cnt)
mm_split_reg(r, r2, j + 1, qlen, a);
qe = qe1, re = re1;
dropped = 1;
}
} else {
for (i = 1; i < cnt1; ++i) { // gap filling
if ((a[as1+i].y & (MM_SEED_IGNORE|MM_SEED_TANDEM)) && i != cnt1 - 1) continue;
mm_adjust_minier(mi, qseq0, &a[as1 + i], &re, &qe);
re1 = re, qe1 = qe;
if (i == cnt1 - 1 || (a[as1+i].y&MM_SEED_LONG_JOIN) || (qe - qs >= opt->min_ksw_len && re - rs >= opt->min_ksw_len)) {
int bw1 = bw;
if (a[as1+i].y & MM_SEED_LONG_JOIN)
bw1 = qe - qs > re - rs? qe - qs : re - rs;
qseq = &qseq0[rev][qs];
mm_idx_getseq(mi, rid, rs, re, tseq);
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag|KSW_EZ_APPROX_MAX, ez);
if (mm_check_zdrop(qseq, tseq, ez->n_cigar, ez->cigar, mat, opt->q, opt->e, opt->zdrop))
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag, ez);
if (ez->n_cigar > 0)
mm_append_cigar(r, ez->n_cigar, ez->cigar);
if (ez->zdropped) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle.
int j;
for (j = i - 1; j >= 0; --j)
if ((int32_t)a[as1 + j].x < rs + ez->max_t)
break;
dropped = 1;
r->p->dp_score += ez->max;
re1 = rs + (ez->max_t + 1);
qe1 = qs + (ez->max_q + 1);
if (cnt1 - (j + 1) >= opt->min_cnt)
mm_split_reg(r, r2, j + 1, qlen, a);
break;
} else r->p->dp_score += ez->score;
rs = re, qs = qe;
} }
if (mm_check_zdrop(qseq, tseq, ez->n_cigar, ez->cigar, mat, opt->q, opt->e, opt->zdrop))
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag, ez); // second pass: lift approximate
if (ez->n_cigar > 0)
mm_append_cigar(r, ez->n_cigar, ez->cigar);
if (ez->zdropped) { // truncated by Z-drop; TODO: sometimes Z-drop kicks in because the next seed placement is wrong. This can be fixed in principle.
for (j = i - 1; j >= 0; --j)
if ((int32_t)a[as1 + j].x < rs + ez->max_t)
break;
dropped = 1;
r->p->dp_score += ez->max;
re1 = rs + (ez->max_t + 1);
qe1 = qs + (ez->max_q + 1);
if (cnt1 - (j + 1) >= opt->min_cnt)
mm_split_reg(r, r2, j + 1, qlen, a);
break;
} else r->p->dp_score += ez->score;
rs = re, qs = qe;
} }
} }

2
main.c
View File

@ -6,7 +6,7 @@
#include "mmpriv.h" #include "mmpriv.h"
#include "getopt.h" #include "getopt.h"
#define MM_VERSION "2.2-r493-dirty" #define MM_VERSION "2.2-r494-dirty"
#ifdef __linux__ #ifdef __linux__
#include <sys/resource.h> #include <sys/resource.h>