r229: a new way to prevent out-of-band backtrack
This commit is contained in:
parent
fa99d28d34
commit
5934d68772
1
Makefile
1
Makefile
|
|
@ -51,6 +51,7 @@ index.o: kthread.h bseq.h minimap.h mmpriv.h kvec.h kalloc.h khash.h
|
|||
kalloc.o: kalloc.h
|
||||
ksw2_extd2_sse.o: ksw2.h kalloc.h
|
||||
ksw2_extz2_sse.o: ksw2.h kalloc.h
|
||||
ksw2_ll_sse.o: ksw2.h kalloc.h
|
||||
main.o: bseq.h minimap.h mmpriv.h
|
||||
map.o: kthread.h kvec.h kalloc.h sdust.h mmpriv.h minimap.h bseq.h
|
||||
misc.o: minimap.h ksort.h
|
||||
|
|
|
|||
6
align.c
6
align.c
|
|
@ -126,6 +126,12 @@ static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) //
|
|||
|
||||
static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint8_t *qseq, int tlen, const uint8_t *tseq, const int8_t *mat, int w, int flag, ksw_extz_t *ez)
|
||||
{
|
||||
if (mm_dbg_flag & MM_DBG_PRINT_ALN_SEQ) {
|
||||
int i;
|
||||
fprintf(stderr, "===> q=(%d,%d), e=(%d,%d), bw=%d, flag=%d, zdrop=%d <===\n", opt->q, opt->q2, opt->e, opt->e2, w, flag, opt->zdrop);
|
||||
for (i = 0; i < tlen; ++i) fputc("ACGTN"[tseq[i]], stderr); fputc('\n', stderr);
|
||||
for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr); fputc('\n', stderr);
|
||||
}
|
||||
if (opt->q == opt->q2 && opt->e == opt->e2)
|
||||
ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, opt->zdrop, flag, ez);
|
||||
else
|
||||
|
|
|
|||
22
ksw2.h
22
ksw2.h
|
|
@ -39,8 +39,8 @@ typedef struct {
|
|||
* @param mat m*m scoring mattrix in one-dimension array
|
||||
* @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)"
|
||||
* @param gape gap extension penalty
|
||||
* @param w band width
|
||||
* @param zdrop off-diagonal drop-off to stop extension (positive)
|
||||
* @param w band width (<0 to disable)
|
||||
* @param zdrop off-diagonal drop-off to stop extension (positive; <0 to disable)
|
||||
* @param flag flag (see KSW_EZ_* macros)
|
||||
* @param ez (out) scores and cigar
|
||||
*/
|
||||
|
|
@ -53,6 +53,8 @@ void ksw_extd(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t
|
|||
void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
|
||||
int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int flag, ksw_extz_t *ez);
|
||||
|
||||
void ksw_extf2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez);
|
||||
|
||||
/**
|
||||
* Global alignment
|
||||
*
|
||||
|
|
@ -104,16 +106,26 @@ static inline uint32_t *ksw_push_cigar(void *km, int *n_cigar, int *m_cigar, uin
|
|||
// bit 0-2: which type gets the max - 0 for H, 1 for E, 2 for F, 3 for \tilde{E} and 4 for \tilde{F}
|
||||
// bit 3/0x08: 1 if a continuation on the E state (bit 5/0x20 for a continuation on \tilde{E})
|
||||
// bit 4/0x10: 1 if a continuation on the F state (bit 6/0x40 for a continuation on \tilde{F})
|
||||
static inline void ksw_backtrack(void *km, int is_rot, int is_rev, const uint8_t *p, const int *off, int n_col, int i0, int j0, int *m_cigar_, int *n_cigar_, uint32_t **cigar_)
|
||||
static inline void ksw_backtrack(void *km, int is_rot, int is_rev, const uint8_t *p, const int *off, const int *off_end, int n_col, int i0, int j0, int *m_cigar_, int *n_cigar_, uint32_t **cigar_)
|
||||
{ // p[] - lower 3 bits: which type gets the max; bit
|
||||
int n_cigar = 0, m_cigar = *m_cigar_, i = i0, j = j0, r, state = 0;
|
||||
uint32_t *cigar = *cigar_, tmp;
|
||||
while (i >= 0 && j >= 0) { // at the beginning of the loop, _state_ tells us which state to check
|
||||
if (is_rot) r = i + j, tmp = p[r * n_col + i - off[r]];
|
||||
else tmp = p[i * n_col + j - off[i]];
|
||||
int force_state = -1;
|
||||
if (is_rot) {
|
||||
r = i + j;
|
||||
if (i < off[r]) force_state = 2;
|
||||
if (off_end && i > off_end[r]) force_state = 1;
|
||||
tmp = force_state < 0? p[r * n_col + i - off[r]] : 0;
|
||||
} else {
|
||||
if (j < off[i]) force_state = 2;
|
||||
if (off_end && j > off_end[i]) force_state = 1;
|
||||
tmp = force_state < 0? p[i * n_col + j - off[i]] : 0;
|
||||
}
|
||||
if (state == 0) state = tmp & 7; // if requesting the H state, find state one maximizes it.
|
||||
else if (!(tmp >> (state + 2) & 1)) state = 0; // if requesting other states, _state_ stays the same if it is a continuation; otherwise, set to H
|
||||
if (state == 0) state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure
|
||||
if (force_state >= 0) state = force_state;
|
||||
if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match
|
||||
else if (state == 1 || state == 3) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion
|
||||
else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
a2= _mm_sub_epi8(a2, tmp); \
|
||||
b2= _mm_sub_epi8(b2, tmp);
|
||||
|
||||
int r, t, qe = q + e, n_col_, *off = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, min_sc, long_thres, long_diff;
|
||||
int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, min_sc, long_thres, long_diff;
|
||||
int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX);
|
||||
int32_t *H = 0, H0 = 0, last_H0_t = 0;
|
||||
uint8_t *qr, *sf, *mem, *mem2 = 0;
|
||||
|
|
@ -96,7 +96,8 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
if (with_cigar) {
|
||||
mem2 = (uint8_t*)kmalloc(km, ((qlen + tlen - 1) * n_col_ + 1) * 16);
|
||||
p = (__m128i*)(((size_t)mem2 + 15) >> 4 << 4);
|
||||
off = (int*)kmalloc(km, (qlen + tlen - 1) * sizeof(int));
|
||||
off = (int*)kmalloc(km, (qlen + tlen - 1) * sizeof(int) * 2);
|
||||
off_end = off + qlen + tlen - 1;
|
||||
}
|
||||
|
||||
for (t = 0; t < qlen; ++t) qr[t] = query[qlen - 1 - t];
|
||||
|
|
@ -105,7 +106,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) {
|
||||
int st = 0, en = tlen - 1, st0, en0, st_, en_;
|
||||
int8_t x1, x21, v1;
|
||||
uint8_t *qrr = qr + (qlen - 1 - r), p_en0 = 0;
|
||||
uint8_t *qrr = qr + (qlen - 1 - r);
|
||||
int8_t *u8 = (int8_t*)u, *v8 = (int8_t*)v, *x8 = (int8_t*)x, *x28 = (int8_t*)x2;
|
||||
__m128i x1_, x21_, v1_;
|
||||
// find the boundaries
|
||||
|
|
@ -199,18 +200,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
}
|
||||
} else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment
|
||||
__m128i *pr = p + r * n_col_ - st_;
|
||||
off[r] = st;
|
||||
if (en0 < r && en0 < tlen - 1) { // to avoid backtracking out of the band; this assumes a fixed band
|
||||
int8_t a, a2, z = ((uint8_t*)s)[en0];
|
||||
a = x8[en0-1] + v8[en0-1];
|
||||
p_en0 = a > z? 1 : 0;
|
||||
z = a > z? a : z;
|
||||
p_en0 |= a - (z - q) > 0? 1<<4 : 0;
|
||||
a2 = x28[en0-1] + v8[en0-1];
|
||||
p_en0 = a2 > z? 3 : p_en0;
|
||||
z = a2 > z? a2 : z;
|
||||
p_en0 |= a2 - (z - q2) > 0? 1<<6 : 0;
|
||||
}
|
||||
off[r] = st, off_end[r] = en;
|
||||
for (t = st_; t <= en_; ++t) {
|
||||
__m128i d, z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp;
|
||||
__dp_code_block1;
|
||||
|
|
@ -257,18 +247,7 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
}
|
||||
} else { // gap right-alignment
|
||||
__m128i *pr = p + r * n_col_ - st_;
|
||||
off[r] = st;
|
||||
if (en0 < r && en0 < tlen - 1) { // to avoid backtracking out of the band; this assumes a fixed band
|
||||
int8_t a, a2, z = ((uint8_t*)s)[en0];
|
||||
a = x8[en0-1] + v8[en0-1];
|
||||
p_en0 = a >= z? 1 : 0;
|
||||
z = a >= z? a : z;
|
||||
p_en0 |= a - (z - q) >= 0? 1<<4 : 0;
|
||||
a2 = x28[en0-1] + v8[en0-1];
|
||||
p_en0 = a2 >= z? 3 : p_en0;
|
||||
z = a2 >= z? a2 : z;
|
||||
p_en0 |= a2 - (z - q2) >= 0? 1<<6 : 0;
|
||||
}
|
||||
off[r] = st, off_end[r] = en;
|
||||
for (t = st_; t <= en_; ++t) {
|
||||
__m128i d, z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp;
|
||||
__dp_code_block1;
|
||||
|
|
@ -314,7 +293,6 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
_mm_store_si128(&pr[t], d);
|
||||
}
|
||||
}
|
||||
if (with_cigar && en0 < r && en0 < tlen - 1) ((uint8_t*)(p + r * n_col_))[en0 - st] = p_en0;
|
||||
if (!approx_max) { // find the exact max with a 32-bit score array
|
||||
int32_t max_H, max_t;
|
||||
// compute H[], max_H and max_t
|
||||
|
|
@ -384,9 +362,9 @@ void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
if (with_cigar) { // backtrack
|
||||
int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
|
||||
if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY))
|
||||
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
else if (ez->max_t >= 0 && ez->max_q >= 0)
|
||||
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
kfree(km, mem2); kfree(km, off);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
a = _mm_sub_epi8(a, z); \
|
||||
b = _mm_sub_epi8(b, z);
|
||||
|
||||
int r, t, qe = q + e, n_col_, *off = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, min_sc;
|
||||
int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, min_sc;
|
||||
int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX);
|
||||
int32_t *H = 0, H0 = 0, last_H0_t = 0;
|
||||
uint8_t *qr, *sf, *mem, *mem2 = 0;
|
||||
|
|
@ -76,7 +76,8 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
if (with_cigar) {
|
||||
mem2 = (uint8_t*)kmalloc(km, ((qlen + tlen - 1) * n_col_ + 1) * 16);
|
||||
p = (__m128i*)(((size_t)mem2 + 15) >> 4 << 4);
|
||||
off = (int*)kmalloc(km, (qlen + tlen - 1) * sizeof(int));
|
||||
off = (int*)kmalloc(km, (qlen + tlen - 1) * sizeof(int) * 2);
|
||||
off_end = off + qlen + tlen - 1;
|
||||
}
|
||||
|
||||
for (t = 0; t < qlen; ++t) qr[t] = query[qlen - 1 - t];
|
||||
|
|
@ -85,7 +86,7 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) {
|
||||
int st = 0, en = tlen - 1, st0, en0, st_, en_;
|
||||
int8_t x1, v1;
|
||||
uint8_t *qrr = qr + (qlen - 1 - r), *u8 = (uint8_t*)u, *v8 = (uint8_t*)v, *x8 = (uint8_t*)x, p_en0 = 0;
|
||||
uint8_t *qrr = qr + (qlen - 1 - r), *u8 = (uint8_t*)u, *v8 = (uint8_t*)v;
|
||||
__m128i x1_, v1_;
|
||||
// find the boundaries
|
||||
if (st < r - qlen + 1) st = r - qlen + 1;
|
||||
|
|
@ -152,14 +153,7 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
}
|
||||
} else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment
|
||||
__m128i *pr = p + r * n_col_ - st_;
|
||||
off[r] = st;
|
||||
if (en0 < r && en0 < tlen - 1) { // to avoid backtracking out of the band; this assumes a fixed band
|
||||
int8_t a, z = ((uint8_t*)s)[en0] + 2 * qe;
|
||||
a = x8[en0-1] + v8[en0-1];
|
||||
p_en0 = a > z? 1 : 0;
|
||||
z = a > z? a : z;
|
||||
p_en0 |= a - (z - q) > 0? 0x08 : 0;
|
||||
}
|
||||
off[r] = st, off_end[r] = en;
|
||||
for (t = st_; t <= en_; ++t) {
|
||||
__m128i d, z, a, b, xt1, vt1, ut, tmp;
|
||||
__dp_code_block1;
|
||||
|
|
@ -185,14 +179,7 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
}
|
||||
} else { // gap right-alignment
|
||||
__m128i *pr = p + r * n_col_ - st_;
|
||||
off[r] = st;
|
||||
if (en0 < r && en0 < tlen - 1) {
|
||||
int8_t a, z = ((uint8_t*)s)[en0] + 2 * qe;
|
||||
a = x8[en0-1] + v8[en0-1];
|
||||
p_en0 = a >= z? 1 : 0;
|
||||
z = a >= z? a : z;
|
||||
p_en0 |= a - (z - q) >= 0? 0x08 : 0;
|
||||
}
|
||||
off[r] = st, off_end[r] = en;
|
||||
for (t = st_; t <= en_; ++t) {
|
||||
__m128i d, z, a, b, xt1, vt1, ut, tmp;
|
||||
__dp_code_block1;
|
||||
|
|
@ -217,7 +204,6 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
_mm_store_si128(&pr[t], d);
|
||||
}
|
||||
}
|
||||
if (with_cigar && en0 < r && en0 < tlen - 1) ((uint8_t*)(p + r * n_col_))[en0 - st] = p_en0;
|
||||
if (!approx_max) { // find the exact max with a 32-bit score array
|
||||
int32_t max_H, max_t;
|
||||
// compute H[], max_H and max_t
|
||||
|
|
@ -289,9 +275,9 @@ void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
if (with_cigar) { // backtrack
|
||||
int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
|
||||
if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY))
|
||||
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
else if (ez->max_t >= 0 && ez->max_q >= 0)
|
||||
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
ksw_backtrack(km, 1, rev_cigar, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
|
||||
kfree(km, mem2); kfree(km, off);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
4
main.c
4
main.c
|
|
@ -8,7 +8,7 @@
|
|||
#include "minimap.h"
|
||||
#include "mmpriv.h"
|
||||
|
||||
#define MM_VERSION "2.0-r228-dirty"
|
||||
#define MM_VERSION "2.0-r229-dirty"
|
||||
|
||||
void liftrlimit()
|
||||
{
|
||||
|
|
@ -30,6 +30,7 @@ static struct option long_options[] = {
|
|||
{ "print-seed", no_argument, 0, 0 },
|
||||
{ "max-chain-skip", required_argument, 0, 0 },
|
||||
{ "min-dp-len", required_argument, 0, 0 },
|
||||
{ "print-aln-seq", no_argument, 0, 0 },
|
||||
{ "version", no_argument, 0, 'V' },
|
||||
{ "min-count", required_argument, 0, 'n' },
|
||||
{ "min-chain-score",required_argument, 0, 'm' },
|
||||
|
|
@ -85,6 +86,7 @@ int main(int argc, char *argv[])
|
|||
else if (c == 0 && long_idx == 6) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_SEED; // --print-seed
|
||||
else if (c == 0 && long_idx == 7) opt.max_chain_skip = atoi(optarg); // --max-chain-skip
|
||||
else if (c == 0 && long_idx == 8) opt.min_ksw_len = atoi(optarg); // --min-dp-len
|
||||
else if (c == 0 && long_idx == 9) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ; // --print-aln-seq
|
||||
else if (c == 'V') {
|
||||
puts(MM_VERSION);
|
||||
return 0;
|
||||
|
|
|
|||
7
mmpriv.h
7
mmpriv.h
|
|
@ -8,9 +8,10 @@
|
|||
#define MM_PARENT_UNSET (-1)
|
||||
#define MM_PARENT_TMP_PRI (-2)
|
||||
|
||||
#define MM_DBG_NO_KALLOC 0x1
|
||||
#define MM_DBG_PRINT_QNAME 0x2
|
||||
#define MM_DBG_PRINT_SEED 0x4
|
||||
#define MM_DBG_NO_KALLOC 0x1
|
||||
#define MM_DBG_PRINT_QNAME 0x2
|
||||
#define MM_DBG_PRINT_SEED 0x4
|
||||
#define MM_DBG_PRINT_ALN_SEQ 0x8
|
||||
|
||||
#define MM_SEED_LONG_JOIN (1ULL<<40)
|
||||
#define MM_SEED_IGNORE (1ULL<<41)
|
||||
|
|
|
|||
Loading…
Reference in New Issue