r104: long gap patching

This commit is contained in:
Heng Li 2017-06-29 14:54:54 -04:00
parent 17944a75c2
commit b9075d39a8
3 changed files with 26 additions and 13 deletions

14
align.c
View File

@ -167,6 +167,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
int32_t rs1, qs1, re1, qe1; int32_t rs1, qs1, re1, qe1;
int8_t mat[25]; int8_t mat[25];
if (r->cnt == 0) return;
ksw_gen_simple_mat(5, mat, opt->a, opt->b); ksw_gen_simple_mat(5, mat, opt->a, opt->b);
bw = (int)(opt->bw * 1.5 + 1.); bw = (int)(opt->bw * 1.5 + 1.);
@ -221,7 +222,10 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
for (i = 1; i < r->cnt; ++i) { // gap filling for (i = 1; i < r->cnt; ++i) { // gap filling
mm_adjust_minier(mi, qseq0, &a[r->as + i], &re, &qe); mm_adjust_minier(mi, qseq0, &a[r->as + i], &re, &qe);
re1 = re, qe1 = qe; re1 = re, qe1 = qe;
if (i == r->cnt - 1 || qe - qs >= opt->min_ksw_len || re - rs >= opt->min_ksw_len) { if (i == r->cnt - 1 || (a[r->as+i].y>>40&1) || qe - qs >= opt->min_ksw_len || re - rs >= opt->min_ksw_len) {
int bw1 = bw;
if (a[r->as+i].y>>40&1)
bw1 = qe - qs > re - rs? qe - qs : re - rs;
qseq = &qseq0[rev][qs]; qseq = &qseq0[rev][qs];
mm_idx_getseq(mi, rid, rs, re, tseq); mm_idx_getseq(mi, rid, rs, re, tseq);
#if 0 #if 0
@ -229,9 +233,9 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
for (k = 0; k < tl; ++k) fputc("ACGTN"[tseq[k]], stderr); fputc('\n', stderr); for (k = 0; k < tl; ++k) fputc("ACGTN"[tseq[k]], stderr); fputc('\n', stderr);
for (k = 0; k < ql; ++k) fputc("ACGTN"[qseq[k]], stderr); fputc('\n', stderr); for (k = 0; k < ql; ++k) fputc("ACGTN"[qseq[k]], stderr); fputc('\n', stderr);
#endif #endif
ksw_extz2_sse(km, qe - qs, qseq, re - rs, tseq, 5, mat, opt->q, opt->e, bw, opt->zdrop, KSW_EZ_APPROX_MAX, ez); ksw_extz2_sse(km, qe - qs, qseq, re - rs, tseq, 5, mat, opt->q, opt->e, bw1, opt->zdrop, KSW_EZ_APPROX_MAX, ez);
if (mm_check_zdrop(qseq, tseq, ez->n_cigar, ez->cigar, mat, opt->q, opt->e, opt->zdrop)) if (mm_check_zdrop(qseq, tseq, ez->n_cigar, ez->cigar, mat, opt->q, opt->e, opt->zdrop))
ksw_extz2_sse(km, qe - qs, qseq, re - rs, tseq, 5, mat, opt->q, opt->e, bw, opt->zdrop, 0, ez); ksw_extz2_sse(km, qe - qs, qseq, re - rs, tseq, 5, mat, opt->q, opt->e, bw1, opt->zdrop, 0, ez);
if (ez->n_cigar > 0) { if (ez->n_cigar > 0) {
mm_append_cigar(r, ez->n_cigar, ez->cigar); mm_append_cigar(r, ez->n_cigar, ez->cigar);
mm_update_extra(r->p, qseq, tseq, ez->n_cigar, ez->cigar, 0); mm_update_extra(r->p, qseq, tseq, ez->n_cigar, ez->cigar, 0);
@ -313,8 +317,8 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m
for (r = i = 0; r < n_regs; ++r) { for (r = i = 0; r < n_regs; ++r) {
mm_reg1_t *reg = &regs[r]; mm_reg1_t *reg = &regs[r];
int flt = 0; int flt = 0;
if (reg->p->blen - reg->p->n_ambi - reg->p->n_diff < opt->min_chain_score) flt = 1; if (reg->cnt < opt->min_cnt) flt = 1;
else if (reg->cnt < opt->min_cnt) flt = 1; else if (reg->p->blen - reg->p->n_ambi - reg->p->n_diff < opt->min_chain_score) flt = 1;
else if (reg->p->score < opt->min_dp_score && (reg->qe - reg->qs) * 2 < qlen) flt = 1; else if (reg->p->score < opt->min_dp_score && (reg->qe - reg->qs) * 2 < qlen) flt = 1;
if (flt) free(reg->p); if (flt) free(reg->p);
else if (i < r) regs[i++] = regs[r]; // NB: this also move the regs[r].p pointer else if (i < r) regs[i++] = regs[r]; // NB: this also move the regs[r].p pointer

2
main.c
View File

@ -10,7 +10,7 @@
#include "minimap.h" #include "minimap.h"
#include "mmpriv.h" #include "mmpriv.h"
#define MM_VERSION "2.0-r99-pre" #define MM_VERSION "2.0-r104-pre"
void liftrlimit() void liftrlimit()
{ {

23
map.c
View File

@ -225,9 +225,13 @@ void mm_select_sub(float mask_level, float pri_ratio, int *n_, mm_reg1_t *r, voi
} }
} }
static int mm_squeeze_a_core(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a, const uint64_t *aux) static int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a)
{ // squeeze out regions in a[] that are not referenced by regs[] { // squeeze out regions in a[] that are not referenced by regs[]
int i, as = 0; int i, n_aux, as = 0;
uint64_t *aux;
aux = (uint64_t*)kmalloc(km, n_regs * 8);
for (i = n_aux = 0; i < n_regs; ++i)
aux[n_aux++] = (uint64_t)regs[i].as << 32 | i;
for (i = 0; i < n_regs; ++i) { for (i = 0; i < n_regs; ++i) {
mm_reg1_t *r = &regs[i]; mm_reg1_t *r = &regs[i];
if (r->as != as) { if (r->as != as) {
@ -236,21 +240,23 @@ static int mm_squeeze_a_core(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a,
} }
as += r->cnt; as += r->cnt;
} }
kfree(km, aux);
return as; return as;
} }
void mm_join_long(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, mm128_t *a) void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int n_regs, mm_reg1_t *regs, mm128_t *a)
{ {
int i, n_aux; int i, n_aux;
uint64_t *aux; uint64_t *aux;
if (n_regs < 2) return; // nothing to join if (n_regs < 2) return; // nothing to join
mm_squeeze_a(km, n_regs, regs, a);
aux = (uint64_t*)kmalloc(km, n_regs * 8); aux = (uint64_t*)kmalloc(km, n_regs * 8);
for (i = n_aux = 0; i < n_regs; ++i) for (i = n_aux = 0; i < n_regs; ++i)
aux[n_aux++] = (uint64_t)regs[i].as << 32 | i; if (regs[i].parent == i || regs[i].parent < 0)
assert(n_aux == n_regs); // TODO: may be relaxed in future for other use cases aux[n_aux++] = (uint64_t)regs[i].as << 32 | i;
radix_sort_64(aux, aux + n_aux); radix_sort_64(aux, aux + n_aux);
mm_squeeze_a_core(km, n_regs, regs, a, aux);
for (i = n_aux - 1; i >= 1; --i) { for (i = n_aux - 1; i >= 1; --i) {
mm_reg1_t *r0 = &regs[(int32_t)aux[i-1]], *r1 = &regs[(int32_t)aux[i]]; mm_reg1_t *r0 = &regs[(int32_t)aux[i-1]], *r1 = &regs[(int32_t)aux[i]];
@ -351,8 +357,10 @@ mm_reg1_t *mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b,
n_u = mm_chain_dp(opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, n_a, a, &u, b->km); n_u = mm_chain_dp(opt->max_gap, opt->bw, opt->max_chain_skip, opt->min_cnt, opt->min_chain_score, n_a, a, &u, b->km);
regs = mm_gen_reg(qlen, n_u, u, a); regs = mm_gen_reg(qlen, n_u, u, a);
*n_regs = n_u; *n_regs = n_u;
if (!(opt->flag & MM_F_AVA)) // don't choose primary mapping(s) for read overlap if (!(opt->flag & MM_F_AVA)) { // don't choose primary mapping(s) for read overlap
mm_select_sub(opt->mask_level, opt->pri_ratio, n_regs, regs, b->km); mm_select_sub(opt->mask_level, opt->pri_ratio, n_regs, regs, b->km);
mm_join_long(b->km, opt, qlen, *n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle
}
if (opt->flag & MM_F_CIGAR) if (opt->flag & MM_F_CIGAR)
regs = mm_align_skeleton(b->km, opt, mi, qlen, seq, n_regs, regs, a); regs = mm_align_skeleton(b->km, opt, mi, qlen, seq, n_regs, regs, a);
@ -432,6 +440,7 @@ static void *worker_pipeline(void *shared, int step, void *in)
bseq1_t *t = &s->seq[i]; bseq1_t *t = &s->seq[i];
for (j = 0; j < s->n_reg[i]; ++j) { for (j = 0; j < s->n_reg[i]; ++j) {
mm_reg1_t *r = &s->reg[i][j]; mm_reg1_t *r = &s->reg[i][j];
if (r->cnt == 0) continue;
if (p->opt->flag & MM_F_OUT_SAM) mm_write_sam(&p->str, mi, t, j, r); if (p->opt->flag & MM_F_OUT_SAM) mm_write_sam(&p->str, mi, t, j, r);
else mm_write_paf(&p->str, mi, t, j, r); else mm_write_paf(&p->str, mi, t, j, r);
puts(p->str.s); puts(p->str.s);