r224: inversion alignment around Z-drop break

This commit is contained in:
Heng Li 2017-07-29 13:09:10 -04:00
parent 120bebc290
commit 19d6ec885e
7 changed files with 60 additions and 30 deletions

65
align.c
View File

@ -355,23 +355,53 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
kfree(km, tseq);
}
static void mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, uint8_t *qseq0[2], const mm_reg1_t *r1, const mm_reg1_t *r2, mm_reg1_t *r_inv, ksw_extz_t *ez)
static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], const mm_reg1_t *r1, const mm_reg1_t *r2, mm_reg1_t *r_inv, ksw_extz_t *ez)
{
int tlen, qlen;
int tl, ql, ret = 0;
uint8_t *tseq, *qseq;
int8_t mat[25];
memset(r_inv, 0, sizeof(mm_reg1_t));
if (r1->rid != r2->rid || r1->rev != r2->rev) return;
qlen = r2->qs - r1->qe;
tlen = r2->rs - r1->re;
if (qlen < 0 || qlen > opt->max_gap) return;
if (tlen < 0 || tlen > opt->max_gap) return;
if (!(r1->split&1) || !(r2->split&2)) return 0;
if (r1->rid != r2->rid || r1->rev != r2->rev) return 0;
ql = r2->qs - r1->qe;
tl = r2->rs - r1->re;
if (ql < 0 || ql > opt->max_gap) return 0;
if (tl < 0 || tl > opt->max_gap) return 0;
ksw_gen_simple_mat(5, mat, opt->a, opt->b);
tseq = (uint8_t*)kmalloc(km, tlen);
tseq = (uint8_t*)kmalloc(km, tl);
mm_idx_getseq(mi, r1->rid, r1->re, r2->rs, tseq);
qseq = &qseq0[!r1->rev][r1->qe];
mm_align_pair(km, opt, qlen, qseq, tlen, tseq, mat, -1, KSW_EZ_APPROX_MAX, ez);
fprintf(stderr, "%d; (%d,%d); (%d,%d)\n", ez->score, r1->re, r2->rs, r1->qe, r2->qs);
qseq = &qseq0[!r1->rev][qlen - r2->qs];
mm_align_pair(km, opt, ql, qseq, tl, tseq, mat, (int)(opt->bw * 1.5), KSW_EZ_APPROX_MAX, ez);
if (ez->n_cigar > 0 && ez->score > 0) {
mm_append_cigar(r_inv, ez->n_cigar, ez->cigar);
r_inv->p->dp_score = ez->score;
mm_update_extra(r_inv->p, qseq, tseq, mat, opt->q, opt->e);
if (r_inv->p->dp_max < opt->min_dp_max) { // discard this hit
free(r_inv->p);
r_inv->p = 0;
} else {
r_inv->id = -1;
r_inv->parent = MM_PARENT_UNSET;
r_inv->inv = 1;
r_inv->rev = !r1->rev;
r_inv->qs = r1->qe, r_inv->qe = r2->qs;
r_inv->rs = r1->re, r_inv->re = r2->rs;
ret = 1;
fprintf(stderr, "here!\n");
}
}
kfree(km, tseq);
return ret;
}
static inline mm_reg1_t *mm_insert_reg(const mm_reg1_t *r, int i, int *n_regs, mm_reg1_t *regs)
{
regs = (mm_reg1_t*)realloc(regs, (*n_regs + 1) * sizeof(mm_reg1_t));
if (i + 1 != *n_regs)
memmove(&regs[i + 2], &regs[i + 1], sizeof(mm_reg1_t) * (*n_regs - i - 1));
regs[i + 1] = *r;
++*n_regs;
return regs;
}
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a)
@ -394,15 +424,10 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m
for (i = 0; i < n_regs; ++i) {
mm_reg1_t r2;
mm_align1(km, opt, mi, qlen, qseq0, &regs[i], &r2, a, &ez);
if (r2.cnt > 0) {
regs = (mm_reg1_t*)realloc(regs, (n_regs + 1) * sizeof(mm_reg1_t)); // this should be very rare
if (i + 1 != n_regs)
memmove(&regs[i + 2], &regs[i + 1], sizeof(mm_reg1_t) * (n_regs - i - 1));
regs[i + 1] = r2;
++n_regs;
}
if (i > 0 && (regs[i].split&2) && (regs[i-1].split&1)) {
mm_align1_inv(km, opt, mi, qseq0, &regs[i-1], &regs[i], &r2, &ez);
if (r2.cnt > 0) regs = mm_insert_reg(&r2, i, &n_regs, regs);
if (i > 0 && mm_align1_inv(km, opt, mi, qlen, qseq0, &regs[i-1], &regs[i], &r2, &ez)) {
regs = mm_insert_reg(&r2, i, &n_regs, regs);
++i; // skip the inserted INV alignment
}
}
*n_regs_ = n_regs;

View File

@ -56,7 +56,8 @@ static void mm_sprintf_lite(kstring_t *s, const char *fmt, ...)
static inline void write_tags(kstring_t *s, const mm_reg1_t *r)
{
mm_sprintf_lite(s, "\tcm:i:%d\ts1:i:%d", r->cnt, r->score);
int type = r->inv? 'I' : r->id == r->parent? 'P' : 'S';
mm_sprintf_lite(s, "\ttp:A:%c\tcm:i:%d\ts1:i:%d", type, r->cnt, r->score);
if (r->parent == r->id) mm_sprintf_lite(s, "\ts2:i:%d", r->subsc);
if (r->split) mm_sprintf_lite(s, "\tzd:i:%d", r->split);
if (r->p) mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->dp_max, r->p->dp_score, r->p->n_ambi);

10
hit.c
View File

@ -125,11 +125,11 @@ void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r)
aux = (uint64_t*)kmalloc(km, n * 8);
t = (mm_reg1_t*)kmalloc(km, n * sizeof(mm_reg1_t));
for (i = n_aux = 0; i < n; ++i) {
if (r[i].cnt > 0) { // squeeze out elements with cnt==0 (soft deleted)
if (r[i].inv || r[i].cnt > 0) { // squeeze out elements with cnt==0 (soft deleted)
assert(r[i].p);
aux[n_aux++] = (uint64_t)r[i].p->dp_max << 32 | i;
} else if (r[i].p) {
kfree(km, r[i].p);
free(r[i].p);
r[i].p = 0;
}
}
@ -197,7 +197,7 @@ void mm_filter_regs(void *km, const mm_mapopt_t *opt, int *n_regs, mm_reg1_t *re
for (i = k = 0; i < *n_regs; ++i) {
mm_reg1_t *r = &regs[i];
int flt = 0;
if (r->cnt < opt->min_cnt) flt = 1;
if (!r->inv && r->cnt < opt->min_cnt) flt = 1;
if (r->p) {
if (r->p->blen - r->p->n_ambi - r->p->n_diff < opt->min_chain_score) flt = 1;
else if (r->p->dp_max < opt->min_dp_max) flt = 1;
@ -294,7 +294,9 @@ void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc)
int i;
for (i = 0; i < n_regs; ++i) {
mm_reg1_t *r = &regs[i];
if (r->parent == r->id) {
if (r->inv) {
r->mapq = 0;
} else if (r->parent == r->id) {
int mapq, subsc;
float pen_cm = r->cnt >= 10? 1.0f : 0.1f * r->cnt;
subsc = r->subsc > min_chain_sc? r->subsc : min_chain_sc;

2
main.c
View File

@ -8,7 +8,7 @@
#include "minimap.h"
#include "mmpriv.h"
#define MM_VERSION "2.0-r222-dirty"
#define MM_VERSION "2.0-r224-dirty"
void liftrlimit()
{

2
map.c
View File

@ -31,7 +31,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1;
opt->zdrop = 400;
opt->min_dp_max = opt->min_chain_score;
opt->min_dp_max = opt->min_chain_score * opt->a;
opt->min_ksw_len = 200;
}

View File

@ -61,7 +61,7 @@ typedef struct {
typedef struct {
int32_t id;
uint32_t cnt:31, rev:1;
uint32_t rid:31, rep:1;
uint32_t rid:31, inv:1;
int32_t score;
int32_t qs, qe, rs, re;
int32_t parent, subsc;

View File

@ -1,4 +1,4 @@
.TH minimap2 1 "28 July 2017" "minimap2-2.0-r219-dirty" "Bioinformatics tools"
.TH minimap2 1 "29 July 2017" "minimap2-2.0-r224-dirty" "Bioinformatics tools"
.SH NAME
.PP
minimap2 - mapping and alignment between collections of DNA sequences
@ -329,6 +329,7 @@ cb | cb | cb
r | c | l .
Tag Type Description
_
tp A Type of aln: P/primary, S/secondary and I/inversion
cm i Number of minimizers on the chain
s1 i Chaining score
s2 i Chaining score of the best secondary chain
@ -343,7 +344,8 @@ cg Z CIGAR string (only in PAF)
.TP 2
*
Minimap2 may produce suboptimal alignments through long low-complexity regions
where seed positions may be inaccurate.
where seed positions may be suboptimal. This should not be a big concern
because even the optimal alignment may be wrong in such regions.
.TP
*
Minimap2 may produce poor alignments that may need post-filtering. We are still