From e6f525edafad46ad9e056bacbb9b5ed1b46c0ea2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 16 Oct 2017 10:38:22 -0400 Subject: [PATCH] r512: option to filter poorly aligned reads --- align.c | 56 ++++++++++++++++++++++++++++++++++++------------------- format.c | 10 ++++++---- hit.c | 39 ++++++++++++++++++++++++++++++++++++++ main.c | 6 ++++-- map.c | 28 +++++++++++++++++----------- minimap.h | 5 ++++- mmpriv.h | 3 ++- pe.c | 22 ++++++++++++++++++++++ 8 files changed, 131 insertions(+), 38 deletions(-) diff --git a/align.c b/align.c index dd93058..f81f921 100644 --- a/align.c +++ b/align.c @@ -117,7 +117,7 @@ static void mm_fix_cigar(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, } } -static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e) +static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *qual, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e) { uint32_t k, l, toff = 0, qoff = 0; int32_t s = 0, max = 0, qshift, tshift; @@ -128,21 +128,30 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts for (k = 0; k < p->n_cigar; ++k) { uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4; if (op == 0) { // match/mismatch + int n_ambi = 0, n_diff = 0; + float n_diff2 = 0.0f; for (l = 0; l < len; ++l) { int cq = qseq[qoff + l], ct = tseq[toff + l]; - if (ct > 3 || cq > 3) ++p->n_ambi; - else if (ct != cq) ++p->n_diff; + if (ct > 3 || cq > 3) ++n_ambi; + else if (ct != cq) { + ++n_diff; + n_diff2 += qual == 0 || qual[qoff + l] >= 20? 1.0f : .05f * qual[qoff + l]; + } s += mat[ct * 5 + cq]; if (s < 0) s = 0; else max = max > s? max : s; } + p->n_ambi += n_ambi; + p->n_diff += n_diff; toff += len, qoff += len, p->blen += len; + p->n_diff2 += n_diff2, p->blen2 += len - n_ambi; } else if (op == 1) { // insertion int n_ambi = 0; for (l = 0; l < len; ++l) if (qseq[qoff + l] > 3) ++n_ambi; qoff += len, p->blen += len; p->n_ambi += n_ambi, p->n_diff += len - n_ambi; + p->n_diff2 += 1.0f, ++p->blen2; s -= q + e * len; if (s < 0) s = 0; } else if (op == 2) { // deletion @@ -151,6 +160,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *ts if (tseq[toff + l] > 3) ++n_ambi; toff += len, p->blen += len; p->n_ambi += n_ambi, p->n_diff += len - n_ambi; + p->n_diff2 += 1.0f, ++p->blen2; s -= q + e * len; if (s < 0) s = 0; } else if (op == 3) { // intron @@ -335,7 +345,7 @@ static void mm_max_stretch(const mm_mapopt_t *opt, const mm_reg1_t *r, const mm1 *as = max_i, *cnt = max_len; } -static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], mm_reg1_t *r, mm_reg1_t *r2, int n_a, mm128_t *a, ksw_extz_t *ez, int splice_flag) +static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], uint8_t *qual0[2], mm_reg1_t *r, mm_reg1_t *r2, int n_a, mm128_t *a, ksw_extz_t *ez, int splice_flag) { int is_sr = !!(opt->flag & MM_F_SR), is_splice = !!(opt->flag & MM_F_SPLICE); int32_t rid = a[r->as].x<<1>>33, rev = a[r->as].x>>63, as1, cnt1; @@ -381,10 +391,10 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int if (is_sr) { qs0 = 0, qe0 = qlen; l = qs; - l += l * opt->a > opt->q? (l * opt->a - opt->q) / opt->e : 0; + l += l * opt->a + opt->end_bonus > opt->q? (l * opt->a + opt->end_bonus - opt->q) / opt->e : 0; rs0 = rs - l > 0? rs - l : 0; l = qlen - qe; - l += l * opt->a > opt->q? (l * opt->a - opt->q) / opt->e : 0; + l += l * opt->a + opt->end_bonus > opt->q? (l * opt->a + opt->end_bonus - opt->q) / opt->e : 0; re0 = re + l < mi->seq[rid].len? re + l : mi->seq[rid].len; } else { // compute rs0 and qs0 @@ -523,7 +533,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int assert(re1 - rs1 <= re0 - rs0); if (r->p) { mm_idx_getseq(mi, rid, rs1, re1, tseq); - mm_update_extra(r, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e); + mm_update_extra(r, &qseq0[r->rev][qs1], qual0[r->rev]? &qual0[r->rev][qs1] : 0, tseq, mat, opt->q, opt->e); if (rev && r->p->trans_strand) r->p->trans_strand ^= 3; // flip to the read strand } @@ -531,10 +541,10 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int kfree(km, tseq); } -static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], const mm_reg1_t *r1, const mm_reg1_t *r2, mm_reg1_t *r_inv, ksw_extz_t *ez) +static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], uint8_t *qual0[2], const mm_reg1_t *r1, const mm_reg1_t *r2, mm_reg1_t *r_inv, ksw_extz_t *ez) { int tl, ql, score, ret = 0, q_off, t_off; - uint8_t *tseq, *qseq; + uint8_t *tseq, *qseq, *qual; int8_t mat[25]; void *qp; @@ -552,6 +562,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i tseq = (uint8_t*)kmalloc(km, tl); mm_idx_getseq(mi, r1->rid, r1->re, r2->rs, tseq); qseq = &qseq0[!r1->rev][qlen - r2->qs]; + qual = qual0[!r1->rev]? &qseq0[!r1->rev][qlen - r2->qs] : 0; mm_seq_rev(ql, qseq); mm_seq_rev(tl, tseq); @@ -573,7 +584,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i r_inv->rid = r1->rid; r_inv->qs = r1->qe + q_off, r_inv->qe = r_inv->qs + ez->max_q + 1; r_inv->rs = r1->re + t_off, r_inv->re = r_inv->rs + ez->max_t + 1; - mm_update_extra(r_inv, qseq + q_off, tseq + t_off, mat, opt->q, opt->e); + mm_update_extra(r_inv, &qseq[q_off], qual? &qual[q_off] : 0, &tseq[t_off], mat, opt->q, opt->e); ret = 1; end_align1_inv: kfree(km, tseq); @@ -590,20 +601,26 @@ static inline mm_reg1_t *mm_insert_reg(const mm_reg1_t *r, int i, int *n_regs, m return regs; } -mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a) +mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, const char *qual, int *n_regs_, mm_reg1_t *regs, mm128_t *a) { extern unsigned char seq_nt4_table[256]; int32_t i, n_regs = *n_regs_, n_a; - uint8_t *qseq0[2]; + uint8_t *qseq0[2], *qual0[2]; ksw_extz_t ez; // encode the query sequence - qseq0[0] = (uint8_t*)kmalloc(km, qlen); - qseq0[1] = (uint8_t*)kmalloc(km, qlen); + qseq0[0] = (uint8_t*)kmalloc(km, qlen * 2); + qseq0[1] = qseq0[0] + qlen; for (i = 0; i < qlen; ++i) { qseq0[0][i] = seq_nt4_table[(uint8_t)qstr[i]]; qseq0[1][qlen - 1 - i] = qseq0[0][i] < 4? 3 - qseq0[0][i] : 4; } + if (qual) { + qual0[0] = (uint8_t*)kmalloc(km, qlen * 2); + qual0[1] = qual0[0] + qlen; + for (i = 0; i < qlen; ++i) + qual0[0][i] = qual0[1][qlen - 1 - i] = qual[i] - 33; + } else qual0[0] = qual0[1] = 0; // align through seed hits n_a = mm_squeeze_a(km, n_regs, regs, a); @@ -614,8 +631,8 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m mm_reg1_t s[2], s2[2]; int which, trans_strand; s[0] = s[1] = regs[i]; - mm_align1(km, opt, mi, qlen, qseq0, &s[0], &s2[0], n_a, a, &ez, MM_F_SPLICE_FOR); - mm_align1(km, opt, mi, qlen, qseq0, &s[1], &s2[1], n_a, a, &ez, MM_F_SPLICE_REV); + mm_align1(km, opt, mi, qlen, qseq0, qual0, &s[0], &s2[0], n_a, a, &ez, MM_F_SPLICE_FOR); + mm_align1(km, opt, mi, qlen, qseq0, qual0, &s[1], &s2[1], n_a, a, &ez, MM_F_SPLICE_REV); if (s[0].p->dp_score > s[1].p->dp_score) which = 0, trans_strand = 1; else if (s[0].p->dp_score < s[1].p->dp_score) which = 1, trans_strand = 2; else trans_strand = 3, which = (qlen + s[0].p->dp_score) & 1; // randomly choose a strand, effectively @@ -628,20 +645,21 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m } regs[i].p->trans_strand = trans_strand; } else { // one round of alignment - mm_align1(km, opt, mi, qlen, qseq0, ®s[i], &r2, n_a, a, &ez, opt->flag); + mm_align1(km, opt, mi, qlen, qseq0, qual0, ®s[i], &r2, n_a, a, &ez, opt->flag); if (opt->flag&MM_F_SPLICE) regs[i].p->trans_strand = opt->flag&MM_F_SPLICE_FOR? 1 : 2; } if (r2.cnt > 0) regs = mm_insert_reg(&r2, i, &n_regs, regs); if (!(opt->flag&MM_F_SPLICE) && !(opt->flag&MM_F_SR) && i > 0) { // don't try inversion alignment for -xsplice or -xsr - if (mm_align1_inv(km, opt, mi, qlen, qseq0, ®s[i-1], ®s[i], &r2, &ez)) { + if (mm_align1_inv(km, opt, mi, qlen, qseq0, qual0, ®s[i-1], ®s[i], &r2, &ez)) { regs = mm_insert_reg(&r2, i, &n_regs, regs); ++i; // skip the inserted INV alignment } } } *n_regs_ = n_regs; - kfree(km, qseq0[0]); kfree(km, qseq0[1]); + kfree(km, qseq0[0]); + if (qual0[0]) kfree(km, qual0[0]); kfree(km, ez.cigar); mm_filter_regs(km, opt, n_regs_, regs); mm_hit_sort_by_dp(km, n_regs_, regs); diff --git a/format.c b/format.c index 061103b..c4964d0 100644 --- a/format.c +++ b/format.c @@ -196,14 +196,15 @@ static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_ static inline void write_tags(kstring_t *s, const mm_reg1_t *r) { int type = r->inv? 'I' : r->id == r->parent? 'P' : 'S'; - mm_sprintf_lite(s, "\ttp:A:%c\tcm:i:%d\ts1:i:%d", type, r->cnt, r->score); - if (r->parent == r->id) mm_sprintf_lite(s, "\ts2:i:%d", r->subsc); - if (r->split) mm_sprintf_lite(s, "\tzd:i:%d", r->split); + if (r->iden_flt) mm_sprintf_lite(s, "\tom:i:%d", r->mapq); if (r->p) { mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->dp_max, r->p->dp_score, r->p->n_ambi); if (r->p->trans_strand == 1 || r->p->trans_strand == 2) mm_sprintf_lite(s, "\tts:A:%c", "?+-?"[r->p->trans_strand]); } + mm_sprintf_lite(s, "\ttp:A:%c\tcm:i:%d\ts1:i:%d", type, r->cnt, r->score); + if (r->parent == r->id) mm_sprintf_lite(s, "\ts2:i:%d", r->subsc); + if (r->split) mm_sprintf_lite(s, "\tzd:i:%d", r->split); } void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag) @@ -303,8 +304,9 @@ void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se mm_sprintf_lite(s, "\t%s\t%d\t0\t*", mi->seq[this_rid].name, this_pos+1); } else mm_sprintf_lite(s, "\t*\t0\t0\t*"); } else { + int mapq = !r->iden_flt? r->mapq : r->mapq < 3? r->mapq : 3; this_rid = r->rid, this_pos = r->rs, this_rev = r->rev; - mm_sprintf_lite(s, "\t%s\t%d\t%d\t", mi->seq[r->rid].name, r->rs+1, r->mapq); + mm_sprintf_lite(s, "\t%s\t%d\t%d\t", mi->seq[r->rid].name, r->rs+1, mapq); if (r->p) { // actually this should always be true for SAM output uint32_t k, clip_len = r->rev? t->l_seq - r->qe : r->qs; int clip_char = (flag&0x800)? 'H' : 'S'; diff --git a/hit.c b/hit.c index 40147d4..11c77e4 100644 --- a/hit.c +++ b/hit.c @@ -239,6 +239,45 @@ void mm_filter_regs(void *km, const mm_mapopt_t *opt, int *n_regs, mm_reg1_t *re *n_regs = k; } +void mm_filter_by_identity(void *km, int n_regs, mm_reg1_t *regs, float min_iden, int qlen, const char *qual) // TODO: make sure it is not beyond the ends of contigs +{ + int i, j, n_aux = 0, en, blen = 0; + uint64_t *aux; + float n_diff = 0.0f; + if (n_regs <= 0) return; + for (i = 0; i < n_regs; ++i) + if (regs[i].id == regs[i].parent && regs[i].pe_thru) // sequenced through the fragment; don't filter + return; + for (i = 0; i < n_regs; ++i) + if (regs[i].id == regs[i].parent) + ++n_aux; + assert(n_aux >= 1); + aux = (uint64_t*)kmalloc(km, n_aux * 8); + for (i = 0, n_aux = 0; i < n_regs; ++i) + if (regs[i].id == regs[i].parent) + aux[n_aux++] = (uint64_t)regs[i].qs<<32 | i; + radix_sort_64(aux, aux + n_aux); + for (i = 0, en = 0; i < n_aux; ++i) { + mm_reg1_t *r = ®s[(int32_t)aux[i]]; + if (r->qs > en) { + for (j = en; j < r->qs; ++j) + n_diff += qual == 0 || qual[j] >= 53? 0.5f : 0.025f * (qual[j] - 33); + blen += r->qs - en; + } + assert(r->p); + blen += r->p->blen2; + n_diff += r->p->n_diff2; + en = en > r->qe? en : r->qe; + } + for (j = en; j < qlen; ++j) + n_diff += qual == 0 || qual[j] >= 53? 0.5f : 0.025f * (qual[j] - 33); + blen += qlen - en; + kfree(km, aux); + if (1.0f - n_diff / blen < min_iden) + for (i = 0; i < n_regs; ++i) + regs[i].iden_flt = 1; +} + int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a) { // squeeze out regions in a[] that are not referenced by regs[] int i, as = 0; diff --git a/main.c b/main.c index 316c9ce..ada76b2 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.2-r511-dirty" +#define MM_VERSION "2.2-r512-dirty" #ifdef __linux__ #include @@ -65,7 +65,7 @@ static inline int64_t mm_parse_num(const char *str) int main(int argc, char *argv[]) { - const char *opt_str = "2aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:"; + const char *opt_str = "2aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:i:"; mm_mapopt_t opt; mm_idxopt_t ipt; int i, c, n_threads = 3, long_idx, max_gap_ref = 0; @@ -100,6 +100,7 @@ int main(int argc, char *argv[]) else if (c == 'g') opt.max_gap = (int)mm_parse_num(optarg); else if (c == 'G') max_gap_ref = (int)mm_parse_num(optarg); else if (c == 'F') opt.max_frag_len = (int)mm_parse_num(optarg); + else if (c == 'i') opt.min_iden = atof(optarg); else if (c == 'N') opt.best_n = atoi(optarg); else if (c == 'p') opt.pri_ratio = atof(optarg); else if (c == 'M') opt.mask_level = atof(optarg); @@ -222,6 +223,7 @@ int main(int argc, char *argv[]) fprintf(fp_help, " -z INT Z-drop score [%d]\n", opt.zdrop); fprintf(fp_help, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max); fprintf(fp_help, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n"); + fprintf(fp_help, " -i FLOAT min identity (mapQ reduced to 0 if below) [0]\n"); fprintf(fp_help, " Input/Output:\n"); fprintf(fp_help, " -a output in the SAM format (PAF by default)\n"); fprintf(fp_help, " -Q don't output base quality in SAM\n"); diff --git a/map.c b/map.c index fb291b6..c6c80af 100644 --- a/map.c +++ b/map.c @@ -85,9 +85,9 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) io->is_hpc = 0, io->k = 21, io->w = 11; mo->flag |= MM_F_SR | MM_F_FRAG_MODE | MM_F_NO_PRINT_2ND | MM_F_2_IO_THREADS; mo->pe_ori = 0<<1|1; // FR - mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 32, mo->e2 = 1; + mo->a = 2, mo->b = 8, mo->q = 12, mo->e = 2, mo->q2 = 24, mo->e2 = 1; mo->zdrop = 100; - mo->end_bonus = 11; + mo->end_bonus = 15; mo->max_frag_len = 800; mo->max_gap = 100; mo->bw = 100; @@ -251,10 +251,10 @@ static void chain_post(const mm_mapopt_t *opt, int max_chain_gap_ref, const mm_i } } -static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *km, int qlen, const char *seq, int *n_regs, mm_reg1_t *regs, mm128_t *a) +static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *km, int qlen, const char *seq, const char *qual, int *n_regs, mm_reg1_t *regs, mm128_t *a) { if (!(opt->flag & MM_F_CIGAR)) return regs; - regs = mm_align_skeleton(km, opt, mi, qlen, seq, n_regs, regs, a); // this calls mm_filter_regs() + regs = mm_align_skeleton(km, opt, mi, qlen, seq, qual, n_regs, regs, a); // this calls mm_filter_regs() if (!(opt->flag & MM_F_AVA)) { mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b); mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs); @@ -263,7 +263,7 @@ static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *k return regs; } -void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname) +void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, const char **quals, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname) { int i, j, rep_len, qlen_sum, n_regs0; int max_chain_gap_qry, max_chain_gap_ref, is_splice = !!(opt->flag & MM_F_SPLICE); @@ -340,7 +340,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** chain_post(opt, max_chain_gap_ref, mi, b->km, qlen_sum, n_segs, qlens, &n_regs0, regs0, a); if (n_segs == 1) { // uni-segment - regs0 = align_regs(opt, mi, b->km, qlens[0], seqs[0], &n_regs0, regs0, a); + regs0 = align_regs(opt, mi, b->km, qlens[0], seqs[0], quals? quals[0] : 0, &n_regs0, regs0, a); mm_set_mapq(n_regs0, regs0, opt->min_chain_score, opt->a, rep_len); n_regs[0] = n_regs0, regs[0] = regs0; } else { // multi-segment @@ -349,13 +349,16 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** free(regs0); for (i = 0; i < n_segs; ++i) { mm_set_parent(b->km, opt->mask_level, n_regs[i], regs[i], opt->a * 2 + opt->b); // update mm_reg1_t::parent - regs[i] = align_regs(opt, mi, b->km, qlens[i], seqs[i], &n_regs[i], regs[i], seg[i].a); + regs[i] = align_regs(opt, mi, b->km, qlens[i], seqs[i], quals? quals[i] : 0, &n_regs[i], regs[i], seg[i].a); mm_set_mapq(n_regs[i], regs[i], opt->min_chain_score, opt->a, rep_len); } mm_seg_free(b->km, n_segs, seg); if (n_segs == 2 && opt->pe_ori >= 0 && (opt->flag&MM_F_CIGAR)) mm_pair(b->km, max_chain_gap_ref, opt->pe_bonus, opt->a * 2 + opt->b, opt->a, qlens, n_regs, regs); // pairing } + if (opt->min_iden > 0.0f) + for (i = 0; i < n_segs; ++i) + mm_filter_by_identity(b->km, n_regs[i], regs[i], opt->min_iden, qlens[i], quals[i]); kfree(b->km, a); kfree(b->km, u); @@ -364,7 +367,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname) { mm_reg1_t *regs; - mm_map_frag(mi, 1, &qlen, &seq, n_regs, ®s, b, opt, qname); + mm_map_frag(mi, 1, &qlen, &seq, 0, n_regs, ®s, b, opt, qname); return regs; } @@ -392,20 +395,22 @@ typedef struct { static void worker_for(void *_data, long i, int tid) // kt_for() callback { step_t *s = (step_t*)_data; - int *qlens, j, off = s->seg_off[i], pe_ori = s->p->opt->pe_ori; - const char **qseqs; + int *qlens, j, off = s->seg_off[i], pe_ori = s->p->opt->pe_ori, is_sr = !!(s->p->opt->flag & MM_F_SR); + const char **qseqs, **quals = 0; mm_tbuf_t *b = s->buf[tid]; if (mm_dbg_flag & MM_DBG_PRINT_QNAME) fprintf(stderr, "QR\t%s\t%d\n", s->seq[off].name, tid); qlens = (int*)kmalloc(b->km, s->n_seg[i] * sizeof(int)); qseqs = (const char**)kmalloc(b->km, s->n_seg[i] * sizeof(const char**)); + quals = (const char**)kmalloc(b->km, s->n_seg[i] * sizeof(const char**)); for (j = 0; j < s->n_seg[i]; ++j) { if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori>>1&1)) || (j == 1 && (pe_ori&1)))) mm_revcomp_bseq(&s->seq[off + j]); qlens[j] = s->seq[off + j].l_seq; qseqs[j] = s->seq[off + j].seq; + quals[j] = is_sr? s->seq[off + j].qual : 0; } - mm_map_frag(s->p->mi, s->n_seg[i], qlens, qseqs, &s->n_reg[off], &s->reg[off], b, s->p->opt, s->seq[off].name); + mm_map_frag(s->p->mi, s->n_seg[i], qlens, qseqs, quals, &s->n_reg[off], &s->reg[off], b, s->p->opt, s->seq[off].name); for (j = 0; j < s->n_seg[i]; ++j) // flip the query strand and coordinate to the original read strand if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori>>1&1)) || (j == 1 && (pe_ori&1)))) { int k, t; @@ -420,6 +425,7 @@ static void worker_for(void *_data, long i, int tid) // kt_for() callback } kfree(b->km, qlens); kfree(b->km, qseqs); + kfree(b->km, quals); } static void *worker_pipeline(void *shared, int step, void *in) diff --git a/minimap.h b/minimap.h index 9bef3e5..beb4b0d 100644 --- a/minimap.h +++ b/minimap.h @@ -58,6 +58,8 @@ typedef struct { uint32_t n_diff; // number of differences, including ambiguous bases uint32_t n_ambi:30, trans_strand:2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for - uint32_t n_cigar; // number of cigar operations in cigar[] + float n_diff2; + uint32_t blen2; uint32_t cigar[]; } mm_extra_t; @@ -71,7 +73,7 @@ typedef struct { int32_t as; // offset in the a[] array (for internal uses only) int32_t fuzzy_mlen, fuzzy_blen; // seeded exact match length; seeded alignment block length (approximate) uint32_t mapq:8, split:2, n_sub:22; // mapQ; split pattern; number of suboptimal mappings - uint32_t sam_pri:1, proper_frag:1, dummy:30; + uint32_t sam_pri:1, proper_frag:1, iden_flt:1, pe_thru:1, dummy:29; uint32_t hash; mm_extra_t *p; } mm_reg1_t; @@ -98,6 +100,7 @@ typedef struct { float mask_level; float pri_ratio; int best_n; // top best_n chains are subjected to DP alignment + float min_iden; int max_join_long, max_join_short; int min_join_flank_sc; diff --git a/mmpriv.h b/mmpriv.h index 285a183..7fe23e3 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -64,7 +64,7 @@ const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n); int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq); int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f); mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km); -mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a); +mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, const char *qual, int *n_regs_, mm_reg1_t *regs, mm128_t *a); mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a); void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a); @@ -75,6 +75,7 @@ void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int *n_, mm_reg1_t *r); void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r); void mm_filter_regs(void *km, const mm_mapopt_t *opt, int *n_regs, mm_reg1_t *regs); +void mm_filter_by_identity(void *km, int n_regs, mm_reg1_t *regs, float min_iden, int qlen, const char *qual); void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs, mm128_t *a); void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r); void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len); diff --git a/pe.c b/pe.c index a1f9fda..a73556b 100644 --- a/pe.c +++ b/pe.c @@ -42,6 +42,26 @@ void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int } } +void mm_set_pe_thru(const int *qlens, int *n_regs, mm_reg1_t **regs) +{ + int s, i, n_pri[2], pri[2]; + n_pri[0] = n_pri[1] = 0; + pri[0] = pri[1] = -1; + for (s = 0; s < 2; ++s) + for (i = 0; i < n_regs[s]; ++i) + if (regs[s][i].id == regs[s][i].parent) + ++n_pri[s], pri[s] = i; + if (n_pri[0] == 1 && n_pri[1] == 1) { + mm_reg1_t *p = ®s[0][pri[0]]; + mm_reg1_t *q = ®s[1][pri[1]]; + if (p->rid == q->rid && p->rev == q->rev && abs(p->rs - q->rs) < 3 && abs(p->re - p->re) < 3 + && ((p->qs == 0 && qlens[1] - q->qe == 0) || (q->qs == 0 && qlens[0] - p->qe == 0))) + { + p->pe_thru = q->pe_thru = 1; + } + } +} + #include "ksort.h" typedef struct { @@ -152,4 +172,6 @@ void mm_pair(void *km, int max_gap_ref, int pe_bonus, int sub_diff, int match_sc kfree(km, a); kfree(km, sc.a); + + mm_set_pe_thru(qlens, n_regs, regs); }