From adf6cd7f52ff92295bb2085bd502ab9397d8a9ab Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 16 Oct 2017 10:55:18 -0400 Subject: [PATCH] r513: merged pre- and post-cigar blen and mlen This saves a bit memory and is cleaner. --- align.c | 14 +++++++------- format.c | 7 +++---- hit.c | 12 ++++++------ main.c | 2 +- minimap.h | 4 +--- 5 files changed, 18 insertions(+), 21 deletions(-) diff --git a/align.c b/align.c index f81f921..a105212 100644 --- a/align.c +++ b/align.c @@ -125,6 +125,7 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *qu if (p == 0) return; mm_fix_cigar(r, qseq, tseq, &qshift, &tshift); qseq += qshift, tseq += tshift; // qseq and tseq may be shifted due to the removal of leading I/D + r->blen = r->mlen = 0; for (k = 0; k < p->n_cigar; ++k) { uint32_t op = p->cigar[k]&0xf, len = p->cigar[k]>>4; if (op == 0) { // match/mismatch @@ -141,28 +142,27 @@ static void mm_update_extra(mm_reg1_t *r, const uint8_t *qseq, const uint8_t *qu if (s < 0) s = 0; else max = max > s? max : s; } - p->n_ambi += n_ambi; - p->n_diff += n_diff; - toff += len, qoff += len, p->blen += len; + r->blen += len - n_ambi, r->mlen += len - (n_ambi + n_diff), p->n_ambi += n_ambi; p->n_diff2 += n_diff2, p->blen2 += len - n_ambi; + toff += len, qoff += len; } else if (op == 1) { // insertion int n_ambi = 0; for (l = 0; l < len; ++l) if (qseq[qoff + l] > 3) ++n_ambi; - qoff += len, p->blen += len; - p->n_ambi += n_ambi, p->n_diff += len - n_ambi; + r->blen += len - n_ambi, p->n_ambi += n_ambi; p->n_diff2 += 1.0f, ++p->blen2; s -= q + e * len; if (s < 0) s = 0; + qoff += len; } else if (op == 2) { // deletion int n_ambi = 0; for (l = 0; l < len; ++l) if (tseq[toff + l] > 3) ++n_ambi; - toff += len, p->blen += len; - p->n_ambi += n_ambi, p->n_diff += len - n_ambi; + r->blen += len - n_ambi, p->n_ambi += n_ambi; p->n_diff2 += 1.0f, ++p->blen2; s -= q + e * len; if (s < 0) s = 0; + toff += len; } else if (op == 3) { // intron toff += len; } diff --git a/format.c b/format.c index c4964d0..04f8ac9 100644 --- a/format.c +++ b/format.c @@ -198,7 +198,7 @@ static inline void write_tags(kstring_t *s, const mm_reg1_t *r) int type = r->inv? 'I' : r->id == r->parent? 'P' : 'S'; if (r->iden_flt) mm_sprintf_lite(s, "\tom:i:%d", r->mapq); if (r->p) { - mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->p->n_diff, r->p->dp_max, r->p->dp_score, r->p->n_ambi); + mm_sprintf_lite(s, "\tNM:i:%d\tms:i:%d\tAS:i:%d\tnn:i:%d", r->blen - r->mlen + r->p->n_ambi, r->p->dp_max, r->p->dp_score, r->p->n_ambi); if (r->p->trans_strand == 1 || r->p->trans_strand == 2) mm_sprintf_lite(s, "\tts:A:%c", "?+-?"[r->p->trans_strand]); } @@ -214,8 +214,7 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m if (mi->seq[r->rid].name) mm_sprintf_lite(s, "%s", mi->seq[r->rid].name); else mm_sprintf_lite(s, "%d", r->rid); mm_sprintf_lite(s, "\t%d\t%d\t%d", mi->seq[r->rid].len, r->rs, r->re); - if (r->p) mm_sprintf_lite(s, "\t%d\t%d", r->p->blen - r->p->n_ambi - r->p->n_diff, r->p->blen); - else mm_sprintf_lite(s, "\t%d\t%d", r->fuzzy_mlen, r->fuzzy_blen); + mm_sprintf_lite(s, "\t%d\t%d", r->mlen, r->blen); mm_sprintf_lite(s, "\t%d", r->mapq); write_tags(s, r); if (r->p && (opt_flag & MM_F_OUT_CG)) { @@ -389,7 +388,7 @@ void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se if (l_I) mm_sprintf_lite(s, "%dI", l_I); if (l_D) mm_sprintf_lite(s, "%dD", l_D); if (clip3) mm_sprintf_lite(s, "%dS", clip3); - mm_sprintf_lite(s, ",%d,%d;", q->mapq, q->p->n_diff); + mm_sprintf_lite(s, ",%d,%d;", q->mapq, q->blen - q->mlen + q->p->n_ambi); } } } diff --git a/hit.c b/hit.c index 11c77e4..bfe5c57 100644 --- a/hit.c +++ b/hit.c @@ -8,15 +8,15 @@ static inline void mm_cal_fuzzy_len(mm_reg1_t *r, const mm128_t *a) { int i; - r->fuzzy_mlen = r->fuzzy_blen = 0; + r->mlen = r->blen = 0; if (r->cnt <= 0) return; - r->fuzzy_mlen = r->fuzzy_blen = a[r->as].y>>32&0xff; + r->mlen = r->blen = a[r->as].y>>32&0xff; for (i = r->as + 1; i < r->as + r->cnt; ++i) { int span = a[i].y>>32&0xff; int tl = (int32_t)a[i].x - (int32_t)a[i-1].x; int ql = (int32_t)a[i].y - (int32_t)a[i-1].y; - r->fuzzy_blen += tl > ql? tl : ql; - r->fuzzy_mlen += tl > span && ql > span? span : tl < ql? tl : ql; + r->blen += tl > ql? tl : ql; + r->mlen += tl > span && ql > span? span : tl < ql? tl : ql; } } @@ -227,7 +227,7 @@ void mm_filter_regs(void *km, const mm_mapopt_t *opt, int *n_regs, mm_reg1_t *re int flt = 0; if (!r->inv && !r->seg_split && r->cnt < opt->min_cnt) flt = 1; if (r->p) { - if (r->p->blen - r->p->n_ambi - r->p->n_diff < opt->min_chain_score) flt = 1; + if (r->mlen < opt->min_chain_score) flt = 1; else if (r->p->dp_max < opt->min_dp_max) flt = 1; if (flt) free(r->p); } @@ -432,7 +432,7 @@ void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, in pen_cm = pen_s1 < pen_cm? pen_s1 : pen_cm; subsc = r->subsc > min_chain_sc? r->subsc : min_chain_sc; if (r->p && r->p->dp_max2 > 0 && r->p->dp_max > 0) { - float identity = (float)(r->p->blen - r->p->n_diff - r->p->n_ambi) / (r->p->blen - r->p->n_ambi); + float identity = (float)r->mlen / r->blen; int mapq_alt = (int)(6.02f * identity * identity * (r->p->dp_max - r->p->dp_max2) / match_sc + .499f); // BWA-MEM like mapQ, mostly for short reads mapq = (int)(identity * pen_cm * q_coef * (1. - (float)r->p->dp_max2 * subsc / r->p->dp_max / r->score) * logf(r->score)); // more for long reads mapq = mapq < mapq_alt? mapq : mapq_alt; // in case the long-read heuristic fails diff --git a/main.c b/main.c index ada76b2..2f0a668 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.2-r512-dirty" +#define MM_VERSION "2.2-r513-dirty" #ifdef __linux__ #include diff --git a/minimap.h b/minimap.h index beb4b0d..5040e8a 100644 --- a/minimap.h +++ b/minimap.h @@ -54,8 +54,6 @@ typedef struct { typedef struct { uint32_t capacity; // the capacity of cigar[] int32_t dp_score, dp_max, dp_max2; // DP score; score of the max-scoring segment; score of the best alternate mappings - uint32_t blen; // block length - uint32_t n_diff; // number of differences, including ambiguous bases uint32_t n_ambi:30, trans_strand:2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for - uint32_t n_cigar; // number of cigar operations in cigar[] float n_diff2; @@ -71,7 +69,7 @@ typedef struct { int32_t qs, qe, rs, re; // query start and end; reference start and end int32_t parent, subsc; // parent==id if primary; best alternate mapping score int32_t as; // offset in the a[] array (for internal uses only) - int32_t fuzzy_mlen, fuzzy_blen; // seeded exact match length; seeded alignment block length (approximate) + int32_t mlen, blen; // seeded exact match length; seeded alignment block length uint32_t mapq:8, split:2, n_sub:22; // mapQ; split pattern; number of suboptimal mappings uint32_t sam_pri:1, proper_frag:1, iden_flt:1, pe_thru:1, dummy:29; uint32_t hash;