diff --git a/bwamem.c b/bwamem.c index 660c51b..e5ea8e7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -67,6 +67,7 @@ mem_opt_t *mem_opt_init() o->split_factor = 1.5; o->chunk_size = 10000000; o->n_threads = 1; + o->max_hits = 10; o->max_matesw = 100; o->mask_level_redun = 0.95; o->min_chain_weight = 0; @@ -898,6 +899,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m } } } + if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); } if (s->comment) { kputc('\t', str); kputs(s->comment, str); } kputc('\n', str); } @@ -934,10 +936,14 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) // TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m) { + extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query); kstring_t str; kvec_t(mem_aln_t) aa; int k; + char **XA = 0; + if (!(opt->flag & MEM_F_ALL)) + XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq); kv_init(aa); str.l = str.m = 0; str.s = 0; for (k = 0; k < a->n; ++k) { @@ -948,10 +954,8 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa if (p->secondary >= 0 && p->score < a->a[p->secondary].score * opt->drop_ratio) continue; q = kv_pushp(mem_aln_t, aa); *q = mem_reg2aln2(opt, bns, pac, s->l_seq, s->seq, p, s->name); - if (q->rid < 0) { - --aa.n; - continue; - } + assert(q->rid >= 0); // this should not happen with the new code + q->XA = XA? XA[k] : 0; q->flag |= extra_flag; // flag secondary if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score if (k && p->secondary < 0) // if supplementary @@ -966,10 +970,14 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa } else { for (k = 0; k < aa.n; ++k) mem_aln2sam(bns, &str, s, aa.n, aa.a, k, m, opt->flag&MEM_F_SOFTCLIP); - for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar); + for (k = 0; k < aa.n; ++k) { + free(aa.a[k].cigar); + free(aa.a[k].XA); + } free(aa.a); } s->sam = str.s; + if (XA) free(XA); } mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) diff --git a/bwamem.h b/bwamem.h index ba630aa..a6a6c8d 100644 --- a/bwamem.h +++ b/bwamem.h @@ -47,6 +47,7 @@ typedef struct { int mapQ_coef_fac; int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end + int max_hits; // if there are max_hits or fewer, output them all int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset } mem_opt_t; @@ -82,6 +83,7 @@ typedef struct { // This struct is only used for the convenience of API. uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance int n_cigar; // number of CIGAR operations uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234 + char *XA; // alternative mappings int score, sub; } mem_aln_t; diff --git a/bwamem_extra.c b/bwamem_extra.c index fa56ef1..6b600ae 100644 --- a/bwamem_extra.c +++ b/bwamem_extra.c @@ -100,9 +100,48 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, kputw(bns->anns[rid].len, &str); kputc('\t', &str); kputw(pos, &str); kputc('\t', &str); kputw(pos + (re - rb), &str); kputc('\t', &str); ksprintf(&str, "%.3f", (double)p->truesc / opt->a / (qe - qb > re - rb? qe - qb : re - rb)); - kputc('\t', &str); kputw(p->n_comp, &str); kputc('\n', &str); } s->sam = str.s; } +// Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup. +char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se() +{ + int i, k, *cnt, tot; + kstring_t *aln = 0; + char **XA = 0; + + cnt = calloc(a->n, sizeof(int)); + for (i = 0, tot = 0; i < a->n; ++i) { + int j = a->a[i].secondary; + if (j >= 0 && a->a[i].score >= a->a[j].score * opt->drop_ratio) + ++cnt[j], ++tot; + } + if (tot == 0) goto end_gen_alt; + aln = calloc(a->n, sizeof(kstring_t)); + for (i = 0; i < a->n; ++i) { + mem_aln_t t; + int j = a->a[i].secondary; + if (j < 0 || a->a[i].score < a->a[j].score * opt->drop_ratio) continue; // we don't process the primary alignments as they will be converted to SAM later + if (cnt[j] > opt->max_hits) continue; + t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]); + kputs(bns->anns[t.rid].name, &aln[j]); + kputc(',', &aln[j]); kputc("+-"[t.is_rev], &aln[j]); kputl(t.pos + 1, &aln[j]); + kputc(',', &aln[j]); + for (k = 0; k < t.n_cigar; ++k) { + kputw(t.cigar[k]>>4, &aln[j]); + kputc("MIDSHN"[t.cigar[k]&0xf], &aln[j]); + } + kputc(',', &aln[j]); kputw(t.NM, &aln[j]); + kputc(';', &aln[j]); + free(t.cigar); + } + XA = calloc(a->n, sizeof(char*)); + for (k = 0; k < a->n; ++k) + XA[k] = aln[k].s; + +end_gen_alt: + free(cnt); free(aln); + return XA; +} diff --git a/bwamem_pair.c b/bwamem_pair.c index b7cdc35..b0c1164 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -313,7 +313,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co mem_aln2sam(bns, &str, &s[0], 1, &h[0], 0, &h[1], opt->flag&MEM_F_SOFTCLIP); s[0].sam = strdup(str.s); str.l = 0; mem_aln2sam(bns, &str, &s[1], 1, &h[1], 0, &h[0], opt->flag&MEM_F_SOFTCLIP); s[1].sam = str.s; if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); - free(h[0].cigar); free(h[1].cigar); + free(h[0].cigar); free(h[0].XA); + free(h[1].cigar); free(h[1].XA); } else goto no_pairing; return n; diff --git a/main.c b/main.c index bcbcb22..31153c5 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8-r737-dirty" +#define PACKAGE_VERSION "0.7.8-r738-dirty" #endif int bwa_fa2pac(int argc, char *argv[]);