r738: output multi-map in the XA tag (SE only)
... PE support coming soon
This commit is contained in:
parent
d59d78838c
commit
c6c943f9d7
18
bwamem.c
18
bwamem.c
|
|
@ -67,6 +67,7 @@ mem_opt_t *mem_opt_init()
|
||||||
o->split_factor = 1.5;
|
o->split_factor = 1.5;
|
||||||
o->chunk_size = 10000000;
|
o->chunk_size = 10000000;
|
||||||
o->n_threads = 1;
|
o->n_threads = 1;
|
||||||
|
o->max_hits = 10;
|
||||||
o->max_matesw = 100;
|
o->max_matesw = 100;
|
||||||
o->mask_level_redun = 0.95;
|
o->mask_level_redun = 0.95;
|
||||||
o->min_chain_weight = 0;
|
o->min_chain_weight = 0;
|
||||||
|
|
@ -898,6 +899,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); }
|
||||||
if (s->comment) { kputc('\t', str); kputs(s->comment, str); }
|
if (s->comment) { kputc('\t', str); kputs(s->comment, str); }
|
||||||
kputc('\n', str);
|
kputc('\n', str);
|
||||||
}
|
}
|
||||||
|
|
@ -934,10 +936,14 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a)
|
||||||
// TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible
|
// TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible
|
||||||
void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m)
|
void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m)
|
||||||
{
|
{
|
||||||
|
extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query);
|
||||||
kstring_t str;
|
kstring_t str;
|
||||||
kvec_t(mem_aln_t) aa;
|
kvec_t(mem_aln_t) aa;
|
||||||
int k;
|
int k;
|
||||||
|
char **XA = 0;
|
||||||
|
|
||||||
|
if (!(opt->flag & MEM_F_ALL))
|
||||||
|
XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq);
|
||||||
kv_init(aa);
|
kv_init(aa);
|
||||||
str.l = str.m = 0; str.s = 0;
|
str.l = str.m = 0; str.s = 0;
|
||||||
for (k = 0; k < a->n; ++k) {
|
for (k = 0; k < a->n; ++k) {
|
||||||
|
|
@ -948,10 +954,8 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa
|
||||||
if (p->secondary >= 0 && p->score < a->a[p->secondary].score * opt->drop_ratio) continue;
|
if (p->secondary >= 0 && p->score < a->a[p->secondary].score * opt->drop_ratio) continue;
|
||||||
q = kv_pushp(mem_aln_t, aa);
|
q = kv_pushp(mem_aln_t, aa);
|
||||||
*q = mem_reg2aln2(opt, bns, pac, s->l_seq, s->seq, p, s->name);
|
*q = mem_reg2aln2(opt, bns, pac, s->l_seq, s->seq, p, s->name);
|
||||||
if (q->rid < 0) {
|
assert(q->rid >= 0); // this should not happen with the new code
|
||||||
--aa.n;
|
q->XA = XA? XA[k] : 0;
|
||||||
continue;
|
|
||||||
}
|
|
||||||
q->flag |= extra_flag; // flag secondary
|
q->flag |= extra_flag; // flag secondary
|
||||||
if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score
|
if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score
|
||||||
if (k && p->secondary < 0) // if supplementary
|
if (k && p->secondary < 0) // if supplementary
|
||||||
|
|
@ -966,10 +970,14 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa
|
||||||
} else {
|
} else {
|
||||||
for (k = 0; k < aa.n; ++k)
|
for (k = 0; k < aa.n; ++k)
|
||||||
mem_aln2sam(bns, &str, s, aa.n, aa.a, k, m, opt->flag&MEM_F_SOFTCLIP);
|
mem_aln2sam(bns, &str, s, aa.n, aa.a, k, m, opt->flag&MEM_F_SOFTCLIP);
|
||||||
for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar);
|
for (k = 0; k < aa.n; ++k) {
|
||||||
|
free(aa.a[k].cigar);
|
||||||
|
free(aa.a[k].XA);
|
||||||
|
}
|
||||||
free(aa.a);
|
free(aa.a);
|
||||||
}
|
}
|
||||||
s->sam = str.s;
|
s->sam = str.s;
|
||||||
|
if (XA) free(XA);
|
||||||
}
|
}
|
||||||
|
|
||||||
mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
|
mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
|
||||||
|
|
|
||||||
2
bwamem.h
2
bwamem.h
|
|
@ -47,6 +47,7 @@ typedef struct {
|
||||||
int mapQ_coef_fac;
|
int mapQ_coef_fac;
|
||||||
int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
|
int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
|
||||||
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
|
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
|
||||||
|
int max_hits; // if there are max_hits or fewer, output them all
|
||||||
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
|
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
|
||||||
} mem_opt_t;
|
} mem_opt_t;
|
||||||
|
|
||||||
|
|
@ -82,6 +83,7 @@ typedef struct { // This struct is only used for the convenience of API.
|
||||||
uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
|
uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
|
||||||
int n_cigar; // number of CIGAR operations
|
int n_cigar; // number of CIGAR operations
|
||||||
uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234
|
uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234
|
||||||
|
char *XA; // alternative mappings
|
||||||
|
|
||||||
int score, sub;
|
int score, sub;
|
||||||
} mem_aln_t;
|
} mem_aln_t;
|
||||||
|
|
|
||||||
|
|
@ -100,9 +100,48 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
|
||||||
kputw(bns->anns[rid].len, &str); kputc('\t', &str);
|
kputw(bns->anns[rid].len, &str); kputc('\t', &str);
|
||||||
kputw(pos, &str); kputc('\t', &str); kputw(pos + (re - rb), &str); kputc('\t', &str);
|
kputw(pos, &str); kputc('\t', &str); kputw(pos + (re - rb), &str); kputc('\t', &str);
|
||||||
ksprintf(&str, "%.3f", (double)p->truesc / opt->a / (qe - qb > re - rb? qe - qb : re - rb));
|
ksprintf(&str, "%.3f", (double)p->truesc / opt->a / (qe - qb > re - rb? qe - qb : re - rb));
|
||||||
kputc('\t', &str); kputw(p->n_comp, &str);
|
|
||||||
kputc('\n', &str);
|
kputc('\n', &str);
|
||||||
}
|
}
|
||||||
s->sam = str.s;
|
s->sam = str.s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup.
|
||||||
|
char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se()
|
||||||
|
{
|
||||||
|
int i, k, *cnt, tot;
|
||||||
|
kstring_t *aln = 0;
|
||||||
|
char **XA = 0;
|
||||||
|
|
||||||
|
cnt = calloc(a->n, sizeof(int));
|
||||||
|
for (i = 0, tot = 0; i < a->n; ++i) {
|
||||||
|
int j = a->a[i].secondary;
|
||||||
|
if (j >= 0 && a->a[i].score >= a->a[j].score * opt->drop_ratio)
|
||||||
|
++cnt[j], ++tot;
|
||||||
|
}
|
||||||
|
if (tot == 0) goto end_gen_alt;
|
||||||
|
aln = calloc(a->n, sizeof(kstring_t));
|
||||||
|
for (i = 0; i < a->n; ++i) {
|
||||||
|
mem_aln_t t;
|
||||||
|
int j = a->a[i].secondary;
|
||||||
|
if (j < 0 || a->a[i].score < a->a[j].score * opt->drop_ratio) continue; // we don't process the primary alignments as they will be converted to SAM later
|
||||||
|
if (cnt[j] > opt->max_hits) continue;
|
||||||
|
t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]);
|
||||||
|
kputs(bns->anns[t.rid].name, &aln[j]);
|
||||||
|
kputc(',', &aln[j]); kputc("+-"[t.is_rev], &aln[j]); kputl(t.pos + 1, &aln[j]);
|
||||||
|
kputc(',', &aln[j]);
|
||||||
|
for (k = 0; k < t.n_cigar; ++k) {
|
||||||
|
kputw(t.cigar[k]>>4, &aln[j]);
|
||||||
|
kputc("MIDSHN"[t.cigar[k]&0xf], &aln[j]);
|
||||||
|
}
|
||||||
|
kputc(',', &aln[j]); kputw(t.NM, &aln[j]);
|
||||||
|
kputc(';', &aln[j]);
|
||||||
|
free(t.cigar);
|
||||||
|
}
|
||||||
|
XA = calloc(a->n, sizeof(char*));
|
||||||
|
for (k = 0; k < a->n; ++k)
|
||||||
|
XA[k] = aln[k].s;
|
||||||
|
|
||||||
|
end_gen_alt:
|
||||||
|
free(cnt); free(aln);
|
||||||
|
return XA;
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -313,7 +313,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
|
||||||
mem_aln2sam(bns, &str, &s[0], 1, &h[0], 0, &h[1], opt->flag&MEM_F_SOFTCLIP); s[0].sam = strdup(str.s); str.l = 0;
|
mem_aln2sam(bns, &str, &s[0], 1, &h[0], 0, &h[1], opt->flag&MEM_F_SOFTCLIP); s[0].sam = strdup(str.s); str.l = 0;
|
||||||
mem_aln2sam(bns, &str, &s[1], 1, &h[1], 0, &h[0], opt->flag&MEM_F_SOFTCLIP); s[1].sam = str.s;
|
mem_aln2sam(bns, &str, &s[1], 1, &h[1], 0, &h[0], opt->flag&MEM_F_SOFTCLIP); s[1].sam = str.s;
|
||||||
if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
|
if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
|
||||||
free(h[0].cigar); free(h[1].cigar);
|
free(h[0].cigar); free(h[0].XA);
|
||||||
|
free(h[1].cigar); free(h[1].XA);
|
||||||
} else goto no_pairing;
|
} else goto no_pairing;
|
||||||
return n;
|
return n;
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue