r738: output multi-map in the XA tag (SE only)
... PE support coming soon
This commit is contained in:
parent
d59d78838c
commit
c6c943f9d7
18
bwamem.c
18
bwamem.c
|
|
@ -67,6 +67,7 @@ mem_opt_t *mem_opt_init()
|
|||
o->split_factor = 1.5;
|
||||
o->chunk_size = 10000000;
|
||||
o->n_threads = 1;
|
||||
o->max_hits = 10;
|
||||
o->max_matesw = 100;
|
||||
o->mask_level_redun = 0.95;
|
||||
o->min_chain_weight = 0;
|
||||
|
|
@ -898,6 +899,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
|
|||
}
|
||||
}
|
||||
}
|
||||
if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); }
|
||||
if (s->comment) { kputc('\t', str); kputs(s->comment, str); }
|
||||
kputc('\n', str);
|
||||
}
|
||||
|
|
@ -934,10 +936,14 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a)
|
|||
// TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible
|
||||
void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m)
|
||||
{
|
||||
extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query);
|
||||
kstring_t str;
|
||||
kvec_t(mem_aln_t) aa;
|
||||
int k;
|
||||
char **XA = 0;
|
||||
|
||||
if (!(opt->flag & MEM_F_ALL))
|
||||
XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq);
|
||||
kv_init(aa);
|
||||
str.l = str.m = 0; str.s = 0;
|
||||
for (k = 0; k < a->n; ++k) {
|
||||
|
|
@ -948,10 +954,8 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa
|
|||
if (p->secondary >= 0 && p->score < a->a[p->secondary].score * opt->drop_ratio) continue;
|
||||
q = kv_pushp(mem_aln_t, aa);
|
||||
*q = mem_reg2aln2(opt, bns, pac, s->l_seq, s->seq, p, s->name);
|
||||
if (q->rid < 0) {
|
||||
--aa.n;
|
||||
continue;
|
||||
}
|
||||
assert(q->rid >= 0); // this should not happen with the new code
|
||||
q->XA = XA? XA[k] : 0;
|
||||
q->flag |= extra_flag; // flag secondary
|
||||
if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score
|
||||
if (k && p->secondary < 0) // if supplementary
|
||||
|
|
@ -966,10 +970,14 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa
|
|||
} else {
|
||||
for (k = 0; k < aa.n; ++k)
|
||||
mem_aln2sam(bns, &str, s, aa.n, aa.a, k, m, opt->flag&MEM_F_SOFTCLIP);
|
||||
for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar);
|
||||
for (k = 0; k < aa.n; ++k) {
|
||||
free(aa.a[k].cigar);
|
||||
free(aa.a[k].XA);
|
||||
}
|
||||
free(aa.a);
|
||||
}
|
||||
s->sam = str.s;
|
||||
if (XA) free(XA);
|
||||
}
|
||||
|
||||
mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
|
||||
|
|
|
|||
2
bwamem.h
2
bwamem.h
|
|
@ -47,6 +47,7 @@ typedef struct {
|
|||
int mapQ_coef_fac;
|
||||
int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
|
||||
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
|
||||
int max_hits; // if there are max_hits or fewer, output them all
|
||||
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
|
||||
} mem_opt_t;
|
||||
|
||||
|
|
@ -82,6 +83,7 @@ typedef struct { // This struct is only used for the convenience of API.
|
|||
uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
|
||||
int n_cigar; // number of CIGAR operations
|
||||
uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234
|
||||
char *XA; // alternative mappings
|
||||
|
||||
int score, sub;
|
||||
} mem_aln_t;
|
||||
|
|
|
|||
|
|
@ -100,9 +100,48 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
|
|||
kputw(bns->anns[rid].len, &str); kputc('\t', &str);
|
||||
kputw(pos, &str); kputc('\t', &str); kputw(pos + (re - rb), &str); kputc('\t', &str);
|
||||
ksprintf(&str, "%.3f", (double)p->truesc / opt->a / (qe - qb > re - rb? qe - qb : re - rb));
|
||||
kputc('\t', &str); kputw(p->n_comp, &str);
|
||||
kputc('\n', &str);
|
||||
}
|
||||
s->sam = str.s;
|
||||
}
|
||||
|
||||
// Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup.
|
||||
char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se()
|
||||
{
|
||||
int i, k, *cnt, tot;
|
||||
kstring_t *aln = 0;
|
||||
char **XA = 0;
|
||||
|
||||
cnt = calloc(a->n, sizeof(int));
|
||||
for (i = 0, tot = 0; i < a->n; ++i) {
|
||||
int j = a->a[i].secondary;
|
||||
if (j >= 0 && a->a[i].score >= a->a[j].score * opt->drop_ratio)
|
||||
++cnt[j], ++tot;
|
||||
}
|
||||
if (tot == 0) goto end_gen_alt;
|
||||
aln = calloc(a->n, sizeof(kstring_t));
|
||||
for (i = 0; i < a->n; ++i) {
|
||||
mem_aln_t t;
|
||||
int j = a->a[i].secondary;
|
||||
if (j < 0 || a->a[i].score < a->a[j].score * opt->drop_ratio) continue; // we don't process the primary alignments as they will be converted to SAM later
|
||||
if (cnt[j] > opt->max_hits) continue;
|
||||
t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]);
|
||||
kputs(bns->anns[t.rid].name, &aln[j]);
|
||||
kputc(',', &aln[j]); kputc("+-"[t.is_rev], &aln[j]); kputl(t.pos + 1, &aln[j]);
|
||||
kputc(',', &aln[j]);
|
||||
for (k = 0; k < t.n_cigar; ++k) {
|
||||
kputw(t.cigar[k]>>4, &aln[j]);
|
||||
kputc("MIDSHN"[t.cigar[k]&0xf], &aln[j]);
|
||||
}
|
||||
kputc(',', &aln[j]); kputw(t.NM, &aln[j]);
|
||||
kputc(';', &aln[j]);
|
||||
free(t.cigar);
|
||||
}
|
||||
XA = calloc(a->n, sizeof(char*));
|
||||
for (k = 0; k < a->n; ++k)
|
||||
XA[k] = aln[k].s;
|
||||
|
||||
end_gen_alt:
|
||||
free(cnt); free(aln);
|
||||
return XA;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -313,7 +313,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
|
|||
mem_aln2sam(bns, &str, &s[0], 1, &h[0], 0, &h[1], opt->flag&MEM_F_SOFTCLIP); s[0].sam = strdup(str.s); str.l = 0;
|
||||
mem_aln2sam(bns, &str, &s[1], 1, &h[1], 0, &h[0], opt->flag&MEM_F_SOFTCLIP); s[1].sam = str.s;
|
||||
if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
|
||||
free(h[0].cigar); free(h[1].cigar);
|
||||
free(h[0].cigar); free(h[0].XA);
|
||||
free(h[1].cigar); free(h[1].XA);
|
||||
} else goto no_pairing;
|
||||
return n;
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue