r738: output multi-map in the XA tag (SE only)

... PE support coming soon
This commit is contained in:
Heng Li 2014-04-30 16:46:05 -04:00
parent d59d78838c
commit c6c943f9d7
5 changed files with 58 additions and 8 deletions

View File

@ -67,6 +67,7 @@ mem_opt_t *mem_opt_init()
o->split_factor = 1.5;
o->chunk_size = 10000000;
o->n_threads = 1;
o->max_hits = 10;
o->max_matesw = 100;
o->mask_level_redun = 0.95;
o->min_chain_weight = 0;
@ -898,6 +899,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
}
}
}
if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); }
if (s->comment) { kputc('\t', str); kputs(s->comment, str); }
kputc('\n', str);
}
@ -934,10 +936,14 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a)
// TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible
void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m)
{
extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query);
kstring_t str;
kvec_t(mem_aln_t) aa;
int k;
char **XA = 0;
if (!(opt->flag & MEM_F_ALL))
XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq);
kv_init(aa);
str.l = str.m = 0; str.s = 0;
for (k = 0; k < a->n; ++k) {
@ -948,10 +954,8 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa
if (p->secondary >= 0 && p->score < a->a[p->secondary].score * opt->drop_ratio) continue;
q = kv_pushp(mem_aln_t, aa);
*q = mem_reg2aln2(opt, bns, pac, s->l_seq, s->seq, p, s->name);
if (q->rid < 0) {
--aa.n;
continue;
}
assert(q->rid >= 0); // this should not happen with the new code
q->XA = XA? XA[k] : 0;
q->flag |= extra_flag; // flag secondary
if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score
if (k && p->secondary < 0) // if supplementary
@ -966,10 +970,14 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa
} else {
for (k = 0; k < aa.n; ++k)
mem_aln2sam(bns, &str, s, aa.n, aa.a, k, m, opt->flag&MEM_F_SOFTCLIP);
for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar);
for (k = 0; k < aa.n; ++k) {
free(aa.a[k].cigar);
free(aa.a[k].XA);
}
free(aa.a);
}
s->sam = str.s;
if (XA) free(XA);
}
mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)

View File

@ -47,6 +47,7 @@ typedef struct {
int mapQ_coef_fac;
int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
int max_hits; // if there are max_hits or fewer, output them all
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
} mem_opt_t;
@ -82,6 +83,7 @@ typedef struct { // This struct is only used for the convenience of API.
uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
int n_cigar; // number of CIGAR operations
uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234
char *XA; // alternative mappings
int score, sub;
} mem_aln_t;

View File

@ -100,9 +100,48 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
kputw(bns->anns[rid].len, &str); kputc('\t', &str);
kputw(pos, &str); kputc('\t', &str); kputw(pos + (re - rb), &str); kputc('\t', &str);
ksprintf(&str, "%.3f", (double)p->truesc / opt->a / (qe - qb > re - rb? qe - qb : re - rb));
kputc('\t', &str); kputw(p->n_comp, &str);
kputc('\n', &str);
}
s->sam = str.s;
}
// Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup.
char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se()
{
int i, k, *cnt, tot;
kstring_t *aln = 0;
char **XA = 0;
cnt = calloc(a->n, sizeof(int));
for (i = 0, tot = 0; i < a->n; ++i) {
int j = a->a[i].secondary;
if (j >= 0 && a->a[i].score >= a->a[j].score * opt->drop_ratio)
++cnt[j], ++tot;
}
if (tot == 0) goto end_gen_alt;
aln = calloc(a->n, sizeof(kstring_t));
for (i = 0; i < a->n; ++i) {
mem_aln_t t;
int j = a->a[i].secondary;
if (j < 0 || a->a[i].score < a->a[j].score * opt->drop_ratio) continue; // we don't process the primary alignments as they will be converted to SAM later
if (cnt[j] > opt->max_hits) continue;
t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]);
kputs(bns->anns[t.rid].name, &aln[j]);
kputc(',', &aln[j]); kputc("+-"[t.is_rev], &aln[j]); kputl(t.pos + 1, &aln[j]);
kputc(',', &aln[j]);
for (k = 0; k < t.n_cigar; ++k) {
kputw(t.cigar[k]>>4, &aln[j]);
kputc("MIDSHN"[t.cigar[k]&0xf], &aln[j]);
}
kputc(',', &aln[j]); kputw(t.NM, &aln[j]);
kputc(';', &aln[j]);
free(t.cigar);
}
XA = calloc(a->n, sizeof(char*));
for (k = 0; k < a->n; ++k)
XA[k] = aln[k].s;
end_gen_alt:
free(cnt); free(aln);
return XA;
}

View File

@ -313,7 +313,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
mem_aln2sam(bns, &str, &s[0], 1, &h[0], 0, &h[1], opt->flag&MEM_F_SOFTCLIP); s[0].sam = strdup(str.s); str.l = 0;
mem_aln2sam(bns, &str, &s[1], 1, &h[1], 0, &h[0], opt->flag&MEM_F_SOFTCLIP); s[1].sam = str.s;
if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
free(h[0].cigar); free(h[1].cigar);
free(h[0].cigar); free(h[0].XA);
free(h[1].cigar); free(h[1].XA);
} else goto no_pairing;
return n;

2
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.8-r737-dirty"
#define PACKAGE_VERSION "0.7.8-r738-dirty"
#endif
int bwa_fa2pac(int argc, char *argv[]);