fast-bwa/bwamem.c

1020 lines
37 KiB
C
Raw Normal View History

2013-02-01 02:59:48 +08:00
#include <stdlib.h>
2013-02-01 04:55:22 +08:00
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include <math.h>
2013-02-08 02:29:01 +08:00
#ifdef HAVE_PTHREAD
#include <pthread.h>
#endif
2013-02-25 02:09:29 +08:00
2013-02-08 03:36:18 +08:00
#include "kstring.h"
2013-02-01 02:59:48 +08:00
#include "bwamem.h"
#include "bntseq.h"
#include "ksw.h"
#include "kvec.h"
2013-02-05 06:23:06 +08:00
#include "ksort.h"
#include "utils.h"
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
2013-02-20 14:11:38 +08:00
/* Theory on probability and scoring *ungapped* alignment
*
* s'(a,b) = log[P(b|a)/P(b)] = log[4P(b|a)], assuming uniform base distribution
* s'(a,a) = log(4), s'(a,b) = log(4e/3), where e is the error rate
*
* Scale s'(a,b) to s(a,a) s.t. s(a,a)=x. Then s(a,b) = x*s'(a,b)/log(4), or conversely: s'(a,b)=s(a,b)*log(4)/x
*
* If the matching score is x and mismatch penalty is -y, we can compute error rate e:
* e = .75 * exp[-log(4) * y/x]
*
* log P(seq) = \sum_i log P(b_i|a_i) = \sum_i {s'(a,b) - log(4)}
* = \sum_i { s(a,b)*log(4)/x - log(4) } = log(4) * (S/x - l)
*
* where S=\sum_i s(a,b) is the alignment score. Converting to the phred scale:
* Q(seq) = -10/log(10) * log P(seq) = 10*log(4)/log(10) * (l - S/x) = 6.02 * (l - S/x)
*
*
* Gap open (zero gap): q' = log[P(gap-open)], r' = log[P(gap-ext)] (see Durbin et al. (1998) Section 4.1)
* Then q = x*log[P(gap-open)]/log(4), r = x*log[P(gap-ext)]/log(4)
2013-02-21 08:11:44 +08:00
*
* When there are gaps, l should be the length of alignment matches (i.e. the M operator in CIGAR)
2013-02-20 14:11:38 +08:00
*/
mem_opt_t *mem_opt_init()
2013-02-01 02:59:48 +08:00
{
mem_opt_t *o;
o = calloc(1, sizeof(mem_opt_t));
2013-02-19 05:33:06 +08:00
o->flag = 0;
o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100;
o->T = 30;
o->zdrop = 100;
o->pen_unpaired = 17;
o->pen_clip5 = o->pen_clip3 = 5;
o->min_seed_len = 19;
o->split_width = 10;
o->max_occ = 10000;
2013-02-01 04:55:22 +08:00
o->max_chain_gap = 10000;
2013-02-11 23:59:38 +08:00
o->max_ins = 10000;
2013-02-05 13:17:20 +08:00
o->mask_level = 0.50;
2013-02-05 13:41:07 +08:00
o->chain_drop_ratio = 0.50;
2013-02-22 01:52:00 +08:00
o->split_factor = 1.5;
2013-02-08 02:13:43 +08:00
o->chunk_size = 10000000;
o->n_threads = 1;
2013-02-26 00:56:02 +08:00
o->max_matesw = 100;
o->mask_level_redun = 0.95;
o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len);
// o->mapQ_coef_len = o->mapQ_coef_fac = 0;
2013-03-05 22:38:12 +08:00
bwa_fill_scmat(o->a, o->b, o->mat);
2013-02-01 02:59:48 +08:00
return o;
}
/***************************
* SMEM iterator interface *
***************************/
2013-02-02 03:38:44 +08:00
struct __smem_i {
const bwt_t *bwt;
const uint8_t *query;
int start, len;
bwtintv_v *matches; // matches; to be returned by smem_next()
bwtintv_v *sub; // sub-matches inside the longest match; temporary
bwtintv_v *tmpvec[2]; // temporary arrays
};
2013-02-01 02:59:48 +08:00
smem_i *smem_itr_init(const bwt_t *bwt)
{
smem_i *itr;
itr = calloc(1, sizeof(smem_i));
2013-02-01 02:59:48 +08:00
itr->bwt = bwt;
itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v));
itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v));
itr->matches = calloc(1, sizeof(bwtintv_v));
itr->sub = calloc(1, sizeof(bwtintv_v));
2013-02-01 02:59:48 +08:00
return itr;
}
void smem_itr_destroy(smem_i *itr)
{
2013-02-02 04:26:34 +08:00
free(itr->tmpvec[0]->a); free(itr->tmpvec[0]);
free(itr->tmpvec[1]->a); free(itr->tmpvec[1]);
free(itr->matches->a); free(itr->matches);
free(itr->sub->a); free(itr->sub);
2013-02-01 02:59:48 +08:00
free(itr);
}
2013-02-02 03:20:38 +08:00
void smem_set_query(smem_i *itr, int len, const uint8_t *query)
2013-02-01 02:59:48 +08:00
{
itr->query = query;
itr->start = 0;
itr->len = len;
}
const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width)
2013-02-01 02:59:48 +08:00
{
int i, max, max_i, ori_start;
2013-02-02 03:38:44 +08:00
itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0;
if (itr->start >= itr->len || itr->start < 0) return 0;
2013-02-01 02:59:48 +08:00
while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
2013-02-02 03:38:44 +08:00
if (itr->start == itr->len) return 0;
ori_start = itr->start;
itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM
2013-02-02 03:38:44 +08:00
if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here
for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match
2013-02-02 03:20:38 +08:00
bwtintv_t *p = &itr->matches->a[i];
int len = (uint32_t)p->info - (p->info>>32);
if (max < len) max = len, max_i = i;
}
if (split_len > 0 && max >= split_len && itr->matches->a[max_i].x[2] <= split_width) { // if the longest SMEM is unique and long
2013-02-02 03:20:38 +08:00
int j;
2013-02-02 03:38:44 +08:00
bwtintv_v *a = itr->tmpvec[0]; // reuse tmpvec[0] for merging
2013-02-02 03:20:38 +08:00
bwtintv_t *p = &itr->matches->a[max_i];
bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, itr->matches->a[max_i].x[2]+1, itr->sub, itr->tmpvec); // starting from the middle of the longest MEM
2013-02-02 03:20:38 +08:00
i = j = 0; a->n = 0;
while (i < itr->matches->n && j < itr->sub->n) { // ordered merge
2013-02-05 05:48:11 +08:00
int64_t xi = itr->matches->a[i].info>>32<<32 | (itr->len - (uint32_t)itr->matches->a[i].info);
2013-02-05 13:41:07 +08:00
int64_t xj = itr->sub->a[j].info>>32<<32 | (itr->len - (uint32_t)itr->sub->a[j].info);
2013-02-05 05:48:11 +08:00
if (xi < xj) {
2013-02-02 03:20:38 +08:00
kv_push(bwtintv_t, *a, itr->matches->a[i]);
++i;
} else if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1 && (uint32_t)itr->sub->a[j].info > ori_start) {
2013-02-02 03:20:38 +08:00
kv_push(bwtintv_t, *a, itr->sub->a[j]);
++j;
} else ++j;
2013-02-02 03:20:38 +08:00
}
for (; i < itr->matches->n; ++i) kv_push(bwtintv_t, *a, itr->matches->a[i]);
for (; j < itr->sub->n; ++j)
if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1 && (uint32_t)itr->sub->a[j].info > ori_start)
kv_push(bwtintv_t, *a, itr->sub->a[j]);
2013-02-02 03:20:38 +08:00
kv_copy(bwtintv_t, *itr->matches, *a);
}
2013-02-02 03:38:44 +08:00
return itr->matches;
2013-02-01 02:59:48 +08:00
}
2013-02-05 06:23:06 +08:00
/********************************
* Chaining while finding SMEMs *
********************************/
typedef struct {
int64_t rbeg;
int32_t qbeg, len;
} mem_seed_t;
typedef struct {
int n, m;
int64_t pos;
mem_seed_t *seeds;
} mem_chain_t;
typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v;
2013-02-01 04:55:22 +08:00
#include "kbtree.h"
#define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos))
2013-02-08 02:13:43 +08:00
KBTREE_INIT(chn, mem_chain_t, chain_cmp)
2013-02-01 04:55:22 +08:00
static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p)
2013-02-01 04:55:22 +08:00
{
int64_t qend, rend, x, y;
const mem_seed_t *last = &c->seeds[c->n-1];
2013-02-01 04:55:22 +08:00
qend = last->qbeg + last->len;
rend = last->rbeg + last->len;
2013-02-01 05:39:24 +08:00
if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend)
2013-02-01 04:55:22 +08:00
return 1; // contained seed; do nothing
if ((last->rbeg < l_pac || c->seeds[0].rbeg < l_pac) && p->rbeg >= l_pac) return 0; // don't chain if on different strand
2013-02-05 05:48:11 +08:00
x = p->qbeg - last->qbeg; // always non-negtive
2013-02-01 04:55:22 +08:00
y = p->rbeg - last->rbeg;
2013-02-05 05:48:11 +08:00
if (y >= 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain
2013-02-01 04:55:22 +08:00
if (c->n == c->m) {
c->m <<= 1;
c->seeds = realloc(c->seeds, c->m * sizeof(mem_seed_t));
2013-02-01 04:55:22 +08:00
}
c->seeds[c->n++] = *p;
return 1;
}
2013-02-02 03:38:44 +08:00
return 0; // request to add a new chain
2013-02-01 04:55:22 +08:00
}
static void mem_insert_seed(const mem_opt_t *opt, int64_t l_pac, kbtree_t(chn) *tree, smem_i *itr)
2013-02-01 04:55:22 +08:00
{
2013-02-02 03:38:44 +08:00
const bwtintv_v *a;
2013-02-22 01:52:00 +08:00
int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
2013-02-26 06:29:35 +08:00
split_len = split_len < itr->len? split_len : itr->len;
2013-02-22 01:52:00 +08:00
while ((a = smem_next(itr, split_len, opt->split_width)) != 0) { // to find all SMEM and some internal MEM
2013-02-01 04:55:22 +08:00
int i;
2013-02-02 03:38:44 +08:00
for (i = 0; i < a->n; ++i) { // go through each SMEM/MEM up to itr->start
bwtintv_t *p = &a->a[i];
2013-02-01 04:55:22 +08:00
int slen = (uint32_t)p->info - (p->info>>32); // seed length
int64_t k;
2013-02-02 03:38:44 +08:00
if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive
2013-02-01 04:55:22 +08:00
for (k = 0; k < p->x[2]; ++k) {
2013-02-08 02:13:43 +08:00
mem_chain_t tmp, *lower, *upper;
mem_seed_t s;
2013-02-01 04:55:22 +08:00
int to_add = 0;
2013-02-02 03:38:44 +08:00
s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
s.qbeg = p->info>>32;
s.len = slen;
if (s.rbeg < l_pac && l_pac < s.rbeg + s.len) continue; // bridging forward-reverse boundary; skip
2013-02-01 04:55:22 +08:00
if (kb_size(tree)) {
2013-02-02 03:38:44 +08:00
kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain
if (!lower || !test_and_merge(opt, l_pac, lower, &s)) to_add = 1;
} else to_add = 1;
2013-02-02 03:38:44 +08:00
if (to_add) { // add the seed as a new chain
2013-02-01 04:55:22 +08:00
tmp.n = 1; tmp.m = 4;
tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t));
tmp.seeds[0] = s;
2013-02-01 04:55:22 +08:00
kb_putp(chn, tree, &tmp);
}
}
}
}
}
2013-02-08 05:27:11 +08:00
void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn)
{
int i, j;
for (i = 0; i < chn->n; ++i) {
mem_chain_t *p = &chn->a[i];
err_printf("CHAIN(%d) n=%d", i, p->n);
2013-02-08 05:27:11 +08:00
for (j = 0; j < p->n; ++j) {
bwtint_t pos;
int is_rev, ref_id;
pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev);
if (is_rev) pos -= p->seeds[j].len - 1;
bns_cnt_ambi(bns, pos, p->seeds[j].len, &ref_id);
err_printf("\t%d,%d,%ld(%s:%c%ld)", p->seeds[j].len, p->seeds[j].qbeg, (long)p->seeds[j].rbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
2013-02-08 05:27:11 +08:00
}
err_putchar('\n');
2013-02-08 05:27:11 +08:00
}
}
mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int64_t l_pac, int len, const uint8_t *seq)
2013-02-01 04:55:22 +08:00
{
2013-02-08 02:13:43 +08:00
mem_chain_v chain;
2013-02-01 04:55:22 +08:00
smem_i *itr;
kbtree_t(chn) *tree;
2013-02-08 02:13:43 +08:00
kv_init(chain);
2013-02-01 04:55:22 +08:00
if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match
tree = kb_init(chn, KB_DEFAULT_SIZE);
itr = smem_itr_init(bwt);
2013-02-02 03:20:38 +08:00
smem_set_query(itr, len, seq);
mem_insert_seed(opt, l_pac, tree, itr);
2013-02-08 02:13:43 +08:00
kv_resize(mem_chain_t, chain, kb_size(tree));
2013-02-08 02:13:43 +08:00
#define traverse_func(p_) (chain.a[chain.n++] = *(p_))
__kb_traverse(mem_chain_t, tree, traverse_func);
#undef traverse_func
2013-02-01 04:55:22 +08:00
smem_itr_destroy(itr);
kb_destroy(chn, tree);
return chain;
}
2013-02-05 05:48:11 +08:00
/********************
* Filtering chains *
********************/
2013-02-05 06:23:06 +08:00
typedef struct {
int beg, end, w;
void *p, *p2;
} flt_aux_t;
#define flt_lt(a, b) ((a).w > (b).w)
KSORT_INIT(mem_flt, flt_aux_t, flt_lt)
2013-02-08 02:13:43 +08:00
int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains)
2013-02-05 06:23:06 +08:00
{
flt_aux_t *a;
int i, j, n;
2013-02-07 02:59:32 +08:00
if (n_chn <= 1) return n_chn; // no need to filter
a = malloc(sizeof(flt_aux_t) * n_chn);
2013-02-07 02:59:32 +08:00
for (i = 0; i < n_chn; ++i) {
2013-02-08 02:13:43 +08:00
mem_chain_t *c = &chains[i];
2013-02-09 04:34:25 +08:00
int64_t end;
int w = 0, tmp;
for (j = 0, end = 0; j < c->n; ++j) {
const mem_seed_t *s = &c->seeds[j];
if (s->qbeg >= end) w += s->len;
else if (s->qbeg + s->len > end) w += s->qbeg + s->len - end;
end = end > s->qbeg + s->len? end : s->qbeg + s->len;
}
tmp = w;
for (j = 0, end = 0; j < c->n; ++j) {
const mem_seed_t *s = &c->seeds[j];
if (s->rbeg >= end) w += s->len;
else if (s->rbeg + s->len > end) w += s->rbeg + s->len - end;
end = end > s->qbeg + s->len? end : s->qbeg + s->len;
}
w = w < tmp? w : tmp;
2013-02-05 06:23:06 +08:00
a[i].beg = c->seeds[0].qbeg;
a[i].end = c->seeds[c->n-1].qbeg + c->seeds[c->n-1].len;
2013-02-07 02:59:32 +08:00
a[i].w = w; a[i].p = c; a[i].p2 = 0;
2013-02-05 06:23:06 +08:00
}
2013-02-07 02:59:32 +08:00
ks_introsort(mem_flt, n_chn, a);
2013-02-06 10:58:33 +08:00
{ // reorder chains such that the best chain appears first
2013-02-08 02:13:43 +08:00
mem_chain_t *swap;
swap = malloc(sizeof(mem_chain_t) * n_chn);
2013-02-07 02:59:32 +08:00
for (i = 0; i < n_chn; ++i) {
2013-02-08 02:13:43 +08:00
swap[i] = *((mem_chain_t*)a[i].p);
2013-02-07 02:59:32 +08:00
a[i].p = &chains[i]; // as we will memcpy() below, a[i].p is changed
2013-02-06 10:58:33 +08:00
}
2013-02-08 02:13:43 +08:00
memcpy(chains, swap, sizeof(mem_chain_t) * n_chn);
2013-02-06 10:58:33 +08:00
free(swap);
}
2013-02-07 02:59:32 +08:00
for (i = 1, n = 1; i < n_chn; ++i) {
2013-02-05 06:23:06 +08:00
for (j = 0; j < n; ++j) {
int b_max = a[j].beg > a[i].beg? a[j].beg : a[i].beg;
2013-02-05 13:17:20 +08:00
int e_min = a[j].end < a[i].end? a[j].end : a[i].end;
2013-02-05 06:23:06 +08:00
if (e_min > b_max) { // have overlap
int min_l = a[i].end - a[i].beg < a[j].end - a[j].beg? a[i].end - a[i].beg : a[j].end - a[j].beg;
if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap
if (a[j].p2 == 0) a[j].p2 = a[i].p;
2013-02-22 01:25:20 +08:00
if (a[i].w < a[j].w * opt->chain_drop_ratio && a[j].w - a[i].w >= opt->min_seed_len<<1)
2013-02-05 06:23:06 +08:00
break;
}
}
}
if (j == n) a[n++] = a[i]; // if have no significant overlap with better chains, keep it.
}
2013-02-05 13:17:20 +08:00
for (i = 0; i < n; ++i) { // mark chains to be kept
2013-02-08 02:13:43 +08:00
mem_chain_t *c = (mem_chain_t*)a[i].p;
2013-02-05 13:17:20 +08:00
if (c->n > 0) c->n = -c->n;
2013-02-08 02:13:43 +08:00
c = (mem_chain_t*)a[i].p2;
2013-02-05 13:17:20 +08:00
if (c && c->n > 0) c->n = -c->n;
}
2013-02-05 06:23:06 +08:00
free(a);
2013-02-07 02:59:32 +08:00
for (i = 0; i < n_chn; ++i) { // free discarded chains
2013-02-08 02:13:43 +08:00
mem_chain_t *c = &chains[i];
2013-02-05 13:17:20 +08:00
if (c->n >= 0) {
free(c->seeds);
c->n = c->m = 0;
} else c->n = -c->n;
}
2013-02-07 02:59:32 +08:00
for (i = n = 0; i < n_chn; ++i) { // squeeze out discarded chains
if (chains[i].n > 0) {
if (n != i) chains[n++] = chains[i];
2013-02-05 13:17:20 +08:00
else ++n;
}
}
2013-02-07 02:59:32 +08:00
return n;
2013-02-05 06:23:06 +08:00
}
2013-02-11 01:24:33 +08:00
/******************************
* De-overlap single-end hits *
******************************/
2013-02-07 03:38:40 +08:00
#define alnreg_slt2(a, b) ((a).re < (b).re)
KSORT_INIT(mem_ars2, mem_alnreg_t, alnreg_slt2)
2013-02-11 01:24:33 +08:00
#define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb))))
KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt)
int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun)
2013-02-11 01:24:33 +08:00
{
int m, i, j;
2013-02-07 03:38:40 +08:00
if (n <= 1) return n;
ks_introsort(mem_ars2, n, a);
for (i = 1; i < n; ++i) {
mem_alnreg_t *p = &a[i];
if (p->rb >= a[i-1].re) continue;
for (j = i - 1; j >= 0 && p->rb < a[j].re; --j) {
mem_alnreg_t *q = &a[j];
int64_t or, oq, mr, mq;
if (q->qe == q->qb) continue; // a[j] has been excluded
or = q->re - p->rb; // overlap length on the reference
oq = q->qb < p->qb? q->qe - p->qb : p->qe - q->qb; // overlap length on the query
mr = q->re - q->rb < p->re - p->rb? q->re - q->rb : p->re - p->rb; // min ref len in alignment
mq = q->qe - q->qb < p->qe - p->qb? q->qe - q->qb : p->qe - p->qb; // min qry len in alignment
if (or > mask_level_redun * mr && oq > mask_level_redun * mq) { // one of the hits is redundant
if (p->score < q->score) {
p->qe = p->qb;
break;
} else q->qe = q->qb;
}
}
}
for (i = 0, m = 0; i < n; ++i) // exclude identical hits
if (a[i].qe > a[i].qb) {
if (m != i) a[m++] = a[i];
else ++m;
}
n = m;
2013-02-11 01:24:33 +08:00
ks_introsort(mem_ars, n, a);
2013-02-09 03:18:39 +08:00
for (i = 1; i < n; ++i) { // mark identical hits
if (a[i].score == a[i-1].score && a[i].rb == a[i-1].rb && a[i].qb == a[i-1].qb)
2013-02-09 06:55:35 +08:00
a[i].qe = a[i].qb;
2013-02-09 03:18:39 +08:00
}
for (i = 1, m = 1; i < n; ++i) // exclude identical hits
2013-02-21 09:26:57 +08:00
if (a[i].qe > a[i].qb) {
if (m != i) a[m++] = a[i];
else ++m;
}
2013-02-11 01:24:33 +08:00
return m;
}
void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORTANT: must run mem_sort_and_dedup() before calling this function
2013-02-11 01:24:33 +08:00
{ // similar to the loop in mem_chain_flt()
int i, k, tmp;
kvec_t(int) z;
if (n == 0) return;
kv_init(z);
for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1;
2013-02-09 06:55:35 +08:00
tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r;
kv_push(int, z, 0);
for (i = 1; i < n; ++i) {
for (k = 0; k < z.n; ++k) {
int j = z.a[k];
2013-02-07 03:38:40 +08:00
int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb;
int e_min = a[j].qe < a[i].qe? a[j].qe : a[i].qe;
if (e_min > b_max) { // have overlap
int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb;
if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap
if (a[j].sub == 0) a[j].sub = a[i].score;
2013-02-09 06:55:35 +08:00
if (a[j].score - a[i].score <= tmp) ++a[j].sub_n;
2013-02-07 03:38:40 +08:00
break;
}
}
}
if (k == z.n) kv_push(int, z, i);
else a[i].secondary = z.a[k];
2013-02-07 03:38:40 +08:00
}
free(z.a);
2013-02-07 03:38:40 +08:00
}
2013-02-05 05:48:11 +08:00
/****************************************
* Construct the alignment from a chain *
****************************************/
/* mem_chain2aln() vs mem_chain2aln_short()
*
* mem_chain2aln() covers all the functionality of mem_chain2aln_short().
* However, it may waste time on extracting the reference sequences given a
* very long query. mem_chain2aln_short() is faster for very short chains in a
* long query. It may fail when the matches are long or reach the end of the
* query. In this case, mem_chain2aln() will be called again.
* mem_chain2aln_short() is almost never used for short-read alignment.
*/
#define MEM_SHORT_EXT 50
#define MEM_SHORT_LEN 200
#define MAX_BAND_TRY 2
int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av)
{
int i, qb, qe, xtra;
int64_t rb, re, rlen;
uint8_t *rseq = 0;
mem_alnreg_t a;
kswr_t x;
if (c->n == 0) return -1;
qb = l_query; qe = 0;
rb = l_pac<<1; re = 0;
memset(&a, 0, sizeof(mem_alnreg_t));
for (i = 0; i < c->n; ++i) {
const mem_seed_t *s = &c->seeds[i];
qb = qb < s->qbeg? qb : s->qbeg;
qe = qe > s->qbeg + s->len? qe : s->qbeg + s->len;
rb = rb < s->rbeg? rb : s->rbeg;
re = re > s->rbeg + s->len? re : s->rbeg + s->len;
a.seedcov += s->len;
}
qb -= MEM_SHORT_EXT; qe += MEM_SHORT_EXT;
if (qb <= 10 || qe >= l_query - 10) return 1; // because ksw_align() does not support end-to-end alignment
rb -= MEM_SHORT_EXT; re += MEM_SHORT_EXT;
rb = rb > 0? rb : 0;
re = re < l_pac<<1? re : l_pac<<1;
if (rb < l_pac && l_pac < re) {
if (c->seeds[0].rbeg < l_pac) re = l_pac;
else rb = l_pac;
}
if ((re - rb) - (qe - qb) > MEM_SHORT_EXT || (qe - qb) - (re - rb) > MEM_SHORT_EXT) return 1;
if (qe - qb >= opt->w * 4 || re - rb >= opt->w * 4) return 1;
if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return 1;
rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
assert(rlen == re - rb);
xtra = KSW_XSUBO | KSW_XSTART | ((qe - qb) * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a);
x = ksw_align(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->q, opt->r, xtra, 0);
free(rseq);
if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1;
a.rb = rb + x.tb; a.re = rb + x.te + 1;
a.qb = qb + x.qb; a.qe = qb + x.qe + 1;
a.score = x.score;
a.csub = x.score2;
kv_push(mem_alnreg_t, *av, a);
if (bwa_verbose >= 4) printf("chain2aln(short): [%d,%d) <=> [%ld,%ld)\n", a.qb, a.qe, (long)a.rb, (long)a.re);
return 0;
}
static inline int cal_max_gap(const mem_opt_t *opt, int qlen)
{
int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.);
2013-02-27 13:29:11 +08:00
l = l > 1? l : 1;
return l < opt->w<<1? l : opt->w<<1;
}
void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av)
{
2013-03-05 06:29:07 +08:00
int i, k, max_off[2], aw[2]; // aw: actual bandwidth used in extension
int64_t rlen, rmax[2], tmp, max = 0;
2013-02-05 05:08:00 +08:00
const mem_seed_t *s;
uint8_t *rseq = 0;
uint64_t *srt;
2013-02-05 04:02:56 +08:00
if (c->n == 0) return;
// get the max possible span
2013-02-05 05:08:00 +08:00
rmax[0] = l_pac<<1; rmax[1] = 0;
for (i = 0; i < c->n; ++i) {
int64_t b, e;
const mem_seed_t *t = &c->seeds[i];
b = t->rbeg - (t->qbeg + cal_max_gap(opt, t->qbeg));
e = t->rbeg + t->len + ((l_query - t->qbeg - t->len) + cal_max_gap(opt, l_query - t->qbeg - t->len));
rmax[0] = rmax[0] < b? rmax[0] : b;
rmax[1] = rmax[1] > e? rmax[1] : e;
if (t->len > max) max = t->len;
2013-02-05 05:08:00 +08:00
}
rmax[0] = rmax[0] > 0? rmax[0] : 0;
rmax[1] = rmax[1] < l_pac<<1? rmax[1] : l_pac<<1;
if (rmax[0] < l_pac && l_pac < rmax[1]) { // crossing the forward-reverse boundary; then choose one side
if (c->seeds[0].rbeg < l_pac) rmax[1] = l_pac; // this works because all seeds are guaranteed to be on the same strand
else rmax[0] = l_pac;
}
// retrieve the reference sequence
rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen);
assert(rlen == rmax[1] - rmax[0]);
srt = malloc(c->n * 8);
for (i = 0; i < c->n; ++i)
srt[i] = (uint64_t)c->seeds[i].len<<32 | i;
ks_introsort_64(c->n, srt);
for (k = c->n - 1; k >= 0; --k) {
mem_alnreg_t *a;
s = &c->seeds[(uint32_t)srt[k]];
for (i = 0; i < av->n; ++i) { // test whether extension has been made before
mem_alnreg_t *p = &av->a[i];
int64_t rd;
2013-02-27 13:29:11 +08:00
int qd, w, max_gap;
if (s->rbeg < p->rb || s->rbeg + s->len > p->re || s->qbeg < p->qb || s->qbeg + s->len > p->qe) continue; // not fully contained
// qd: distance ahead of the seed on query; rd: on reference
qd = s->qbeg - p->qb; rd = s->rbeg - p->rb;
max_gap = cal_max_gap(opt, qd < rd? qd : rd); // the maximal gap allowed in regions ahead of the seed
w = max_gap < opt->w? max_gap : opt->w; // bounded by the band width
if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit
2013-02-27 13:29:11 +08:00
// similar to the previous four lines, but this time we look at the region behind
qd = p->qe - (s->qbeg + s->len); rd = p->re - (s->rbeg + s->len);
max_gap = cal_max_gap(opt, qd < rd? qd : rd);
w = max_gap < opt->w? max_gap : opt->w;
if (qd - rd < w && rd - qd < w) break;
}
if (i < av->n) { // the seed is (almost) contained in an existing alignment
for (i = k + 1; i < c->n; ++i) { // check overlapping seeds in the same chain
const mem_seed_t *t;
if (srt[i] == 0) continue;
t = &c->seeds[(uint32_t)srt[i]];
if (t->len < s->len * .95) continue; // only check overlapping if t is long enough; TODO: more efficient by early stopping
if (s->qbeg <= t->qbeg && s->qbeg + s->len - t->qbeg >= s->len>>2 && t->qbeg - s->qbeg != t->rbeg - s->rbeg) break;
if (t->qbeg <= s->qbeg && t->qbeg + t->len - s->qbeg >= s->len>>2 && s->qbeg - t->qbeg != s->rbeg - t->rbeg) break;
}
if (i == c->n) { // no overlapping seeds; then skip extension
srt[k] = 0; // mark that seed extension has not been performed
continue;
}
}
a = kv_pushp(mem_alnreg_t, *av);
2013-02-08 10:20:36 +08:00
memset(a, 0, sizeof(mem_alnreg_t));
2013-03-05 06:29:07 +08:00
a->w = aw[0] = aw[1] = opt->w;
a->score = a->truesc = -1;
if (bwa_verbose >= 4) err_printf("Extending from seed [%ld,%ld,%ld]\n", (long)s->len, (long)s->qbeg, (long)s->rbeg);
2013-02-08 10:20:36 +08:00
if (s->qbeg) { // left extension
uint8_t *rs, *qs;
int qle, tle, gtle, gscore;
qs = malloc(s->qbeg);
2013-02-08 10:20:36 +08:00
for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i];
tmp = s->rbeg - rmax[0];
rs = malloc(tmp);
2013-02-08 10:20:36 +08:00
for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i];
for (i = 0; i < MAX_BAND_TRY; ++i) {
int prev = a->score;
aw[0] = opt->w << i;
a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, &gtle, &gscore, &max_off[0]);
if (bwa_verbose >= 4) { printf("L\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); }
if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break;
2013-03-05 06:29:07 +08:00
}
// check whether we prefer to reach the end of the query
if (gscore <= 0 || gscore <= a->score - opt->pen_clip5) { // local extension
a->qb = s->qbeg - qle, a->rb = s->rbeg - tle;
a->truesc = a->score;
} else { // to-end extension
a->qb = 0, a->rb = s->rbeg - gtle;
a->truesc = gscore;
}
2013-02-08 10:20:36 +08:00
free(qs); free(rs);
} else a->score = a->truesc = s->len * opt->a, a->qb = 0, a->rb = s->rbeg;
2013-02-08 10:20:36 +08:00
if (s->qbeg + s->len != l_query) { // right extension
int qle, tle, qe, re, gtle, gscore, sc0 = a->score;
2013-02-08 10:20:36 +08:00
qe = s->qbeg + s->len;
re = s->rbeg + s->len - rmax[0];
assert(re >= 0);
for (i = 0; i < MAX_BAND_TRY; ++i) {
int prev = a->score;
aw[1] = opt->w << i;
a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, &gtle, &gscore, &max_off[1]);
if (bwa_verbose >= 4) { printf("R\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); }
if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break;
2013-03-05 06:29:07 +08:00
}
// similar to the above
if (gscore <= 0 || gscore <= a->score - opt->pen_clip3) { // local extension
a->qe = qe + qle, a->re = rmax[0] + re + tle;
a->truesc += a->score - sc0;
} else { // to-end extension
a->qe = l_query, a->re = rmax[0] + re + gtle;
a->truesc += gscore - sc0;
}
2013-02-08 10:20:36 +08:00
} else a->qe = l_query, a->re = s->rbeg + s->len;
2013-03-05 06:29:07 +08:00
if (bwa_verbose >= 4) { printf("[%d]\taw={%d,%d}\tscore=%d\t[%d,%d) <=> [%ld,%ld)\n", k, aw[0], aw[1], a->score, a->qb, a->qe, (long)a->rb, (long)a->re); fflush(stdout); }
// compute seedcov
for (i = 0, a->seedcov = 0; i < c->n; ++i) {
const mem_seed_t *t = &c->seeds[i];
if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained
a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough
}
2013-03-05 06:29:07 +08:00
a->w = aw[0] > aw[1]? aw[0] : aw[1];
}
free(srt); free(rseq);
}
2013-02-06 10:49:19 +08:00
/*****************************
* Basic hit->SAM conversion *
*****************************/
2013-02-28 12:59:50 +08:00
static inline int infer_bw(int l1, int l2, int score, int a, int q, int r)
{
int w;
if (l1 == l2 && l1 * a - score < (q + r - a)<<1) return 0; // to get equal alignment length, we need at least two gaps
2013-02-28 12:59:50 +08:00
w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 1.);
if (w < abs(l1 - l2)) w = abs(l1 - l2);
return w;
}
static inline int get_rlen(int n_cigar, const uint32_t *cigar)
{
int k, l;
for (k = l = 0; k < n_cigar; ++k) {
int op = cigar[k]&0xf;
if (op == 0 || op == 2)
l += cigar[k]>>4;
2013-02-23 05:38:48 +08:00
}
return l;
}
2013-03-12 11:01:51 +08:00
void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_)
{
2013-03-12 11:01:51 +08:00
int i;
mem_aln_t ptmp = list[which], *p = &ptmp, mtmp, *m = 0; // make a copy of the alignment to convert
2013-03-12 11:01:51 +08:00
if (m_) mtmp = *m_, m = &mtmp;
// set flag
p->flag |= m? 0x1 : 0; // is paired in sequencing
2013-03-12 11:01:51 +08:00
p->flag |= p->rid < 0? 0x4 : 0; // is mapped
p->flag |= m && m->rid < 0? 0x8 : 0; // is mate mapped
if (p->rid < 0 && m && m->rid >= 0) // copy mate to alignment
p->rid = m->rid, p->pos = m->pos, p->is_rev = m->is_rev, p->n_cigar = 0;
if (m && m->rid < 0 && p->rid >= 0) // copy alignment to mate
m->rid = p->rid, m->pos = p->pos, m->is_rev = p->is_rev, m->n_cigar = 0;
p->flag |= p->is_rev? 0x10 : 0; // is on the reverse strand
p->flag |= m && m->is_rev? 0x20 : 0; // is mate on the reverse strand
// print up to CIGAR
kputs(s->name, str); kputc('\t', str); // QNAME
2013-03-12 09:55:52 +08:00
kputw((p->flag&0xffff) | (p->flag&0x10000? 0x100 : 0), str); kputc('\t', str); // FLAG
if (p->rid >= 0) { // with coordinate
kputs(bns->anns[p->rid].name, str); kputc('\t', str); // RNAME
kputl(p->pos + 1, str); kputc('\t', str); // POS
kputw(p->mapq, str); kputc('\t', str); // MAPQ
if (p->n_cigar) { // aligned
for (i = 0; i < p->n_cigar; ++i) {
int c = p->cigar[i]&0xf;
if (c == 3 || c == 4) c = which? 4 : 3; // use hard clipping for supplementary alignments
kputw(p->cigar[i]>>4, str); kputc("MIDSH"[c], str);
}
} else kputc('*', str); // having a coordinate but unaligned (e.g. when copy_mate is true)
} else kputsn("*\t0\t0\t*", 7, str); // without coordinte
kputc('\t', str);
// print the mate position if applicable
2013-03-12 11:01:51 +08:00
if (m && m->rid >= 0) {
if (p->rid == m->rid) kputc('=', str);
else kputs(bns->anns[m->rid].name, str);
2013-02-23 05:38:48 +08:00
kputc('\t', str);
2013-03-12 11:01:51 +08:00
kputl(m->pos + 1, str); kputc('\t', str);
if (p->rid == m->rid) {
int64_t p0 = p->pos + (p->is_rev? get_rlen(p->n_cigar, p->cigar) - 1 : 0);
int64_t p1 = m->pos + (m->is_rev? get_rlen(m->n_cigar, m->cigar) - 1 : 0);
if (m->n_cigar == 0 || p->n_cigar == 0) kputc('0', str);
else kputl(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str);
} else kputc('0', str);
2013-03-12 11:01:51 +08:00
} else kputsn("*\t0\t0", 5, str);
kputc('\t', str);
// print SEQ and QUAL
if (p->flag & 0x100) { // for secondary alignments, don't write SEQ and QUAL
2013-02-25 02:23:43 +08:00
kputsn("*\t*", 3, str);
} else if (!p->is_rev) { // the forward strand
int i, qb = 0, qe = s->l_seq;
if (p->n_cigar) {
if (which && ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3)) qb += p->cigar[0]>>4;
if (which && ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3)) qe -= p->cigar[p->n_cigar-1]>>4;
}
ks_resize(str, str->l + (qe - qb) + 1);
for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]];
kputc('\t', str);
if (s->qual) { // printf qual
ks_resize(str, str->l + (qe - qb) + 1);
for (i = qb; i < qe; ++i) str->s[str->l++] = s->qual[i];
str->s[str->l] = 0;
} else kputc('*', str);
} else { // the reverse strand
int i, qb = 0, qe = s->l_seq;
if (p->n_cigar) {
if (which && ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3)) qe -= p->cigar[0]>>4;
if (which && ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3)) qb += p->cigar[p->n_cigar-1]>>4;
}
ks_resize(str, str->l + (qe - qb) + 1);
for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]];
kputc('\t', str);
if (s->qual) { // printf qual
ks_resize(str, str->l + (qe - qb) + 1);
for (i = qe-1; i >= qb; --i) str->s[str->l++] = s->qual[i];
str->s[str->l] = 0;
} else kputc('*', str);
}
// print optional tags
if (p->n_cigar) { kputsn("\tNM:i:", 6, str); kputw(p->NM, str); }
2013-02-23 05:38:48 +08:00
if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); }
if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); }
if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); }
2013-05-23 07:55:07 +08:00
if (!(p->flag & 0x100)) { // not multi-hit
for (i = 0; i < n; ++i)
2013-05-23 07:55:07 +08:00
if (i != which && !(list[i].flag&0x100)) break;
if (i < n) { // there are other primary hits; output them
kputsn("\tSA:Z:", 6, str);
for (i = 0; i < n; ++i) {
const mem_aln_t *r = &list[i];
int k;
2013-05-23 07:55:07 +08:00
if (i == which || (list[i].flag&0x100)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit
kputs(bns->anns[r->rid].name, str); kputc(',', str);
kputl(r->pos+1, str); kputc(',', str);
kputc("+-"[r->is_rev], str); kputc(',', str);
for (k = 0; k < r->n_cigar; ++k) {
kputw(r->cigar[k]>>4, str); kputc("MIDSH"[r->cigar[k]&0xf], str);
}
kputc(',', str); kputw(r->mapq, str);
kputc(',', str); kputw(r->NM, str);
kputc(';', str);
}
}
}
2013-02-24 05:57:34 +08:00
if (s->comment) { kputc('\t', str); kputs(s->comment, str); }
kputc('\n', str);
}
2013-02-08 03:36:18 +08:00
/************************
* Integrated interface *
************************/
2013-02-19 13:50:39 +08:00
int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a)
2013-02-09 02:43:15 +08:00
{
int mapq, l, sub = a->sub? a->sub : opt->min_seed_len * opt->a;
double identity;
2013-02-09 03:46:57 +08:00
sub = a->csub > sub? a->csub : sub;
2013-02-23 03:47:57 +08:00
if (sub >= a->score) return 0;
2013-02-09 02:43:15 +08:00
l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb;
identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l;
if (a->score == 0) {
mapq = 0;
} else if (opt->mapQ_coef_len > 0) {
double tmp;
2013-09-07 00:37:38 +08:00
tmp = l < opt->mapQ_coef_len? 1. : opt->mapQ_coef_fac / log(l);
tmp *= identity * identity;
2013-09-07 00:37:38 +08:00
mapq = (int)(6.02 * (a->score - sub) / opt->a * tmp * tmp + .499);
} else {
mapq = (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499);
mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq;
}
if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499);
2013-02-09 02:43:15 +08:00
if (mapq > 60) mapq = 60;
2013-02-09 06:20:44 +08:00
if (mapq < 0) mapq = 0;
2013-02-09 02:43:15 +08:00
return mapq;
}
2013-05-23 08:02:53 +08:00
// TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible
2013-03-12 09:59:15 +08:00
void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m)
2013-02-08 02:13:43 +08:00
{
2013-02-08 03:36:18 +08:00
kstring_t str;
kvec_t(mem_aln_t) aa;
int k;
kv_init(aa);
2013-02-08 03:36:18 +08:00
str.l = str.m = 0; str.s = 0;
for (k = 0; k < a->n; ++k) {
mem_alnreg_t *p = &a->a[k];
mem_aln_t *q;
if (p->score < opt->T) continue;
if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue;
if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue;
q = kv_pushp(mem_aln_t, aa);
*q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p);
2013-05-23 07:55:07 +08:00
q->flag |= extra_flag; // flag secondary
if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score
if (k && p->secondary < 0) // if supplementary
q->flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800;
if (k && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq;
}
if (aa.n == 0) { // no alignments good enough; then write an unaligned record
2013-03-12 09:55:52 +08:00
mem_aln_t t;
t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0);
2013-03-12 11:01:51 +08:00
t.flag |= extra_flag;
2013-03-12 09:55:52 +08:00
mem_aln2sam(bns, &str, s, 1, &t, 0, m);
} else {
for (k = 0; k < aa.n; ++k)
mem_aln2sam(bns, &str, s, aa.n, aa.a, k, m);
for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar);
free(aa.a);
}
2013-02-08 03:36:18 +08:00
s->sam = str.s;
}
mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
2013-02-08 03:36:18 +08:00
{
int i;
2013-02-08 03:36:18 +08:00
mem_chain_v chn;
mem_alnreg_v regs;
for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so
2013-02-25 02:09:29 +08:00
seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]];
chn = mem_chain(opt, bwt, bns->l_pac, l_seq, (uint8_t*)seq);
2013-02-08 03:36:18 +08:00
chn.n = mem_chain_flt(opt, chn.n, chn.a);
2013-02-24 05:57:34 +08:00
if (bwa_verbose >= 4) mem_print_chain(bns, &chn);
kv_init(regs);
2013-02-08 03:36:18 +08:00
for (i = 0; i < chn.n; ++i) {
mem_chain_t *p = &chn.a[i];
int ret;
if (bwa_verbose >= 4) err_printf("===> Processing chain(%d) <===\n", i);
ret = mem_chain2aln_short(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, &regs);
if (ret > 0) mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, &regs);
2013-02-08 03:36:18 +08:00
free(chn.a[i].seeds);
}
free(chn.a);
regs.n = mem_sort_and_dedup(regs.n, regs.a, opt->mask_level_redun);
2013-02-08 03:36:18 +08:00
return regs;
2013-02-08 02:13:43 +08:00
}
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq_)
{ // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence
mem_alnreg_v ar;
char *seq;
seq = malloc(l_seq);
memcpy(seq, seq_, l_seq); // makes a copy of seq_
ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq);
mem_mark_primary_se(opt, ar.n, ar.a);
free(seq);
return ar;
}
// This routine is only used for the API purpose
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar)
{
mem_aln_t a;
int i, w2, qb, qe, NM, score, is_rev;
int64_t pos, rb, re;
uint8_t *query;
memset(&a, 0, sizeof(mem_aln_t));
if (ar == 0 || ar->rb < 0 || ar->re < 0) { // generate an unmapped record
a.rid = -1; a.pos = -1; a.flag |= 0x4;
return a;
}
qb = ar->qb, qe = ar->qe;
rb = ar->rb, re = ar->re;
query = malloc(l_query);
for (i = 0; i < l_query; ++i) // convert to the nt4 encoding
query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]];
a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0;
2013-05-23 07:55:07 +08:00
if (ar->secondary >= 0) a.flag |= 0x100; // secondary alignment
if (bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re) < 0) {
fprintf(stderr, "[E::%s] If you see this message, please let the developer know. Abort. Sorry.\n", __func__);
exit(1);
}
w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->q, opt->r);
if (bwa_verbose >= 4) fprintf(stderr, "Band width: infer=%d, opt=%d, alnreg=%d\n", w2, opt->w, ar->w);
if (w2 > opt->w) w2 = w2 < ar->w? w2 : ar->w;
else w2 = opt->w;
i = 0; a.cigar = 0;
do {
free(a.cigar);
a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM);
if (bwa_verbose >= 4) fprintf(stderr, "Final alignment: w2=%d, global_sc=%d, local_sc=%d\n", w2, score, ar->truesc);
w2 <<= 1;
} while (++i < 3 && score < ar->truesc - opt->a);
a.NM = NM;
2013-02-28 12:40:46 +08:00
pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
a.is_rev = is_rev;
if (a.n_cigar > 0) {
if ((a.cigar[0]&0xf) == 2) {
pos += a.cigar[0]>>4;
--a.n_cigar;
memmove(a.cigar, a.cigar + 1, a.n_cigar * 4);
} else if ((a.cigar[a.n_cigar-1]&0xf) == 2) --a.n_cigar;
}
if (qb != 0 || qe != l_query) { // add clipping to CIGAR
int clip5, clip3;
clip5 = is_rev? l_query - qe : qb;
clip3 = is_rev? qb : l_query - qe;
a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2));
if (clip5) {
memmove(a.cigar+1, a.cigar, a.n_cigar * 4);
a.cigar[0] = clip5<<4 | 3;
++a.n_cigar;
}
if (clip3) a.cigar[a.n_cigar++] = clip3<<4 | 3;
}
a.rid = bns_pos2rid(bns, pos);
a.pos = pos - bns->anns[a.rid].offset;
a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub;
free(query);
return a;
}
2013-02-08 02:29:01 +08:00
typedef struct {
const mem_opt_t *opt;
const bwt_t *bwt;
const bntseq_t *bns;
const uint8_t *pac;
const mem_pestat_t *pes;
2013-02-08 02:29:01 +08:00
bseq1_t *seqs;
2013-02-08 03:36:18 +08:00
mem_alnreg_v *regs;
} worker_t;
2013-02-08 02:29:01 +08:00
static void worker1(void *data, int i, int tid)
2013-02-08 02:29:01 +08:00
{
2013-02-08 03:36:18 +08:00
worker_t *w = (worker_t*)data;
if (!(w->opt->flag&MEM_F_PE)) {
w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq);
} else {
w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq);
w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq);
}
2013-02-08 03:36:18 +08:00
}
static void worker2(void *data, int i, int tid)
2013-02-08 03:36:18 +08:00
{
2013-02-19 05:33:06 +08:00
extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]);
2013-02-08 03:36:18 +08:00
worker_t *w = (worker_t*)data;
2013-02-19 05:33:06 +08:00
if (!(w->opt->flag&MEM_F_PE)) {
mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a);
mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0);
free(w->regs[i].a);
2013-02-08 03:36:18 +08:00
} else {
mem_sam_pe(w->opt, w->bns, w->pac, w->pes, i, &w->seqs[i<<1], &w->regs[i<<1]);
free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a);
2013-02-08 03:36:18 +08:00
}
2013-02-08 02:29:01 +08:00
}
void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs, const mem_pestat_t *pes0)
2013-02-08 02:13:43 +08:00
{
extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n);
2013-02-08 02:13:43 +08:00
int i;
worker_t w;
2013-02-08 03:57:22 +08:00
mem_alnreg_v *regs;
2013-02-11 23:59:38 +08:00
mem_pestat_t pes[4];
regs = malloc(n * sizeof(mem_alnreg_v));
2013-02-08 02:29:01 +08:00
for (i = 0; i < opt->n_threads; ++i) {
worker_t *p = &w;
2013-02-08 03:57:22 +08:00
p->opt = opt; p->bwt = bwt; p->bns = bns; p->pac = pac;
p->seqs = seqs; p->regs = regs;
2013-02-11 23:59:38 +08:00
p->pes = &pes[0];
2013-02-08 02:29:01 +08:00
}
kt_for(opt->n_threads, worker1, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions
if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided
if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0
else mem_pestat(opt, bns->l_pac, n, regs, pes); // otherwise, infer the insert size distribution from data
2013-02-08 02:29:01 +08:00
}
kt_for(opt->n_threads, worker2, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment
free(regs);
2013-02-08 02:13:43 +08:00
}