1825 lines
68 KiB
C
1825 lines
68 KiB
C
/* The MIT License
|
||
|
||
Copyright (c) 2018- Dana-Farber Cancer Institute
|
||
2009-2018 Broad Institute, Inc.
|
||
2008-2009 Genome Research Ltd. (GRL)
|
||
|
||
Permission is hereby granted, free of charge, to any person obtaining
|
||
a copy of this software and associated documentation files (the
|
||
"Software"), to deal in the Software without restriction, including
|
||
without limitation the rights to use, copy, modify, merge, publish,
|
||
distribute, sublicense, and/or sell copies of the Software, and to
|
||
permit persons to whom the Software is furnished to do so, subject to
|
||
the following conditions:
|
||
|
||
The above copyright notice and this permission notice shall be
|
||
included in all copies or substantial portions of the Software.
|
||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||
SOFTWARE.
|
||
*/
|
||
#include <stdlib.h>
|
||
#include <string.h>
|
||
#include <stdio.h>
|
||
#include <assert.h>
|
||
#include <limits.h>
|
||
#include <math.h>
|
||
#include <immintrin.h>
|
||
#ifdef HAVE_PTHREAD
|
||
#include <pthread.h>
|
||
#endif
|
||
|
||
#include "kstring.h"
|
||
#include "bwamem.h"
|
||
#include "bntseq.h"
|
||
#include "ksw.h"
|
||
#include "kvec.h"
|
||
#include "ksort.h"
|
||
#include "utils.h"
|
||
#include "fmt_idx.h"
|
||
#include "profiling.h"
|
||
#include "debug.h"
|
||
|
||
#ifdef USE_MALLOC_WRAPPERS
|
||
# include "malloc_wrap.h"
|
||
#endif
|
||
|
||
/* Theory on probability and scoring *ungapped* alignment
|
||
*
|
||
* s'(a,b) = log[P(b|a)/P(b)] = log[4P(b|a)], assuming uniform base distribution
|
||
* s'(a,a) = log(4), s'(a,b) = log(4e/3), where e is the error rate
|
||
*
|
||
* Scale s'(a,b) to s(a,a) s.t. s(a,a)=x. Then s(a,b) = x*s'(a,b)/log(4), or conversely: s'(a,b)=s(a,b)*log(4)/x
|
||
*
|
||
* If the matching score is x and mismatch penalty is -y, we can compute error rate e:
|
||
* e = .75 * exp[-log(4) * y/x]
|
||
*
|
||
* log P(seq) = \sum_i log P(b_i|a_i) = \sum_i {s'(a,b) - log(4)}
|
||
* = \sum_i { s(a,b)*log(4)/x - log(4) } = log(4) * (S/x - l)
|
||
*
|
||
* where S=\sum_i s(a,b) is the alignment score. Converting to the phred scale:
|
||
* Q(seq) = -10/log(10) * log P(seq) = 10*log(4)/log(10) * (l - S/x) = 6.02 * (l - S/x)
|
||
*
|
||
*
|
||
* Gap open (zero gap): q' = log[P(gap-open)], r' = log[P(gap-ext)] (see Durbin et al. (1998) Section 4.1)
|
||
* Then q = x*log[P(gap-open)]/log(4), r = x*log[P(gap-ext)]/log(4)
|
||
*
|
||
* When there are gaps, l should be the length of alignment matches (i.e. the M operator in CIGAR)
|
||
*/
|
||
|
||
static const bntseq_t *global_bns = 0; // for debugging only
|
||
|
||
mem_opt_t *mem_opt_init()
|
||
{
|
||
mem_opt_t *o;
|
||
o = calloc(1, sizeof(mem_opt_t));
|
||
o->flag = 0;
|
||
o->a = 1; o->b = 4;
|
||
o->o_del = o->o_ins = 6;
|
||
o->e_del = o->e_ins = 1;
|
||
o->w = 100;
|
||
o->T = 30;
|
||
o->zdrop = 100;
|
||
o->pen_unpaired = 17;
|
||
o->pen_clip5 = o->pen_clip3 = 5;
|
||
|
||
o->max_mem_intv = 20;
|
||
|
||
o->min_seed_len = 19;
|
||
o->split_width = 10;
|
||
o->max_occ = 500;
|
||
o->max_chain_gap = 10000;
|
||
o->max_ins = 10000;
|
||
o->mask_level = 0.50;
|
||
o->drop_ratio = 0.50;
|
||
o->XA_drop_ratio = 0.80;
|
||
o->split_factor = 1.5;
|
||
o->chunk_size = 10000000;
|
||
o->n_threads = 1;
|
||
o->max_XA_hits = 5;
|
||
o->max_XA_hits_alt = 200;
|
||
o->max_matesw = 50;
|
||
o->mask_level_redun = 0.95;
|
||
o->min_chain_weight = 0;
|
||
o->max_chain_extend = 1<<30;
|
||
o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len);
|
||
bwa_fill_scmat(o->a, o->b, o->mat);
|
||
return o;
|
||
}
|
||
|
||
/***************************
|
||
* Collection SA invervals *
|
||
***************************/
|
||
|
||
#define intv_lt(a, b) ((a).info < (b).info)
|
||
KSORT_INIT(mem_intv, bwtintv_t, intv_lt)
|
||
|
||
static void mem_collect_intv(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_aux_t *a)
|
||
{
|
||
int i, k, x = 0, old_n;
|
||
int start_width = 1;
|
||
int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
|
||
int max_seed_len = 0;
|
||
int start_N_num = 0, start_flag = 1;
|
||
a->mem.n = 0;
|
||
|
||
// 1. first pass: find all SMEMs
|
||
while (x < len) {
|
||
if (seq[x] < 4) {
|
||
start_flag = 0;
|
||
x = fmt_smem(fmt, len, seq, x, start_width, opt->min_seed_len, 0, &a->mem1, a->tmpv[0]);
|
||
for (i = 0; i < a->mem1.n; ++i) {
|
||
bwtintv_t *p = &a->mem1.a[i];
|
||
int slen = (uint32_t)p->info - (p->info >> 32); // seed length
|
||
max_seed_len = MAX(max_seed_len, slen);
|
||
if (slen >= opt->min_seed_len) {
|
||
kv_push(bwtintv_t, a->mem, *p);
|
||
}
|
||
}
|
||
//break; // for test full match time
|
||
} else {
|
||
++x;
|
||
if (start_flag) ++start_N_num;
|
||
}
|
||
}
|
||
|
||
#ifdef FILTER_FULL_MATCH
|
||
if (max_seed_len == len - start_N_num) goto collect_intv_end;
|
||
#endif
|
||
|
||
// 2. second pass: find MEMs inside a long SMEM
|
||
old_n = a->mem.n;
|
||
for (k = 0; k < old_n; ++k) {
|
||
bwtintv_t *p = &a->mem.a[k];
|
||
int start = p->info>>32, end = (int32_t)p->info;
|
||
if (end - start < split_len || p->x[2] > opt->split_width) continue;
|
||
fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, p, &a->mem1, a->tmpv[0]);
|
||
for (i = 0; i < a->mem1.n; ++i) {
|
||
bwtintv_t *p = &a->mem1.a[i];
|
||
int slen = (uint32_t)p->info - (p->info >> 32);
|
||
if (slen >= opt->min_seed_len) {
|
||
kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
|
||
}
|
||
}
|
||
}
|
||
|
||
// 3. third pass: LAST-like
|
||
if (opt->max_mem_intv > 0) {
|
||
x = 0;
|
||
while (x < len) {
|
||
if (seq[x] < 4) {
|
||
bwtintv_t m;
|
||
x = fmt_seed_strategy1(fmt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
|
||
if (m.x[2] > 0) {
|
||
kv_push(bwtintv_t, a->mem, m);
|
||
}
|
||
} else ++x;
|
||
}
|
||
}
|
||
|
||
#ifdef FILTER_FULL_MATCH
|
||
collect_intv_end:
|
||
#endif
|
||
// sort
|
||
ks_introsort(mem_intv, a->mem.n, a->mem.a);
|
||
}
|
||
|
||
/************
|
||
* Chaining *
|
||
************/
|
||
|
||
#include "kbtree.h"
|
||
|
||
#define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos))
|
||
KBTREE_INIT(chn, mem_chain_t, chain_cmp)
|
||
|
||
// return 1 if the seed is merged into the chain
|
||
static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p, int seed_rid)
|
||
{
|
||
//return 0;
|
||
int64_t qend, rend, x, y;
|
||
const mem_seed_t *last = &c->seeds[c->n-1];
|
||
qend = last->qbeg + last->len;
|
||
rend = last->rbeg + last->len;
|
||
if (seed_rid != c->rid) return 0; // different chr; request a new chain
|
||
if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend)
|
||
return 1; // contained seed; do nothing
|
||
if ((last->rbeg < l_pac || c->seeds[0].rbeg < l_pac) && p->rbeg >= l_pac) return 0; // don't chain if on different strand
|
||
|
||
//return 0; //到这里都一样的
|
||
x = p->qbeg - last->qbeg; // always non-negtive
|
||
y = p->rbeg - last->rbeg;
|
||
//fprintf(stderr, "x: %ld, y: %ld\n", x, y);
|
||
// 这里的代码是不稳定的,因为他只用当前的seed和chain里的last做比较,而chain里所有的seed都是无序的,他们的顺序跟处理seed的先后有关
|
||
if (y >= 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain
|
||
if (c->n == c->m) {
|
||
c->m <<= 1;
|
||
c->seeds = realloc(c->seeds, c->m * sizeof(mem_seed_t));
|
||
}
|
||
c->seeds[c->n++] = *p;
|
||
return 1;
|
||
}
|
||
return 0; // request to add a new chain
|
||
}
|
||
|
||
int mem_chain_weight(const mem_chain_t *c)
|
||
{
|
||
int64_t end;
|
||
int j, w = 0, tmp;
|
||
for (j = 0, end = 0; j < c->n; ++j) {
|
||
const mem_seed_t *s = &c->seeds[j];
|
||
if (s->qbeg >= end) w += s->len;
|
||
else if (s->qbeg + s->len > end) w += s->qbeg + s->len - end;
|
||
end = end > s->qbeg + s->len? end : s->qbeg + s->len;
|
||
}
|
||
tmp = w; w = 0;
|
||
for (j = 0, end = 0; j < c->n; ++j) {
|
||
const mem_seed_t *s = &c->seeds[j];
|
||
if (s->rbeg >= end) w += s->len;
|
||
else if (s->rbeg + s->len > end) w += s->rbeg + s->len - end;
|
||
end = end > s->rbeg + s->len? end : s->rbeg + s->len;
|
||
}
|
||
w = w < tmp? w : tmp;
|
||
return w < 1<<30? w : (1<<30)-1;
|
||
}
|
||
|
||
void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn)
|
||
{
|
||
int i, j;
|
||
for (i = 0; i < chn->n; ++i) {
|
||
mem_chain_t *p = &chn->a[i];
|
||
err_printf("* Found CHAIN(%d): n=%d; weight=%d", i, p->n, mem_chain_weight(p));
|
||
for (j = 0; j < p->n; ++j) {
|
||
bwtint_t pos;
|
||
int is_rev;
|
||
pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev);
|
||
if (is_rev) pos -= p->seeds[j].len - 1;
|
||
err_printf("\t%d;%d;%d,%ld(%s:%c%ld)", p->seeds[j].score, p->seeds[j].len, p->seeds[j].qbeg, (long)p->seeds[j].rbeg, bns->anns[p->rid].name, "+-"[is_rev], (long)(pos - bns->anns[p->rid].offset) + 1);
|
||
}
|
||
err_putchar('\n');
|
||
}
|
||
}
|
||
|
||
#define uint64_lt(a, b) ((a) > (b))
|
||
KSORT_INIT(uint64_sort, uint64_t, uint64_lt)
|
||
mem_chain_v mem_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *bns, int len, const uint8_t *seq, void *buf)
|
||
{
|
||
int i, b, e, l_rep;
|
||
int64_t l_pac = bns->l_pac;
|
||
mem_chain_v chain;
|
||
kbtree_t(chn) *tree;
|
||
smem_aux_t *aux;
|
||
|
||
|
||
uint64_v v1, v2;
|
||
kv_init(v1);
|
||
kv_init(v2);
|
||
|
||
kv_init(chain);
|
||
if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match
|
||
tree = kb_init(chn, KB_DEFAULT_SIZE);
|
||
|
||
aux = (smem_aux_t*)buf;
|
||
mem_collect_intv(opt, fmt, len, seq, aux);
|
||
|
||
#define CHECK_ADD_CHAIN(tmp, lower, upper) \
|
||
int rid, to_add = 0; \
|
||
mem_chain_t tmp, *lower, *upper; \
|
||
tmp.pos = s.rbeg; \
|
||
s.qbeg = p->info >> 32; \
|
||
s.score = s.len = slen; \
|
||
rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len); \
|
||
if (rid < 0) \
|
||
continue; \
|
||
if (kb_size(tree)) \
|
||
{ \
|
||
kb_intervalp(chn, tree, &tmp, &lower, &upper); \
|
||
if (!lower || !test_and_merge(opt, l_pac, lower, &s, rid)) \
|
||
to_add = 1; \
|
||
} \
|
||
else \
|
||
to_add = 1; \
|
||
if (to_add) \
|
||
{ \
|
||
tmp.n = 1; \
|
||
tmp.m = 4; \
|
||
tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t)); \
|
||
tmp.seeds[0] = s; \
|
||
tmp.rid = rid; \
|
||
tmp.is_alt = !!bns->anns[rid].is_alt; \
|
||
kb_putp(chn, tree, &tmp); \
|
||
}
|
||
|
||
for (i = 0, b = e = l_rep = 0; i < aux->mem.n; ++i) { // compute frac_rep
|
||
bwtintv_t *p = &aux->mem.a[i];
|
||
int sb = (p->info>>32), se = (uint32_t)p->info;
|
||
if (p->x[2] <= opt->max_occ) continue;
|
||
if (sb > e) l_rep += e - b, b = sb, e = se;
|
||
else e = e > se? e : se;
|
||
}
|
||
l_rep += e - b;
|
||
for (i = 0; i < aux->mem.n; ++i) {
|
||
bwtintv_t *p = &aux->mem.a[i];
|
||
int step, count, slen = (uint32_t)p->info - (p->info>>32); // seed length
|
||
int64_t k;
|
||
|
||
if (p->num_match > 0) {
|
||
mem_seed_t s;
|
||
s.rbeg = p->rm[0].rs;
|
||
if (p->rm[0].reverse) s.rbeg = (fmt->l_pac << 1) - 1 - s.rbeg;
|
||
CHECK_ADD_CHAIN(tmp, lower, upper);
|
||
} else {
|
||
step = p->x[2] > opt->max_occ? p->x[2] / opt->max_occ : 1;
|
||
//step = 1;
|
||
for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) {
|
||
mem_seed_t s;
|
||
s.rbeg = fmt_sa(fmt, p->x[0] + k);
|
||
|
||
//kv_push(uint64_t, v1, s.rbeg);
|
||
//fprintf(stderr, "%ld\t", s.rbeg);
|
||
//if (p->x[0] == 0)
|
||
// s.rbeg = (fmt->l_pac << 1) - slen - fmt_sa(fmt, p->x[1] + p->x[2] - 1 - k);
|
||
//else
|
||
// s.rbeg = fmt_sa(fmt, p->x[0] + k);
|
||
//kv_push(uint64_t, v2, s.rbeg);
|
||
//fprintf(stderr, "%ld\t", s.rbeg);
|
||
//s.rbeg = (fmt->l_pac << 1) - slen - fmt_sa(fmt, p->x[1] + k);
|
||
//fprintf(stderr, "%ld\n", s.rbeg);
|
||
//if (s.rbeg < fmt->l_pac) { // 在正链
|
||
// s.rbeg = (fmt->l_pac << 1) - slen - s.rbeg;
|
||
//} else { // 在反向互补链
|
||
// s.rbeg = (fmt->l_pac << 1) - slen - s.rbeg;
|
||
//}
|
||
//s.rbeg = fmt_sa(fmt, p->x[0] + k);
|
||
|
||
CHECK_ADD_CHAIN(tmp, lower, upper);
|
||
}
|
||
//fprintf(stderr, "\n");
|
||
}
|
||
}
|
||
//fprintf(stderr, "split\n");
|
||
kv_resize(mem_chain_t, chain, kb_size(tree));
|
||
|
||
#define traverse_func(p_) (chain.a[chain.n++] = *(p_))
|
||
__kb_traverse(mem_chain_t, tree, traverse_func);
|
||
#undef traverse_func
|
||
|
||
for (i = 0; i < chain.n; ++i) chain.a[i].frac_rep = (float)l_rep / len;
|
||
if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len);
|
||
kb_destroy(chn, tree);
|
||
|
||
#if 0
|
||
for (i = 0; i < chain.n; ++i)
|
||
{
|
||
fprintf(gf[0], "chain-begin: %d\n", i);
|
||
int j;
|
||
fprintf(gf[0], "chain-%ld %d\n", chain.a[i].pos, chain.a[i].rid);
|
||
for (j = 0; j < chain.a[i].n; ++j) {
|
||
fprintf(gf[0], "chain-%d %ld %d\n", chain.a[i].seeds[j].qbeg, chain.a[i].seeds[j].rbeg, chain.a[i].seeds[j].len);
|
||
}
|
||
fprintf(gf[0], "chain-end\n");
|
||
}
|
||
fprintf(gf[0], "fun-end\n");
|
||
#endif
|
||
// ks_introsort(uint64_sort, v1.n, v1.a);
|
||
// ks_introsort(uint64_sort, v2.n, v2.a);
|
||
// for (i = 0; i < v1.n; ++i) {
|
||
// //fprintf(stderr, "%ld\t%ld\n", v1.a[i], v2.a[i]);
|
||
// if (v1.a[i] != v2.a[i]) {
|
||
// fprintf(stderr, "diff: %ld\t%ld\n", v1.a[i], v2.a[i]);
|
||
// }
|
||
// }
|
||
|
||
return chain;
|
||
}
|
||
|
||
/********************
|
||
* Filtering chains *
|
||
********************/
|
||
|
||
#define chn_beg(ch) ((ch).seeds->qbeg)
|
||
#define chn_end(ch) ((ch).seeds[(ch).n-1].qbeg + (ch).seeds[(ch).n-1].len)
|
||
|
||
#define flt_lt(a, b) ((a).w > (b).w)
|
||
KSORT_INIT(mem_flt, mem_chain_t, flt_lt)
|
||
|
||
int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *a)
|
||
{
|
||
int i, k;
|
||
kvec_t(int) chains = {0,0,0}; // this keeps int indices of the non-overlapping chains
|
||
if (n_chn == 0) return 0; // no need to filter
|
||
// compute the weight of each chain and drop chains with small weight
|
||
for (i = k = 0; i < n_chn; ++i) {
|
||
mem_chain_t *c = &a[i];
|
||
c->first = -1; c->kept = 0;
|
||
c->w = mem_chain_weight(c);
|
||
if (c->w < opt->min_chain_weight) free(c->seeds);
|
||
else a[k++] = *c;
|
||
}
|
||
n_chn = k;
|
||
ks_introsort(mem_flt, n_chn, a);
|
||
// pairwise chain comparisons
|
||
a[0].kept = 3;
|
||
kv_push(int, chains, 0);
|
||
for (i = 1; i < n_chn; ++i) {
|
||
int large_ovlp = 0;
|
||
for (k = 0; k < chains.n; ++k) {
|
||
int j = chains.a[k];
|
||
int b_max = chn_beg(a[j]) > chn_beg(a[i])? chn_beg(a[j]) : chn_beg(a[i]);
|
||
int e_min = chn_end(a[j]) < chn_end(a[i])? chn_end(a[j]) : chn_end(a[i]);
|
||
if (e_min > b_max && (!a[j].is_alt || a[i].is_alt)) { // have overlap; don't consider ovlp where the kept chain is ALT while the current chain is primary
|
||
int li = chn_end(a[i]) - chn_beg(a[i]);
|
||
int lj = chn_end(a[j]) - chn_beg(a[j]);
|
||
int min_l = li < lj? li : lj;
|
||
if (e_min - b_max >= min_l * opt->mask_level && min_l < opt->max_chain_gap) { // significant overlap
|
||
large_ovlp = 1;
|
||
if (a[j].first < 0) a[j].first = i; // keep the first shadowed hit s.t. mapq can be more accurate
|
||
if (a[i].w < a[j].w * opt->drop_ratio && a[j].w - a[i].w >= opt->min_seed_len<<1)
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
if (k == chains.n) {
|
||
kv_push(int, chains, i);
|
||
a[i].kept = large_ovlp? 2 : 3;
|
||
}
|
||
}
|
||
for (i = 0; i < chains.n; ++i) {
|
||
mem_chain_t *c = &a[chains.a[i]];
|
||
if (c->first >= 0) a[c->first].kept = 1;
|
||
}
|
||
free(chains.a);
|
||
for (i = k = 0; i < n_chn; ++i) { // don't extend more than opt->max_chain_extend .kept=1/2 chains
|
||
if (a[i].kept == 0 || a[i].kept == 3) continue;
|
||
if (++k >= opt->max_chain_extend) break;
|
||
}
|
||
for (; i < n_chn; ++i)
|
||
if (a[i].kept < 3) a[i].kept = 0;
|
||
for (i = k = 0; i < n_chn; ++i) { // free discarded chains
|
||
mem_chain_t *c = &a[i];
|
||
if (c->kept == 0) free(c->seeds);
|
||
else a[k++] = a[i];
|
||
}
|
||
return k;
|
||
}
|
||
|
||
/******************************
|
||
* De-overlap single-end hits *
|
||
******************************/
|
||
|
||
#define alnreg_slt2(a, b) ((a).re < (b).re)
|
||
KSORT_INIT(mem_ars2, mem_alnreg_t, alnreg_slt2)
|
||
|
||
#define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb))))
|
||
KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt)
|
||
|
||
#define alnreg_hlt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).is_alt < (b).is_alt || ((a).is_alt == (b).is_alt && (a).hash < (b).hash))))
|
||
KSORT_INIT(mem_ars_hash, mem_alnreg_t, alnreg_hlt)
|
||
|
||
#define alnreg_hlt2(a, b) ((a).is_alt < (b).is_alt || ((a).is_alt == (b).is_alt && ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash))))
|
||
KSORT_INIT(mem_ars_hash2, mem_alnreg_t, alnreg_hlt2)
|
||
|
||
#define PATCH_MAX_R_BW 0.05f
|
||
#define PATCH_MIN_SC_RATIO 0.90f
|
||
|
||
int mem_patch_reg(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, const mem_alnreg_t *a, const mem_alnreg_t *b, int *_w)
|
||
{
|
||
int w, score, q_s, r_s;
|
||
double r;
|
||
if (bns == 0 || pac == 0 || query == 0) return 0;
|
||
assert(a->rid == b->rid && a->rb <= b->rb);
|
||
if (a->rb < bns->l_pac && b->rb >= bns->l_pac) return 0; // on different strands
|
||
if (a->qb >= b->qb || a->qe >= b->qe || a->re >= b->re) return 0; // not colinear
|
||
w = (a->re - b->rb) - (a->qe - b->qb); // required bandwidth
|
||
w = w > 0? w : -w; // l = abs(l)
|
||
r = (double)(a->re - b->rb) / (b->re - a->rb) - (double)(a->qe - b->qb) / (b->qe - a->qb); // relative bandwidth
|
||
r = r > 0.? r : -r; // r = fabs(r)
|
||
if (bwa_verbose >= 4)
|
||
printf("* potential hit merge between [%d,%d)<=>[%ld,%ld) and [%d,%d)<=>[%ld,%ld), @ %s; w=%d, r=%.4g\n",
|
||
a->qb, a->qe, (long)a->rb, (long)a->re, b->qb, b->qe, (long)b->rb, (long)b->re, bns->anns[a->rid].name, w, r);
|
||
if (a->re < b->rb || a->qe < b->qb) { // no overlap on query or on ref
|
||
if (w > opt->w<<1 || r >= PATCH_MAX_R_BW) return 0; // the bandwidth or the relative bandwidth is too large
|
||
} else if (w > opt->w<<2 || r >= PATCH_MAX_R_BW*2) return 0; // more permissive if overlapping on both ref and query
|
||
// global alignment
|
||
w += a->w + b->w;
|
||
w = w < opt->w<<2? w : opt->w<<2;
|
||
if (bwa_verbose >= 4) printf("* test potential hit merge with global alignment; w=%d\n", w);
|
||
bwa_gen_cigar2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, w, bns->l_pac, pac, b->qe - a->qb, query + a->qb, a->rb, b->re, &score, 0, 0);
|
||
q_s = (int)((double)(b->qe - a->qb) / ((b->qe - b->qb) + (a->qe - a->qb)) * (b->score + a->score) + .499); // predicted score from query
|
||
r_s = (int)((double)(b->re - a->rb) / ((b->re - b->rb) + (a->re - a->rb)) * (b->score + a->score) + .499); // predicted score from ref
|
||
if (bwa_verbose >= 4) printf("* score=%d;(%d,%d)\n", score, q_s, r_s);
|
||
if ((double)score / (q_s > r_s? q_s : r_s) < PATCH_MIN_SC_RATIO) return 0;
|
||
*_w = w;
|
||
return score;
|
||
}
|
||
|
||
int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a)
|
||
{
|
||
int m, i, j;
|
||
if (n <= 1) return n;
|
||
ks_introsort(mem_ars2, n, a); // sort by the END position, not START!
|
||
for (i = 0; i < n; ++i) a[i].n_comp = 1;
|
||
for (i = 1; i < n; ++i) {
|
||
mem_alnreg_t *p = &a[i];
|
||
if (p->rid != a[i-1].rid || p->rb >= a[i-1].re + opt->max_chain_gap) continue; // then no need to go into the loop below
|
||
for (j = i - 1; j >= 0 && p->rid == a[j].rid && p->rb < a[j].re + opt->max_chain_gap; --j) {
|
||
mem_alnreg_t *q = &a[j];
|
||
int64_t or, oq, mr, mq;
|
||
int score, w;
|
||
if (q->qe == q->qb) continue; // a[j] has been excluded
|
||
or = q->re - p->rb; // overlap length on the reference
|
||
oq = q->qb < p->qb? q->qe - p->qb : p->qe - q->qb; // overlap length on the query
|
||
mr = q->re - q->rb < p->re - p->rb? q->re - q->rb : p->re - p->rb; // min ref len in alignment
|
||
mq = q->qe - q->qb < p->qe - p->qb? q->qe - q->qb : p->qe - p->qb; // min qry len in alignment
|
||
if (or > opt->mask_level_redun * mr && oq > opt->mask_level_redun * mq) { // one of the hits is redundant
|
||
if (p->score < q->score) {
|
||
p->qe = p->qb;
|
||
break;
|
||
} else q->qe = q->qb;
|
||
} else if (q->rb < p->rb && (score = mem_patch_reg(opt, bns, pac, query, q, p, &w)) > 0) { // then merge q into p
|
||
p->n_comp += q->n_comp + 1;
|
||
p->seedcov = p->seedcov > q->seedcov? p->seedcov : q->seedcov;
|
||
p->sub = p->sub > q->sub? p->sub : q->sub;
|
||
p->csub = p->csub > q->csub? p->csub : q->csub;
|
||
p->qb = q->qb, p->rb = q->rb;
|
||
p->truesc = p->score = score;
|
||
p->w = w;
|
||
q->qb = q->qe;
|
||
}
|
||
}
|
||
}
|
||
for (i = 0, m = 0; i < n; ++i) // exclude identical hits
|
||
if (a[i].qe > a[i].qb) {
|
||
if (m != i) a[m++] = a[i];
|
||
else ++m;
|
||
}
|
||
n = m;
|
||
ks_introsort(mem_ars, n, a);
|
||
for (i = 1; i < n; ++i) { // mark identical hits
|
||
if (a[i].score == a[i-1].score && a[i].rb == a[i-1].rb && a[i].qb == a[i-1].qb)
|
||
a[i].qe = a[i].qb;
|
||
}
|
||
for (i = 1, m = 1; i < n; ++i) // exclude identical hits
|
||
if (a[i].qe > a[i].qb) {
|
||
if (m != i) a[m++] = a[i];
|
||
else ++m;
|
||
}
|
||
return m;
|
||
}
|
||
|
||
typedef kvec_t(int) int_v;
|
||
|
||
static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t *a, int_v *z)
|
||
{ // similar to the loop in mem_chain_flt()
|
||
int i, k, tmp;
|
||
tmp = opt->a + opt->b;
|
||
tmp = opt->o_del + opt->e_del > tmp? opt->o_del + opt->e_del : tmp;
|
||
tmp = opt->o_ins + opt->e_ins > tmp? opt->o_ins + opt->e_ins : tmp;
|
||
z->n = 0;
|
||
kv_push(int, *z, 0);
|
||
for (i = 1; i < n; ++i) {
|
||
for (k = 0; k < z->n; ++k) {
|
||
int j = z->a[k];
|
||
int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb;
|
||
int e_min = a[j].qe < a[i].qe? a[j].qe : a[i].qe;
|
||
if (e_min > b_max) { // have overlap
|
||
int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb;
|
||
if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap
|
||
if (a[j].sub == 0) a[j].sub = a[i].score;
|
||
if (a[j].score - a[i].score <= tmp && (a[j].is_alt || !a[i].is_alt))
|
||
++a[j].sub_n;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
if (k == z->n) kv_push(int, *z, i);
|
||
else a[i].secondary = z->a[k];
|
||
}
|
||
}
|
||
|
||
int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id)
|
||
{
|
||
int i, n_pri;
|
||
int_v z = {0,0,0};
|
||
if (n == 0) return 0;
|
||
for (i = n_pri = 0; i < n; ++i) {
|
||
a[i].sub = a[i].alt_sc = 0, a[i].secondary = a[i].secondary_all = -1, a[i].hash = hash_64(id+i);
|
||
if (!a[i].is_alt) ++n_pri;
|
||
}
|
||
ks_introsort(mem_ars_hash, n, a);
|
||
mem_mark_primary_se_core(opt, n, a, &z);
|
||
for (i = 0; i < n; ++i) {
|
||
mem_alnreg_t *p = &a[i];
|
||
p->secondary_all = i; // keep the rank in the first round
|
||
if (!p->is_alt && p->secondary >= 0 && a[p->secondary].is_alt)
|
||
p->alt_sc = a[p->secondary].score;
|
||
}
|
||
if (n_pri >= 0 && n_pri < n) {
|
||
kv_resize(int, z, n);
|
||
if (n_pri > 0) ks_introsort(mem_ars_hash2, n, a);
|
||
for (i = 0; i < n; ++i) z.a[a[i].secondary_all] = i;
|
||
for (i = 0; i < n; ++i) {
|
||
if (a[i].secondary >= 0) {
|
||
a[i].secondary_all = z.a[a[i].secondary];
|
||
if (a[i].is_alt) a[i].secondary = INT_MAX;
|
||
} else a[i].secondary_all = -1;
|
||
}
|
||
if (n_pri > 0) { // mark primary for hits to the primary assembly only
|
||
for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1;
|
||
mem_mark_primary_se_core(opt, n_pri, a, &z);
|
||
}
|
||
} else {
|
||
for (i = 0; i < n; ++i)
|
||
a[i].secondary_all = a[i].secondary;
|
||
}
|
||
free(z.a);
|
||
return n_pri;
|
||
}
|
||
|
||
/*********************************
|
||
* Test if a seed is good enough *
|
||
*********************************/
|
||
|
||
#define MEM_SHORT_EXT 50
|
||
#define MEM_SHORT_LEN 200
|
||
|
||
#define MEM_HSP_COEF 1.1f
|
||
#define MEM_MINSC_COEF 5.5f
|
||
#define MEM_SEEDSW_COEF 0.05f
|
||
|
||
int mem_seed_sw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_seed_t *s, buf_t *buf)
|
||
{
|
||
int qb, qe, rid;
|
||
int64_t rb, re, mid, l_pac = bns->l_pac;
|
||
uint8_t *rseq = 0;
|
||
kswr_t x;
|
||
|
||
if (s->len >= MEM_SHORT_LEN) return -1; // the seed is longer than the max-extend; no need to do SW
|
||
qb = s->qbeg, qe = s->qbeg + s->len;
|
||
rb = s->rbeg, re = s->rbeg + s->len;
|
||
mid = (rb + re) >> 1;
|
||
qb -= MEM_SHORT_EXT; qb = qb > 0? qb : 0;
|
||
qe += MEM_SHORT_EXT; qe = qe < l_query? qe : l_query;
|
||
rb -= MEM_SHORT_EXT; rb = rb > 0? rb : 0;
|
||
re += MEM_SHORT_EXT; re = re < l_pac<<1? re : l_pac<<1;
|
||
if (rb < l_pac && l_pac < re) {
|
||
if (mid < l_pac) re = l_pac;
|
||
else rb = l_pac;
|
||
}
|
||
if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return -1; // the seed seems good enough; no need to do SW
|
||
|
||
rseq = bns_fetch_seq(bns, pac, &rb, mid, &re, &rid);
|
||
//bns_fetch_seq_no_alloc(bns, pac, &rb, mid, &re, &rid, &buf->m, &buf->addr); rseq = buf->addr;
|
||
|
||
x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, KSW_XSTART, 0);
|
||
free(rseq);
|
||
return x.score;
|
||
}
|
||
|
||
void mem_flt_chained_seeds(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, int n_chn, mem_chain_t *a, buf_t *buf)
|
||
{
|
||
double min_l = opt->min_chain_weight? MEM_HSP_COEF * opt->min_chain_weight : MEM_MINSC_COEF * log(l_query);
|
||
int i, j, k, min_HSP_score = (int)(opt->a * min_l + .499);
|
||
if (min_l > MEM_SEEDSW_COEF * l_query) return; // don't run the following for short reads
|
||
for (i = 0; i < n_chn; ++i) {
|
||
mem_chain_t *c = &a[i];
|
||
for (j = k = 0; j < c->n; ++j) {
|
||
mem_seed_t *s = &c->seeds[j];
|
||
s->score = mem_seed_sw(opt, bns, pac, l_query, query, s, buf);
|
||
if (s->score < 0 || s->score >= min_HSP_score) {
|
||
s->score = s->score < 0? s->len * opt->a : s->score;
|
||
c->seeds[k++] = *s;
|
||
}
|
||
}
|
||
c->n = k;
|
||
}
|
||
}
|
||
|
||
/****************************************
|
||
* Construct the alignment from a chain *
|
||
****************************************/
|
||
|
||
static inline int cal_max_gap(const mem_opt_t *opt, int qlen)
|
||
{
|
||
int l_del = (int)((double)(qlen * opt->a - opt->o_del) / opt->e_del + 1.);
|
||
int l_ins = (int)((double)(qlen * opt->a - opt->o_ins) / opt->e_ins + 1.);
|
||
int l = l_del > l_ins? l_del : l_ins;
|
||
l = l > 1? l : 1;
|
||
return l < opt->w<<1? l : opt->w<<1;
|
||
}
|
||
|
||
#define MAX_BAND_TRY 2
|
||
|
||
void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av, void *buf, int tid)
|
||
{
|
||
int i, k, rid, max_off[2], aw[2]; // aw: actual bandwidth used in extension
|
||
int64_t l_pac = bns->l_pac, rmax[2], tmp, max = 0;
|
||
const mem_seed_t *s;
|
||
uint8_t *rseq = 0;
|
||
uint64_t *srt;
|
||
smem_aux_t *aux = (smem_aux_t*)buf;
|
||
|
||
if (c->n == 0) return;
|
||
// get the max possible span
|
||
rmax[0] = l_pac<<1; rmax[1] = 0;
|
||
for (i = 0; i < c->n; ++i) {
|
||
int64_t b, e;
|
||
const mem_seed_t *t = &c->seeds[i];
|
||
b = t->rbeg - (t->qbeg + cal_max_gap(opt, t->qbeg));
|
||
e = t->rbeg + t->len + ((l_query - t->qbeg - t->len) + cal_max_gap(opt, l_query - t->qbeg - t->len));
|
||
rmax[0] = rmax[0] < b? rmax[0] : b;
|
||
rmax[1] = rmax[1] > e? rmax[1] : e;
|
||
if (t->len > max) max = t->len;
|
||
}
|
||
rmax[0] = rmax[0] > 0? rmax[0] : 0;
|
||
rmax[1] = rmax[1] < l_pac<<1? rmax[1] : l_pac<<1;
|
||
if (rmax[0] < l_pac && l_pac < rmax[1]) { // crossing the forward-reverse boundary; then choose one side
|
||
if (c->seeds[0].rbeg < l_pac) rmax[1] = l_pac; // this works because all seeds are guaranteed to be on the same strand
|
||
else rmax[0] = l_pac;
|
||
}
|
||
// retrieve the reference sequence
|
||
rseq = bns_fetch_seq(bns, pac, &rmax[0], c->seeds[0].rbeg, &rmax[1], &rid);
|
||
//bns_fetch_seq_no_alloc(bns, pac, &rmax[0], c->seeds[0].rbeg, &rmax[1], &rid, &aux->seq_buf->m, &aux->seq_buf->addr); rseq = aux->seq_buf->addr;
|
||
|
||
assert(c->rid == rid);
|
||
|
||
srt = malloc(c->n * 8);
|
||
for (i = 0; i < c->n; ++i)
|
||
srt[i] = (uint64_t)c->seeds[i].score<<32 | i;
|
||
ks_introsort_64(c->n, srt);
|
||
|
||
for (k = c->n - 1; k >= 0; --k) {
|
||
mem_alnreg_t *a;
|
||
s = &c->seeds[(uint32_t)srt[k]];
|
||
|
||
for (i = 0; i < av->n; ++i) { // test whether extension has been made before
|
||
mem_alnreg_t *p = &av->a[i];
|
||
int64_t rd;
|
||
int qd, w, max_gap;
|
||
if (s->rbeg < p->rb || s->rbeg + s->len > p->re || s->qbeg < p->qb || s->qbeg + s->len > p->qe) continue; // not fully contained
|
||
if (s->len - p->seedlen0 > .1 * l_query) continue; // this seed may give a better alignment
|
||
// qd: distance ahead of the seed on query; rd: on reference
|
||
qd = s->qbeg - p->qb; rd = s->rbeg - p->rb;
|
||
max_gap = cal_max_gap(opt, qd < rd? qd : rd); // the maximal gap allowed in regions ahead of the seed
|
||
w = max_gap < p->w? max_gap : p->w; // bounded by the band width
|
||
if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit
|
||
// similar to the previous four lines, but this time we look at the region behind
|
||
qd = p->qe - (s->qbeg + s->len); rd = p->re - (s->rbeg + s->len);
|
||
max_gap = cal_max_gap(opt, qd < rd? qd : rd);
|
||
w = max_gap < p->w? max_gap : p->w;
|
||
if (qd - rd < w && rd - qd < w) break;
|
||
}
|
||
if (i < av->n) { // the seed is (almost) contained in an existing alignment; further testing is needed to confirm it is not leading to a different aln
|
||
if (bwa_verbose >= 4)
|
||
printf("** Seed(%d) [%ld;%ld,%ld] is almost contained in an existing alignment [%d,%d) <=> [%ld,%ld)\n",
|
||
k, (long)s->len, (long)s->qbeg, (long)s->rbeg, av->a[i].qb, av->a[i].qe, (long)av->a[i].rb, (long)av->a[i].re);
|
||
for (i = k + 1; i < c->n; ++i) { // check overlapping seeds in the same chain
|
||
const mem_seed_t *t;
|
||
if (srt[i] == 0) continue;
|
||
t = &c->seeds[(uint32_t)srt[i]];
|
||
if (t->len < s->len * .95) continue; // only check overlapping if t is long enough; TODO: more efficient by early stopping
|
||
if (s->qbeg <= t->qbeg && s->qbeg + s->len - t->qbeg >= s->len>>2 && t->qbeg - s->qbeg != t->rbeg - s->rbeg) break;
|
||
if (t->qbeg <= s->qbeg && t->qbeg + t->len - s->qbeg >= s->len>>2 && s->qbeg - t->qbeg != s->rbeg - t->rbeg) break;
|
||
}
|
||
if (i == c->n) { // no overlapping seeds; then skip extension
|
||
srt[k] = 0; // mark that seed extension has not been performed
|
||
continue;
|
||
}
|
||
if (bwa_verbose >= 4)
|
||
printf("** Seed(%d) might lead to a different alignment even though it is contained. Extension will be performed.\n", k);
|
||
}
|
||
|
||
a = kv_pushp(mem_alnreg_t, *av);
|
||
memset(a, 0, sizeof(mem_alnreg_t));
|
||
a->w = aw[0] = aw[1] = opt->w;
|
||
a->score = a->truesc = -1;
|
||
a->rid = c->rid;
|
||
|
||
if (bwa_verbose >= 4) err_printf("** ---> Extending from seed(%d) [%ld;%ld,%ld] @ %s <---\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg, bns->anns[c->rid].name);
|
||
if (s->qbeg) { // left extension
|
||
int qle, tle, gtle, gscore;
|
||
tmp = s->rbeg - rmax[0];
|
||
#ifndef USE_AVX2
|
||
uint8_t *rs, *qs;
|
||
qs = malloc(s->qbeg);
|
||
for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i];
|
||
rs = malloc(tmp);
|
||
for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i];
|
||
#endif
|
||
for (i = 0; i < MAX_BAND_TRY; ++i) {
|
||
int prev = a->score;
|
||
aw[0] = opt->w << i;
|
||
if (bwa_verbose >= 4) {
|
||
int j;
|
||
printf("*** Left ref: "); for (j = 0; j < tmp; ++j) putchar("ACGTN"[(int)rseq[tmp - 1 - j]]); putchar('\n');
|
||
printf("*** Left query: "); for (j = 0; j < s->qbeg; ++j) putchar("ACGTN"[(int)query[s->qbeg - 1 - j]]); putchar('\n');
|
||
}
|
||
PROF_START(bsw);
|
||
#ifndef USE_AVX2
|
||
a->score = ksw_extend2(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0], aux->sw_buf);
|
||
#else
|
||
a->score = ksw_extend2_avx2(s->qbeg, query, tmp, rseq, 1, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->a, opt->b, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0], aux->sw_buf);
|
||
#endif
|
||
PROF_END(tprof[T_BSW][tid], bsw);
|
||
if (bwa_verbose >= 4) { printf("*** Left extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); }
|
||
if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break;
|
||
}
|
||
// check whether we prefer to reach the end of the query
|
||
if (gscore <= 0 || gscore <= a->score - opt->pen_clip5) { // local extension
|
||
a->qb = s->qbeg - qle, a->rb = s->rbeg - tle;
|
||
a->truesc = a->score;
|
||
} else { // to-end extension
|
||
a->qb = 0, a->rb = s->rbeg - gtle;
|
||
a->truesc = gscore;
|
||
}
|
||
#ifndef USE_AVX2
|
||
free(qs); free(rs);
|
||
#endif
|
||
} else a->score = a->truesc = s->len * opt->a, a->qb = 0, a->rb = s->rbeg;
|
||
|
||
if (s->qbeg + s->len != l_query) { // right extension
|
||
int qle, tle, qe, re, gtle, gscore, sc0 = a->score;
|
||
qe = s->qbeg + s->len;
|
||
re = s->rbeg + s->len - rmax[0];
|
||
assert(re >= 0);
|
||
for (i = 0; i < MAX_BAND_TRY; ++i) {
|
||
int prev = a->score;
|
||
aw[1] = opt->w << i;
|
||
if (bwa_verbose >= 4) {
|
||
int j;
|
||
printf("*** Right ref: "); for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[re+j]]); putchar('\n');
|
||
printf("*** Right query: "); for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[qe+j]]); putchar('\n');
|
||
}
|
||
PROF_START(bsw);
|
||
#ifndef USE_AVX2
|
||
a->score = ksw_extend2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1], aux->sw_buf);
|
||
#else
|
||
a->score = ksw_extend2_avx2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 0, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->a, opt->b, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1], aux->sw_buf);
|
||
#endif
|
||
PROF_END(tprof[T_BSW][tid], bsw);
|
||
if (bwa_verbose >= 4) { printf("*** Right extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); }
|
||
if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break;
|
||
}
|
||
// similar to the above
|
||
if (gscore <= 0 || gscore <= a->score - opt->pen_clip3) { // local extension
|
||
a->qe = qe + qle, a->re = rmax[0] + re + tle;
|
||
a->truesc += a->score - sc0;
|
||
} else { // to-end extension
|
||
a->qe = l_query, a->re = rmax[0] + re + gtle;
|
||
a->truesc += gscore - sc0;
|
||
}
|
||
} else a->qe = l_query, a->re = s->rbeg + s->len;
|
||
if (bwa_verbose >= 4) printf("*** Added alignment region: [%d,%d) <=> [%ld,%ld); score=%d; {left,right}_bandwidth={%d,%d}\n", a->qb, a->qe, (long)a->rb, (long)a->re, a->score, aw[0], aw[1]);
|
||
|
||
// compute seedcov
|
||
for (i = 0, a->seedcov = 0; i < c->n; ++i) {
|
||
const mem_seed_t *t = &c->seeds[i];
|
||
if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained
|
||
a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough
|
||
}
|
||
a->w = aw[0] > aw[1]? aw[0] : aw[1];
|
||
a->seedlen0 = s->len;
|
||
|
||
a->frac_rep = c->frac_rep;
|
||
}
|
||
free(srt);
|
||
free(rseq);
|
||
}
|
||
|
||
/*****************************
|
||
* Basic hit->SAM conversion *
|
||
*****************************/
|
||
|
||
static inline int infer_bw(int l1, int l2, int score, int a, int q, int r)
|
||
{
|
||
int w;
|
||
if (l1 == l2 && l1 * a - score < (q + r - a)<<1) return 0; // to get equal alignment length, we need at least two gaps
|
||
w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 2.);
|
||
if (w < abs(l1 - l2)) w = abs(l1 - l2);
|
||
return w;
|
||
}
|
||
|
||
static inline int get_rlen(int n_cigar, const uint32_t *cigar)
|
||
{
|
||
int k, l;
|
||
for (k = l = 0; k < n_cigar; ++k) {
|
||
int op = cigar[k]&0xf;
|
||
if (op == 0 || op == 2)
|
||
l += cigar[k]>>4;
|
||
}
|
||
return l;
|
||
}
|
||
|
||
static inline void add_cigar(const mem_opt_t *opt, mem_aln_t *p, kstring_t *str, int which)
|
||
{
|
||
int i;
|
||
if (p->n_cigar) { // aligned
|
||
for (i = 0; i < p->n_cigar; ++i) {
|
||
int c = p->cigar[i]&0xf;
|
||
if (!(opt->flag&MEM_F_SOFTCLIP) && !p->is_alt && (c == 3 || c == 4))
|
||
c = which? 4 : 3; // use hard clipping for supplementary alignments
|
||
kputw(p->cigar[i]>>4, str); kputc("MIDSH"[c], str);
|
||
}
|
||
} else kputc('*', str); // having a coordinate but unaligned (e.g. when copy_mate is true)
|
||
}
|
||
|
||
void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_)
|
||
{
|
||
int i, l_name;
|
||
mem_aln_t ptmp = list[which], *p = &ptmp, mtmp, *m = 0; // make a copy of the alignment to convert
|
||
|
||
if (m_) mtmp = *m_, m = &mtmp;
|
||
// set flag
|
||
p->flag |= m? 0x1 : 0; // is paired in sequencing
|
||
p->flag |= p->rid < 0? 0x4 : 0; // is mapped
|
||
p->flag |= m && m->rid < 0? 0x8 : 0; // is mate mapped
|
||
if (p->rid < 0 && m && m->rid >= 0) // copy mate to alignment
|
||
p->rid = m->rid, p->pos = m->pos, p->is_rev = m->is_rev, p->n_cigar = 0;
|
||
if (m && m->rid < 0 && p->rid >= 0) // copy alignment to mate
|
||
m->rid = p->rid, m->pos = p->pos, m->is_rev = p->is_rev, m->n_cigar = 0;
|
||
p->flag |= p->is_rev? 0x10 : 0; // is on the reverse strand
|
||
p->flag |= m && m->is_rev? 0x20 : 0; // is mate on the reverse strand
|
||
|
||
// print up to CIGAR
|
||
l_name = strlen(s->name);
|
||
ks_resize(str, str->l + s->l_seq + l_name + (s->qual? s->l_seq : 0) + 20);
|
||
kputsn(s->name, l_name, str); kputc('\t', str); // QNAME
|
||
kputw((p->flag&0xffff) | (p->flag&0x10000? 0x100 : 0), str); kputc('\t', str); // FLAG
|
||
if (p->rid >= 0) { // with coordinate
|
||
kputs(bns->anns[p->rid].name, str); kputc('\t', str); // RNAME
|
||
kputl(p->pos + 1, str); kputc('\t', str); // POS
|
||
kputw(p->mapq, str); kputc('\t', str); // MAPQ
|
||
add_cigar(opt, p, str, which);
|
||
} else kputsn("*\t0\t0\t*", 7, str); // without coordinte
|
||
kputc('\t', str);
|
||
|
||
// print the mate position if applicable
|
||
if (m && m->rid >= 0) {
|
||
if (p->rid == m->rid) kputc('=', str);
|
||
else kputs(bns->anns[m->rid].name, str);
|
||
kputc('\t', str);
|
||
kputl(m->pos + 1, str); kputc('\t', str);
|
||
if (p->rid == m->rid) {
|
||
int64_t p0 = p->pos + (p->is_rev? get_rlen(p->n_cigar, p->cigar) - 1 : 0);
|
||
int64_t p1 = m->pos + (m->is_rev? get_rlen(m->n_cigar, m->cigar) - 1 : 0);
|
||
if (m->n_cigar == 0 || p->n_cigar == 0) kputc('0', str);
|
||
else kputl(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str);
|
||
} else kputc('0', str);
|
||
} else kputsn("*\t0\t0", 5, str);
|
||
kputc('\t', str);
|
||
|
||
// print SEQ and QUAL
|
||
if (p->flag & 0x100) { // for secondary alignments, don't write SEQ and QUAL
|
||
kputsn("*\t*", 3, str);
|
||
} else if (!p->is_rev) { // the forward strand
|
||
int i, qb = 0, qe = s->l_seq;
|
||
if (p->n_cigar && which && !(opt->flag&MEM_F_SOFTCLIP) && !p->is_alt) { // have cigar && not the primary alignment && not softclip all
|
||
if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qb += p->cigar[0]>>4;
|
||
if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qe -= p->cigar[p->n_cigar-1]>>4;
|
||
}
|
||
ks_resize(str, str->l + (qe - qb) + 1);
|
||
for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]];
|
||
kputc('\t', str);
|
||
if (s->qual) { // printf qual
|
||
ks_resize(str, str->l + (qe - qb) + 1);
|
||
for (i = qb; i < qe; ++i) str->s[str->l++] = s->qual[i];
|
||
str->s[str->l] = 0;
|
||
} else kputc('*', str);
|
||
} else { // the reverse strand
|
||
int i, qb = 0, qe = s->l_seq;
|
||
if (p->n_cigar && which && !(opt->flag&MEM_F_SOFTCLIP) && !p->is_alt) {
|
||
if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qe -= p->cigar[0]>>4;
|
||
if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qb += p->cigar[p->n_cigar-1]>>4;
|
||
}
|
||
ks_resize(str, str->l + (qe - qb) + 1);
|
||
for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]];
|
||
kputc('\t', str);
|
||
if (s->qual) { // printf qual
|
||
ks_resize(str, str->l + (qe - qb) + 1);
|
||
for (i = qe-1; i >= qb; --i) str->s[str->l++] = s->qual[i];
|
||
str->s[str->l] = 0;
|
||
} else kputc('*', str);
|
||
}
|
||
|
||
// print optional tags
|
||
if (p->n_cigar) {
|
||
kputsn("\tNM:i:", 6, str); kputw(p->NM, str);
|
||
kputsn("\tMD:Z:", 6, str); kputs((char*)(p->cigar + p->n_cigar), str);
|
||
}
|
||
if (m && m->n_cigar) { kputsn("\tMC:Z:", 6, str); add_cigar(opt, m, str, which); }
|
||
// if (m) { kputsn("\tMQ:i:", 6, str); kputw(m->mapq, str);}
|
||
if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); }
|
||
if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); }
|
||
if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); }
|
||
if (!(p->flag & 0x100)) { // not multi-hit
|
||
for (i = 0; i < n; ++i)
|
||
if (i != which && !(list[i].flag&0x100)) break;
|
||
if (i < n) { // there are other primary hits; output them
|
||
kputsn("\tSA:Z:", 6, str);
|
||
for (i = 0; i < n; ++i) {
|
||
const mem_aln_t *r = &list[i];
|
||
int k;
|
||
if (i == which || (r->flag&0x100)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit
|
||
kputs(bns->anns[r->rid].name, str); kputc(',', str);
|
||
kputl(r->pos+1, str); kputc(',', str);
|
||
kputc("+-"[r->is_rev], str); kputc(',', str);
|
||
for (k = 0; k < r->n_cigar; ++k) {
|
||
kputw(r->cigar[k]>>4, str); kputc("MIDSH"[r->cigar[k]&0xf], str);
|
||
}
|
||
kputc(',', str); kputw(r->mapq, str);
|
||
kputc(',', str); kputw(r->NM, str);
|
||
kputc(';', str);
|
||
}
|
||
}
|
||
if (p->alt_sc > 0)
|
||
ksprintf(str, "\tpa:f:%.3f", (double)p->score / p->alt_sc);
|
||
}
|
||
if (p->XA) {
|
||
kputsn((opt->flag&MEM_F_XB)? "\tXB:Z:" : "\tXA:Z:", 6, str);
|
||
kputs(p->XA, str);
|
||
}
|
||
if (s->comment) { kputc('\t', str); kputs(s->comment, str); }
|
||
if ((opt->flag&MEM_F_REF_HDR) && p->rid >= 0 && bns->anns[p->rid].anno != 0 && bns->anns[p->rid].anno[0] != 0) {
|
||
int tmp;
|
||
kputsn("\tXR:Z:", 6, str);
|
||
tmp = str->l;
|
||
kputs(bns->anns[p->rid].anno, str);
|
||
for (i = tmp; i < str->l; ++i) // replace TAB in the comment to SPACE
|
||
if (str->s[i] == '\t') str->s[i] = ' ';
|
||
}
|
||
kputc('\n', str);
|
||
}
|
||
|
||
/************************
|
||
* Integrated interface *
|
||
************************/
|
||
|
||
int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a)
|
||
{
|
||
int mapq, l, sub = a->sub? a->sub : opt->min_seed_len * opt->a;
|
||
double identity;
|
||
sub = a->csub > sub? a->csub : sub;
|
||
if (sub >= a->score) return 0;
|
||
l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb;
|
||
identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l;
|
||
if (a->score == 0) {
|
||
mapq = 0;
|
||
} else if (opt->mapQ_coef_len > 0) {
|
||
double tmp;
|
||
tmp = l < opt->mapQ_coef_len? 1. : opt->mapQ_coef_fac / log(l);
|
||
tmp *= identity * identity;
|
||
mapq = (int)(6.02 * (a->score - sub) / opt->a * tmp * tmp + .499);
|
||
} else {
|
||
mapq = (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499);
|
||
mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq;
|
||
}
|
||
if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499);
|
||
if (mapq > 60) mapq = 60;
|
||
if (mapq < 0) mapq = 0;
|
||
mapq = (int)(mapq * (1. - a->frac_rep) + .499);
|
||
return mapq;
|
||
}
|
||
|
||
void mem_reorder_primary5(int T, mem_alnreg_v *a)
|
||
{
|
||
int k, n_pri = 0, left_st = INT_MAX, left_k = -1;
|
||
mem_alnreg_t t;
|
||
for (k = 0; k < a->n; ++k)
|
||
if (a->a[k].secondary < 0 && !a->a[k].is_alt && a->a[k].score >= T) ++n_pri;
|
||
if (n_pri <= 1) return; // only one alignment
|
||
for (k = 0; k < a->n; ++k) {
|
||
mem_alnreg_t *p = &a->a[k];
|
||
if (p->secondary >= 0 || p->is_alt || p->score < T) continue;
|
||
if (p->qb < left_st) left_st = p->qb, left_k = k;
|
||
}
|
||
assert(a->a[0].secondary < 0);
|
||
if (left_k == 0) return; // no need to reorder
|
||
t = a->a[0], a->a[0] = a->a[left_k], a->a[left_k] = t;
|
||
for (k = 1; k < a->n; ++k) { // update secondary and secondary_all
|
||
mem_alnreg_t *p = &a->a[k];
|
||
if (p->secondary == 0) p->secondary = left_k;
|
||
else if (p->secondary == left_k) p->secondary = 0;
|
||
if (p->secondary_all == 0) p->secondary_all = left_k;
|
||
else if (p->secondary_all == left_k) p->secondary_all = 0;
|
||
}
|
||
}
|
||
|
||
// TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible
|
||
void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m, seq_sam_t *ss)
|
||
{
|
||
extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query);
|
||
// kstring_t str;
|
||
kvec_t(mem_aln_t) aa;
|
||
int k, l;
|
||
char **XA = 0;
|
||
|
||
if (!(opt->flag & MEM_F_ALL))
|
||
XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq);
|
||
kv_init(aa);
|
||
// str.l = str.m = 0; str.s = 0;
|
||
ss->sam.l = 0;
|
||
for (k = l = 0; k < a->n; ++k) {
|
||
mem_alnreg_t *p = &a->a[k];
|
||
mem_aln_t *q;
|
||
if (p->score < opt->T) continue;
|
||
if (p->secondary >= 0 && (p->is_alt || !(opt->flag&MEM_F_ALL))) continue;
|
||
if (p->secondary >= 0 && p->secondary < INT_MAX && p->score < a->a[p->secondary].score * opt->drop_ratio) continue;
|
||
q = kv_pushp(mem_aln_t, aa);
|
||
*q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p);
|
||
assert(q->rid >= 0); // this should not happen with the new code
|
||
q->XA = XA? XA[k] : 0;
|
||
q->flag |= extra_flag; // flag secondary
|
||
if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score
|
||
if (l && p->secondary < 0) // if supplementary
|
||
q->flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800;
|
||
if (!(opt->flag & MEM_F_KEEP_SUPP_MAPQ) && l && !p->is_alt && q->mapq > aa.a[0].mapq)
|
||
q->mapq = aa.a[0].mapq; // lower mapq for supplementary mappings, unless -5 or -q is applied
|
||
++l;
|
||
}
|
||
if (aa.n == 0) { // no alignments good enough; then write an unaligned record
|
||
mem_aln_t t;
|
||
t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0);
|
||
t.flag |= extra_flag;
|
||
mem_aln2sam(opt, bns, &ss->sam, s, 1, &t, 0, m);
|
||
} else {
|
||
for (k = 0; k < aa.n; ++k)
|
||
mem_aln2sam(opt, bns, &ss->sam, s, aa.n, aa.a, k, m);
|
||
for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar);
|
||
free(aa.a);
|
||
}
|
||
// s->sam = str.s;
|
||
if (XA) {
|
||
for (k = 0; k < a->n; ++k) free(XA[k]);
|
||
free(XA);
|
||
}
|
||
}
|
||
|
||
mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf)
|
||
{
|
||
int i;
|
||
mem_chain_v chn;
|
||
mem_alnreg_v regs;
|
||
|
||
for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so
|
||
seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]];
|
||
|
||
chn = mem_chain(opt, fmt, bns, l_seq, (uint8_t*)seq, buf);
|
||
chn.n = mem_chain_flt(opt, chn.n, chn.a);
|
||
mem_flt_chained_seeds(opt, bns, pac, l_seq, (uint8_t *)seq, chn.n, chn.a, ((smem_aux_t *)buf)->seq_buf);
|
||
if (bwa_verbose >= 4) mem_print_chain(bns, &chn);
|
||
|
||
kv_init(regs);
|
||
for (i = 0; i < chn.n; ++i) {
|
||
mem_chain_t *p = &chn.a[i];
|
||
if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i);
|
||
mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, ®s, buf, 0);
|
||
free(chn.a[i].seeds);
|
||
}
|
||
free(chn.a);
|
||
regs.n = mem_sort_dedup_patch(opt, bns, pac, (uint8_t*)seq, regs.n, regs.a);
|
||
if (bwa_verbose >= 4) {
|
||
err_printf("* %ld chains remain after removing duplicated chains\n", regs.n);
|
||
for (i = 0; i < regs.n; ++i) {
|
||
mem_alnreg_t *p = ®s.a[i];
|
||
printf("** %d, [%d,%d) <=> [%ld,%ld)\n", p->score, p->qb, p->qe, (long)p->rb, (long)p->re);
|
||
}
|
||
}
|
||
for (i = 0; i < regs.n; ++i) {
|
||
mem_alnreg_t *p = ®s.a[i];
|
||
if (p->rid >= 0 && bns->anns[p->rid].is_alt)
|
||
p->is_alt = 1;
|
||
}
|
||
return regs;
|
||
}
|
||
|
||
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar)
|
||
{
|
||
mem_aln_t a;
|
||
int i, w2, tmp, qb, qe, NM, score, is_rev, last_sc = -(1<<30), l_MD;
|
||
int64_t pos, rb, re;
|
||
uint8_t *query;
|
||
|
||
memset(&a, 0, sizeof(mem_aln_t));
|
||
if (ar == 0 || ar->rb < 0 || ar->re < 0) { // generate an unmapped record
|
||
a.rid = -1; a.pos = -1; a.flag |= 0x4;
|
||
return a;
|
||
}
|
||
qb = ar->qb, qe = ar->qe;
|
||
rb = ar->rb, re = ar->re;
|
||
query = malloc(l_query);
|
||
for (i = 0; i < l_query; ++i) // convert to the nt4 encoding
|
||
query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]];
|
||
a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0;
|
||
if (ar->secondary >= 0) a.flag |= 0x100; // secondary alignment
|
||
tmp = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_del, opt->e_del);
|
||
w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_ins, opt->e_ins);
|
||
w2 = w2 > tmp? w2 : tmp;
|
||
if (bwa_verbose >= 4) printf("* Band width: inferred=%d, cmd_opt=%d, alnreg=%d\n", w2, opt->w, ar->w);
|
||
if (w2 > opt->w) w2 = w2 < ar->w? w2 : ar->w;
|
||
i = 0; a.cigar = 0;
|
||
do {
|
||
free(a.cigar);
|
||
w2 = w2 < opt->w<<2? w2 : opt->w<<2;
|
||
a.cigar = bwa_gen_cigar2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM);
|
||
if (bwa_verbose >= 4) printf("* Final alignment: w2=%d, global_sc=%d, local_sc=%d\n", w2, score, ar->truesc);
|
||
if (score == last_sc || w2 == opt->w<<2) break; // it is possible that global alignment and local alignment give different scores
|
||
last_sc = score;
|
||
w2 <<= 1;
|
||
} while (++i < 3 && score < ar->truesc - opt->a);
|
||
l_MD = strlen((char*)(a.cigar + a.n_cigar)) + 1;
|
||
a.NM = NM;
|
||
pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
|
||
a.is_rev = is_rev;
|
||
if (a.n_cigar > 0) { // squeeze out leading or trailing deletions
|
||
if ((a.cigar[0]&0xf) == 2) {
|
||
pos += a.cigar[0]>>4;
|
||
--a.n_cigar;
|
||
memmove(a.cigar, a.cigar + 1, a.n_cigar * 4 + l_MD);
|
||
} else if ((a.cigar[a.n_cigar-1]&0xf) == 2) {
|
||
--a.n_cigar;
|
||
memmove(a.cigar + a.n_cigar, a.cigar + a.n_cigar + 1, l_MD); // MD needs to be moved accordingly
|
||
}
|
||
}
|
||
if (qb != 0 || qe != l_query) { // add clipping to CIGAR
|
||
int clip5, clip3;
|
||
clip5 = is_rev? l_query - qe : qb;
|
||
clip3 = is_rev? qb : l_query - qe;
|
||
a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2) + l_MD);
|
||
if (clip5) {
|
||
memmove(a.cigar+1, a.cigar, a.n_cigar * 4 + l_MD); // make room for 5'-end clipping
|
||
a.cigar[0] = clip5<<4 | 3;
|
||
++a.n_cigar;
|
||
}
|
||
if (clip3) {
|
||
memmove(a.cigar + a.n_cigar + 1, a.cigar + a.n_cigar, l_MD); // make room for 3'-end clipping
|
||
a.cigar[a.n_cigar++] = clip3<<4 | 3;
|
||
}
|
||
}
|
||
a.rid = bns_pos2rid(bns, pos);
|
||
assert(a.rid == ar->rid);
|
||
a.pos = pos - bns->anns[a.rid].offset;
|
||
a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub;
|
||
a.is_alt = ar->is_alt; a.alt_sc = ar->alt_sc;
|
||
free(query);
|
||
return a;
|
||
}
|
||
|
||
static void worker1(void *data, int i, int tid)
|
||
{
|
||
mem_worker_t *w = (mem_worker_t*)data;
|
||
if (!(w->opt->flag&MEM_F_PE)) {
|
||
if (bwa_verbose >= 4) printf("=====> Processing read '%s' <=====\n", w->seqs[i].name);
|
||
w->regs[i] = mem_align1_core(w->opt, w->fmt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq, w->aux[tid]);
|
||
} else {
|
||
if (bwa_verbose >= 4) printf("=====> Processing read '%s'/1 <=====\n", w->seqs[i<<1|0].name);
|
||
w->regs[i<<1|0] = mem_align1_core(w->opt, w->fmt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq, w->aux[tid]);
|
||
if (bwa_verbose >= 4) printf("=====> Processing read '%s'/2 <=====\n", w->seqs[i<<1|1].name);
|
||
w->regs[i<<1|1] = mem_align1_core(w->opt, w->fmt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq, w->aux[tid]);
|
||
}
|
||
}
|
||
|
||
static void worker2(void *data, int i, int tid)
|
||
{
|
||
extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], seq_sam_t ss[2], int tid);
|
||
extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a);
|
||
mem_worker_t *w = (mem_worker_t*)data;
|
||
if (!(w->opt->flag&MEM_F_PE)) {
|
||
if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name);
|
||
mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i);
|
||
if (w->opt->flag & MEM_F_PRIMARY5) mem_reorder_primary5(w->opt->T, &w->regs[i]);
|
||
mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0, &w->sams[i]);
|
||
free(w->regs[i].a);
|
||
} else {
|
||
if (bwa_verbose >= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i<<1|0].name);
|
||
mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1], &w->sams[i<<1], tid);
|
||
free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a);
|
||
}
|
||
}
|
||
|
||
static smem_aux_t *smem_aux_init()
|
||
{
|
||
smem_aux_t *a;
|
||
a = calloc(1, sizeof(smem_aux_t));
|
||
a->tmpv[0] = calloc(1, sizeof(bwtintv_v));
|
||
a->tmpv[1] = calloc(1, sizeof(bwtintv_v));
|
||
a->sw_buf = calloc(1, sizeof(buf_t));
|
||
a->seq_buf = calloc(1, sizeof(buf_t));
|
||
return a;
|
||
}
|
||
|
||
static void smem_aux_destroy(smem_aux_t *a)
|
||
{
|
||
free(a->tmpv[0]->a); free(a->tmpv[0]);
|
||
free(a->tmpv[1]->a); free(a->tmpv[1]);
|
||
free(a->mem.a); free(a->mem1.a);
|
||
free(a);
|
||
}
|
||
|
||
// 初始化线程需要的数据
|
||
mem_worker_t *init_mem_worker(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *bns, const uint8_t *pac)
|
||
{
|
||
int i = opt->n_threads, j;
|
||
mem_worker_t *w = calloc(1, sizeof(mem_worker_t));
|
||
w->opt = opt; w->bns = bns; w->pac = pac; w->fmt = fmt;
|
||
w->calc_isize = 0; w->n = 0; w->regs = 0;
|
||
w->aux = malloc(i * sizeof(smem_aux_t));
|
||
w->smem_arr = malloc(i * sizeof(smem_v *));
|
||
w->chain_arr = malloc(i * sizeof(mem_chain_v *));
|
||
w->isize_arr = malloc(i * sizeof(uint64_v *));
|
||
|
||
for (i = 0; i < opt->n_threads; ++i) {
|
||
w->aux[i] = smem_aux_init();
|
||
w->smem_arr[i] = malloc(opt->batch_size * sizeof(smem_v));
|
||
w->chain_arr[i] = malloc(opt->batch_size * sizeof(mem_chain_v));
|
||
w->isize_arr[i] = calloc(4, sizeof(uint64_v));
|
||
for (j = 0; j < opt->batch_size; ++j) {
|
||
kv_init(w->smem_arr[i][j].mem);
|
||
kv_init(w->smem_arr[i][j].pos_arr);
|
||
kv_init(w->chain_arr[i][j]);
|
||
}
|
||
}
|
||
return w;
|
||
}
|
||
#include "khash.h"
|
||
KHASH_MAP_INIT_INT64(seed, uint64_t);
|
||
|
||
static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_v *smem, smem_aux_t *a, int tid)
|
||
{
|
||
int i, k, x = 0, old_n;
|
||
int start_width = 1;
|
||
int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
|
||
int max_seed_len = 0;
|
||
int start_N_num = 0, start_flag = 1;
|
||
smem->mem.n = 0;
|
||
//int s1_num = 0;
|
||
// 1. first pass: find all SMEMs
|
||
PROF_START(seed_1);
|
||
// fprintf(gf[0], "read\n");
|
||
while (x < len) {
|
||
if (seq[x] < 4) {
|
||
start_flag = 0;
|
||
#ifdef DEBUG_FILE_OUTPUT
|
||
#ifdef COUNT_SEED_LENGTH
|
||
int last_x = x;
|
||
#endif
|
||
#endif
|
||
x = fmt_smem(fmt, len, seq, x, start_width, opt->min_seed_len, 0, &a->mem1, a->tmpv[0]);
|
||
#ifdef DEBUG_FILE_OUTPUT
|
||
#ifdef COUNT_SEED_LENGTH
|
||
fprintf(gf[0], "%d\n", x - last_x);
|
||
#endif
|
||
#endif
|
||
for (i = 0; i < a->mem1.n; ++i) {
|
||
bwtintv_t *p = &a->mem1.a[i];
|
||
int slen = (uint32_t)p->info - (p->info >> 32); // seed length
|
||
max_seed_len = MAX(max_seed_len, slen);
|
||
if (slen >= opt->min_seed_len) {
|
||
kv_push(bwtintv_t, smem->mem, *p);
|
||
}
|
||
}
|
||
#ifdef COUNT_SEED_LENGTH
|
||
break; // for test full match time
|
||
#endif
|
||
} else {
|
||
++x;
|
||
if (start_flag) ++start_N_num;
|
||
}
|
||
}
|
||
|
||
#ifdef SHOW_DATA_PERF
|
||
gdat[0] += 1; // read num ++
|
||
if (max_seed_len == len - start_N_num)
|
||
gdat[1] += 1; // seed-1 full match num ++
|
||
#endif
|
||
|
||
PROF_END(tprof[T_SEED_1][tid], seed_1);
|
||
|
||
#ifdef FILTER_FULL_MATCH
|
||
if (max_seed_len == len - start_N_num) {
|
||
|
||
#ifdef DEBUG_FILE_OUTPUT
|
||
if (start_N_num == 0) {
|
||
#ifdef GET_FULL_MATCH_READ
|
||
for (i = 0; i < len; ++i)
|
||
fprintf(gf[0], "%c", "ACGT"[seq[i]]);
|
||
fprintf(gf[0], "\n");
|
||
#endif
|
||
#ifdef SHOW_DATA_PERF
|
||
gdat[2]++;
|
||
if (gdat[2] % 100 == 0) {
|
||
fprintf(stderr, "reads num: %ld\n", gdat[2]);
|
||
}
|
||
#endif
|
||
}
|
||
#endif
|
||
//goto third_seed;
|
||
goto collect_intv_end;
|
||
}
|
||
#endif
|
||
bwtintv_v mem2 = {0};
|
||
// 2. second pass: find MEMs inside a long SMEM
|
||
uint32_v qev;
|
||
PROF_START(seed_2);
|
||
old_n = smem->mem.n;
|
||
//fprintf(gf[2], "seed-1\n");
|
||
for (k = 0; k < old_n; ++k) {
|
||
bwtintv_t *p = &smem->mem.a[k];
|
||
int start = p->info>>32, end = (int32_t)p->info;
|
||
if (end - start < split_len || p->x[2] > opt->split_width) continue;
|
||
//++ s1_num;
|
||
//fprintf(gf[2], "qs:%ld, qe:%d, x0:%ld, x1:%ld, x2:%ld\n", p->info >> 32, (int)p->info, p->x[0], p->x[1], p->x[2]);
|
||
// fprintf(gf[0], "start\n");
|
||
fmt_smem_2(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, p, &a->mem1, a->tmpv[0], qev);
|
||
for (i = 0; i < a->mem1.n; ++i) {
|
||
bwtintv_t *p = &a->mem1.a[i];
|
||
int slen = (uint32_t)p->info - (p->info >> 32);
|
||
// fprintf(gf[0], "%d\n", slen);
|
||
if (slen >= opt->min_seed_len) {
|
||
kv_push(bwtintv_t, smem->mem, a->mem1.a[i]);
|
||
kv_push(bwtintv_t, mem2, a->mem1.a[i]);
|
||
}
|
||
}
|
||
// fprintf(gf[0], "end\n");
|
||
}
|
||
//fprintf(gf[2], "seed-2\n");
|
||
PROF_END(tprof[T_SEED_2][tid], seed_2);
|
||
//if (s1_num > 1) {
|
||
//ks_introsort(mem_intv, mem2.n, mem2.a);
|
||
//for (i = 0; i < mem2.n; ++i) {
|
||
// fprintf(gf[2], "qs:%ld, qe:%d, x0:%ld, x1:%ld, x2:%ld\n", mem2.a[i].info >> 32, (int)mem2.a[i].info, mem2.a[i].x[0], mem2.a[i].x[1], mem2.a[i].x[2]);
|
||
//}
|
||
//fprintf(gf[2], "\n");
|
||
//}
|
||
|
||
// third_seed:
|
||
#if 1
|
||
// 3. third pass: LAST-like
|
||
PROF_START(seed_3);
|
||
if (opt->max_mem_intv > 0) {
|
||
x = 0;
|
||
while (x < len) {
|
||
if (seq[x] < 4) {
|
||
bwtintv_t m;
|
||
x = fmt_seed_strategy1(fmt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
|
||
if (m.x[2] > 0) {
|
||
// fprintf(stderr, "3: %ld %ld %ld\n", m.info >> 32, m.info & ((1L << 32) - 1), m.x[2]);
|
||
kv_push(bwtintv_t, smem->mem, m);
|
||
}
|
||
} else ++x;
|
||
}
|
||
}
|
||
PROF_END(tprof[T_SEED_3][tid], seed_3);
|
||
#endif
|
||
|
||
#ifdef FILTER_FULL_MATCH
|
||
collect_intv_end:
|
||
#endif
|
||
// sort
|
||
ks_introsort(mem_intv, smem->mem.n, smem->mem.a);
|
||
|
||
|
||
khash_t(str) * h;
|
||
//for (i = 0; i < smem->mem.n; ++i) {
|
||
// fprintf(stderr, "seed %d: %ld %ld %ld\n", i, smem->mem.a[i].x[0], smem->mem.a[i].x[1], smem->mem.a[i].x[2]);
|
||
//}
|
||
//fprintf(stderr, "split\n");
|
||
}
|
||
|
||
void find_smem(const mem_opt_t *opt, const FMTIndex *fmt, int len, const uint8_t *seq, smem_aux_t *aux, smem_v *smemv, int tid)
|
||
{
|
||
int i;
|
||
|
||
if (len < opt->min_seed_len) return; // if the query is shorter than the seed length, no match
|
||
mem_collect_intv_batch(opt, fmt, len, seq, smemv, aux, tid);
|
||
smemv->pos_arr.n = 0;
|
||
for (i=0; i<smemv->mem.n; ++i) {
|
||
bwtintv_t *p = &smemv->mem.a[i];
|
||
int step, count; // seed length
|
||
int64_t k;
|
||
if (p->num_match > 0) {
|
||
uint64_t pos = p->rm[0].rs;
|
||
if (p->rm[0].reverse) pos = (fmt->l_pac << 1) - 1 - pos;
|
||
kv_push(uint64_t, smemv->pos_arr, pos);
|
||
if (p->num_match > 1) {
|
||
uint64_t pos = p->rm[1].rs;
|
||
if (p->rm[1].reverse) pos = (fmt->l_pac << 1) - 1 - pos;
|
||
kv_push(uint64_t, smemv->pos_arr, pos);
|
||
}
|
||
} else {
|
||
step = p->x[2] > opt->max_occ ? p->x[2] / opt->max_occ : 1;
|
||
for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count)
|
||
{
|
||
PROF_START(sal);
|
||
uint64_t pos = fmt_sa_offset(fmt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
|
||
PROF_END(tprof[T_SAL][tid], sal);
|
||
kv_push(uint64_t, smemv->pos_arr, pos);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
void generate_chain(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *bns, int len, const uint8_t *seq, smem_v *smemv, mem_chain_v *chain, int tid)
|
||
{
|
||
int i, j, b, e, l_rep;
|
||
int64_t l_pac = bns->l_pac;
|
||
kbtree_t(chn) * tree;
|
||
chain->n = 0;
|
||
|
||
if (len < opt->min_seed_len) return; // if the query is shorter than the seed length, no match
|
||
tree = kb_init(chn, KB_DEFAULT_SIZE);
|
||
|
||
#define CHECK_ADD_CHAIN(tmp, lower, upper) \
|
||
int rid, to_add = 0; \
|
||
mem_chain_t tmp, *lower, *upper; \
|
||
tmp.pos = s.rbeg; \
|
||
s.qbeg = p->info >> 32; \
|
||
s.score = s.len = slen; \
|
||
rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len); \
|
||
if (rid < 0) \
|
||
continue; \
|
||
if (kb_size(tree)) \
|
||
{ \
|
||
kb_intervalp(chn, tree, &tmp, &lower, &upper); \
|
||
if (!lower || !test_and_merge(opt, l_pac, lower, &s, rid)) \
|
||
to_add = 1; \
|
||
} \
|
||
else \
|
||
to_add = 1; \
|
||
if (to_add) \
|
||
{ \
|
||
tmp.n = 1; \
|
||
tmp.m = 4; \
|
||
tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t)); \
|
||
tmp.seeds[0] = s; \
|
||
tmp.rid = rid; \
|
||
tmp.is_alt = !!bns->anns[rid].is_alt; \
|
||
kb_putp(chn, tree, &tmp); \
|
||
}
|
||
|
||
for (i = 0, b = e = l_rep = 0; i < smemv->mem.n; ++i) { // compute frac_rep
|
||
bwtintv_t *p = &smemv->mem.a[i];
|
||
int sb = (p->info >> 32), se = (uint32_t)p->info;
|
||
if (p->x[2] <= opt->max_occ) continue;
|
||
if (sb > e) l_rep += e - b, b = sb, e = se;
|
||
else e = e > se ? e : se;
|
||
}
|
||
l_rep += e - b;
|
||
for (i = 0, j = 0; i < smemv->mem.n; ++i) {
|
||
bwtintv_t *p = &smemv->mem.a[i];
|
||
int step, count, slen = (uint32_t)p->info - (p->info >> 32); // seed length
|
||
int64_t k;
|
||
if (p->num_match > 0) {
|
||
mem_seed_t s;
|
||
s.rbeg = smemv->pos_arr.a[j++];
|
||
//fprintf(gf[1], "%ld, %ld, %d\n", s.rbeg, p->info>>32, (uint32_t)p->info);
|
||
CHECK_ADD_CHAIN(tmp, lower, upper);
|
||
if (p->num_match > 1) {
|
||
mem_seed_t s;
|
||
s.rbeg = smemv->pos_arr.a[j++];
|
||
//fprintf(gf[1], "%ld, %ld, %d\n", s.rbeg, p->info >> 32, (uint32_t)p->info);
|
||
CHECK_ADD_CHAIN(tmp, lower, upper);
|
||
}
|
||
} else {
|
||
step = p->x[2] > opt->max_occ ? p->x[2] / opt->max_occ : 1;
|
||
for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count)
|
||
{
|
||
mem_seed_t s;
|
||
uint64_t sa = smemv->pos_arr.a[j++];
|
||
uint64_t sa_idx = sa << 16 >> 16;
|
||
s.rbeg = (sa >> 48) + bwt_get_sa((uint8_t *)fmt->sa, sa_idx);
|
||
// fprintf(gf[1], "%ld, %ld, %d\n", s.rbeg, p->info >> 32, (uint32_t)p->info);
|
||
CHECK_ADD_CHAIN(tmp, lower, upper);
|
||
}
|
||
}
|
||
}
|
||
if (chain->m < kb_size(tree)) {
|
||
kv_resize(mem_chain_t, *chain, kb_size(tree));
|
||
}
|
||
|
||
#define traverse_func(p_) (chain->a[chain->n++] = *(p_))
|
||
__kb_traverse(mem_chain_t, tree, traverse_func);
|
||
#undef traverse_func
|
||
|
||
for (i = 0; i < chain->n; ++i) chain->a[i].frac_rep = (float)l_rep / len;
|
||
if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len);
|
||
kb_destroy(chn, tree);
|
||
}
|
||
|
||
static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist)
|
||
{
|
||
int64_t p2;
|
||
int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac);
|
||
p2 = r1 == r2 ? b2 : (l_pac << 1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand
|
||
*dist = p2 > b1 ? p2 - b1 : b1 - p2;
|
||
return (r1 == r2 ? 0 : 1) ^ (p2 > b1 ? 0 : 3);
|
||
}
|
||
|
||
static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r)
|
||
{
|
||
int j;
|
||
for (j = 1; j < r->n; ++j) { // choose unique alignment
|
||
int b_max = r->a[j].qb > r->a[0].qb? r->a[j].qb : r->a[0].qb;
|
||
int e_min = r->a[j].qe < r->a[0].qe? r->a[j].qe : r->a[0].qe;
|
||
if (e_min > b_max) { // have overlap
|
||
int min_l = r->a[j].qe - r->a[j].qb < r->a[0].qe - r->a[0].qb? r->a[j].qe - r->a[j].qb : r->a[0].qe - r->a[0].qb;
|
||
if (e_min - b_max >= min_l * opt->mask_level) break; // significant overlap
|
||
}
|
||
}
|
||
return j < r->n? r->a[j].score : opt->min_seed_len * opt->a;
|
||
}
|
||
// mem主要流程
|
||
void mem_core_process(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t *bns, const uint8_t *pac,
|
||
bseq1_t *seq_arr, int nseq, smem_aux_t *aux, smem_v *smem_arr, mem_chain_v *chain_arr, mem_alnreg_v *reg_arr,
|
||
int calc_isize, int64_t l_pac, uint64_v *isize, int tid)
|
||
{
|
||
|
||
int i, j, l_seq;
|
||
mem_chain_v *chnp;
|
||
mem_alnreg_v *regp;
|
||
char *seq;
|
||
|
||
// 1. seeding
|
||
PROF_START(seed_all);
|
||
for (i = 0; i < nseq; ++i) {
|
||
seq = seq_arr[i].seq;
|
||
l_seq = seq_arr[i].l_seq;
|
||
for (j = 0; j < l_seq; ++j) {
|
||
seq[j] = seq[j] < 4 ? seq[j] : nst_nt4_table[(int)seq[j]];
|
||
}
|
||
find_smem(opt, fmt, l_seq, (uint8_t *)seq, aux, &smem_arr[i], tid);
|
||
}
|
||
PROF_END(tprof[T_SEED_ALL][tid], seed_all);
|
||
|
||
// 2. chain
|
||
PROF_START(chain_all);
|
||
for (i=0; i<nseq; ++i) {
|
||
seq = seq_arr[i].seq;
|
||
l_seq = seq_arr[i].l_seq;
|
||
chnp = chain_arr + i;
|
||
PROF_START(gen_chain);
|
||
generate_chain(opt, fmt, bns, l_seq, (uint8_t*)seq, &smem_arr[i], chnp, tid);
|
||
PROF_END(tprof[T_GEN_CHAIN][tid], gen_chain);
|
||
PROF_START(flt_chain);
|
||
chnp->n = mem_chain_flt(opt, chnp->n, chnp->a);
|
||
PROF_END(tprof[T_FLT_CHAIN][tid], flt_chain);
|
||
PROF_START(flt_chained_seeds);
|
||
mem_flt_chained_seeds(opt, bns, pac, l_seq, (uint8_t*)seq, chnp->n, chnp->a, aux->seq_buf);
|
||
PROF_END(tprof[T_FLT_CHANNED_SEEDS][tid], flt_chained_seeds);
|
||
if (bwa_verbose >= 4) mem_print_chain(bns, chnp);
|
||
}
|
||
PROF_END(tprof[T_CHAIN_ALL][tid], chain_all);
|
||
|
||
// 3. align
|
||
PROF_START(aln_all);
|
||
for (i=0; i<nseq; ++i) {
|
||
seq = seq_arr[i].seq;
|
||
l_seq = seq_arr[i].l_seq;
|
||
chnp = chain_arr + i;
|
||
regp = reg_arr + i;
|
||
kv_init(*regp);
|
||
|
||
for(j=0; j<chnp->n; ++j) {
|
||
mem_chain_t *p = &chnp->a[j];
|
||
if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", j);
|
||
mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, regp, aux, tid);
|
||
free(chnp->a[j].seeds);
|
||
}
|
||
|
||
free(chnp->a); chnp->m = 0; chnp->a = 0;
|
||
regp->n = mem_sort_dedup_patch(opt, bns, pac, (uint8_t *)seq, regp->n, regp->a);
|
||
if (bwa_verbose >= 4) {
|
||
err_printf("* %ld chains remain after removing duplicated chains\n", regp->n);
|
||
for (j = 0; j < regp->n; ++j) {
|
||
mem_alnreg_t *p = ®p->a[j];
|
||
printf("** %d, [%d,%d) <=> [%ld,%ld)\n", p->score, p->qb, p->qe, (long)p->rb, (long)p->re);
|
||
}
|
||
}
|
||
for (j = 0; j < regp->n; ++j) {
|
||
mem_alnreg_t *p = ®p->a[j];
|
||
if (p->rid >= 0 && bns->anns[p->rid].is_alt)
|
||
p->is_alt = 1;
|
||
}
|
||
}
|
||
PROF_END(tprof[T_ALN_ALL][tid], aln_all);
|
||
|
||
#if 1
|
||
// 4. calc insert size
|
||
|
||
#define MIN_RATIO 0.8
|
||
if (calc_isize) {
|
||
PROF_START(ins_size);
|
||
for (i = 0; i < nseq>>1; ++i) {
|
||
int dir;
|
||
int64_t is;
|
||
mem_alnreg_v *r[2];
|
||
r[0] = (mem_alnreg_v*)®_arr[i<<1|0];
|
||
r[1] = (mem_alnreg_v*)®_arr[i<<1|1];
|
||
if (r[0]->n == 0 || r[1]->n == 0) continue;
|
||
if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue;
|
||
if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue;
|
||
if (r[0]->a[0].rid != r[1]->a[0].rid) continue; // not on the same chr
|
||
dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is);
|
||
if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is);
|
||
}
|
||
PROF_END(tprof[T_INS_SIZE][tid], ins_size);
|
||
}
|
||
#endif
|
||
}
|
||
// 找smem,生成chain,然后align
|
||
static void worker_smem_align(void *data, int i, int tid)
|
||
{
|
||
mem_worker_t *w = (mem_worker_t *)data;
|
||
int start = i * w->opt->batch_size;
|
||
int end = MIN(start + w->opt->batch_size, w->n_reads);
|
||
mem_core_process(w->opt, w->fmt, w->bns, w->pac, w->seqs + start, end - start, w->aux[tid], w->smem_arr[tid], w->chain_arr[tid], w->regs + start, w->calc_isize, w->bns->l_pac, w->isize_arr[tid], tid);
|
||
}
|
||
|
||
static void worker_sam(void *data, int i, int tid)
|
||
{
|
||
extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], seq_sam_t ss[2], int tid);
|
||
extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a);
|
||
extern void mem_matesw_batch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t *s, mem_alnreg_v *a, int data_size);
|
||
mem_worker_t *w = (mem_worker_t*)data;
|
||
int start = i*w->opt->batch_size;
|
||
int end = MIN(start + w->opt->batch_size, w->n_reads);
|
||
|
||
if (!(w->opt->flag&MEM_F_PE)) {
|
||
for (i=start; i<end; ++i) {
|
||
if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name);
|
||
mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i);
|
||
if (w->opt->flag & MEM_F_PRIMARY5) mem_reorder_primary5(w->opt->T, &w->regs[i]);
|
||
mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0, &w->sams[i]);
|
||
free(w->regs[i].a);
|
||
}
|
||
} else {
|
||
if (!(w->opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment
|
||
mem_matesw_batch(w->opt, w->bns, w->pac, w->pes, &w->seqs[start], &w->regs[start], end-start);
|
||
}
|
||
|
||
for (i=start; i<end; i+=2) {
|
||
if (bwa_verbose >= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i].name);
|
||
mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed+i)>>1, &w->seqs[i], &w->regs[i], &w->sams[i], tid);
|
||
free(w->regs[i].a); free(w->regs[i+1].a);
|
||
}
|
||
}
|
||
}
|
||
|
||
void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0, seq_sam_t *sams)
|
||
{
|
||
extern void kt_for(int n_threads, void (*func)(void *, int, int), void *data, int n);
|
||
PROF_START(mem_prepare);
|
||
mem_pestat_t pes[4];
|
||
int batch_size = opt->batch_size; // 对于pair-end数据,必须是2的整数倍,因为要保证pair-end数据在同一个线程里
|
||
int n_batch = (n + batch_size - 1) / batch_size;
|
||
double ctime, rtime;
|
||
ctime = cputime(); rtime = realtime();
|
||
global_bns = w->bns;
|
||
|
||
w->opt = opt;
|
||
if (w->n < n) {
|
||
w->n = n;
|
||
w->regs = realloc(w->regs, n * sizeof(mem_alnreg_v));
|
||
}
|
||
w->seqs = seqs; w->n_processed = n_processed; w->sams = sams;
|
||
w->n_reads = n; w->pes = &pes[0];
|
||
|
||
#if 1
|
||
if ((opt->flag & MEM_F_PE) && !pes0) { // infer insert sizes if not provided
|
||
int i, j;
|
||
w->calc_isize = 1;
|
||
for (i = 0; i < opt->n_threads; ++i)
|
||
for (j = 0; j < 4; ++j)
|
||
w->isize_arr[i][j].n = 0;
|
||
}
|
||
#endif
|
||
PROF_END(gprof[G_MEM_PREPARE], mem_prepare);
|
||
|
||
PROF_START(kernel);
|
||
// kt_for(opt->n_threads, worker1, w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions
|
||
kt_for(opt->n_threads, worker_smem_align, w, n_batch); // find mapping positions
|
||
PROF_END(gprof[G_MEM_KERNEL], kernel);
|
||
|
||
PROF_START(pestat);
|
||
if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided
|
||
if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0
|
||
else mem_pestat(opt, w->bns->l_pac, n, w->isize_arr, pes); // otherwise, infer the insert size distribution from data
|
||
//else mem_pestat_old(opt, w->bns->l_pac, n, w->regs, pes);
|
||
}
|
||
PROF_END(gprof[G_MEM_PESTAT], pestat);
|
||
|
||
PROF_START(mem_sam);
|
||
kt_for(opt->n_threads, worker2, w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment
|
||
//kt_for(opt->n_threads, worker_sam, w, n_batch);
|
||
PROF_END(gprof[G_MEM_SAM], mem_sam);
|
||
|
||
//free(w.regs);
|
||
if (bwa_verbose >= 3)
|
||
fprintf(stderr, "[M::%s] Processed %d reads in %.3f CPU sec, %.3f real sec\n", __func__, n, cputime() - ctime, realtime() - rtime);
|
||
}
|