r461: randomize repetitive hits

This commit is contained in:
Heng Li 2017-10-04 13:05:18 -04:00
parent cf55c84056
commit 2a1e738a94
6 changed files with 52 additions and 25 deletions

37
hit.c
View File

@ -3,6 +3,7 @@
#include <math.h>
#include "mmpriv.h"
#include "kalloc.h"
#include "khash.h"
static inline void mm_cal_fuzzy_len(mm_reg1_t *r, const mm128_t *a)
{
@ -36,7 +37,19 @@ static inline void mm_reg_set_coor(mm_reg1_t *r, int32_t qlen, const mm128_t *a)
mm_cal_fuzzy_len(r, a);
}
mm_reg1_t *mm_gen_regs(void *km, int qlen, int n_u, uint64_t *u, mm128_t *a) // convert chains to hits
static inline uint64_t hash64(uint64_t key)
{
key = (~key + (key << 21));
key = key ^ key >> 24;
key = ((key + (key << 3)) + (key << 8));
key = key ^ key >> 14;
key = ((key + (key << 2)) + (key << 4));
key = key ^ key >> 28;
key = (key + (key << 31));
return key;
}
mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a) // convert chains to hits
{
mm128_t *z, tmp;
mm_reg1_t *r;
@ -47,7 +60,9 @@ mm_reg1_t *mm_gen_regs(void *km, int qlen, int n_u, uint64_t *u, mm128_t *a) //
// sort by score
z = (mm128_t*)kmalloc(km, n_u * 16);
for (i = k = 0; i < n_u; ++i) {
z[i].x = u[i] >> 32;
uint32_t h;
h = (uint32_t)hash64((hash64(a[k].x) + hash64(a[k].y)) ^ hash);
z[i].x = u[i] ^ h; // u[i] -- higher 32 bits: chain score; lower 32 bits: number of seeds in the chain
z[i].y = (uint64_t)k << 32 | (int32_t)u[i];
k += (int32_t)u[i];
}
@ -61,7 +76,8 @@ mm_reg1_t *mm_gen_regs(void *km, int qlen, int n_u, uint64_t *u, mm128_t *a) //
mm_reg1_t *ri = &r[i];
ri->id = i;
ri->parent = MM_PARENT_UNSET;
ri->score = z[i].x;
ri->score = z[i].x >> 32;
ri->hash = (uint32_t)z[i].x;
ri->cnt = (int32_t)z[i].y;
ri->as = z[i].y >> 32;
mm_reg_set_coor(ri, qlen, a);
@ -124,24 +140,25 @@ void mm_set_parent(void *km, float mask_level, int n, mm_reg1_t *r, int sub_diff
void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r)
{
int32_t i, n_aux, n = *n_regs;
uint64_t *aux;
mm128_t *aux;
mm_reg1_t *t;
if (n <= 1) return;
aux = (uint64_t*)kmalloc(km, n * 8);
aux = (mm128_t*)kmalloc(km, n * 16);
t = (mm_reg1_t*)kmalloc(km, n * sizeof(mm_reg1_t));
for (i = n_aux = 0; i < n; ++i) {
if (r[i].inv || r[i].cnt > 0) { // squeeze out elements with cnt==0 (soft deleted)
assert(r[i].p);
aux[n_aux++] = (uint64_t)r[i].p->dp_max << 32 | i;
aux[n_aux].x = (uint64_t)r[i].p->dp_max << 32 | r[i].hash;
aux[n_aux++].y = i;
} else if (r[i].p) {
free(r[i].p);
r[i].p = 0;
}
}
radix_sort_64(aux, aux + n_aux);
radix_sort_128x(aux, aux + n_aux);
for (i = n_aux - 1; i >= 0; --i)
t[n_aux - 1 - i] = r[(int32_t)aux[i]];
t[n_aux - 1 - i] = r[aux[i].y];
memcpy(r, t, sizeof(mm_reg1_t) * n_aux);
*n_regs = n_aux;
kfree(km, aux);
@ -294,7 +311,7 @@ void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs_, mm_r
}
}
mm_seg_t *mm_seg_gen(void *km, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a)
mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a)
{
int s, i, j, acc_qlen[MM_MAX_SEG+1], qlen_sum = 0;
mm_seg_t *seg;
@ -340,7 +357,7 @@ mm_seg_t *mm_seg_gen(void *km, int n_segs, const int *qlens, int n_regs0, const
}
}
for (s = 0; s < n_segs; ++s) {
regs[s] = mm_gen_regs(km, qlens[s], seg[s].n_u, seg[s].u, seg[s].a);
regs[s] = mm_gen_regs(km, hash, qlens[s], seg[s].n_u, seg[s].u, seg[s].a);
n_regs[s] = seg[s].n_u;
for (i = 0; i < n_regs[s]; ++i)
regs[s][i].seg_split = 1;

8
main.c
View File

@ -6,7 +6,7 @@
#include "mmpriv.h"
#include "getopt.h"
#define MM_VERSION "2.2-r460-dirty"
#define MM_VERSION "2.2-r461-dirty"
#ifdef __linux__
#include <sys/resource.h>
@ -37,8 +37,9 @@ static struct option long_options[] = {
{ "cost-non-gt-ag", required_argument, 0, 0 },
{ "no-sam-sq", no_argument, 0, 0 },
{ "sr", no_argument, 0, 0 },
{ "multi-seg", optional_argument, 0, 0 },
{ "multi", optional_argument, 0, 0 },
{ "no-long-join", no_argument, 0, 0 },
{ "seed", required_argument, 0, 0 },
{ "help", no_argument, 0, 'h' },
{ "max-intron-len", required_argument, 0, 'G' },
{ "version", no_argument, 0, 'V' },
@ -118,7 +119,8 @@ int main(int argc, char *argv[])
else if (c == 0 && long_idx ==12) opt.flag |= MM_F_NO_SAM_SQ; // --no-sam-sq
else if (c == 0 && long_idx ==13) opt.flag |= MM_F_SR; // --sr
else if (c == 0 && long_idx ==15) opt.flag |= MM_F_NO_LJOIN; // --no-long-join
else if (c == 0 && long_idx ==14) { // --multi-seg
else if (c == 0 && long_idx ==16) opt.seed = atoi(optarg); // --seed
else if (c == 0 && long_idx ==14) { // --multi
if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0)
opt.flag |= MM_F_MULTI_SEG;
else opt.flag &= ~MM_F_MULTI_SEG;

13
map.c
View File

@ -7,10 +7,12 @@
#include "sdust.h"
#include "mmpriv.h"
#include "bseq.h"
#include "khash.h"
void mm_mapopt_init(mm_mapopt_t *opt)
{
memset(opt, 0, sizeof(mm_mapopt_t));
opt->seed = 11;
opt->mid_occ_frac = 2e-4f;
opt->sdust_thres = 0; // no SDUST masking
@ -243,7 +245,7 @@ static void chain_post(const mm_mapopt_t *opt, const mm_idx_t *mi, void *km, int
if (n_segs <= 1) mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);
else mm_select_sub_multi(km, opt->pri_ratio, 0.2f, 0.7f, opt->max_gap_ref, mi->k*2, opt->best_n, n_segs, qlens, n_regs, regs);
if (!(opt->flag & MM_F_SPLICE) && !(opt->flag & MM_F_SR) && !(opt->flag & MM_F_NO_LJOIN))
mm_join_long(km, opt, qlen, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle
mm_join_long(km, opt, qlen, n_regs, regs, a);
}
}
@ -262,6 +264,7 @@ static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *k
void mm_map_multi(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname)
{
int i, j, max_gap_ref, rep_len, qlen_sum, n_regs0, rechain = 0;
uint32_t hash;
int64_t n_a;
uint64_t *u;
mm128_t *a;
@ -272,6 +275,10 @@ void mm_map_multi(const mm_idx_t *mi, int n_segs, const int *qlens, const char *
if (qlen_sum == 0 || n_segs <= 0 || n_segs > MM_MAX_SEG) return;
hash = qname? __ac_X31_hash_string(qname) : 0;
hash ^= __ac_Wang_hash(qlen_sum) + __ac_Wang_hash(opt->seed);
hash = __ac_Wang_hash(hash);
collect_minimizers(opt, mi, n_segs, qlens, seqs, b);
a = collect_seed_hits(opt, opt->mid_occ, mi, qname, qlen_sum, &n_a, &rep_len, b);
radix_sort_128x(a, a + n_a);
@ -308,7 +315,7 @@ void mm_map_multi(const mm_idx_t *mi, int n_segs, const int *qlens, const char *
}
}
regs0 = mm_gen_regs(b->km, qlen_sum, n_regs0, u, a);
regs0 = mm_gen_regs(b->km, hash, qlen_sum, n_regs0, u, a);
if (mm_dbg_flag & MM_DBG_PRINT_SEED)
for (j = 0; j < n_regs0; ++j)
@ -324,7 +331,7 @@ void mm_map_multi(const mm_idx_t *mi, int n_segs, const int *qlens, const char *
n_regs[0] = n_regs0, regs[0] = regs0;
} else {
mm_seg_t *seg;
seg = mm_seg_gen(b->km, n_segs, qlens, n_regs0, regs0, n_regs, regs, a);
seg = mm_seg_gen(b->km, hash, n_segs, qlens, n_regs0, regs0, n_regs, regs, a);
free(regs0);
for (i = 0; i < n_segs; ++i) {
mm_set_parent(b->km, opt->mask_level, n_regs[i], regs[i], opt->a * 2 + opt->b);

View File

@ -69,7 +69,7 @@ typedef struct {
int32_t as; // offset in the a[] array (for internal uses only)
int32_t fuzzy_mlen, fuzzy_blen; // seeded exact match length; seeded alignment block length (approximate)
uint32_t mapq:8, split:2, sam_pri:1, n_sub:21; // mapQ; split pattern; if SAM primary; number of suboptimal mappings
uint64_t hash;
uint32_t hash, dummy;
mm_extra_t *p;
} mm_reg1_t;
@ -81,6 +81,7 @@ typedef struct {
} mm_idxopt_t;
typedef struct {
int seed;
int sdust_thres; // score threshold for SDUST; 0 to disable
int flag; // see MM_F_* macros

View File

@ -66,7 +66,7 @@ int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f);
mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
mm_reg1_t *mm_gen_regs(void *km, int qlen, int n_u, uint64_t *u, mm128_t *a);
mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a);
void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a);
void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs);
int mm_set_sam_pri(int n, mm_reg1_t *r);
@ -78,7 +78,7 @@ void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs, mm_re
void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r);
void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len);
mm_seg_t *mm_seg_gen(void *km, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a);
mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a);
void mm_seg_free(void *km, int n_segs, mm_seg_t *segs);
void mm_pair(void *km, int max_gap_ref, int dp_bonus, int sub_diff, int match_sc, const int *qlens, int *n_regs, mm_reg1_t **regs);

12
pe.c
View File

@ -94,15 +94,15 @@ void mm_pair(void *km, int max_gap_ref, int pe_bonus, int sub_diff, int match_sc
q = a[last[a[i].rev]].r;
if (r->rid != q->rid || r->rs - q->re > max_gap_ref) continue;
for (j = last[a[i].rev]; j >= 0; --j) {
int64_t sum;
int64_t score;
if (a[j].rev != a[i].rev || a[j].s == a[i].s) continue;
q = a[j].r;
if (r->rid != q->rid || r->rs - q->re > max_gap_ref) break;
if (r->p->dp_max + q->p->dp_max < dp_thres) continue;
sum = r->p->dp_max + q->p->dp_max;
if (sum > max)
max = sum, max_idx[a[j].s] = j, max_idx[a[i].s] = i;
kv_push(uint64_t, km, sc, sum);
score = (int64_t)(r->p->dp_max + q->p->dp_max) | (r->hash + q->hash);
if (score > max)
max = score, max_idx[a[j].s] = j, max_idx[a[i].s] = i;
kv_push(uint64_t, km, sc, score);
}
} else { // forward first read or reverse second read
last[a[i].rev] = i;
@ -138,7 +138,7 @@ void mm_pair(void *km, int max_gap_ref, int pe_bonus, int sub_diff, int match_sc
if (sc.n == 1) {
if (r[0]->mapq < 2) r[0]->mapq = 2;
if (r[1]->mapq < 2) r[1]->mapq = 2;
} else if (max > sc.a[sc.n - 2]) {
} else if (max>>32 > sc.a[sc.n - 2]>>32) {
if (r[0]->mapq < 1) r[0]->mapq = 1;
if (r[1]->mapq < 1) r[1]->mapq = 1;
}