r445: pair coordinate adjustment working

Next: mapq adjustment, which will be tricky...
This commit is contained in:
Heng Li 2017-09-27 14:33:36 -04:00
parent a349d85280
commit 7e0d70bfd3
5 changed files with 76 additions and 14 deletions

2
main.c
View File

@ -6,7 +6,7 @@
#include "mmpriv.h"
#include "getopt.h"
#define MM_VERSION "2.2-r444-dirty"
#define MM_VERSION "2.2-r446-dirty"
#ifdef __linux__
#include <sys/resource.h>

8
map.c
View File

@ -13,7 +13,6 @@ void mm_mapopt_init(mm_mapopt_t *opt)
memset(opt, 0, sizeof(mm_mapopt_t));
opt->mid_occ_frac = 2e-4f;
opt->sdust_thres = 0; // no SDUST masking
opt->pe_ori = 0; // FF
opt->min_cnt = 3;
opt->min_chain_score = 40;
@ -35,6 +34,9 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->min_dp_max = opt->min_chain_score * opt->a;
opt->min_ksw_len = 200;
opt->mini_batch_size = 200000000;
opt->pe_ori = 0; // FF
opt->pe_bonus = 33;
}
void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
@ -236,7 +238,7 @@ static void chain_post(const mm_mapopt_t *opt, const mm_idx_t *mi, void *km, int
if (!(opt->flag & MM_F_AVA)) { // don't choose primary mapping(s) for read overlap
mm_set_parent(km, opt->mask_level, *n_regs, regs, opt->a * 2 + opt->b);
if (n_segs <= 1) mm_select_sub(km, opt->pri_ratio, mi->k*2, opt->best_n, n_regs, regs);
else mm_select_sub_multi(km, opt->pri_ratio, 0.2f, 0.7f, opt->max_gap_ref, mi->k*2, opt->best_n * n_segs, n_segs, qlens, n_regs, regs);
else mm_select_sub_multi(km, opt->pri_ratio, 0.2f, 0.7f, opt->max_gap_ref, mi->k*2, opt->best_n, n_segs, qlens, n_regs, regs);
if (!(opt->flag & MM_F_SPLICE) && !(opt->flag & MM_F_SR))
mm_join_long(km, opt, qlen, n_regs, regs, a); // TODO: this can be applied to all-vs-all in principle
}
@ -304,6 +306,8 @@ void mm_map_multi(const mm_idx_t *mi, int n_segs, const int *qlens, const char *
mm_set_mapq(n_regs[i], regs[i], opt->min_chain_score, opt->a, rep_len);
}
mm_seg_free(b->km, n_segs, seg);
if (n_segs == 2 && opt->pe_ori >= 0)
mm_pair(b->km, max_gap_ref, opt->pe_bonus, qlens, n_regs, regs);
}
kfree(b->km, a);

View File

@ -68,6 +68,7 @@ typedef struct {
int32_t as; // offset in the a[] array (for internal uses only)
int32_t fuzzy_mlen, fuzzy_blen; // seeded exact match length; seeded alignment block length (approximate)
uint32_t mapq:8, split:2, sam_pri:1, n_sub:21; // mapQ; split pattern; if SAM primary; number of suboptimal mappings
uint64_t hash;
mm_extra_t *p;
} mm_reg1_t;
@ -81,7 +82,6 @@ typedef struct {
typedef struct {
int sdust_thres; // score threshold for SDUST; 0 to disable
int flag; // see MM_F_* macros
int pe_ori;
int bw; // bandwidth
int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window
@ -102,6 +102,8 @@ typedef struct {
int min_dp_max; // drop an alignment if the score of the max scoring segment is below this threshold
int min_ksw_len;
int pe_ori, pe_bonus;
float mid_occ_frac; // only used by mm_mapopt_update(); see below
int32_t mid_occ; // ignore seeds with occurrences above this threshold
int mini_batch_size; // size of a batch of query bases to process in parallel

View File

@ -80,6 +80,7 @@ void mm_set_mapq(int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, in
mm_seg_t *mm_seg_gen(void *km, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a);
void mm_seg_free(void *km, int n_segs, mm_seg_t *segs);
void mm_pair(void *km, int max_gap_ref, int dp_bonus, const int *qlens, int *n_regs, mm_reg1_t **regs);
#ifdef __cplusplus
}

75
pe.c
View File

@ -1,5 +1,6 @@
#include <stdlib.h>
#include "mmpriv.h"
#include "kalloc.h"
void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r)
{
@ -43,29 +44,83 @@ void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int
#include "ksort.h"
typedef struct {
int s;
int s, rev;
uint64_t key;
mm_reg1_t *r;
} pair_arr_t;
#define sort_key_pair(a) ((a).key)
KRADIX_SORT_INIT(pair, pair_arr_t, sort_key_pair, 8)
/*
void mm_pair(void *km, int max_gap_ref, int *n_regs, mm_reg1_t **regs)
void mm_pair(void *km, int max_gap_ref, int pe_bonus, const int *qlens, int *n_regs, mm_reg1_t **regs)
{
int i, j, s, n, last = -1;
int i, j, s, n, last[2], dp_thres, segs = 0, n_pairs = 0, max_idx[2];
int64_t max, max2;
pair_arr_t *a;
a = (pair_arr_t*)kmalloc(km, (n_regs[0] + n_regs[1]) * sizeof(pair_arr_t));
for (s = n = 0; s < 2; ++s)
for (s = n = 0, dp_thres = 0; s < 2; ++s) {
int max = 0;
for (i = 0; i < n_regs[s]; ++i) {
a[n].s = s;
a[n].r = &regs[s][i];
a[n++].key = (uint64_t)a[n].r->rid << 32 | a[n].r->rs;
a[n].rev = a[n].r->rev;
a[n].key = (uint64_t)a[n].r->rid << 32 | a[n].r->rs<<1 | (s^a[n].rev);
max = max > a[n].r->p->dp_max? max : a[n].r->p->dp_max;
++n;
segs |= 1<<s;
}
radix_sort_pair(n, a);
for (i = 0; i < n; ++i) {
int pre_dir = seg_rev? !a[i].r->rev : a[i].r->rev;
dp_thres += max;
}
if (segs != 3) {
kfree(km, a); // only one end is mapped
return;
}
dp_thres -= pe_bonus;
if (dp_thres < 0) dp_thres = 0;
radix_sort_pair(a, a + n);
max = max2 = -1;
max_idx[0] = max_idx[1] = -1;
last[0] = last[1] = -1;
for (i = 0; i < n; ++i) {
if (a[i].key & 1) { // reverse first read or forward second read
mm_reg1_t *q, *r;
if (last[a[i].rev] < 0) continue;
r = a[i].r;
q = a[last[a[i].rev]].r;
if (r->rid != q->rid || r->rs - q->re > max_gap_ref) continue;
for (j = last[a[i].rev]; j >= 0; --j) {
int sum;
if (a[j].rev != a[i].rev || a[j].s == a[i].s) continue;
q = a[j].r;
if (r->rid != q->rid || r->rs - q->re > max_gap_ref) break;
if (r->p->dp_max + q->p->dp_max < dp_thres) continue;
++n_pairs;
sum = r->p->dp_max + q->p->dp_max;
if (sum > max) {
max_idx[a[j].s] = j, max_idx[a[i].s] = i;
max2 = max;
max = sum;
} else if (sum > max2)
max2 = sum;
}
} else { // forward first read or reverse second read
last[a[i].rev] = i;
}
}
if (max > 0) {
for (s = 0; s < 2; ++s) {
mm_reg1_t *r = a[max_idx[s]].r;
if (r->id != r->parent) {
mm_reg1_t *p = &regs[s][r->parent];
for (i = 0; i < n_regs[s]; ++i)
if (regs[s][i].parent == p->id)
regs[s][i].parent = r->id;
}
}
}
kfree(km, a);
}
*/