r618: heuristics to avoid tiny terminal exons
This commit is contained in:
parent
824712a4ee
commit
98a6e52c06
49
align.c
49
align.c
|
|
@ -1,6 +1,7 @@
|
|||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include "minimap.h"
|
||||
#include "mmpriv.h"
|
||||
#include "ksw2.h"
|
||||
|
|
@ -341,6 +342,48 @@ static void mm_max_stretch(const mm_mapopt_t *opt, const mm_reg1_t *r, const mm1
|
|||
*as = max_i, *cnt = max_len;
|
||||
}
|
||||
|
||||
static int mm_seed_ext_score(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, const int8_t mat[25], int qlen, uint8_t *qseq0[2], const mm128_t *a)
|
||||
{
|
||||
uint8_t *qseq, *tseq;
|
||||
int q_span = a->y>>32&0xff, qs, qe, rs, re, rid, score, q_off, t_off, ext_len = opt->anchor_ext_len;
|
||||
void *qp;
|
||||
rid = a->x<<1>>33;
|
||||
re = (uint32_t)a->x + 1, rs = re - q_span;
|
||||
qe = (uint32_t)a->y + 1, qs = qe - q_span;
|
||||
rs = rs - ext_len > 0? rs - ext_len : 0;
|
||||
qs = qs - ext_len > 0? qs - ext_len : 0;
|
||||
re = re + ext_len < mi->seq[rid].len? re + ext_len : mi->seq[rid].len;
|
||||
qe = qe + ext_len < qlen? qe + ext_len : qlen;
|
||||
tseq = (uint8_t*)kmalloc(km, re - rs);
|
||||
mm_idx_getseq(mi, rid, rs, re, tseq);
|
||||
qseq = qseq0[a->x>>63] + qs;
|
||||
qp = ksw_ll_qinit(km, 2, qe - qs, qseq, 5, mat);
|
||||
score = ksw_ll_i16(qp, re - rs, tseq, opt->q, opt->e, &q_off, &t_off);
|
||||
kfree(km, tseq);
|
||||
kfree(km, qp);
|
||||
return score;
|
||||
}
|
||||
|
||||
static void mm_fix_bad_ends_splice(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, const mm_reg1_t *r, const int8_t mat[25], int qlen, uint8_t *qseq0[2], const mm128_t *a, int *as1, int *cnt1)
|
||||
{ // this assumes a very crude k-mer based mode; it is not necessary to use a good model just for filtering bounary exons
|
||||
int score;
|
||||
double log_gap;
|
||||
*as1 = r->as, *cnt1 = r->cnt;
|
||||
if (r->cnt < 3) return;
|
||||
log_gap = log((int32_t)a[r->as + 1].x - (int32_t)a[r->as].x);
|
||||
if ((a[r->as].y>>32&0xff) < log_gap + opt->anchor_ext_shift) {
|
||||
score = mm_seed_ext_score(km, opt, mi, mat, qlen, qseq0, &a[r->as]);
|
||||
if ((double)score / mat[0] < log_gap + opt->anchor_ext_shift) // a more exact format is "score < log_4(gap) + shift"
|
||||
++(*as1), --(*cnt1);
|
||||
}
|
||||
log_gap = log((int32_t)a[r->as + r->cnt - 1].x - (int32_t)a[r->as + r->cnt - 2].x);
|
||||
if ((a[r->as + r->cnt - 1].y>>32&0xff) < log_gap + opt->anchor_ext_shift) {
|
||||
score = mm_seed_ext_score(km, opt, mi, mat, qlen, qseq0, &a[r->as + r->cnt - 1]);
|
||||
if ((double)score / mat[0] < log_gap + opt->anchor_ext_shift)
|
||||
--(*cnt1);
|
||||
}
|
||||
}
|
||||
|
||||
static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], mm_reg1_t *r, mm_reg1_t *r2, int n_a, mm128_t *a, ksw_extz_t *ez, int splice_flag)
|
||||
{
|
||||
int is_sr = !!(opt->flag & MM_F_SR), is_splice = !!(opt->flag & MM_F_SPLICE);
|
||||
|
|
@ -365,9 +408,11 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|||
re = (int32_t)a[as1+cnt1-1].x + 1;
|
||||
qe = (int32_t)a[as1+cnt1-1].y + 1;
|
||||
} else {
|
||||
if (!is_splice)
|
||||
if (is_splice) {
|
||||
mm_fix_bad_ends_splice(km, opt, mi, r, mat, qlen, qseq0, a, &as1, &cnt1);
|
||||
} else {
|
||||
mm_fix_bad_ends(r, a, opt->bw, &as1, &cnt1);
|
||||
else as1 = r->as, cnt1 = r->cnt;
|
||||
}
|
||||
mm_filter_bad_seeds(km, as1, cnt1, a, 10, 40, opt->max_gap>>1, 10);
|
||||
mm_adjust_minier(mi, qseq0, &a[as1], &rs, &qs);
|
||||
mm_adjust_minier(mi, qseq0, &a[as1 + cnt1 - 1], &re, &qe);
|
||||
|
|
|
|||
6
esterr.c
6
esterr.c
|
|
@ -44,7 +44,11 @@ void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const
|
|||
r->div = -1.0f;
|
||||
if (r->cnt == 0) continue;
|
||||
st = en = get_mini_idx(qlen, r->rev? &a[r->as + r->cnt - 1] : &a[r->as], n, mini_pos);
|
||||
assert(st >= 0);
|
||||
if (st < 0) {
|
||||
if (mm_verbose >= 2)
|
||||
fprintf(stderr, "[WARNING] logic inconsistency in mm_est_err(). Please contact the developer.\n");
|
||||
continue;
|
||||
}
|
||||
l_ref = mi->seq[r->rid].len;
|
||||
for (k = 1, j = st + 1, n_match = 1; j < n && k < r->cnt; ++j) {
|
||||
int32_t x;
|
||||
|
|
|
|||
4
main.c
4
main.c
|
|
@ -6,7 +6,7 @@
|
|||
#include "mmpriv.h"
|
||||
#include "getopt.h"
|
||||
|
||||
#define MM_VERSION "2.5-r617-dirty"
|
||||
#define MM_VERSION "2.5-r618-dirty"
|
||||
|
||||
#ifdef __linux__
|
||||
#include <sys/resource.h>
|
||||
|
|
@ -44,6 +44,7 @@ static struct option long_options[] = {
|
|||
{ "no-pairing", no_argument, 0, 0 },
|
||||
{ "splice-flank", optional_argument, 0, 0 },
|
||||
{ "idx-no-seq", no_argument, 0, 0 },
|
||||
{ "end-seed-pen", required_argument, 0, 0 }, // 21
|
||||
{ "help", no_argument, 0, 'h' },
|
||||
{ "max-intron-len", required_argument, 0, 'G' },
|
||||
{ "version", no_argument, 0, 'V' },
|
||||
|
|
@ -140,6 +141,7 @@ int main(int argc, char *argv[])
|
|||
else if (c == 0 && long_idx ==17) opt.end_bonus = atoi(optarg); // --end-bonus
|
||||
else if (c == 0 && long_idx ==18) opt.flag |= MM_F_INDEPEND_SEG; // --no-pairing
|
||||
else if (c == 0 && long_idx ==20) ipt.flag |= MM_I_NO_SEQ; // --idx-no-seq
|
||||
else if (c == 0 && long_idx ==21) opt.anchor_ext_shift = atoi(optarg); // --end-seed-pen
|
||||
else if (c == 0 && long_idx == 14) { // --frag
|
||||
if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0)
|
||||
opt.flag |= MM_F_FRAG_MODE;
|
||||
|
|
|
|||
1
map.c
1
map.c
|
|
@ -36,6 +36,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
|
|||
opt->end_bonus = -1;
|
||||
opt->min_dp_max = opt->min_chain_score * opt->a;
|
||||
opt->min_ksw_len = 200;
|
||||
opt->anchor_ext_len = 20, opt->anchor_ext_shift = 6;
|
||||
opt->mini_batch_size = 500000000;
|
||||
|
||||
opt->pe_ori = 0; // FF
|
||||
|
|
|
|||
|
|
@ -116,6 +116,7 @@ typedef struct {
|
|||
int end_bonus;
|
||||
int min_dp_max; // drop an alignment if the score of the max scoring segment is below this threshold
|
||||
int min_ksw_len;
|
||||
int anchor_ext_len, anchor_ext_shift;
|
||||
|
||||
int pe_ori, pe_bonus;
|
||||
|
||||
|
|
|
|||
12
minimap2.1
12
minimap2.1
|
|
@ -272,6 +272,18 @@ However, the SIRV control does not honor this trend
|
|||
on SIRV data, please add
|
||||
.B --splice-flank=no
|
||||
to the command line.
|
||||
.TP
|
||||
.BI --end-seed-pen \ INT
|
||||
Drop a terminal anchor if
|
||||
.IR s <log( g )+ INT ,
|
||||
where
|
||||
.I s
|
||||
is the local alignment score around the anchor and
|
||||
.I g
|
||||
the length of the terminal gap in the chain. This option is only effective
|
||||
with
|
||||
.BR --splice .
|
||||
It helps to avoid tiny terminal exons. [6]
|
||||
.SS Input/output options
|
||||
.TP 10
|
||||
.B -a
|
||||
|
|
|
|||
Loading…
Reference in New Issue