r618: heuristics to avoid tiny terminal exons

This commit is contained in:
Heng Li 2017-12-10 21:52:07 -05:00
parent 824712a4ee
commit 98a6e52c06
6 changed files with 69 additions and 4 deletions

49
align.c
View File

@ -1,6 +1,7 @@
#include <assert.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include "minimap.h"
#include "mmpriv.h"
#include "ksw2.h"
@ -341,6 +342,48 @@ static void mm_max_stretch(const mm_mapopt_t *opt, const mm_reg1_t *r, const mm1
*as = max_i, *cnt = max_len;
}
static int mm_seed_ext_score(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, const int8_t mat[25], int qlen, uint8_t *qseq0[2], const mm128_t *a)
{
uint8_t *qseq, *tseq;
int q_span = a->y>>32&0xff, qs, qe, rs, re, rid, score, q_off, t_off, ext_len = opt->anchor_ext_len;
void *qp;
rid = a->x<<1>>33;
re = (uint32_t)a->x + 1, rs = re - q_span;
qe = (uint32_t)a->y + 1, qs = qe - q_span;
rs = rs - ext_len > 0? rs - ext_len : 0;
qs = qs - ext_len > 0? qs - ext_len : 0;
re = re + ext_len < mi->seq[rid].len? re + ext_len : mi->seq[rid].len;
qe = qe + ext_len < qlen? qe + ext_len : qlen;
tseq = (uint8_t*)kmalloc(km, re - rs);
mm_idx_getseq(mi, rid, rs, re, tseq);
qseq = qseq0[a->x>>63] + qs;
qp = ksw_ll_qinit(km, 2, qe - qs, qseq, 5, mat);
score = ksw_ll_i16(qp, re - rs, tseq, opt->q, opt->e, &q_off, &t_off);
kfree(km, tseq);
kfree(km, qp);
return score;
}
static void mm_fix_bad_ends_splice(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, const mm_reg1_t *r, const int8_t mat[25], int qlen, uint8_t *qseq0[2], const mm128_t *a, int *as1, int *cnt1)
{ // this assumes a very crude k-mer based mode; it is not necessary to use a good model just for filtering bounary exons
int score;
double log_gap;
*as1 = r->as, *cnt1 = r->cnt;
if (r->cnt < 3) return;
log_gap = log((int32_t)a[r->as + 1].x - (int32_t)a[r->as].x);
if ((a[r->as].y>>32&0xff) < log_gap + opt->anchor_ext_shift) {
score = mm_seed_ext_score(km, opt, mi, mat, qlen, qseq0, &a[r->as]);
if ((double)score / mat[0] < log_gap + opt->anchor_ext_shift) // a more exact format is "score < log_4(gap) + shift"
++(*as1), --(*cnt1);
}
log_gap = log((int32_t)a[r->as + r->cnt - 1].x - (int32_t)a[r->as + r->cnt - 2].x);
if ((a[r->as + r->cnt - 1].y>>32&0xff) < log_gap + opt->anchor_ext_shift) {
score = mm_seed_ext_score(km, opt, mi, mat, qlen, qseq0, &a[r->as + r->cnt - 1]);
if ((double)score / mat[0] < log_gap + opt->anchor_ext_shift)
--(*cnt1);
}
}
static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, uint8_t *qseq0[2], mm_reg1_t *r, mm_reg1_t *r2, int n_a, mm128_t *a, ksw_extz_t *ez, int splice_flag)
{
int is_sr = !!(opt->flag & MM_F_SR), is_splice = !!(opt->flag & MM_F_SPLICE);
@ -365,9 +408,11 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
re = (int32_t)a[as1+cnt1-1].x + 1;
qe = (int32_t)a[as1+cnt1-1].y + 1;
} else {
if (!is_splice)
if (is_splice) {
mm_fix_bad_ends_splice(km, opt, mi, r, mat, qlen, qseq0, a, &as1, &cnt1);
} else {
mm_fix_bad_ends(r, a, opt->bw, &as1, &cnt1);
else as1 = r->as, cnt1 = r->cnt;
}
mm_filter_bad_seeds(km, as1, cnt1, a, 10, 40, opt->max_gap>>1, 10);
mm_adjust_minier(mi, qseq0, &a[as1], &rs, &qs);
mm_adjust_minier(mi, qseq0, &a[as1 + cnt1 - 1], &re, &qe);

View File

@ -44,7 +44,11 @@ void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const
r->div = -1.0f;
if (r->cnt == 0) continue;
st = en = get_mini_idx(qlen, r->rev? &a[r->as + r->cnt - 1] : &a[r->as], n, mini_pos);
assert(st >= 0);
if (st < 0) {
if (mm_verbose >= 2)
fprintf(stderr, "[WARNING] logic inconsistency in mm_est_err(). Please contact the developer.\n");
continue;
}
l_ref = mi->seq[r->rid].len;
for (k = 1, j = st + 1, n_match = 1; j < n && k < r->cnt; ++j) {
int32_t x;

4
main.c
View File

@ -6,7 +6,7 @@
#include "mmpriv.h"
#include "getopt.h"
#define MM_VERSION "2.5-r617-dirty"
#define MM_VERSION "2.5-r618-dirty"
#ifdef __linux__
#include <sys/resource.h>
@ -44,6 +44,7 @@ static struct option long_options[] = {
{ "no-pairing", no_argument, 0, 0 },
{ "splice-flank", optional_argument, 0, 0 },
{ "idx-no-seq", no_argument, 0, 0 },
{ "end-seed-pen", required_argument, 0, 0 }, // 21
{ "help", no_argument, 0, 'h' },
{ "max-intron-len", required_argument, 0, 'G' },
{ "version", no_argument, 0, 'V' },
@ -140,6 +141,7 @@ int main(int argc, char *argv[])
else if (c == 0 && long_idx ==17) opt.end_bonus = atoi(optarg); // --end-bonus
else if (c == 0 && long_idx ==18) opt.flag |= MM_F_INDEPEND_SEG; // --no-pairing
else if (c == 0 && long_idx ==20) ipt.flag |= MM_I_NO_SEQ; // --idx-no-seq
else if (c == 0 && long_idx ==21) opt.anchor_ext_shift = atoi(optarg); // --end-seed-pen
else if (c == 0 && long_idx == 14) { // --frag
if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0)
opt.flag |= MM_F_FRAG_MODE;

1
map.c
View File

@ -36,6 +36,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->end_bonus = -1;
opt->min_dp_max = opt->min_chain_score * opt->a;
opt->min_ksw_len = 200;
opt->anchor_ext_len = 20, opt->anchor_ext_shift = 6;
opt->mini_batch_size = 500000000;
opt->pe_ori = 0; // FF

View File

@ -116,6 +116,7 @@ typedef struct {
int end_bonus;
int min_dp_max; // drop an alignment if the score of the max scoring segment is below this threshold
int min_ksw_len;
int anchor_ext_len, anchor_ext_shift;
int pe_ori, pe_bonus;

View File

@ -272,6 +272,18 @@ However, the SIRV control does not honor this trend
on SIRV data, please add
.B --splice-flank=no
to the command line.
.TP
.BI --end-seed-pen \ INT
Drop a terminal anchor if
.IR s <log( g )+ INT ,
where
.I s
is the local alignment score around the anchor and
.I g
the length of the terminal gap in the chain. This option is only effective
with
.BR --splice .
It helps to avoid tiny terminal exons. [6]
.SS Input/output options
.TP 10
.B -a