r537: model the next base to GT/AG

[PMID:18688272] shows that the base following GT tends to be A or G (i.e. R) in
both human and yeast, and that the base preceeding AG tends to be C or T (i.e.
Y). In the new model, we pay no cost to GTr..yAG, but we pay half of the cost
if there is no r or y. This improves the junction accuracy when mapping to
human and mouse and decreases the accuacy when mapping to SIRV. My guess is
that SIRV does not honor this trend. Need to investigate in future.

Also in this commit, --cost-non-gt-ag is aliased to -C. The default is changed
to 9 instead of 5. I also added --splice-flank to enable the above model. This
may become the default once I confirm my hypothesis on SIRV.
This commit is contained in:
Heng Li 2017-10-28 00:25:01 -04:00
parent afc2f2e84b
commit 79b0caca95
7 changed files with 32 additions and 20 deletions

View File

@ -381,6 +381,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
if (is_splice) {
if (splice_flag & MM_F_SPLICE_FOR) extra_flag |= rev? KSW_EZ_SPLICE_REV : KSW_EZ_SPLICE_FOR;
if (splice_flag & MM_F_SPLICE_REV) extra_flag |= rev? KSW_EZ_SPLICE_FOR : KSW_EZ_SPLICE_REV;
if (opt->flag & MM_F_SPLICE_FLANK) extra_flag |= KSW_EZ_SPLICE_FLANK;
}
/* Look for the start and end of regions to perform DP. This sounds easy

1
ksw2.h
View File

@ -14,6 +14,7 @@
#define KSW_EZ_REV_CIGAR 0x80 // reverse CIGAR in the output
#define KSW_EZ_SPLICE_FOR 0x100
#define KSW_EZ_SPLICE_REV 0x200
#define KSW_EZ_SPLICE_FLANK 0x400
#ifdef __cplusplus
extern "C" {

View File

@ -111,19 +111,23 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
// set the donor and acceptor arrays. TODO: this assumes 0/1/2/3 encoding!
if (flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV)) {
int semi_cost = flag&KSW_EZ_SPLICE_FLANK? -noncan/2 : 0; // GTr or yAG is worth 0.5 bit; see PMID:18688272
memset(donor, -noncan, tlen_ * 16);
for (t = 0; t < tlen - 2; ++t) {
int is_can = 0; // is a canonical site
if ((flag & KSW_EZ_SPLICE_FOR) && target[t+1] == 2 && target[t+2] == 3) is_can = 1;
if ((flag & KSW_EZ_SPLICE_REV) && target[t+1] == 1 && target[t+2] == 3) is_can = 1;
if (is_can) ((int8_t*)donor)[t] = 0;
for (t = 0; t < tlen - 4; ++t) {
int can_type = 0; // type of canonical site: 0=none, 1=GT/AG only, 2=GTr/yAG
if ((flag & KSW_EZ_SPLICE_FOR) && target[t+1] == 2 && target[t+2] == 3) can_type = 1; // GTr...
if ((flag & KSW_EZ_SPLICE_REV) && target[t+1] == 1 && target[t+2] == 3) can_type = 1; // CTr...
if (can_type && (target[t+3] == 0 || target[t+3] == 2)) can_type = 2;
if (can_type) ((int8_t*)donor)[t] = can_type == 2? 0 : semi_cost;
if (can_type) ((int8_t*)donor)[t] = 0;
}
memset(acceptor, -noncan, tlen_ * 16);
for (t = 2; t < tlen; ++t) {
int is_can = 0;
if ((flag & KSW_EZ_SPLICE_FOR) && target[t-1] == 0 && target[t] == 2) is_can = 1;
if ((flag & KSW_EZ_SPLICE_REV) && target[t-1] == 0 && target[t] == 1) is_can = 1;
if (is_can) ((int8_t*)acceptor)[t] = 0;
int can_type = 0;
if ((flag & KSW_EZ_SPLICE_FOR) && target[t-1] == 0 && target[t] == 2) can_type = 1; // ...yAG
if ((flag & KSW_EZ_SPLICE_REV) && target[t-1] == 0 && target[t] == 1) can_type = 1; // ...yAC
if (can_type && (target[t-2] == 1 || target[t-2] == 3)) can_type = 2;
if (can_type) ((int8_t*)acceptor)[t] = can_type == 2? 0 : semi_cost;
}
}

13
main.c
View File

@ -6,7 +6,7 @@
#include "mmpriv.h"
#include "getopt.h"
#define MM_VERSION "2.3-r536-dirty"
#define MM_VERSION "2.3-r537-dirty"
#ifdef __linux__
#include <sys/resource.h>
@ -34,7 +34,7 @@ static struct option long_options[] = {
{ "min-dp-len", required_argument, 0, 0 },
{ "print-aln-seq", no_argument, 0, 0 },
{ "splice", no_argument, 0, 0 },
{ "cost-non-gt-ag", required_argument, 0, 0 },
{ "cost-non-gt-ag", required_argument, 0, 'C' },
{ "no-long-join", no_argument, 0, 0 },
{ "sr", no_argument, 0, 0 },
{ "frag", optional_argument, 0, 0 },
@ -42,6 +42,7 @@ static struct option long_options[] = {
{ "cs", optional_argument, 0, 0 },
{ "end-bonus", required_argument, 0, 0 },
{ "no-pairing", no_argument, 0, 0 },
{ "splice-flank", optional_argument, 0, 0 },
{ "help", no_argument, 0, 'h' },
{ "max-intron-len", required_argument, 0, 'G' },
{ "version", no_argument, 0, 'V' },
@ -66,7 +67,7 @@ static inline int64_t mm_parse_num(const char *str)
int main(int argc, char *argv[])
{
const char *opt_str = "2aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:i:L";
const char *opt_str = "2aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:i:LC:";
mm_mapopt_t opt;
mm_idxopt_t ipt;
int i, c, n_threads = 3, long_idx;
@ -117,6 +118,7 @@ int main(int argc, char *argv[])
else if (c == 'B') opt.b = atoi(optarg);
else if (c == 'z') opt.zdrop = atoi(optarg);
else if (c == 's') opt.min_dp_max = atoi(optarg);
else if (c == 'C') opt.noncan = atoi(optarg);
else if (c == 'I') ipt.batch_size = mm_parse_num(optarg);
else if (c == 'K') opt.mini_batch_size = (int)mm_parse_num(optarg);
else if (c == 'R') rg = optarg;
@ -132,7 +134,6 @@ int main(int argc, char *argv[])
else if (c == 0 && long_idx == 8) opt.min_ksw_len = atoi(optarg); // --min-dp-len
else if (c == 0 && long_idx == 9) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ; // --print-aln-seq
else if (c == 0 && long_idx ==10) opt.flag |= MM_F_SPLICE; // --splice
else if (c == 0 && long_idx ==11) opt.noncan = atoi(optarg); // --cost-non-gt-ag
else if (c == 0 && long_idx ==12) opt.flag |= MM_F_NO_LJOIN; // --no-long-join
else if (c == 0 && long_idx ==13) opt.flag |= MM_F_SR; // --sr
else if (c == 0 && long_idx ==17) opt.end_bonus = atoi(optarg); // --end-bonus
@ -156,6 +157,10 @@ int main(int argc, char *argv[])
} else if (mm_verbose >= 2) {
fprintf(stderr, "[WARNING]\033[1;31m --cs only takes 'short' or 'long'. Invalid values are assumed to be 'short'.\033[0m\n");
}
} else if (c == 0 && long_idx == 19) { // --splice-flank
if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0)
opt.flag |= MM_F_SPLICE_FLANK;
else opt.flag &= ~MM_F_SPLICE_FLANK;
} else if (c == 'S') {
opt.flag |= MM_F_OUT_CS | MM_F_CIGAR | MM_F_OUT_CS_LONG;
if (mm_verbose >= 2)

2
map.c
View File

@ -108,7 +108,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV;
mo->max_gap = 2000, mo->max_gap_ref = mo->bw = 200000;
mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0;
mo->noncan = 5;
mo->noncan = 9;
mo->zdrop = 200;
} else return -1;
return 0;

View File

@ -23,6 +23,7 @@
#define MM_F_2_IO_THREADS 0x8000
#define MM_F_LONG_CIGAR 0x10000
#define MM_F_INDEPEND_SEG 0x20000
#define MM_F_SPLICE_FLANK 0x40000
#define MM_IDX_MAGIC "MMI\2"

View File

@ -1,4 +1,4 @@
.TH minimap2 1 "22 October 2017" "minimap2-2.2-dirty (r531)" "Bioinformatics tools"
.TH minimap2 1 "27 October 2017" "minimap2-2.3-dirty (r537)" "Bioinformatics tools"
.SH NAME
.PP
minimap2 - mapping and alignment between collections of DNA sequences
@ -219,6 +219,9 @@ costs
.RI min{ O1 + k * E1 , O2 + k * E2 }.
In the splice mode, the second gap penalties are not used.
.TP
.BI -C \ INT
Cost for a non-canonical GT-AG splicing [0]
.TP
.BI -z \ INT
Break an alignment if the running score drops too quickly along the diagonal of
the DP matrix (diagonal X-drop, or Z-drop) [400]. Increasing the value improves
@ -239,9 +242,6 @@ both strands;
.BR n :
no attempt to match GT-AG [n]
.TP
.BI --cost-non-gt-ag \ INT
Cost of non-canonical splicing sites [0].
.TP
.BI --end-bonus \ INT
Score bonus when alignment extends to the end of the query sequence [10].
.SS Input/output options
@ -371,8 +371,8 @@ is that this preset is not using HPC minimizers.
.B splice
Long-read spliced alignment
.RB ( -k15
.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -z200 -ub --cost-non-gt-ag
.BR 5 ).
.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200
.BR -ub ).
In the splice mode, 1) long deletions are taken as introns and represented as
the
.RB ` N '