r537: model the next base to GT/AG
[PMID:18688272] shows that the base following GT tends to be A or G (i.e. R) in both human and yeast, and that the base preceeding AG tends to be C or T (i.e. Y). In the new model, we pay no cost to GTr..yAG, but we pay half of the cost if there is no r or y. This improves the junction accuracy when mapping to human and mouse and decreases the accuacy when mapping to SIRV. My guess is that SIRV does not honor this trend. Need to investigate in future. Also in this commit, --cost-non-gt-ag is aliased to -C. The default is changed to 9 instead of 5. I also added --splice-flank to enable the above model. This may become the default once I confirm my hypothesis on SIRV.
This commit is contained in:
parent
afc2f2e84b
commit
79b0caca95
1
align.c
1
align.c
|
|
@ -381,6 +381,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|||
if (is_splice) {
|
||||
if (splice_flag & MM_F_SPLICE_FOR) extra_flag |= rev? KSW_EZ_SPLICE_REV : KSW_EZ_SPLICE_FOR;
|
||||
if (splice_flag & MM_F_SPLICE_REV) extra_flag |= rev? KSW_EZ_SPLICE_FOR : KSW_EZ_SPLICE_REV;
|
||||
if (opt->flag & MM_F_SPLICE_FLANK) extra_flag |= KSW_EZ_SPLICE_FLANK;
|
||||
}
|
||||
|
||||
/* Look for the start and end of regions to perform DP. This sounds easy
|
||||
|
|
|
|||
1
ksw2.h
1
ksw2.h
|
|
@ -14,6 +14,7 @@
|
|||
#define KSW_EZ_REV_CIGAR 0x80 // reverse CIGAR in the output
|
||||
#define KSW_EZ_SPLICE_FOR 0x100
|
||||
#define KSW_EZ_SPLICE_REV 0x200
|
||||
#define KSW_EZ_SPLICE_FLANK 0x400
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
|
|
|||
|
|
@ -111,19 +111,23 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
|
|||
|
||||
// set the donor and acceptor arrays. TODO: this assumes 0/1/2/3 encoding!
|
||||
if (flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV)) {
|
||||
int semi_cost = flag&KSW_EZ_SPLICE_FLANK? -noncan/2 : 0; // GTr or yAG is worth 0.5 bit; see PMID:18688272
|
||||
memset(donor, -noncan, tlen_ * 16);
|
||||
for (t = 0; t < tlen - 2; ++t) {
|
||||
int is_can = 0; // is a canonical site
|
||||
if ((flag & KSW_EZ_SPLICE_FOR) && target[t+1] == 2 && target[t+2] == 3) is_can = 1;
|
||||
if ((flag & KSW_EZ_SPLICE_REV) && target[t+1] == 1 && target[t+2] == 3) is_can = 1;
|
||||
if (is_can) ((int8_t*)donor)[t] = 0;
|
||||
for (t = 0; t < tlen - 4; ++t) {
|
||||
int can_type = 0; // type of canonical site: 0=none, 1=GT/AG only, 2=GTr/yAG
|
||||
if ((flag & KSW_EZ_SPLICE_FOR) && target[t+1] == 2 && target[t+2] == 3) can_type = 1; // GTr...
|
||||
if ((flag & KSW_EZ_SPLICE_REV) && target[t+1] == 1 && target[t+2] == 3) can_type = 1; // CTr...
|
||||
if (can_type && (target[t+3] == 0 || target[t+3] == 2)) can_type = 2;
|
||||
if (can_type) ((int8_t*)donor)[t] = can_type == 2? 0 : semi_cost;
|
||||
if (can_type) ((int8_t*)donor)[t] = 0;
|
||||
}
|
||||
memset(acceptor, -noncan, tlen_ * 16);
|
||||
for (t = 2; t < tlen; ++t) {
|
||||
int is_can = 0;
|
||||
if ((flag & KSW_EZ_SPLICE_FOR) && target[t-1] == 0 && target[t] == 2) is_can = 1;
|
||||
if ((flag & KSW_EZ_SPLICE_REV) && target[t-1] == 0 && target[t] == 1) is_can = 1;
|
||||
if (is_can) ((int8_t*)acceptor)[t] = 0;
|
||||
int can_type = 0;
|
||||
if ((flag & KSW_EZ_SPLICE_FOR) && target[t-1] == 0 && target[t] == 2) can_type = 1; // ...yAG
|
||||
if ((flag & KSW_EZ_SPLICE_REV) && target[t-1] == 0 && target[t] == 1) can_type = 1; // ...yAC
|
||||
if (can_type && (target[t-2] == 1 || target[t-2] == 3)) can_type = 2;
|
||||
if (can_type) ((int8_t*)acceptor)[t] = can_type == 2? 0 : semi_cost;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
13
main.c
13
main.c
|
|
@ -6,7 +6,7 @@
|
|||
#include "mmpriv.h"
|
||||
#include "getopt.h"
|
||||
|
||||
#define MM_VERSION "2.3-r536-dirty"
|
||||
#define MM_VERSION "2.3-r537-dirty"
|
||||
|
||||
#ifdef __linux__
|
||||
#include <sys/resource.h>
|
||||
|
|
@ -34,7 +34,7 @@ static struct option long_options[] = {
|
|||
{ "min-dp-len", required_argument, 0, 0 },
|
||||
{ "print-aln-seq", no_argument, 0, 0 },
|
||||
{ "splice", no_argument, 0, 0 },
|
||||
{ "cost-non-gt-ag", required_argument, 0, 0 },
|
||||
{ "cost-non-gt-ag", required_argument, 0, 'C' },
|
||||
{ "no-long-join", no_argument, 0, 0 },
|
||||
{ "sr", no_argument, 0, 0 },
|
||||
{ "frag", optional_argument, 0, 0 },
|
||||
|
|
@ -42,6 +42,7 @@ static struct option long_options[] = {
|
|||
{ "cs", optional_argument, 0, 0 },
|
||||
{ "end-bonus", required_argument, 0, 0 },
|
||||
{ "no-pairing", no_argument, 0, 0 },
|
||||
{ "splice-flank", optional_argument, 0, 0 },
|
||||
{ "help", no_argument, 0, 'h' },
|
||||
{ "max-intron-len", required_argument, 0, 'G' },
|
||||
{ "version", no_argument, 0, 'V' },
|
||||
|
|
@ -66,7 +67,7 @@ static inline int64_t mm_parse_num(const char *str)
|
|||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
const char *opt_str = "2aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:i:L";
|
||||
const char *opt_str = "2aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:i:LC:";
|
||||
mm_mapopt_t opt;
|
||||
mm_idxopt_t ipt;
|
||||
int i, c, n_threads = 3, long_idx;
|
||||
|
|
@ -117,6 +118,7 @@ int main(int argc, char *argv[])
|
|||
else if (c == 'B') opt.b = atoi(optarg);
|
||||
else if (c == 'z') opt.zdrop = atoi(optarg);
|
||||
else if (c == 's') opt.min_dp_max = atoi(optarg);
|
||||
else if (c == 'C') opt.noncan = atoi(optarg);
|
||||
else if (c == 'I') ipt.batch_size = mm_parse_num(optarg);
|
||||
else if (c == 'K') opt.mini_batch_size = (int)mm_parse_num(optarg);
|
||||
else if (c == 'R') rg = optarg;
|
||||
|
|
@ -132,7 +134,6 @@ int main(int argc, char *argv[])
|
|||
else if (c == 0 && long_idx == 8) opt.min_ksw_len = atoi(optarg); // --min-dp-len
|
||||
else if (c == 0 && long_idx == 9) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ; // --print-aln-seq
|
||||
else if (c == 0 && long_idx ==10) opt.flag |= MM_F_SPLICE; // --splice
|
||||
else if (c == 0 && long_idx ==11) opt.noncan = atoi(optarg); // --cost-non-gt-ag
|
||||
else if (c == 0 && long_idx ==12) opt.flag |= MM_F_NO_LJOIN; // --no-long-join
|
||||
else if (c == 0 && long_idx ==13) opt.flag |= MM_F_SR; // --sr
|
||||
else if (c == 0 && long_idx ==17) opt.end_bonus = atoi(optarg); // --end-bonus
|
||||
|
|
@ -156,6 +157,10 @@ int main(int argc, char *argv[])
|
|||
} else if (mm_verbose >= 2) {
|
||||
fprintf(stderr, "[WARNING]\033[1;31m --cs only takes 'short' or 'long'. Invalid values are assumed to be 'short'.\033[0m\n");
|
||||
}
|
||||
} else if (c == 0 && long_idx == 19) { // --splice-flank
|
||||
if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0)
|
||||
opt.flag |= MM_F_SPLICE_FLANK;
|
||||
else opt.flag &= ~MM_F_SPLICE_FLANK;
|
||||
} else if (c == 'S') {
|
||||
opt.flag |= MM_F_OUT_CS | MM_F_CIGAR | MM_F_OUT_CS_LONG;
|
||||
if (mm_verbose >= 2)
|
||||
|
|
|
|||
2
map.c
2
map.c
|
|
@ -108,7 +108,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
|||
mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV;
|
||||
mo->max_gap = 2000, mo->max_gap_ref = mo->bw = 200000;
|
||||
mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0;
|
||||
mo->noncan = 5;
|
||||
mo->noncan = 9;
|
||||
mo->zdrop = 200;
|
||||
} else return -1;
|
||||
return 0;
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@
|
|||
#define MM_F_2_IO_THREADS 0x8000
|
||||
#define MM_F_LONG_CIGAR 0x10000
|
||||
#define MM_F_INDEPEND_SEG 0x20000
|
||||
#define MM_F_SPLICE_FLANK 0x40000
|
||||
|
||||
#define MM_IDX_MAGIC "MMI\2"
|
||||
|
||||
|
|
|
|||
12
minimap2.1
12
minimap2.1
|
|
@ -1,4 +1,4 @@
|
|||
.TH minimap2 1 "22 October 2017" "minimap2-2.2-dirty (r531)" "Bioinformatics tools"
|
||||
.TH minimap2 1 "27 October 2017" "minimap2-2.3-dirty (r537)" "Bioinformatics tools"
|
||||
.SH NAME
|
||||
.PP
|
||||
minimap2 - mapping and alignment between collections of DNA sequences
|
||||
|
|
@ -219,6 +219,9 @@ costs
|
|||
.RI min{ O1 + k * E1 , O2 + k * E2 }.
|
||||
In the splice mode, the second gap penalties are not used.
|
||||
.TP
|
||||
.BI -C \ INT
|
||||
Cost for a non-canonical GT-AG splicing [0]
|
||||
.TP
|
||||
.BI -z \ INT
|
||||
Break an alignment if the running score drops too quickly along the diagonal of
|
||||
the DP matrix (diagonal X-drop, or Z-drop) [400]. Increasing the value improves
|
||||
|
|
@ -239,9 +242,6 @@ both strands;
|
|||
.BR n :
|
||||
no attempt to match GT-AG [n]
|
||||
.TP
|
||||
.BI --cost-non-gt-ag \ INT
|
||||
Cost of non-canonical splicing sites [0].
|
||||
.TP
|
||||
.BI --end-bonus \ INT
|
||||
Score bonus when alignment extends to the end of the query sequence [10].
|
||||
.SS Input/output options
|
||||
|
|
@ -371,8 +371,8 @@ is that this preset is not using HPC minimizers.
|
|||
.B splice
|
||||
Long-read spliced alignment
|
||||
.RB ( -k15
|
||||
.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -z200 -ub --cost-non-gt-ag
|
||||
.BR 5 ).
|
||||
.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200
|
||||
.BR -ub ).
|
||||
In the splice mode, 1) long deletions are taken as introns and represented as
|
||||
the
|
||||
.RB ` N '
|
||||
|
|
|
|||
Loading…
Reference in New Issue