From 79b0caca95c0587626b1943566cd3daf187ddd24 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 28 Oct 2017 00:25:01 -0400 Subject: [PATCH] r537: model the next base to GT/AG [PMID:18688272] shows that the base following GT tends to be A or G (i.e. R) in both human and yeast, and that the base preceeding AG tends to be C or T (i.e. Y). In the new model, we pay no cost to GTr..yAG, but we pay half of the cost if there is no r or y. This improves the junction accuracy when mapping to human and mouse and decreases the accuacy when mapping to SIRV. My guess is that SIRV does not honor this trend. Need to investigate in future. Also in this commit, --cost-non-gt-ag is aliased to -C. The default is changed to 9 instead of 5. I also added --splice-flank to enable the above model. This may become the default once I confirm my hypothesis on SIRV. --- align.c | 1 + ksw2.h | 1 + ksw2_exts2_sse.c | 22 +++++++++++++--------- main.c | 13 +++++++++---- map.c | 2 +- minimap.h | 1 + minimap2.1 | 12 ++++++------ 7 files changed, 32 insertions(+), 20 deletions(-) diff --git a/align.c b/align.c index 10fe689..73890da 100644 --- a/align.c +++ b/align.c @@ -381,6 +381,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int if (is_splice) { if (splice_flag & MM_F_SPLICE_FOR) extra_flag |= rev? KSW_EZ_SPLICE_REV : KSW_EZ_SPLICE_FOR; if (splice_flag & MM_F_SPLICE_REV) extra_flag |= rev? KSW_EZ_SPLICE_FOR : KSW_EZ_SPLICE_REV; + if (opt->flag & MM_F_SPLICE_FLANK) extra_flag |= KSW_EZ_SPLICE_FLANK; } /* Look for the start and end of regions to perform DP. This sounds easy diff --git a/ksw2.h b/ksw2.h index e82dd82..dcea9f8 100644 --- a/ksw2.h +++ b/ksw2.h @@ -14,6 +14,7 @@ #define KSW_EZ_REV_CIGAR 0x80 // reverse CIGAR in the output #define KSW_EZ_SPLICE_FOR 0x100 #define KSW_EZ_SPLICE_REV 0x200 +#define KSW_EZ_SPLICE_FLANK 0x400 #ifdef __cplusplus extern "C" { diff --git a/ksw2_exts2_sse.c b/ksw2_exts2_sse.c index 82a0369..83a4fe5 100644 --- a/ksw2_exts2_sse.c +++ b/ksw2_exts2_sse.c @@ -111,19 +111,23 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin // set the donor and acceptor arrays. TODO: this assumes 0/1/2/3 encoding! if (flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV)) { + int semi_cost = flag&KSW_EZ_SPLICE_FLANK? -noncan/2 : 0; // GTr or yAG is worth 0.5 bit; see PMID:18688272 memset(donor, -noncan, tlen_ * 16); - for (t = 0; t < tlen - 2; ++t) { - int is_can = 0; // is a canonical site - if ((flag & KSW_EZ_SPLICE_FOR) && target[t+1] == 2 && target[t+2] == 3) is_can = 1; - if ((flag & KSW_EZ_SPLICE_REV) && target[t+1] == 1 && target[t+2] == 3) is_can = 1; - if (is_can) ((int8_t*)donor)[t] = 0; + for (t = 0; t < tlen - 4; ++t) { + int can_type = 0; // type of canonical site: 0=none, 1=GT/AG only, 2=GTr/yAG + if ((flag & KSW_EZ_SPLICE_FOR) && target[t+1] == 2 && target[t+2] == 3) can_type = 1; // GTr... + if ((flag & KSW_EZ_SPLICE_REV) && target[t+1] == 1 && target[t+2] == 3) can_type = 1; // CTr... + if (can_type && (target[t+3] == 0 || target[t+3] == 2)) can_type = 2; + if (can_type) ((int8_t*)donor)[t] = can_type == 2? 0 : semi_cost; + if (can_type) ((int8_t*)donor)[t] = 0; } memset(acceptor, -noncan, tlen_ * 16); for (t = 2; t < tlen; ++t) { - int is_can = 0; - if ((flag & KSW_EZ_SPLICE_FOR) && target[t-1] == 0 && target[t] == 2) is_can = 1; - if ((flag & KSW_EZ_SPLICE_REV) && target[t-1] == 0 && target[t] == 1) is_can = 1; - if (is_can) ((int8_t*)acceptor)[t] = 0; + int can_type = 0; + if ((flag & KSW_EZ_SPLICE_FOR) && target[t-1] == 0 && target[t] == 2) can_type = 1; // ...yAG + if ((flag & KSW_EZ_SPLICE_REV) && target[t-1] == 0 && target[t] == 1) can_type = 1; // ...yAC + if (can_type && (target[t-2] == 1 || target[t-2] == 3)) can_type = 2; + if (can_type) ((int8_t*)acceptor)[t] = can_type == 2? 0 : semi_cost; } } diff --git a/main.c b/main.c index 63cbb2c..b9e0add 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.3-r536-dirty" +#define MM_VERSION "2.3-r537-dirty" #ifdef __linux__ #include @@ -34,7 +34,7 @@ static struct option long_options[] = { { "min-dp-len", required_argument, 0, 0 }, { "print-aln-seq", no_argument, 0, 0 }, { "splice", no_argument, 0, 0 }, - { "cost-non-gt-ag", required_argument, 0, 0 }, + { "cost-non-gt-ag", required_argument, 0, 'C' }, { "no-long-join", no_argument, 0, 0 }, { "sr", no_argument, 0, 0 }, { "frag", optional_argument, 0, 0 }, @@ -42,6 +42,7 @@ static struct option long_options[] = { { "cs", optional_argument, 0, 0 }, { "end-bonus", required_argument, 0, 0 }, { "no-pairing", no_argument, 0, 0 }, + { "splice-flank", optional_argument, 0, 0 }, { "help", no_argument, 0, 'h' }, { "max-intron-len", required_argument, 0, 'G' }, { "version", no_argument, 0, 'V' }, @@ -66,7 +67,7 @@ static inline int64_t mm_parse_num(const char *str) int main(int argc, char *argv[]) { - const char *opt_str = "2aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:i:L"; + const char *opt_str = "2aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:i:LC:"; mm_mapopt_t opt; mm_idxopt_t ipt; int i, c, n_threads = 3, long_idx; @@ -117,6 +118,7 @@ int main(int argc, char *argv[]) else if (c == 'B') opt.b = atoi(optarg); else if (c == 'z') opt.zdrop = atoi(optarg); else if (c == 's') opt.min_dp_max = atoi(optarg); + else if (c == 'C') opt.noncan = atoi(optarg); else if (c == 'I') ipt.batch_size = mm_parse_num(optarg); else if (c == 'K') opt.mini_batch_size = (int)mm_parse_num(optarg); else if (c == 'R') rg = optarg; @@ -132,7 +134,6 @@ int main(int argc, char *argv[]) else if (c == 0 && long_idx == 8) opt.min_ksw_len = atoi(optarg); // --min-dp-len else if (c == 0 && long_idx == 9) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ; // --print-aln-seq else if (c == 0 && long_idx ==10) opt.flag |= MM_F_SPLICE; // --splice - else if (c == 0 && long_idx ==11) opt.noncan = atoi(optarg); // --cost-non-gt-ag else if (c == 0 && long_idx ==12) opt.flag |= MM_F_NO_LJOIN; // --no-long-join else if (c == 0 && long_idx ==13) opt.flag |= MM_F_SR; // --sr else if (c == 0 && long_idx ==17) opt.end_bonus = atoi(optarg); // --end-bonus @@ -156,6 +157,10 @@ int main(int argc, char *argv[]) } else if (mm_verbose >= 2) { fprintf(stderr, "[WARNING]\033[1;31m --cs only takes 'short' or 'long'. Invalid values are assumed to be 'short'.\033[0m\n"); } + } else if (c == 0 && long_idx == 19) { // --splice-flank + if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0) + opt.flag |= MM_F_SPLICE_FLANK; + else opt.flag &= ~MM_F_SPLICE_FLANK; } else if (c == 'S') { opt.flag |= MM_F_OUT_CS | MM_F_CIGAR | MM_F_OUT_CS_LONG; if (mm_verbose >= 2) diff --git a/map.c b/map.c index a31bceb..f99626d 100644 --- a/map.c +++ b/map.c @@ -108,7 +108,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV; mo->max_gap = 2000, mo->max_gap_ref = mo->bw = 200000; mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0; - mo->noncan = 5; + mo->noncan = 9; mo->zdrop = 200; } else return -1; return 0; diff --git a/minimap.h b/minimap.h index 058c1e3..17e411c 100644 --- a/minimap.h +++ b/minimap.h @@ -23,6 +23,7 @@ #define MM_F_2_IO_THREADS 0x8000 #define MM_F_LONG_CIGAR 0x10000 #define MM_F_INDEPEND_SEG 0x20000 +#define MM_F_SPLICE_FLANK 0x40000 #define MM_IDX_MAGIC "MMI\2" diff --git a/minimap2.1 b/minimap2.1 index 0064d77..b7d7177 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "22 October 2017" "minimap2-2.2-dirty (r531)" "Bioinformatics tools" +.TH minimap2 1 "27 October 2017" "minimap2-2.3-dirty (r537)" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences @@ -219,6 +219,9 @@ costs .RI min{ O1 + k * E1 , O2 + k * E2 }. In the splice mode, the second gap penalties are not used. .TP +.BI -C \ INT +Cost for a non-canonical GT-AG splicing [0] +.TP .BI -z \ INT Break an alignment if the running score drops too quickly along the diagonal of the DP matrix (diagonal X-drop, or Z-drop) [400]. Increasing the value improves @@ -239,9 +242,6 @@ both strands; .BR n : no attempt to match GT-AG [n] .TP -.BI --cost-non-gt-ag \ INT -Cost of non-canonical splicing sites [0]. -.TP .BI --end-bonus \ INT Score bonus when alignment extends to the end of the query sequence [10]. .SS Input/output options @@ -371,8 +371,8 @@ is that this preset is not using HPC minimizers. .B splice Long-read spliced alignment .RB ( -k15 -.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -z200 -ub --cost-non-gt-ag -.BR 5 ). +.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 +.BR -ub ). In the splice mode, 1) long deletions are taken as introns and represented as the .RB ` N '