diff --git a/align.c b/align.c index 10fe689..73890da 100644 --- a/align.c +++ b/align.c @@ -381,6 +381,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int if (is_splice) { if (splice_flag & MM_F_SPLICE_FOR) extra_flag |= rev? KSW_EZ_SPLICE_REV : KSW_EZ_SPLICE_FOR; if (splice_flag & MM_F_SPLICE_REV) extra_flag |= rev? KSW_EZ_SPLICE_FOR : KSW_EZ_SPLICE_REV; + if (opt->flag & MM_F_SPLICE_FLANK) extra_flag |= KSW_EZ_SPLICE_FLANK; } /* Look for the start and end of regions to perform DP. This sounds easy diff --git a/ksw2.h b/ksw2.h index e82dd82..dcea9f8 100644 --- a/ksw2.h +++ b/ksw2.h @@ -14,6 +14,7 @@ #define KSW_EZ_REV_CIGAR 0x80 // reverse CIGAR in the output #define KSW_EZ_SPLICE_FOR 0x100 #define KSW_EZ_SPLICE_REV 0x200 +#define KSW_EZ_SPLICE_FLANK 0x400 #ifdef __cplusplus extern "C" { diff --git a/ksw2_exts2_sse.c b/ksw2_exts2_sse.c index 82a0369..83a4fe5 100644 --- a/ksw2_exts2_sse.c +++ b/ksw2_exts2_sse.c @@ -111,19 +111,23 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin // set the donor and acceptor arrays. TODO: this assumes 0/1/2/3 encoding! if (flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV)) { + int semi_cost = flag&KSW_EZ_SPLICE_FLANK? -noncan/2 : 0; // GTr or yAG is worth 0.5 bit; see PMID:18688272 memset(donor, -noncan, tlen_ * 16); - for (t = 0; t < tlen - 2; ++t) { - int is_can = 0; // is a canonical site - if ((flag & KSW_EZ_SPLICE_FOR) && target[t+1] == 2 && target[t+2] == 3) is_can = 1; - if ((flag & KSW_EZ_SPLICE_REV) && target[t+1] == 1 && target[t+2] == 3) is_can = 1; - if (is_can) ((int8_t*)donor)[t] = 0; + for (t = 0; t < tlen - 4; ++t) { + int can_type = 0; // type of canonical site: 0=none, 1=GT/AG only, 2=GTr/yAG + if ((flag & KSW_EZ_SPLICE_FOR) && target[t+1] == 2 && target[t+2] == 3) can_type = 1; // GTr... + if ((flag & KSW_EZ_SPLICE_REV) && target[t+1] == 1 && target[t+2] == 3) can_type = 1; // CTr... + if (can_type && (target[t+3] == 0 || target[t+3] == 2)) can_type = 2; + if (can_type) ((int8_t*)donor)[t] = can_type == 2? 0 : semi_cost; + if (can_type) ((int8_t*)donor)[t] = 0; } memset(acceptor, -noncan, tlen_ * 16); for (t = 2; t < tlen; ++t) { - int is_can = 0; - if ((flag & KSW_EZ_SPLICE_FOR) && target[t-1] == 0 && target[t] == 2) is_can = 1; - if ((flag & KSW_EZ_SPLICE_REV) && target[t-1] == 0 && target[t] == 1) is_can = 1; - if (is_can) ((int8_t*)acceptor)[t] = 0; + int can_type = 0; + if ((flag & KSW_EZ_SPLICE_FOR) && target[t-1] == 0 && target[t] == 2) can_type = 1; // ...yAG + if ((flag & KSW_EZ_SPLICE_REV) && target[t-1] == 0 && target[t] == 1) can_type = 1; // ...yAC + if (can_type && (target[t-2] == 1 || target[t-2] == 3)) can_type = 2; + if (can_type) ((int8_t*)acceptor)[t] = can_type == 2? 0 : semi_cost; } } diff --git a/main.c b/main.c index 63cbb2c..b9e0add 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.3-r536-dirty" +#define MM_VERSION "2.3-r537-dirty" #ifdef __linux__ #include @@ -34,7 +34,7 @@ static struct option long_options[] = { { "min-dp-len", required_argument, 0, 0 }, { "print-aln-seq", no_argument, 0, 0 }, { "splice", no_argument, 0, 0 }, - { "cost-non-gt-ag", required_argument, 0, 0 }, + { "cost-non-gt-ag", required_argument, 0, 'C' }, { "no-long-join", no_argument, 0, 0 }, { "sr", no_argument, 0, 0 }, { "frag", optional_argument, 0, 0 }, @@ -42,6 +42,7 @@ static struct option long_options[] = { { "cs", optional_argument, 0, 0 }, { "end-bonus", required_argument, 0, 0 }, { "no-pairing", no_argument, 0, 0 }, + { "splice-flank", optional_argument, 0, 0 }, { "help", no_argument, 0, 'h' }, { "max-intron-len", required_argument, 0, 'G' }, { "version", no_argument, 0, 'V' }, @@ -66,7 +67,7 @@ static inline int64_t mm_parse_num(const char *str) int main(int argc, char *argv[]) { - const char *opt_str = "2aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:i:L"; + const char *opt_str = "2aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:i:LC:"; mm_mapopt_t opt; mm_idxopt_t ipt; int i, c, n_threads = 3, long_idx; @@ -117,6 +118,7 @@ int main(int argc, char *argv[]) else if (c == 'B') opt.b = atoi(optarg); else if (c == 'z') opt.zdrop = atoi(optarg); else if (c == 's') opt.min_dp_max = atoi(optarg); + else if (c == 'C') opt.noncan = atoi(optarg); else if (c == 'I') ipt.batch_size = mm_parse_num(optarg); else if (c == 'K') opt.mini_batch_size = (int)mm_parse_num(optarg); else if (c == 'R') rg = optarg; @@ -132,7 +134,6 @@ int main(int argc, char *argv[]) else if (c == 0 && long_idx == 8) opt.min_ksw_len = atoi(optarg); // --min-dp-len else if (c == 0 && long_idx == 9) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ; // --print-aln-seq else if (c == 0 && long_idx ==10) opt.flag |= MM_F_SPLICE; // --splice - else if (c == 0 && long_idx ==11) opt.noncan = atoi(optarg); // --cost-non-gt-ag else if (c == 0 && long_idx ==12) opt.flag |= MM_F_NO_LJOIN; // --no-long-join else if (c == 0 && long_idx ==13) opt.flag |= MM_F_SR; // --sr else if (c == 0 && long_idx ==17) opt.end_bonus = atoi(optarg); // --end-bonus @@ -156,6 +157,10 @@ int main(int argc, char *argv[]) } else if (mm_verbose >= 2) { fprintf(stderr, "[WARNING]\033[1;31m --cs only takes 'short' or 'long'. Invalid values are assumed to be 'short'.\033[0m\n"); } + } else if (c == 0 && long_idx == 19) { // --splice-flank + if (optarg == 0 || strcmp(optarg, "yes") == 0 || strcmp(optarg, "y") == 0) + opt.flag |= MM_F_SPLICE_FLANK; + else opt.flag &= ~MM_F_SPLICE_FLANK; } else if (c == 'S') { opt.flag |= MM_F_OUT_CS | MM_F_CIGAR | MM_F_OUT_CS_LONG; if (mm_verbose >= 2) diff --git a/map.c b/map.c index a31bceb..f99626d 100644 --- a/map.c +++ b/map.c @@ -108,7 +108,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV; mo->max_gap = 2000, mo->max_gap_ref = mo->bw = 200000; mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0; - mo->noncan = 5; + mo->noncan = 9; mo->zdrop = 200; } else return -1; return 0; diff --git a/minimap.h b/minimap.h index 058c1e3..17e411c 100644 --- a/minimap.h +++ b/minimap.h @@ -23,6 +23,7 @@ #define MM_F_2_IO_THREADS 0x8000 #define MM_F_LONG_CIGAR 0x10000 #define MM_F_INDEPEND_SEG 0x20000 +#define MM_F_SPLICE_FLANK 0x40000 #define MM_IDX_MAGIC "MMI\2" diff --git a/minimap2.1 b/minimap2.1 index 0064d77..b7d7177 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "22 October 2017" "minimap2-2.2-dirty (r531)" "Bioinformatics tools" +.TH minimap2 1 "27 October 2017" "minimap2-2.3-dirty (r537)" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences @@ -219,6 +219,9 @@ costs .RI min{ O1 + k * E1 , O2 + k * E2 }. In the splice mode, the second gap penalties are not used. .TP +.BI -C \ INT +Cost for a non-canonical GT-AG splicing [0] +.TP .BI -z \ INT Break an alignment if the running score drops too quickly along the diagonal of the DP matrix (diagonal X-drop, or Z-drop) [400]. Increasing the value improves @@ -239,9 +242,6 @@ both strands; .BR n : no attempt to match GT-AG [n] .TP -.BI --cost-non-gt-ag \ INT -Cost of non-canonical splicing sites [0]. -.TP .BI --end-bonus \ INT Score bonus when alignment extends to the end of the query sequence [10]. .SS Input/output options @@ -371,8 +371,8 @@ is that this preset is not using HPC minimizers. .B splice Long-read spliced alignment .RB ( -k15 -.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -z200 -ub --cost-non-gt-ag -.BR 5 ). +.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 +.BR -ub ). In the splice mode, 1) long deletions are taken as introns and represented as the .RB ` N '