From b5f5929bf9923f7ef69add35b059e9525e456d96 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 13 Aug 2017 21:37:51 -0400 Subject: [PATCH] r296: expose splicing related options to CLI --- align.c | 15 ++++++++++----- main.c | 20 ++++++++++++++++---- minimap.h | 20 +++++++++++--------- minimap2.1 | 27 +++++++++++++++++---------- 4 files changed, 54 insertions(+), 28 deletions(-) diff --git a/align.c b/align.c index cca69cf..b4c00b1 100644 --- a/align.c +++ b/align.c @@ -136,7 +136,7 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr); fputc('\n', stderr); } if (opt->flag & MM_F_SPLICE) - ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, opt->zdrop, flag|KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV, ez); + ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, opt->zdrop, flag, ez); else if (opt->q == opt->q2 && opt->e == opt->e2) ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, opt->zdrop, flag, ez); else @@ -249,7 +249,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int { int32_t rid = a[r->as].x<<1>>33, rev = a[r->as].x>>63, as1, cnt1; uint8_t *tseq, *qseq; - int32_t i, l, bw, dropped = 0, rs0, re0, qs0, qe0; + int32_t i, l, bw, dropped = 0, extra_flag = 0, rs0, re0, qs0, qe0; int32_t rs, re, qs, qe; int32_t rs1, qs1, re1, qe1; int8_t mat[25]; @@ -266,6 +266,11 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int mm_adjust_minier(mi, qseq0, &a[as1], &rs, &qs); mm_adjust_minier(mi, qseq0, &a[as1 + cnt1 - 1], &re, &qe); + if (opt->flag & MM_F_SPLICE) { + if (opt->flag & MM_F_SPLICE_FOR) extra_flag |= rev? KSW_EZ_SPLICE_REV : KSW_EZ_SPLICE_FOR; + if (opt->flag & MM_F_SPLICE_REV) extra_flag |= rev? KSW_EZ_SPLICE_FOR : KSW_EZ_SPLICE_REV; + } + // compute rs0 and qs0 if (r->split && as1 > 0) { mm_adjust_minier(mi, qseq0, &a[as1-1], &rs0, &qs0); @@ -298,7 +303,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int mm_idx_getseq(mi, rid, rs0, rs, tseq); mm_seq_rev(qs - qs0, qseq); mm_seq_rev(rs - rs0, tseq); - mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, mat, bw, KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez); + mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, mat, bw, extra_flag|KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez); if (ez->n_cigar > 0) { mm_append_cigar(r, ez->n_cigar, ez->cigar); r->p->dp_score += ez->max; @@ -320,7 +325,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int bw1 = qe - qs > re - rs? qe - qs : re - rs; qseq = &qseq0[rev][qs]; mm_idx_getseq(mi, rid, rs, re, tseq); - mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, KSW_EZ_APPROX_MAX, ez); + mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag|KSW_EZ_APPROX_MAX, ez); if (mm_check_zdrop(qseq, tseq, ez->n_cigar, ez->cigar, mat, opt->q, opt->e, opt->zdrop)) mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, 0, ez); if (ez->n_cigar > 0) @@ -345,7 +350,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int if (!dropped && qe < qe0 && re < re0) { // right extension qseq = &qseq0[rev][qe]; mm_idx_getseq(mi, rid, re, re0, tseq); - mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, mat, bw, KSW_EZ_EXTZ_ONLY, ez); + mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, mat, bw, extra_flag|KSW_EZ_EXTZ_ONLY, ez); if (ez->n_cigar > 0) { mm_append_cigar(r, ez->n_cigar, ez->cigar); r->p->dp_score += ez->max; diff --git a/main.c b/main.c index 42dafd1..f387f2d 100644 --- a/main.c +++ b/main.c @@ -8,7 +8,7 @@ #include "minimap.h" #include "mmpriv.h" -#define MM_VERSION "2.0-r295-dirty" +#define MM_VERSION "2.0-r296-dirty" void liftrlimit() { @@ -32,6 +32,7 @@ static struct option long_options[] = { { "min-dp-len", required_argument, 0, 0 }, { "print-aln-seq", no_argument, 0, 0 }, { "splice", no_argument, 0, 0 }, + { "cost-non-gt-ag", required_argument, 0, 0 }, { "max-intron-len", required_argument, 0, 'G' }, { "version", no_argument, 0, 'V' }, { "min-count", required_argument, 0, 'n' }, @@ -67,7 +68,7 @@ int main(int argc, char *argv[]) mm_realtime0 = realtime(); mm_mapopt_init(&opt); - while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Q", long_options, &long_idx)) >= 0) { + while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:", long_options, &long_idx)) >= 0) { if (c == 'w') w = atoi(optarg), idx_par_set = 1; else if (c == 'k') k = atoi(optarg), idx_par_set = 1; else if (c == 'H') is_hpc = 1, idx_par_set = 1; @@ -105,9 +106,19 @@ int main(int argc, char *argv[]) else if (c == 0 && long_idx == 8) opt.min_ksw_len = atoi(optarg); // --min-dp-len else if (c == 0 && long_idx == 9) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ; // --print-aln-seq else if (c == 0 && long_idx ==10) opt.flag |= MM_F_SPLICE; // --splice + else if (c == 0 && long_idx ==11) opt.noncan = atoi(optarg); // --cost-non-gt-ag else if (c == 'V') { puts(MM_VERSION); return 0; + } else if (c == 'u') { + if (*optarg == 'b') opt.flag |= MM_F_SPLICE_FOR|MM_F_SPLICE_REV; + else if (*optarg == 'f') opt.flag |= MM_F_SPLICE_FOR, opt.flag &= ~MM_F_SPLICE_REV; + else if (*optarg == 'r') opt.flag |= MM_F_SPLICE_REV, opt.flag &= ~MM_F_SPLICE_FOR; + else if (*optarg == 'n') opt.flag &= ~(MM_F_SPLICE_FOR|MM_F_SPLICE_REV); + else { + fprintf(stderr, "[E::%s] unrecognized cDNA direction\n", __func__); + return 1; + } } else if (c == 'O') { opt.q = opt.q2 = strtol(optarg, &s, 10); if (*s == ',') opt.q2 = strtol(s + 1, &s, 10); @@ -139,7 +150,7 @@ int main(int argc, char *argv[]) opt.min_dp_max = 200; } else if (strcmp(optarg, "splice") == 0 || strcmp(optarg, "cdna") == 0) { k = 15, w = 5; - opt.flag |= MM_F_SPLICE; + opt.flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV; opt.max_gap = 2000, opt.max_gap_ref = opt.bw = 200000; opt.a = 1, opt.b = 2, opt.q = 2, opt.e = 1, opt.q2 = 32, opt.e2 = 0; opt.noncan = 4; @@ -181,6 +192,7 @@ int main(int argc, char *argv[]) fprintf(stderr, " -E INT[,INT] gap extension penalty; a k-long gap costs min{O1+k*E1,O2+k*E2} [%d,%d]\n", opt.e, opt.e2); fprintf(stderr, " -z INT Z-drop score [%d]\n", opt.zdrop); fprintf(stderr, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max); + fprintf(stderr, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n"); fprintf(stderr, " Input/Output:\n"); fprintf(stderr, " -Q ignore base quality in the input\n"); fprintf(stderr, " -a output in the SAM format (PAF by default)\n"); @@ -198,7 +210,7 @@ int main(int argc, char *argv[]) fprintf(stderr, " asm10: -k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10%% div.)\n"); fprintf(stderr, " ava-pb: -Hk19 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (PacBio read overlap)\n"); fprintf(stderr, " ava-ont: -k15 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (ONT read overlap)\n"); - fprintf(stderr, " splice: -k15 -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -z200 (long-read spliced aln)\n"); + fprintf(stderr, " splice: long-read spliced alignment (see minimap2.1 for details)\n"); fprintf(stderr, "\nSee `man ./minimap2.1' for detailed description of command-line options.\n"); return 1; } diff --git a/minimap.h b/minimap.h index c145919..bce2448 100644 --- a/minimap.h +++ b/minimap.h @@ -5,16 +5,18 @@ #include #include -#define MM_IDX_DEF_B 14 +#define MM_IDX_DEF_B 14 -#define MM_F_NO_SELF 0x01 -#define MM_F_AVA 0x02 -#define MM_F_CIGAR 0x04 -#define MM_F_OUT_SAM 0x08 -#define MM_F_NO_QUAL 0x10 -#define MM_F_OUT_CG 0x20 -#define MM_F_OUT_CS 0x40 -#define MM_F_SPLICE 0x80 +#define MM_F_NO_SELF 0x001 +#define MM_F_AVA 0x002 +#define MM_F_CIGAR 0x004 +#define MM_F_OUT_SAM 0x008 +#define MM_F_NO_QUAL 0x010 +#define MM_F_OUT_CG 0x020 +#define MM_F_OUT_CS 0x040 +#define MM_F_SPLICE 0x080 +#define MM_F_SPLICE_FOR 0x100 +#define MM_F_SPLICE_REV 0x200 #define MM_IDX_MAGIC "MMI\2" diff --git a/minimap2.1 b/minimap2.1 index ee8efb5..5b8aee3 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "12 August 2017" "minimap2-2.0-r290-dirty" "Bioinformatics tools" +.TH minimap2 1 "13 August 2017" "minimap2-2.0-r296-dirty" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences @@ -205,6 +205,18 @@ the contiguity of the alignment at the cost of poor alignment in the middle Minimal peak DP alignment score to output [40]. The peak score is computed from the final CIGAR. It is the score of the max scoring segment in the alignment and may be different from the total alignment score. +.TP +.BI -u \ CHAR +How to find canonical splicing sites GT-AG - +.BR f : +transcript strand; +.BR b : +both strands; +.BR n : +no attempt to match GT-AG [n] +.TP +.BI --cost-non-gt-ag \ INT +Cost of non-canonical splicing sites [0]. .SS Input/output options .TP 10 .B -Q @@ -302,11 +314,10 @@ is that this preset is not using HPC minimizers. .B splice Long-read spliced alignment .RB ( -k15 -.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -.BR -z200 ). -As of now, minimap2 only finds approximate exon boundaries. The true boundaries -are usually within 10bp around the reported positions. In the splice mode, -1) long deletions are taken as introns and represented as the +.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -z200 -ub --cost-non-gt-ag +.BR 4 ). +In the splice mode, 1) long deletions are taken as introns and represented as +the .RB ` N ' CIGAR operator; 2) long insertions are disabled; 3) deletion and insertion gap costs are different during chaining; 4) the computation of the @@ -387,10 +398,6 @@ Minimap2 does not work well with Illumina short reads as of now. * Minimap2 requires SSE2 instructions to compile. It is possible to add non-SSE2 support, but it would make minimap2 slower by several times. -.TP -* -In the splice mode, minimap2 is unable to find the precise exon boundaries. -The true bounraries are usually within 10bp around the reported locations. .SH SEE ALSO .PP miniasm(1), minimap(1), bwa(1).