r296: expose splicing related options to CLI

This commit is contained in:
Heng Li 2017-08-13 21:37:51 -04:00
parent 28f86688ab
commit b5f5929bf9
4 changed files with 54 additions and 28 deletions

15
align.c
View File

@ -136,7 +136,7 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr); fputc('\n', stderr);
}
if (opt->flag & MM_F_SPLICE)
ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, opt->zdrop, flag|KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV, ez);
ksw_exts2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, opt->q2, opt->noncan, opt->zdrop, flag, ez);
else if (opt->q == opt->q2 && opt->e == opt->e2)
ksw_extz2_sse(km, qlen, qseq, tlen, tseq, 5, mat, opt->q, opt->e, w, opt->zdrop, flag, ez);
else
@ -249,7 +249,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
{
int32_t rid = a[r->as].x<<1>>33, rev = a[r->as].x>>63, as1, cnt1;
uint8_t *tseq, *qseq;
int32_t i, l, bw, dropped = 0, rs0, re0, qs0, qe0;
int32_t i, l, bw, dropped = 0, extra_flag = 0, rs0, re0, qs0, qe0;
int32_t rs, re, qs, qe;
int32_t rs1, qs1, re1, qe1;
int8_t mat[25];
@ -266,6 +266,11 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
mm_adjust_minier(mi, qseq0, &a[as1], &rs, &qs);
mm_adjust_minier(mi, qseq0, &a[as1 + cnt1 - 1], &re, &qe);
if (opt->flag & MM_F_SPLICE) {
if (opt->flag & MM_F_SPLICE_FOR) extra_flag |= rev? KSW_EZ_SPLICE_REV : KSW_EZ_SPLICE_FOR;
if (opt->flag & MM_F_SPLICE_REV) extra_flag |= rev? KSW_EZ_SPLICE_FOR : KSW_EZ_SPLICE_REV;
}
// compute rs0 and qs0
if (r->split && as1 > 0) {
mm_adjust_minier(mi, qseq0, &a[as1-1], &rs0, &qs0);
@ -298,7 +303,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
mm_idx_getseq(mi, rid, rs0, rs, tseq);
mm_seq_rev(qs - qs0, qseq);
mm_seq_rev(rs - rs0, tseq);
mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, mat, bw, KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez);
mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, mat, bw, extra_flag|KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez);
if (ez->n_cigar > 0) {
mm_append_cigar(r, ez->n_cigar, ez->cigar);
r->p->dp_score += ez->max;
@ -320,7 +325,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
bw1 = qe - qs > re - rs? qe - qs : re - rs;
qseq = &qseq0[rev][qs];
mm_idx_getseq(mi, rid, rs, re, tseq);
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, KSW_EZ_APPROX_MAX, ez);
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, extra_flag|KSW_EZ_APPROX_MAX, ez);
if (mm_check_zdrop(qseq, tseq, ez->n_cigar, ez->cigar, mat, opt->q, opt->e, opt->zdrop))
mm_align_pair(km, opt, qe - qs, qseq, re - rs, tseq, mat, bw1, 0, ez);
if (ez->n_cigar > 0)
@ -345,7 +350,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
if (!dropped && qe < qe0 && re < re0) { // right extension
qseq = &qseq0[rev][qe];
mm_idx_getseq(mi, rid, re, re0, tseq);
mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, mat, bw, KSW_EZ_EXTZ_ONLY, ez);
mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, mat, bw, extra_flag|KSW_EZ_EXTZ_ONLY, ez);
if (ez->n_cigar > 0) {
mm_append_cigar(r, ez->n_cigar, ez->cigar);
r->p->dp_score += ez->max;

20
main.c
View File

@ -8,7 +8,7 @@
#include "minimap.h"
#include "mmpriv.h"
#define MM_VERSION "2.0-r295-dirty"
#define MM_VERSION "2.0-r296-dirty"
void liftrlimit()
{
@ -32,6 +32,7 @@ static struct option long_options[] = {
{ "min-dp-len", required_argument, 0, 0 },
{ "print-aln-seq", no_argument, 0, 0 },
{ "splice", no_argument, 0, 0 },
{ "cost-non-gt-ag", required_argument, 0, 0 },
{ "max-intron-len", required_argument, 0, 'G' },
{ "version", no_argument, 0, 'V' },
{ "min-count", required_argument, 0, 'n' },
@ -67,7 +68,7 @@ int main(int argc, char *argv[])
mm_realtime0 = realtime();
mm_mapopt_init(&opt);
while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Q", long_options, &long_idx)) >= 0) {
while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:", long_options, &long_idx)) >= 0) {
if (c == 'w') w = atoi(optarg), idx_par_set = 1;
else if (c == 'k') k = atoi(optarg), idx_par_set = 1;
else if (c == 'H') is_hpc = 1, idx_par_set = 1;
@ -105,9 +106,19 @@ int main(int argc, char *argv[])
else if (c == 0 && long_idx == 8) opt.min_ksw_len = atoi(optarg); // --min-dp-len
else if (c == 0 && long_idx == 9) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ; // --print-aln-seq
else if (c == 0 && long_idx ==10) opt.flag |= MM_F_SPLICE; // --splice
else if (c == 0 && long_idx ==11) opt.noncan = atoi(optarg); // --cost-non-gt-ag
else if (c == 'V') {
puts(MM_VERSION);
return 0;
} else if (c == 'u') {
if (*optarg == 'b') opt.flag |= MM_F_SPLICE_FOR|MM_F_SPLICE_REV;
else if (*optarg == 'f') opt.flag |= MM_F_SPLICE_FOR, opt.flag &= ~MM_F_SPLICE_REV;
else if (*optarg == 'r') opt.flag |= MM_F_SPLICE_REV, opt.flag &= ~MM_F_SPLICE_FOR;
else if (*optarg == 'n') opt.flag &= ~(MM_F_SPLICE_FOR|MM_F_SPLICE_REV);
else {
fprintf(stderr, "[E::%s] unrecognized cDNA direction\n", __func__);
return 1;
}
} else if (c == 'O') {
opt.q = opt.q2 = strtol(optarg, &s, 10);
if (*s == ',') opt.q2 = strtol(s + 1, &s, 10);
@ -139,7 +150,7 @@ int main(int argc, char *argv[])
opt.min_dp_max = 200;
} else if (strcmp(optarg, "splice") == 0 || strcmp(optarg, "cdna") == 0) {
k = 15, w = 5;
opt.flag |= MM_F_SPLICE;
opt.flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV;
opt.max_gap = 2000, opt.max_gap_ref = opt.bw = 200000;
opt.a = 1, opt.b = 2, opt.q = 2, opt.e = 1, opt.q2 = 32, opt.e2 = 0;
opt.noncan = 4;
@ -181,6 +192,7 @@ int main(int argc, char *argv[])
fprintf(stderr, " -E INT[,INT] gap extension penalty; a k-long gap costs min{O1+k*E1,O2+k*E2} [%d,%d]\n", opt.e, opt.e2);
fprintf(stderr, " -z INT Z-drop score [%d]\n", opt.zdrop);
fprintf(stderr, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max);
fprintf(stderr, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n");
fprintf(stderr, " Input/Output:\n");
fprintf(stderr, " -Q ignore base quality in the input\n");
fprintf(stderr, " -a output in the SAM format (PAF by default)\n");
@ -198,7 +210,7 @@ int main(int argc, char *argv[])
fprintf(stderr, " asm10: -k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10%% div.)\n");
fprintf(stderr, " ava-pb: -Hk19 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (PacBio read overlap)\n");
fprintf(stderr, " ava-ont: -k15 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (ONT read overlap)\n");
fprintf(stderr, " splice: -k15 -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -z200 (long-read spliced aln)\n");
fprintf(stderr, " splice: long-read spliced alignment (see minimap2.1 for details)\n");
fprintf(stderr, "\nSee `man ./minimap2.1' for detailed description of command-line options.\n");
return 1;
}

View File

@ -5,16 +5,18 @@
#include <stdio.h>
#include <sys/types.h>
#define MM_IDX_DEF_B 14
#define MM_IDX_DEF_B 14
#define MM_F_NO_SELF 0x01
#define MM_F_AVA 0x02
#define MM_F_CIGAR 0x04
#define MM_F_OUT_SAM 0x08
#define MM_F_NO_QUAL 0x10
#define MM_F_OUT_CG 0x20
#define MM_F_OUT_CS 0x40
#define MM_F_SPLICE 0x80
#define MM_F_NO_SELF 0x001
#define MM_F_AVA 0x002
#define MM_F_CIGAR 0x004
#define MM_F_OUT_SAM 0x008
#define MM_F_NO_QUAL 0x010
#define MM_F_OUT_CG 0x020
#define MM_F_OUT_CS 0x040
#define MM_F_SPLICE 0x080
#define MM_F_SPLICE_FOR 0x100
#define MM_F_SPLICE_REV 0x200
#define MM_IDX_MAGIC "MMI\2"

View File

@ -1,4 +1,4 @@
.TH minimap2 1 "12 August 2017" "minimap2-2.0-r290-dirty" "Bioinformatics tools"
.TH minimap2 1 "13 August 2017" "minimap2-2.0-r296-dirty" "Bioinformatics tools"
.SH NAME
.PP
minimap2 - mapping and alignment between collections of DNA sequences
@ -205,6 +205,18 @@ the contiguity of the alignment at the cost of poor alignment in the middle
Minimal peak DP alignment score to output [40]. The peak score is computed from
the final CIGAR. It is the score of the max scoring segment in the alignment
and may be different from the total alignment score.
.TP
.BI -u \ CHAR
How to find canonical splicing sites GT-AG -
.BR f :
transcript strand;
.BR b :
both strands;
.BR n :
no attempt to match GT-AG [n]
.TP
.BI --cost-non-gt-ag \ INT
Cost of non-canonical splicing sites [0].
.SS Input/output options
.TP 10
.B -Q
@ -302,11 +314,10 @@ is that this preset is not using HPC minimizers.
.B splice
Long-read spliced alignment
.RB ( -k15
.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0
.BR -z200 ).
As of now, minimap2 only finds approximate exon boundaries. The true boundaries
are usually within 10bp around the reported positions. In the splice mode,
1) long deletions are taken as introns and represented as the
.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -z200 -ub --cost-non-gt-ag
.BR 4 ).
In the splice mode, 1) long deletions are taken as introns and represented as
the
.RB ` N '
CIGAR operator; 2) long insertions are disabled; 3) deletion and insertion gap
costs are different during chaining; 4) the computation of the
@ -387,10 +398,6 @@ Minimap2 does not work well with Illumina short reads as of now.
*
Minimap2 requires SSE2 instructions to compile. It is possible to add
non-SSE2 support, but it would make minimap2 slower by several times.
.TP
*
In the splice mode, minimap2 is unable to find the precise exon boundaries.
The true bounraries are usually within 10bp around the reported locations.
.SH SEE ALSO
.PP
miniasm(1), minimap(1), bwa(1).