r539: use --splice-flank=yes by default
In human/mouse, the GTr..yAG pattern occurs to 91/92% of all GT-AG introns. Modeling r..y clearly leads to higher accuracy. However, in SIRV, this percentage is reduced to ~60%. The default "--splice --splice-flank=yes" leads to lower accuracy. If someone benchmark minimap2 on SIRV, this would be bad, but minimap2 is developed for practical applications, not for benchmarks. I will live with that.
This commit is contained in:
parent
f22a94e868
commit
192217a10c
2
main.c
2
main.c
|
|
@ -6,7 +6,7 @@
|
||||||
#include "mmpriv.h"
|
#include "mmpriv.h"
|
||||||
#include "getopt.h"
|
#include "getopt.h"
|
||||||
|
|
||||||
#define MM_VERSION "2.3-r538-dirty"
|
#define MM_VERSION "2.3-r539-dirty"
|
||||||
|
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
#include <sys/resource.h>
|
#include <sys/resource.h>
|
||||||
|
|
|
||||||
2
map.c
2
map.c
|
|
@ -105,7 +105,7 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
||||||
mo->mini_batch_size = 50000000;
|
mo->mini_batch_size = 50000000;
|
||||||
} else if (strcmp(preset, "splice") == 0 || strcmp(preset, "cdna") == 0) {
|
} else if (strcmp(preset, "splice") == 0 || strcmp(preset, "cdna") == 0) {
|
||||||
io->is_hpc = 0, io->k = 15, io->w = 5;
|
io->is_hpc = 0, io->k = 15, io->w = 5;
|
||||||
mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV;
|
mo->flag |= MM_F_SPLICE | MM_F_SPLICE_FOR | MM_F_SPLICE_REV | MM_F_SPLICE_FLANK;
|
||||||
mo->max_gap = 2000, mo->max_gap_ref = mo->bw = 200000;
|
mo->max_gap = 2000, mo->max_gap_ref = mo->bw = 200000;
|
||||||
mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0;
|
mo->a = 1, mo->b = 2, mo->q = 2, mo->e = 1, mo->q2 = 32, mo->e2 = 0;
|
||||||
mo->noncan = 9;
|
mo->noncan = 9;
|
||||||
|
|
|
||||||
30
minimap2.1
30
minimap2.1
|
|
@ -220,7 +220,9 @@ costs
|
||||||
In the splice mode, the second gap penalties are not used.
|
In the splice mode, the second gap penalties are not used.
|
||||||
.TP
|
.TP
|
||||||
.BI -C \ INT
|
.BI -C \ INT
|
||||||
Cost for a non-canonical GT-AG splicing [0]
|
Cost for a non-canonical GT-AG splicing (effective with
|
||||||
|
.BR --splice )
|
||||||
|
[0]
|
||||||
.TP
|
.TP
|
||||||
.BI -z \ INT
|
.BI -z \ INT
|
||||||
Break an alignment if the running score drops too quickly along the diagonal of
|
Break an alignment if the running score drops too quickly along the diagonal of
|
||||||
|
|
@ -243,7 +245,25 @@ both strands;
|
||||||
no attempt to match GT-AG [n]
|
no attempt to match GT-AG [n]
|
||||||
.TP
|
.TP
|
||||||
.BI --end-bonus \ INT
|
.BI --end-bonus \ INT
|
||||||
Score bonus when alignment extends to the end of the query sequence [10].
|
Score bonus when alignment extends to the end of the query sequence [0].
|
||||||
|
.TP
|
||||||
|
.BR --splice-flank [= yes | no ]
|
||||||
|
Assume the next base to a
|
||||||
|
.B GT
|
||||||
|
donor site tends to be A/G (91% in human and 92% in mouse) and the preceding
|
||||||
|
base to a
|
||||||
|
.B AG
|
||||||
|
acceptor tends to be C/T [yes with
|
||||||
|
.BR --splice ].
|
||||||
|
This trend is evolutionarily conservative, all the way to S. cerevisiae
|
||||||
|
(PMID:18688272). Specifying this option generally leads to higher junction
|
||||||
|
accuracy by several percents, so it is applied by default with
|
||||||
|
.BR --splice .
|
||||||
|
However, the SIRV control does not honor this trend
|
||||||
|
(only ~60%). This option reduces accuracy. If you are benchmarking minimap2
|
||||||
|
on SIRV data, please add
|
||||||
|
.B --splice-flank=no
|
||||||
|
to the command line.
|
||||||
.SS Input/output options
|
.SS Input/output options
|
||||||
.TP 10
|
.TP 10
|
||||||
.B -a
|
.B -a
|
||||||
|
|
@ -261,7 +281,7 @@ the real CIGAR in memory.
|
||||||
.TP
|
.TP
|
||||||
.BI -R \ STR
|
.BI -R \ STR
|
||||||
SAM read group line in a format like
|
SAM read group line in a format like
|
||||||
.RB @RG\\\\tID:foo\\\\tSM:bar
|
.B @RG\\\\tID:foo\\\\tSM:bar
|
||||||
[].
|
[].
|
||||||
.TP
|
.TP
|
||||||
.B -c
|
.B -c
|
||||||
|
|
@ -371,8 +391,8 @@ is that this preset is not using HPC minimizers.
|
||||||
.B splice
|
.B splice
|
||||||
Long-read spliced alignment
|
Long-read spliced alignment
|
||||||
.RB ( -k15
|
.RB ( -k15
|
||||||
.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200
|
.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -C9 -z200 -ub
|
||||||
.BR -ub ).
|
.BR --splice-flank=yes ).
|
||||||
In the splice mode, 1) long deletions are taken as introns and represented as
|
In the splice mode, 1) long deletions are taken as introns and represented as
|
||||||
the
|
the
|
||||||
.RB ` N '
|
.RB ` N '
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue