Released minimap2-2.3 (r531)
This commit is contained in:
parent
1dd221ad82
commit
306e4541f8
34
NEWS.md
34
NEWS.md
|
|
@ -1,3 +1,37 @@
|
|||
Release 2.3-r531 (22 October 2017)
|
||||
----------------------------------
|
||||
|
||||
This release come with many improvements and bug fixes:
|
||||
|
||||
* The **sr** preset now supports paired-end short-read alignment. Minimap2 is
|
||||
3-4 times as fast as BWA-MEM, but is slightly less accurate on simulated
|
||||
reads.
|
||||
|
||||
* Meticulous improvements to assembly-to-assembly alignment (special thanks to
|
||||
Alexey Gurevich from the QUAST team): a) apply a small penalty to matches
|
||||
between ambiguous bases; b) reduce missing alignments due to spurious
|
||||
overlaps; c) introduce the short form of the `cs` tag, an improvement to the
|
||||
SAM MD tag.
|
||||
|
||||
* Make sure gaps are always left-aligned.
|
||||
|
||||
* Recognize `U` bases from Oxford Nanopore Direct RNA-seq (#33).
|
||||
|
||||
* Fixed slightly wrong chaining score. Fixed slightly inaccurate coordinates
|
||||
for split alignment.
|
||||
|
||||
* Fixed multiple reported bugs: 1) wrong reference name for inversion
|
||||
alignment (#30); 2) redundant SQ lines when multiple query files are
|
||||
specified (#39); 3) non-functioning option `-K` (#36).
|
||||
|
||||
This release has implemented all the major features I planned five months ago,
|
||||
with the addition of spliced long-read alignment. The next couple of releases
|
||||
will focus on fine tuning of base algorithms.
|
||||
|
||||
(2.3: 22 October 2017, r531)
|
||||
|
||||
|
||||
|
||||
Release 2.2-r409 (17 September 2017)
|
||||
------------------------------------
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
[](https://github.com/lh3/minimap2/releases)
|
||||
[](https://github.com/lh3/minimap2/releases)
|
||||
[](https://anaconda.org/bioconda/minimap2)
|
||||
[](https://pypi.python.org/pypi/mappy)
|
||||
[](https://pypi.python.org/pypi/mappy)
|
||||
|
|
|
|||
25
main.c
25
main.c
|
|
@ -6,7 +6,7 @@
|
|||
#include "mmpriv.h"
|
||||
#include "getopt.h"
|
||||
|
||||
#define MM_VERSION "2.2-r526-dirty"
|
||||
#define MM_VERSION "2.3-r531"
|
||||
|
||||
#ifdef __linux__
|
||||
#include <sys/resource.h>
|
||||
|
|
@ -68,7 +68,7 @@ int main(int argc, char *argv[])
|
|||
const char *opt_str = "2aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:i:L";
|
||||
mm_mapopt_t opt;
|
||||
mm_idxopt_t ipt;
|
||||
int i, c, n_threads = 3, long_idx, max_gap_ref = 0;
|
||||
int i, c, n_threads = 3, long_idx;
|
||||
char *fnw = 0, *rg = 0, *s;
|
||||
FILE *fp_help = stderr;
|
||||
mm_idx_reader_t *idx_rdr;
|
||||
|
|
@ -98,7 +98,7 @@ int main(int argc, char *argv[])
|
|||
else if (c == 't') n_threads = atoi(optarg);
|
||||
else if (c == 'v') mm_verbose = atoi(optarg);
|
||||
else if (c == 'g') opt.max_gap = (int)mm_parse_num(optarg);
|
||||
else if (c == 'G') max_gap_ref = (int)mm_parse_num(optarg);
|
||||
else if (c == 'G') mm_mapopt_max_intron_len(&opt, (int)mm_parse_num(optarg));
|
||||
else if (c == 'F') opt.max_frag_len = (int)mm_parse_num(optarg);
|
||||
else if (c == 'i') opt.min_iden = atof(optarg);
|
||||
else if (c == 'N') opt.best_n = atoi(optarg);
|
||||
|
|
@ -174,7 +174,7 @@ int main(int argc, char *argv[])
|
|||
else if (*optarg == 'r') opt.flag |= MM_F_SPLICE_REV, opt.flag &= ~MM_F_SPLICE_FOR; // match CT-AC (reverse complement of GT-AG)
|
||||
else if (*optarg == 'n') opt.flag &= ~(MM_F_SPLICE_FOR|MM_F_SPLICE_REV); // don't try to match the GT-AG signal
|
||||
else {
|
||||
fprintf(stderr, "[E::%s] unrecognized cDNA direction\n", __func__);
|
||||
fprintf(stderr, "[ERROR]\033[1;31m unrecognized cDNA direction\033[0m\n");
|
||||
return 1;
|
||||
}
|
||||
} else if (c == 'O') {
|
||||
|
|
@ -185,14 +185,9 @@ int main(int argc, char *argv[])
|
|||
if (*s == ',') opt.e2 = strtol(s + 1, &s, 10);
|
||||
}
|
||||
}
|
||||
if (max_gap_ref > 0) {
|
||||
opt.max_gap_ref = max_gap_ref;
|
||||
if (opt.flag & MM_F_SPLICE) opt.bw = max_gap_ref; // in the splice mode, this also changes the bandwidth
|
||||
}
|
||||
if ((opt.flag & MM_F_OUT_SAM) && (opt.flag & MM_F_OUT_CS_LONG)) {
|
||||
opt.flag &= ~MM_F_OUT_CS_LONG;
|
||||
if (mm_verbose >= 2)
|
||||
fprintf(stderr, "[WARNING]\033[1;31m in SAM, only the short form of the cs tag is outputted.\033[0m\n");
|
||||
if ((opt.flag & MM_F_SPLICE) && (opt.flag & MM_F_FRAG_MODE)) {
|
||||
fprintf(stderr, "[ERROR]\033[1;31m --splice and --frag should not be specified at the same time.\033[0m\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (argc == optind || fp_help == stdout) {
|
||||
|
|
@ -207,8 +202,8 @@ int main(int argc, char *argv[])
|
|||
fprintf(fp_help, " Mapping:\n");
|
||||
fprintf(fp_help, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%g]\n", opt.mid_occ_frac);
|
||||
fprintf(fp_help, " -g NUM stop chain enlongation if there are no minimizers in INT-bp [%d]\n", opt.max_gap);
|
||||
fprintf(fp_help, " -G NUM max reference skip/intron length [-xsplice:200k]\n");
|
||||
fprintf(fp_help, " -F NUM max fragment length in the fragment mode [-xsr:800]\n");
|
||||
fprintf(fp_help, " -G NUM max intron length (effective with -xsplice; changing -r) [200k]\n");
|
||||
fprintf(fp_help, " -F NUM max fragment length (effective with -xsr or in the fragment mode) [800]\n");
|
||||
fprintf(fp_help, " -r NUM bandwidth used in chaining and DP-based alignment [%d]\n", opt.bw);
|
||||
fprintf(fp_help, " -n INT minimal number of minimizers on a chain [%d]\n", opt.min_cnt);
|
||||
fprintf(fp_help, " -m INT minimal chaining score (matching bases minus log gap penalty) [%d]\n", opt.min_chain_score);
|
||||
|
|
@ -228,7 +223,7 @@ int main(int argc, char *argv[])
|
|||
fprintf(fp_help, " Input/Output:\n");
|
||||
fprintf(fp_help, " -a output in the SAM format (PAF by default)\n");
|
||||
fprintf(fp_help, " -Q don't output base quality in SAM\n");
|
||||
fprintf(fp_help, " -L write CIGAR with >65535 ops in the CG tag (compatible with future tools)\n");
|
||||
fprintf(fp_help, " -L write CIGAR with >65535 ops at the CG tag\n");
|
||||
fprintf(fp_help, " -R STR SAM read group line in a format like '@RG\\tID:foo\\tSM:bar' []\n");
|
||||
fprintf(fp_help, " -c output CIGAR in PAF\n");
|
||||
fprintf(fp_help, " --cs[=STR] output the cs tag; STR is 'short' (if absent) or 'long' [none]\n");
|
||||
|
|
|
|||
6
map.c
6
map.c
|
|
@ -52,6 +52,12 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
|
|||
fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ);
|
||||
}
|
||||
|
||||
void mm_mapopt_max_intron_len(mm_mapopt_t *opt, int max_intron_len)
|
||||
{
|
||||
if ((opt->flag & MM_F_SPLICE) && max_intron_len > 0)
|
||||
opt->max_gap_ref = opt->bw = max_intron_len;
|
||||
}
|
||||
|
||||
int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
|
||||
{
|
||||
if (preset == 0) {
|
||||
|
|
|
|||
|
|
@ -161,6 +161,8 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo);
|
|||
*/
|
||||
void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi);
|
||||
|
||||
void mm_mapopt_max_intron_len(mm_mapopt_t *opt, int max_intron_len);
|
||||
|
||||
/**
|
||||
* Initialize an index reader
|
||||
*
|
||||
|
|
|
|||
63
minimap2.1
63
minimap2.1
|
|
@ -1,4 +1,4 @@
|
|||
.TH minimap2 1 "17 October 2017" "minimap2-2.2-dirty (r518)" "Bioinformatics tools"
|
||||
.TH minimap2 1 "22 October 2017" "minimap2-2.2-dirty (r531)" "Bioinformatics tools"
|
||||
.SH NAME
|
||||
.PP
|
||||
minimap2 - mapping and alignment between collections of DNA sequences
|
||||
|
|
@ -126,7 +126,7 @@ Stop chain enlongation if there are no minimizers in
|
|||
[10000].
|
||||
.TP
|
||||
.BI -r \ INT
|
||||
Bandwidth used in chaining and DP-based alignment [1000]. This option
|
||||
Bandwidth used in chaining and DP-based alignment [500]. This option
|
||||
approximately controls the maximum gap size.
|
||||
.TP
|
||||
.BI -n \ INT
|
||||
|
|
@ -148,7 +148,7 @@ diagonal minimizer hits will also be suppressed.
|
|||
.TP
|
||||
.BI -p \ FLOAT
|
||||
Minimal secondary-to-primary score ratio to output secondary mappings [0.8].
|
||||
Between two chains overlaping over half of the shorter chain (controled by
|
||||
Between two chains overlaping over half of the shorter chain (controlled by
|
||||
.BR --mask-level ),
|
||||
the chain with a lower score is secondary to the chain with a higher score.
|
||||
If the ratio of the scores is below
|
||||
|
|
@ -163,10 +163,16 @@ secondary alignments [5]. This option has no effect when
|
|||
is applied.
|
||||
.TP
|
||||
.BI -G \ NUM
|
||||
Maximal intron length in the splice mode [200k]. This option also changes the
|
||||
bandwidth to
|
||||
Maximum gap on the reference (effective with
|
||||
.BR -xsplice / --splice ).
|
||||
This option also changes the chaining and alignment band width to
|
||||
.IR NUM .
|
||||
Increasing this option slows down spliced alignment.
|
||||
Increasing this option slows down spliced alignment. [200k]
|
||||
.TP
|
||||
.BI -F \ NUM
|
||||
Maximum fragment length (aka insert size; effective with
|
||||
.BR -xsr / --frag)
|
||||
[800]
|
||||
.TP
|
||||
.BI --max-chain-skip \ INT
|
||||
A heuristics that stops chaining early [50]. Minimap2 uses dynamic programming
|
||||
|
|
@ -175,6 +181,23 @@ option makes minimap2 exits the inner loop if it repeatedly sees seeds already
|
|||
on chains. Set
|
||||
.I INT
|
||||
to a large number to switch off this heurstics.
|
||||
.TP
|
||||
.B --no-long-join
|
||||
Disable the long gap patching heuristic. When this option is applied, the
|
||||
maximum alignment gap is mostly controlled by
|
||||
.BR -r .
|
||||
.TP
|
||||
.B --splice
|
||||
Enable the splice alignment mode.
|
||||
.TP
|
||||
.B --sr
|
||||
Enable short-read alignment heuristics. In the short-read mode, minimap2
|
||||
applies a second round of chaining with a higher minimizer occurrence threshold
|
||||
if no good chain is found. In addition, minimap2 attempts to patch gaps between
|
||||
seeds with ungapped alignment.
|
||||
.TP
|
||||
.BR --frag [= no | yes ]
|
||||
Whether to enable the fragment mode [no]
|
||||
.SS Alignment options
|
||||
.TP 10
|
||||
.BI -A \ INT
|
||||
|
|
@ -194,6 +217,7 @@ Gap extension penalty [2,1]. A gap of length
|
|||
.I k
|
||||
costs
|
||||
.RI min{ O1 + k * E1 , O2 + k * E2 }.
|
||||
In the splice mode, the second gap penalties are not used.
|
||||
.TP
|
||||
.BI -z \ INT
|
||||
Break an alignment if the running score drops too quickly along the diagonal of
|
||||
|
|
@ -217,6 +241,9 @@ no attempt to match GT-AG [n]
|
|||
.TP
|
||||
.BI --cost-non-gt-ag \ INT
|
||||
Cost of non-canonical splicing sites [0].
|
||||
.TP
|
||||
.BI --end-bonus \ INT
|
||||
Score bonus when alignment extends to the end of the query sequence [10].
|
||||
.SS Input/output options
|
||||
.TP 10
|
||||
.B -a
|
||||
|
|
@ -226,9 +253,15 @@ by default.
|
|||
.B -Q
|
||||
Ignore base quality in the input file.
|
||||
.TP
|
||||
.B -L
|
||||
Write CIGAR with >65535 operators at the CG tag. Older tools are unable to
|
||||
convert alignments with >65535 CIGAR ops to BAM. This option makes minimap2 SAM
|
||||
compatible with older tools. Newer tools recognizes this tag and reconstruct
|
||||
the real CIGAR in memory.
|
||||
.TP
|
||||
.BI -R \ STR
|
||||
SAM read group line in a format like
|
||||
.B @RG\\\\tID:foo\\\\tSM:bar
|
||||
.RB @RG\\\\tID:foo\\\\tSM:bar
|
||||
[].
|
||||
.TP
|
||||
.B -c
|
||||
|
|
@ -249,6 +282,11 @@ is given,
|
|||
.I short
|
||||
is assumed. [none]
|
||||
.TP
|
||||
.BI --seed \ INT
|
||||
Integer seed for randomizing equally best hits. Minimap2 hashes
|
||||
.I INT
|
||||
and read name when choosing between equally best hits. [11]
|
||||
.TP
|
||||
.BI -t \ INT
|
||||
Number of threads [3]. Minimap2 uses at most three threads when indexing target
|
||||
sequences, and uses up to
|
||||
|
|
@ -271,6 +309,9 @@ K/M/G/k/m/g suffix is accepted. A large
|
|||
helps load balancing in the multi-threading mode, at the cost of increased
|
||||
memory.
|
||||
.TP
|
||||
.BR --secondary [= yes | no ]
|
||||
Whether to output secondary alignments [yes]
|
||||
.TP
|
||||
.B --version
|
||||
Print version number to stdout
|
||||
.SS Preset options
|
||||
|
|
@ -343,9 +384,9 @@ tag ignores introns to demote hits to pseudogenes.
|
|||
.B sr
|
||||
Short single-end reads without splicing
|
||||
.RB ( -k21
|
||||
.B -w11 -A2 -B8 -O12,32 -E2,1 -r50 -p.5 -N20 -f1000,5000 -n2 -m20 -s40 -g200
|
||||
.B -2K50m --frag
|
||||
.BR --sr ).
|
||||
.B -w11 --sr --frag -A2 -B8 -O12,32 -E2,1 -r50 -p.5 -N20 -f1000,5000 -n2 -m20
|
||||
.B -s40 -g200 -2K50m
|
||||
.BR --secondary=no ).
|
||||
.RE
|
||||
.SS Miscellaneous options
|
||||
.TP 10
|
||||
|
|
@ -358,7 +399,7 @@ multi-threading mode.
|
|||
.B --print-qname
|
||||
Print query names to stderr, mostly to see which query is crashing minimap2.
|
||||
.TP
|
||||
.B --print-seed
|
||||
.B --print-seeds
|
||||
Print seed positions to stderr, for debugging only.
|
||||
.SH OUTPUT FORMAT
|
||||
.PP
|
||||
|
|
|
|||
Loading…
Reference in New Issue