diff --git a/NEWS.md b/NEWS.md index 566ae2a..078d244 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,37 @@ +Release 2.3-r531 (22 October 2017) +---------------------------------- + +This release come with many improvements and bug fixes: + + * The **sr** preset now supports paired-end short-read alignment. Minimap2 is + 3-4 times as fast as BWA-MEM, but is slightly less accurate on simulated + reads. + + * Meticulous improvements to assembly-to-assembly alignment (special thanks to + Alexey Gurevich from the QUAST team): a) apply a small penalty to matches + between ambiguous bases; b) reduce missing alignments due to spurious + overlaps; c) introduce the short form of the `cs` tag, an improvement to the + SAM MD tag. + + * Make sure gaps are always left-aligned. + + * Recognize `U` bases from Oxford Nanopore Direct RNA-seq (#33). + + * Fixed slightly wrong chaining score. Fixed slightly inaccurate coordinates + for split alignment. + + * Fixed multiple reported bugs: 1) wrong reference name for inversion + alignment (#30); 2) redundant SQ lines when multiple query files are + specified (#39); 3) non-functioning option `-K` (#36). + +This release has implemented all the major features I planned five months ago, +with the addition of spliced long-read alignment. The next couple of releases +will focus on fine tuning of base algorithms. + +(2.3: 22 October 2017, r531) + + + Release 2.2-r409 (17 September 2017) ------------------------------------ diff --git a/README.md b/README.md index d945f0b..f79a254 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![Release](https://img.shields.io/badge/Release-v2.2-blue.svg?style=flat)](https://github.com/lh3/minimap2/releases) +[![Release](https://img.shields.io/badge/Release-v2.3-blue.svg?style=flat)](https://github.com/lh3/minimap2/releases) [![BioConda](https://img.shields.io/conda/vn/bioconda/minimap2.svg?style=flat)](https://anaconda.org/bioconda/minimap2) [![PyPI](https://img.shields.io/pypi/v/mappy.svg?style=flat)](https://pypi.python.org/pypi/mappy) [![Python Version](https://img.shields.io/pypi/pyversions/mappy.svg?style=flat)](https://pypi.python.org/pypi/mappy) diff --git a/main.c b/main.c index 4b4a869..0d02673 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "getopt.h" -#define MM_VERSION "2.2-r526-dirty" +#define MM_VERSION "2.3-r531" #ifdef __linux__ #include @@ -68,7 +68,7 @@ int main(int argc, char *argv[]) const char *opt_str = "2aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:i:L"; mm_mapopt_t opt; mm_idxopt_t ipt; - int i, c, n_threads = 3, long_idx, max_gap_ref = 0; + int i, c, n_threads = 3, long_idx; char *fnw = 0, *rg = 0, *s; FILE *fp_help = stderr; mm_idx_reader_t *idx_rdr; @@ -98,7 +98,7 @@ int main(int argc, char *argv[]) else if (c == 't') n_threads = atoi(optarg); else if (c == 'v') mm_verbose = atoi(optarg); else if (c == 'g') opt.max_gap = (int)mm_parse_num(optarg); - else if (c == 'G') max_gap_ref = (int)mm_parse_num(optarg); + else if (c == 'G') mm_mapopt_max_intron_len(&opt, (int)mm_parse_num(optarg)); else if (c == 'F') opt.max_frag_len = (int)mm_parse_num(optarg); else if (c == 'i') opt.min_iden = atof(optarg); else if (c == 'N') opt.best_n = atoi(optarg); @@ -174,7 +174,7 @@ int main(int argc, char *argv[]) else if (*optarg == 'r') opt.flag |= MM_F_SPLICE_REV, opt.flag &= ~MM_F_SPLICE_FOR; // match CT-AC (reverse complement of GT-AG) else if (*optarg == 'n') opt.flag &= ~(MM_F_SPLICE_FOR|MM_F_SPLICE_REV); // don't try to match the GT-AG signal else { - fprintf(stderr, "[E::%s] unrecognized cDNA direction\n", __func__); + fprintf(stderr, "[ERROR]\033[1;31m unrecognized cDNA direction\033[0m\n"); return 1; } } else if (c == 'O') { @@ -185,14 +185,9 @@ int main(int argc, char *argv[]) if (*s == ',') opt.e2 = strtol(s + 1, &s, 10); } } - if (max_gap_ref > 0) { - opt.max_gap_ref = max_gap_ref; - if (opt.flag & MM_F_SPLICE) opt.bw = max_gap_ref; // in the splice mode, this also changes the bandwidth - } - if ((opt.flag & MM_F_OUT_SAM) && (opt.flag & MM_F_OUT_CS_LONG)) { - opt.flag &= ~MM_F_OUT_CS_LONG; - if (mm_verbose >= 2) - fprintf(stderr, "[WARNING]\033[1;31m in SAM, only the short form of the cs tag is outputted.\033[0m\n"); + if ((opt.flag & MM_F_SPLICE) && (opt.flag & MM_F_FRAG_MODE)) { + fprintf(stderr, "[ERROR]\033[1;31m --splice and --frag should not be specified at the same time.\033[0m\n"); + return 1; } if (argc == optind || fp_help == stdout) { @@ -207,8 +202,8 @@ int main(int argc, char *argv[]) fprintf(fp_help, " Mapping:\n"); fprintf(fp_help, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%g]\n", opt.mid_occ_frac); fprintf(fp_help, " -g NUM stop chain enlongation if there are no minimizers in INT-bp [%d]\n", opt.max_gap); - fprintf(fp_help, " -G NUM max reference skip/intron length [-xsplice:200k]\n"); - fprintf(fp_help, " -F NUM max fragment length in the fragment mode [-xsr:800]\n"); + fprintf(fp_help, " -G NUM max intron length (effective with -xsplice; changing -r) [200k]\n"); + fprintf(fp_help, " -F NUM max fragment length (effective with -xsr or in the fragment mode) [800]\n"); fprintf(fp_help, " -r NUM bandwidth used in chaining and DP-based alignment [%d]\n", opt.bw); fprintf(fp_help, " -n INT minimal number of minimizers on a chain [%d]\n", opt.min_cnt); fprintf(fp_help, " -m INT minimal chaining score (matching bases minus log gap penalty) [%d]\n", opt.min_chain_score); @@ -228,7 +223,7 @@ int main(int argc, char *argv[]) fprintf(fp_help, " Input/Output:\n"); fprintf(fp_help, " -a output in the SAM format (PAF by default)\n"); fprintf(fp_help, " -Q don't output base quality in SAM\n"); - fprintf(fp_help, " -L write CIGAR with >65535 ops in the CG tag (compatible with future tools)\n"); + fprintf(fp_help, " -L write CIGAR with >65535 ops at the CG tag\n"); fprintf(fp_help, " -R STR SAM read group line in a format like '@RG\\tID:foo\\tSM:bar' []\n"); fprintf(fp_help, " -c output CIGAR in PAF\n"); fprintf(fp_help, " --cs[=STR] output the cs tag; STR is 'short' (if absent) or 'long' [none]\n"); diff --git a/map.c b/map.c index 04d3ad8..35efa6d 100644 --- a/map.c +++ b/map.c @@ -52,6 +52,12 @@ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi) fprintf(stderr, "[M::%s::%.3f*%.2f] mid_occ = %d\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), opt->mid_occ); } +void mm_mapopt_max_intron_len(mm_mapopt_t *opt, int max_intron_len) +{ + if ((opt->flag & MM_F_SPLICE) && max_intron_len > 0) + opt->max_gap_ref = opt->bw = max_intron_len; +} + int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo) { if (preset == 0) { diff --git a/minimap.h b/minimap.h index 86cb7a2..e1dbf88 100644 --- a/minimap.h +++ b/minimap.h @@ -161,6 +161,8 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo); */ void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi); +void mm_mapopt_max_intron_len(mm_mapopt_t *opt, int max_intron_len); + /** * Initialize an index reader * diff --git a/minimap2.1 b/minimap2.1 index b1a2837..0064d77 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "17 October 2017" "minimap2-2.2-dirty (r518)" "Bioinformatics tools" +.TH minimap2 1 "22 October 2017" "minimap2-2.2-dirty (r531)" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences @@ -126,7 +126,7 @@ Stop chain enlongation if there are no minimizers in [10000]. .TP .BI -r \ INT -Bandwidth used in chaining and DP-based alignment [1000]. This option +Bandwidth used in chaining and DP-based alignment [500]. This option approximately controls the maximum gap size. .TP .BI -n \ INT @@ -148,7 +148,7 @@ diagonal minimizer hits will also be suppressed. .TP .BI -p \ FLOAT Minimal secondary-to-primary score ratio to output secondary mappings [0.8]. -Between two chains overlaping over half of the shorter chain (controled by +Between two chains overlaping over half of the shorter chain (controlled by .BR --mask-level ), the chain with a lower score is secondary to the chain with a higher score. If the ratio of the scores is below @@ -163,10 +163,16 @@ secondary alignments [5]. This option has no effect when is applied. .TP .BI -G \ NUM -Maximal intron length in the splice mode [200k]. This option also changes the -bandwidth to +Maximum gap on the reference (effective with +.BR -xsplice / --splice ). +This option also changes the chaining and alignment band width to .IR NUM . -Increasing this option slows down spliced alignment. +Increasing this option slows down spliced alignment. [200k] +.TP +.BI -F \ NUM +Maximum fragment length (aka insert size; effective with +.BR -xsr / --frag) +[800] .TP .BI --max-chain-skip \ INT A heuristics that stops chaining early [50]. Minimap2 uses dynamic programming @@ -175,6 +181,23 @@ option makes minimap2 exits the inner loop if it repeatedly sees seeds already on chains. Set .I INT to a large number to switch off this heurstics. +.TP +.B --no-long-join +Disable the long gap patching heuristic. When this option is applied, the +maximum alignment gap is mostly controlled by +.BR -r . +.TP +.B --splice +Enable the splice alignment mode. +.TP +.B --sr +Enable short-read alignment heuristics. In the short-read mode, minimap2 +applies a second round of chaining with a higher minimizer occurrence threshold +if no good chain is found. In addition, minimap2 attempts to patch gaps between +seeds with ungapped alignment. +.TP +.BR --frag [= no | yes ] +Whether to enable the fragment mode [no] .SS Alignment options .TP 10 .BI -A \ INT @@ -194,6 +217,7 @@ Gap extension penalty [2,1]. A gap of length .I k costs .RI min{ O1 + k * E1 , O2 + k * E2 }. +In the splice mode, the second gap penalties are not used. .TP .BI -z \ INT Break an alignment if the running score drops too quickly along the diagonal of @@ -217,6 +241,9 @@ no attempt to match GT-AG [n] .TP .BI --cost-non-gt-ag \ INT Cost of non-canonical splicing sites [0]. +.TP +.BI --end-bonus \ INT +Score bonus when alignment extends to the end of the query sequence [10]. .SS Input/output options .TP 10 .B -a @@ -226,9 +253,15 @@ by default. .B -Q Ignore base quality in the input file. .TP +.B -L +Write CIGAR with >65535 operators at the CG tag. Older tools are unable to +convert alignments with >65535 CIGAR ops to BAM. This option makes minimap2 SAM +compatible with older tools. Newer tools recognizes this tag and reconstruct +the real CIGAR in memory. +.TP .BI -R \ STR SAM read group line in a format like -.B @RG\\\\tID:foo\\\\tSM:bar +.RB @RG\\\\tID:foo\\\\tSM:bar []. .TP .B -c @@ -249,6 +282,11 @@ is given, .I short is assumed. [none] .TP +.BI --seed \ INT +Integer seed for randomizing equally best hits. Minimap2 hashes +.I INT +and read name when choosing between equally best hits. [11] +.TP .BI -t \ INT Number of threads [3]. Minimap2 uses at most three threads when indexing target sequences, and uses up to @@ -271,6 +309,9 @@ K/M/G/k/m/g suffix is accepted. A large helps load balancing in the multi-threading mode, at the cost of increased memory. .TP +.BR --secondary [= yes | no ] +Whether to output secondary alignments [yes] +.TP .B --version Print version number to stdout .SS Preset options @@ -343,9 +384,9 @@ tag ignores introns to demote hits to pseudogenes. .B sr Short single-end reads without splicing .RB ( -k21 -.B -w11 -A2 -B8 -O12,32 -E2,1 -r50 -p.5 -N20 -f1000,5000 -n2 -m20 -s40 -g200 -.B -2K50m --frag -.BR --sr ). +.B -w11 --sr --frag -A2 -B8 -O12,32 -E2,1 -r50 -p.5 -N20 -f1000,5000 -n2 -m20 +.B -s40 -g200 -2K50m +.BR --secondary=no ). .RE .SS Miscellaneous options .TP 10 @@ -358,7 +399,7 @@ multi-threading mode. .B --print-qname Print query names to stderr, mostly to see which query is crashing minimap2. .TP -.B --print-seed +.B --print-seeds Print seed positions to stderr, for debugging only. .SH OUTPUT FORMAT .PP diff --git a/setup.py b/setup.py index ab72226..e5f6bc3 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ def readme(): setup( name = 'mappy', - version = '2.2', + version = '2.3', url = 'https://github.com/lh3/minimap2', description = 'Minimap2 python binding', long_description = readme(),