From 2338e887d96d64d55e5aa13763687d7838f1c489 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 1 Jul 2017 11:25:54 -0400 Subject: [PATCH] finished the first draft of manpage --- chain.c | 2 +- main.c | 8 +-- minimap2.1 | 151 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 152 insertions(+), 9 deletions(-) diff --git a/chain.c b/chain.c index e2be85c..4cefb1e 100644 --- a/chain.c +++ b/chain.c @@ -50,7 +50,7 @@ int mm_chain_dp(int max_dist, int bw, int max_skip, int min_cnt, int min_sc, int dd = dr > dq? dr - dq : dq - dr; if (dd > bw) continue; sc = dq > q_span && dr > q_span? q_span : dq < dr? dq : dr; - sc = f[j] + sc - (dd? ilog2_32(dd) : 0); + sc = f[j] + sc - (dd? ilog2_32(dd) : 0); // TODO: consider to also penalize the shortest distance if (sc > max_f) max_f = sc, max_j = j; } if (max_j >= 0) f[i] = max_f, p[i] = max_j; diff --git a/main.c b/main.c index dea1204..25579e7 100644 --- a/main.c +++ b/main.c @@ -135,14 +135,14 @@ int main(int argc, char *argv[]) fprintf(stderr, " -d FILE dump index to FILE []\n"); fprintf(stderr, " Mapping:\n"); fprintf(stderr, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%g]\n", opt.mid_occ_frac); - fprintf(stderr, " -g INT split a mapping if there are no minimizers in INT-bp [%d]\n", opt.max_gap); - fprintf(stderr, " -r INT bandwidth [%d]\n", opt.bw); - fprintf(stderr, " -n INT minimal number of minimizers [%d]\n", opt.min_cnt); + fprintf(stderr, " -g INT stop chain enlongation if there are no minimizers in INT-bp [%d]\n", opt.max_gap); + fprintf(stderr, " -r INT bandwidth used in chaining and DP-based alignment [%d]\n", opt.bw); + fprintf(stderr, " -n INT minimal number of minimizers on a chain [%d]\n", opt.min_cnt); fprintf(stderr, " -m INT minimal chaining score (matching bases minus log gap penalty) [%d]\n", opt.min_chain_score); // fprintf(stderr, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres); // TODO: this option is never used; might be buggy fprintf(stderr, " -S skip self and dual mappings (for the all-vs-all mode)\n"); fprintf(stderr, " -p FLOAT min secondary-to-primary score ratio [%g]\n", opt.pri_ratio); - fprintf(stderr, " -D FLOAT min fraction of seed matches [%g]\n", opt.min_seedcov_ratio); + fprintf(stderr, " -D FLOAT min fraction of minimizer matches [%g]\n", opt.min_seedcov_ratio); fprintf(stderr, " -x STR preset (recommended to be applied before other options) []\n"); fprintf(stderr, " ava10k: -Hk19 -Sw5 -p0 -m100 -D.05 (PacBio/ONT all-vs-all read mapping)\n"); fprintf(stderr, " map10k: -Hk19 (PacBio/ONT vs reference mapping)\n"); diff --git a/minimap2.1 b/minimap2.1 index fdc2e45..7bf9d4e 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -8,6 +8,13 @@ minimap2 - mapping and alignment between collections of DNA sequences * Indexing the target sequences (optional): .RS 4 minimap2 +.RB [ -x +.IR preset ] +.B -d +.I target.mmi +.I target.fa +.br +minimap2 .RB [ -H ] .RB [ -k .IR kmer ] @@ -74,18 +81,15 @@ SAM format. .TP 10 .BI -k \ INT Minimizer k-mer length [17] - .TP .BI -w \ INT Minimizer window size [2/3 of k-mer length]. A minimizer is the smallest k-mer in a window of w consecutive k-mers. - .TP .B -H Use homopolymer-compressed (HPC) minimizers. An HPC sequence is constructed by contracting homopolymer runs to a single base. An HPC minimizer is a minimizer on the HPC sequence. - .TP .BI -I \ NUM Load at most @@ -100,7 +104,6 @@ multiple times to map it against each batch of target sequences. .I NUM may be ending with k/K/m/M/g/G. NB: mapping quality is incorrect given a multi-part index. - .TP .BI -d \ FILE Save the minimizer index of @@ -116,8 +119,125 @@ to Ignore top .I FLOAT fraction of most frequent minimizers [0.0002] +.TP +.BI -g \ INT +Stop chain enlongation if there are no minimizers in +.IR INT -bp +[10000]. +.TP +.BI -r \ INT +Bandwidth used in chaining and DP-based alignment [1000]. This option +approximately controls the maximum gap size. +.TP +.BI -n \ INT +Discard chains consisting of +.RI < INT +number of minimizers [3] +.TP +.BI -m \ INT +Discard chains with chaining score +.RI < INT +[40]. Chaining score equals the approximate number of matching bases (exact if +not using +.BR -H ) +minus base-2 logarithm gap penalty. It is computed with dynamic programming. +.TP +.B -S +Perform all-vs-all mapping. In this mode, if the query sequence name is +lexicographically larger than the target sequence name, the hits between them +will be suppressed; if the query sequence name is the same as the target name, +diagonal minimizer hits will also be suppressed. +.TP +.BI -p \ FLOAT +Minimal secondary-to-primary score ratio to output secondary mappings [2]. +Between two chains overlaping over half of the shorter chain (controled by +.BR --mask-level ), +the chain with a lower score is secondary to the chain with a higher score. +If the ratio of the scores is below +.IR FLOAT , +the secondary chain will not be outputted or extended with DP alignment later. +The default value suppresses all secondary chains. +.TP +.BI -D \ FLOAT +Discard a chain if the fraction of matching bases over the length of +query/target sequences in the chain is +.RI < FLOAT +[0]. +.TP +.BI -x \ STR +Preset []. This option applies multiple options at the same time. It should be +applied before other options because options applied later will overwrite the +values set by +.BR -x . +Available +.I STR +are: +.RS +.TP 8 +.B ava10k +PacBio/Oxford Nanopore all-vs-all overlap mapping (-Hk19 -Sw5 -p0 -m100 -D.05) +.TP +.B map10k +PacBio/Oxford Nanopore read to reference mapping (-Hk19) +.TP +.B asm1m +Long assembly to reference mapping (-k19 -w19) +.RE + +.SS Alignment options + +.TP 10 +.BI -A \ INT +Matching score [1] +.TP +.BI -B \ INT +Mismatching penalty [2] +.TP +.BI -O \ INT +Gap open penalty [2] +.TP +.BI -E \ INT +Gap extension penalty [1]. A gap of length +.I l +costs +.RI {-O}+{-E}* l . +.TP +.BI -z \ INT +Break an alignment if the running score drops too quickly along the diagonal of +the DP matrix (diagonal X-drop, or Z-drop) [200]. Increasing the value improves +the contiguity of the alignment at the cost of poor alignment in the middle +(e.g. caused by a long inversion). +.TP +.BI -s \ INT +Minimal peak DP alignment score to output [40]. The peak score is computed from +the final CIGAR. It is the score of the max scoring segment in the alignment +and may be different from the total alignment score. + +.SS Input/output options + +.TP 10 +.B -b +Generate CIGAR and output alignments in the SAM format. Minimap2 outputs in PAF +by default. +.TP +.B -c +Generate CIGAR. In PAF, the CIGAR is written to the `cg' custom tag. +.TP +.BI -t \ INT +Number of threads [3]. Minimap2 uses at most three threads when collecting +minimizers on target sequences, and uses up to +.IR INT +1 +threads when mapping (the extra thread is for I/O, which is frequently idle and +takes little CPU time). +.TP +.B -V +Print version number to stdout .SH OUTPUT FORMAT +.PP +Minimap2 outputs mapping positions in the Pairwise mApping Format (PAF) by +default. PAF is a TAB-delimited text format with each line consisting of at +least 12 fields as are described in the following table: .TS center box; @@ -139,6 +259,29 @@ _ 12 int Mapping quality (0-255 with 255 for missing) .TE +.PP +When alignment is available, column 11 gives the total number of sequence +matches, mismatches and gaps in the alignment; column 10 divided by column 11 +gives the BLAST-like alignment identity. When alignment is unavailable, +these two columns are approximate. PAF may optionally have additional fields in +the SAM-like typed key-value format. Minimap2 may output the following tags: + +.TS +center box; +cb | cb | cb +r | c | l . +Tag Type Description +_ +cm i Number of minimizers on the chain +s1 i Chaining score +s2 i Chaining score of the best secondary chain +NM i Total number of mismatches and gaps in the alignment +AS i DP alignment score +ms i DP score of the max scoring segment in the alignment +nn i Number of ambiguous bases in the alignment +cg Z CIGAR string (only in PAF) +.TE + .SH SEE ALSO .PP miniasm(1), minimap(1), bwa(1).