Merge branch 'master' into splice

This commit is contained in:
Heng Li 2017-08-17 23:39:27 +08:00
commit 64c1389e1a
5 changed files with 131 additions and 58 deletions

View File

@ -6,6 +6,8 @@
#include "kalloc.h" #include "kalloc.h"
#include "mmpriv.h" #include "mmpriv.h"
static char *mm_rg_line, mm_rg_id[256];
static inline void str_enlarge(kstring_t *s, int l) static inline void str_enlarge(kstring_t *s, int l)
{ {
if (s->l + l + 1 > s->m) { if (s->l + l + 1 > s->m) {
@ -56,6 +58,60 @@ static void mm_sprintf_lite(kstring_t *s, const char *fmt, ...)
s->s[s->l] = 0; s->s[s->l] = 0;
} }
static inline char *mm_escape(char *s)
{
char *p, *q;
for (p = q = s; *p; ++p) {
if (*p == '\\') {
++p;
if (*p == 't') *q++ = '\t';
else if (*p == 'n') *q++ = '\n';
else if (*p == 'r') *q++ = '\r';
else if (*p == '\\') *q++ = '\\';
} else *q++ = *p;
}
*q = '\0';
return s;
}
void mm_set_rg(const char *s)
{
char *p, *q, *r, *rg_line = 0;
memset(mm_rg_id, 0, 256);
if (mm_rg_line) {
free(mm_rg_line);
mm_rg_line = 0;
}
if (s == 0) return;
if (strstr(s, "@RG") != s) {
if (mm_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__);
goto err_set_rg;
}
if (strstr(s, "\t") != NULL) {
if (mm_verbose >= 1) fprintf(stderr, "[E::%s] the read group line contained literal <tab> characters -- replace with escaped tabs: \\t\n", __func__);
goto err_set_rg;
}
rg_line = strdup(s);
mm_escape(rg_line);
if ((p = strstr(rg_line, "\tID:")) == 0) {
if (mm_verbose >= 1) fprintf(stderr, "[E::%s] no ID within the read group line\n", __func__);
goto err_set_rg;
}
p += 4;
for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
if (q - p + 1 > 256) {
if (mm_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__);
goto err_set_rg;
}
for (q = p, r = mm_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
*r++ = *q;
mm_rg_line = rg_line;
return;
err_set_rg:
free(rg_line);
}
static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r) static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r)
{ {
extern unsigned char seq_nt4_table[256]; extern unsigned char seq_nt4_table[256];
@ -162,6 +218,14 @@ static char comp_tab[] = {
'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127 'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127
}; };
void sam_write_sam_header(const mm_idx_t *idx)
{
uint32_t i;
for (i = 0; i < idx->n_seq; ++i)
printf("@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len);
if (mm_rg_line) puts(mm_rg_line);
}
static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp) static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp)
{ {
if (rev) { if (rev) {
@ -218,6 +282,8 @@ void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
else mm_sprintf_lite(s, "*"); else mm_sprintf_lite(s, "*");
} }
write_tags(s, r); write_tags(s, r);
if (mm_rg_line && mm_rg_id[0])
mm_sprintf_lite(s, "\tRG:Z:%s", mm_rg_id);
if (r->parent == r->id && r->p && n_regs > 1 && regs && r >= regs && r - regs < n_regs) { // supplementary aln may exist if (r->parent == r->id && r->p && n_regs > 1 && regs && r >= regs && r - regs < n_regs) { // supplementary aln may exist
int i, n_sa = 0; // n_sa: number of SA fields int i, n_sa = 0; // n_sa: number of SA fields
for (i = 0; i < n_regs; ++i) for (i = 0; i < n_regs; ++i)

104
main.c
View File

@ -33,6 +33,7 @@ static struct option long_options[] = {
{ "print-aln-seq", no_argument, 0, 0 }, { "print-aln-seq", no_argument, 0, 0 },
{ "splice", no_argument, 0, 0 }, { "splice", no_argument, 0, 0 },
{ "cost-non-gt-ag", required_argument, 0, 0 }, { "cost-non-gt-ag", required_argument, 0, 0 },
{ "help", no_argument, 0, 'h' },
{ "max-intron-len", required_argument, 0, 'G' }, { "max-intron-len", required_argument, 0, 'G' },
{ "version", no_argument, 0, 'V' }, { "version", no_argument, 0, 'V' },
{ "min-count", required_argument, 0, 'n' }, { "min-count", required_argument, 0, 'n' },
@ -62,13 +63,13 @@ int main(int argc, char *argv[])
uint64_t batch_size = 4000000000ULL; uint64_t batch_size = 4000000000ULL;
mm_bseq_file_t *fp = 0; mm_bseq_file_t *fp = 0;
char *fnw = 0, *s; char *fnw = 0, *s;
FILE *fpr = 0, *fpw = 0; FILE *fpr = 0, *fpw = 0, *fp_help = stderr;
liftrlimit(); liftrlimit();
mm_realtime0 = realtime(); mm_realtime0 = realtime();
mm_mapopt_init(&opt); mm_mapopt_init(&opt);
while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:", long_options, &long_idx)) >= 0) { while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:h", long_options, &long_idx)) >= 0) {
if (c == 'w') w = atoi(optarg), idx_par_set = 1; if (c == 'w') w = atoi(optarg), idx_par_set = 1;
else if (c == 'k') k = atoi(optarg), idx_par_set = 1; else if (c == 'k') k = atoi(optarg), idx_par_set = 1;
else if (c == 'H') is_hpc = 1, idx_par_set = 1; else if (c == 'H') is_hpc = 1, idx_par_set = 1;
@ -96,6 +97,8 @@ int main(int argc, char *argv[])
else if (c == 's') opt.min_dp_max = atoi(optarg); else if (c == 's') opt.min_dp_max = atoi(optarg);
else if (c == 'I') batch_size = mm_parse_num(optarg); else if (c == 'I') batch_size = mm_parse_num(optarg);
else if (c == 'K') minibatch_size = (int)mm_parse_num(optarg); else if (c == 'K') minibatch_size = (int)mm_parse_num(optarg);
else if (c == 'R') mm_set_rg(optarg); // WARNING: this modifies global variables in format.c
else if (c == 'h') fp_help = stdout;
else if (c == 0 && long_idx == 0) bucket_bits = atoi(optarg); // --bucket-bits else if (c == 0 && long_idx == 0) bucket_bits = atoi(optarg); // --bucket-bits
else if (c == 0 && long_idx == 2) keep_name = 0; // --int-rname else if (c == 0 && long_idx == 2) keep_name = 0; // --int-rname
else if (c == 0 && long_idx == 3) mm_dbg_flag |= MM_DBG_NO_KALLOC; // --no-kalloc else if (c == 0 && long_idx == 3) mm_dbg_flag |= MM_DBG_NO_KALLOC; // --no-kalloc
@ -166,54 +169,55 @@ int main(int argc, char *argv[])
if ((opt.flag & MM_F_SPLICE) && max_intron_len > 0) if ((opt.flag & MM_F_SPLICE) && max_intron_len > 0)
opt.max_gap_ref = opt.bw = max_intron_len; opt.max_gap_ref = opt.bw = max_intron_len;
if (argc == optind) { if (argc == optind || fp_help == stdout) {
fprintf(stderr, "Usage: minimap2 [options] <target.fa>|<target.idx> [query.fa] [...]\n"); fprintf(fp_help, "Usage: minimap2 [options] <target.fa>|<target.idx> [query.fa] [...]\n");
fprintf(stderr, "Options:\n"); fprintf(fp_help, "Options:\n");
fprintf(stderr, " Indexing:\n"); fprintf(fp_help, " Indexing:\n");
fprintf(stderr, " -H use homopolymer-compressed k-mer\n"); fprintf(fp_help, " -H use homopolymer-compressed k-mer\n");
fprintf(stderr, " -k INT k-mer size (no larger than 28) [%d]\n", k); fprintf(fp_help, " -k INT k-mer size (no larger than 28) [%d]\n", k);
fprintf(stderr, " -w INT minizer window size [{-k}*2/3]\n"); fprintf(fp_help, " -w INT minizer window size [{-k}*2/3]\n");
fprintf(stderr, " -I NUM split index for every ~NUM input bases [4G]\n"); fprintf(fp_help, " -I NUM split index for every ~NUM input bases [4G]\n");
fprintf(stderr, " -d FILE dump index to FILE []\n"); fprintf(fp_help, " -d FILE dump index to FILE []\n");
fprintf(stderr, " Mapping:\n"); fprintf(fp_help, " Mapping:\n");
fprintf(stderr, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%g]\n", opt.mid_occ_frac); fprintf(fp_help, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%g]\n", opt.mid_occ_frac);
fprintf(stderr, " -g INT stop chain enlongation if there are no minimizers in INT-bp [%d]\n", opt.max_gap); fprintf(fp_help, " -g INT stop chain enlongation if there are no minimizers in INT-bp [%d]\n", opt.max_gap);
fprintf(stderr, " -r INT bandwidth used in chaining and DP-based alignment [%d]\n", opt.bw); fprintf(fp_help, " -r INT bandwidth used in chaining and DP-based alignment [%d]\n", opt.bw);
fprintf(stderr, " -n INT minimal number of minimizers on a chain [%d]\n", opt.min_cnt); fprintf(fp_help, " -n INT minimal number of minimizers on a chain [%d]\n", opt.min_cnt);
fprintf(stderr, " -m INT minimal chaining score (matching bases minus log gap penalty) [%d]\n", opt.min_chain_score); fprintf(fp_help, " -m INT minimal chaining score (matching bases minus log gap penalty) [%d]\n", opt.min_chain_score);
// fprintf(stderr, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres); // TODO: this option is never used; might be buggy // fprintf(fp_help, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres); // TODO: this option is never used; might be buggy
fprintf(stderr, " -X skip self and dual mappings (for the all-vs-all mode)\n"); fprintf(fp_help, " -X skip self and dual mappings (for the all-vs-all mode)\n");
fprintf(stderr, " -p FLOAT min secondary-to-primary score ratio [%g]\n", opt.pri_ratio); fprintf(fp_help, " -p FLOAT min secondary-to-primary score ratio [%g]\n", opt.pri_ratio);
fprintf(stderr, " -N INT retain at most INT secondary alignments [%d]\n", opt.best_n); fprintf(fp_help, " -N INT retain at most INT secondary alignments [%d]\n", opt.best_n);
fprintf(stderr, " -G NUM max intron length (only effective following -x splice) [200k]\n"); fprintf(fp_help, " -G NUM max intron length (only effective following -x splice) [200k]\n");
fprintf(stderr, " Alignment:\n"); fprintf(fp_help, " Alignment:\n");
fprintf(stderr, " -A INT matching score [%d]\n", opt.a); fprintf(fp_help, " -A INT matching score [%d]\n", opt.a);
fprintf(stderr, " -B INT mismatch penalty [%d]\n", opt.b); fprintf(fp_help, " -B INT mismatch penalty [%d]\n", opt.b);
fprintf(stderr, " -O INT[,INT] gap open penalty [%d,%d]\n", opt.q, opt.q2); fprintf(fp_help, " -O INT[,INT] gap open penalty [%d,%d]\n", opt.q, opt.q2);
fprintf(stderr, " -E INT[,INT] gap extension penalty; a k-long gap costs min{O1+k*E1,O2+k*E2} [%d,%d]\n", opt.e, opt.e2); fprintf(fp_help, " -E INT[,INT] gap extension penalty; a k-long gap costs min{O1+k*E1,O2+k*E2} [%d,%d]\n", opt.e, opt.e2);
fprintf(stderr, " -z INT Z-drop score [%d]\n", opt.zdrop); fprintf(fp_help, " -z INT Z-drop score [%d]\n", opt.zdrop);
fprintf(stderr, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max); fprintf(fp_help, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max);
fprintf(stderr, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n"); fprintf(fp_help, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n");
fprintf(stderr, " Input/Output:\n"); fprintf(fp_help, " Input/Output:\n");
fprintf(stderr, " -Q ignore base quality in the input\n"); fprintf(fp_help, " -a output in the SAM format (PAF by default)\n");
fprintf(stderr, " -a output in the SAM format (PAF by default)\n"); fprintf(fp_help, " -Q don't output base quality in SAM\n");
fprintf(stderr, " -c output CIGAR in PAF\n"); fprintf(fp_help, " -R STR SAM read group line in a format like '@RG\\tID:foo\\tSM:bar' []\n");
fprintf(stderr, " -S output the cs tag in PAF\n"); fprintf(fp_help, " -c output CIGAR in PAF\n");
fprintf(stderr, " -t INT number of threads [%d]\n", n_threads); fprintf(fp_help, " -S output the cs tag in PAF (cs encodes both query and ref sequences)\n");
fprintf(stderr, " -K NUM minibatch size [200M]\n"); fprintf(fp_help, " -t INT number of threads [%d]\n", n_threads);
// fprintf(stderr, " -v INT verbose level [%d]\n", mm_verbose); fprintf(fp_help, " -K NUM minibatch size [200M]\n");
fprintf(stderr, " -V show version number\n"); // fprintf(fp_help, " -v INT verbose level [%d]\n", mm_verbose);
fprintf(stderr, " Preset:\n"); fprintf(fp_help, " --version show version number\n");
fprintf(stderr, " -x STR preset (recommended to be applied before other options) []\n"); fprintf(fp_help, " Preset:\n");
fprintf(stderr, " map10k/map-pb: -Hk19 (PacBio/ONT vs reference mapping)\n"); fprintf(fp_help, " -x STR preset (recommended to be applied before other options) []\n");
fprintf(stderr, " map-ont: -k15 (slightly more sensitive than 'map10k' for ONT vs reference)\n"); fprintf(fp_help, " map10k/map-pb: -Hk19 (PacBio/ONT vs reference mapping)\n");
fprintf(stderr, " asm5: -k19 -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 (asm to ref mapping; break at 5%% div.)\n"); fprintf(fp_help, " map-ont: -k15 (slightly more sensitive than 'map10k' for ONT vs reference)\n");
fprintf(stderr, " asm10: -k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10%% div.)\n"); fprintf(fp_help, " asm5: -k19 -w19 -A1 -B19 -O39,81 -E3,1 -s200 -z200 (asm to ref mapping; break at 5%% div.)\n");
fprintf(stderr, " ava-pb: -Hk19 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (PacBio read overlap)\n"); fprintf(fp_help, " asm10: -k19 -w19 -A1 -B9 -O16,41 -E2,1 -s200 -z200 (asm to ref mapping; break at 10%% div.)\n");
fprintf(stderr, " ava-ont: -k15 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (ONT read overlap)\n"); fprintf(fp_help, " ava-pb: -Hk19 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (PacBio read overlap)\n");
fprintf(stderr, " splice: long-read spliced alignment (see minimap2.1 for details)\n"); fprintf(fp_help, " ava-ont: -k15 -w5 -Xp0 -m100 -g10000 -K500m --max-chain-skip 25 (ONT read overlap)\n");
fprintf(stderr, "\nSee `man ./minimap2.1' for detailed description of command-line options.\n"); fprintf(fp_help, " splice: long-read spliced alignment (see minimap2.1 for details)\n");
return 1; fprintf(fp_help, "\nSee `man ./minimap2.1' for detailed description of command-line options.\n");
return fp_help == stdout? 0 : 1;
} }
is_idx = mm_idx_is_idx(argv[optind]); is_idx = mm_idx_is_idx(argv[optind]);

6
map.c
View File

@ -388,11 +388,7 @@ int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int
if (pl.fp == 0) return -1; if (pl.fp == 0) return -1;
pl.opt = opt, pl.mi = idx; pl.opt = opt, pl.mi = idx;
pl.n_threads = n_threads, pl.mini_batch_size = mini_batch_size; pl.n_threads = n_threads, pl.mini_batch_size = mini_batch_size;
if (opt->flag & MM_F_OUT_SAM) { if (opt->flag & MM_F_OUT_SAM) sam_write_sam_header(idx);
uint32_t i;
for (i = 0; i < idx->n_seq; ++i)
printf("@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len);
}
kt_pipeline(n_threads == 1? 1 : 2, worker_pipeline, &pl, 3); kt_pipeline(n_threads == 1? 1 : 2, worker_pipeline, &pl, 3);
free(pl.str.s); free(pl.str.s);
mm_bseq_close(pl.fp); mm_bseq_close(pl.fp);

View File

@ -219,13 +219,18 @@ no attempt to match GT-AG [n]
Cost of non-canonical splicing sites [0]. Cost of non-canonical splicing sites [0].
.SS Input/output options .SS Input/output options
.TP 10 .TP 10
.B -Q
Ignore base quality in the input file.
.TP
.B -a .B -a
Generate CIGAR and output alignments in the SAM format. Minimap2 outputs in PAF Generate CIGAR and output alignments in the SAM format. Minimap2 outputs in PAF
by default. by default.
.TP .TP
.B -Q
Ignore base quality in the input file.
.TP
.BI -R \ STR
SAM read group line in a format like
.B @RG\\\\tID:foo\\\\tSM:bar
[].
.TP
.B -c .B -c
Generate CIGAR. In PAF, the CIGAR is written to the `cg' custom tag. Generate CIGAR. In PAF, the CIGAR is written to the `cg' custom tag.
.TP .TP

View File

@ -40,6 +40,8 @@ void radix_sort_128x(mm128_t *beg, mm128_t *end);
void radix_sort_64(uint64_t *beg, uint64_t *end); void radix_sort_64(uint64_t *beg, uint64_t *end);
uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
void mm_set_rg(const char *s);
void sam_write_sam_header(const mm_idx_t *idx);
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int intron_thres); void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int intron_thres);
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs, int intron_thres); void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs, int intron_thres);
int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km); int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km);