r309: improved SAM header output
This commit is contained in:
parent
240f6caaff
commit
0fe1a224ab
6
align.c
6
align.c
|
|
@ -61,7 +61,7 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c
|
|||
return 0;
|
||||
}
|
||||
|
||||
static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int q_intron)
|
||||
static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e)
|
||||
{
|
||||
uint32_t k, l, toff = 0, qoff = 0;
|
||||
int32_t s = 0, max = 0, n_gtag = 0, n_ctac = 0;
|
||||
|
|
@ -374,7 +374,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
|
|||
assert(re1 - rs1 <= re0 - rs0);
|
||||
if (r->p) {
|
||||
mm_idx_getseq(mi, rid, rs1, re1, tseq);
|
||||
mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e, (opt->flag&MM_F_SPLICE)? opt->q2 : 0);
|
||||
mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e);
|
||||
if (rev && r->p->trans_strand)
|
||||
r->p->trans_strand ^= 3; // flip to the read strand
|
||||
}
|
||||
|
|
@ -417,7 +417,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
|
|||
if (ez->n_cigar == 0) goto end_align1_inv; // should never be here
|
||||
mm_append_cigar(r_inv, ez->n_cigar, ez->cigar);
|
||||
r_inv->p->dp_score = ez->max;
|
||||
mm_update_extra(r_inv->p, qseq + q_off, tseq + t_off, mat, opt->q, opt->e, (opt->flag&MM_F_SPLICE)? opt->q2 : 0);
|
||||
mm_update_extra(r_inv->p, qseq + q_off, tseq + t_off, mat, opt->q, opt->e);
|
||||
r_inv->id = -1;
|
||||
r_inv->parent = MM_PARENT_UNSET;
|
||||
r_inv->inv = 1;
|
||||
|
|
|
|||
4
bseq.c
4
bseq.c
|
|
@ -8,7 +8,6 @@
|
|||
KSEQ_INIT(gzFile, gzread)
|
||||
|
||||
struct mm_bseq_file_s {
|
||||
int is_eof;
|
||||
gzFile fp;
|
||||
kseq_t *ks;
|
||||
};
|
||||
|
|
@ -53,12 +52,11 @@ mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int chunk_size, int with_qual, int
|
|||
size += seqs[n++].l_seq;
|
||||
if (size >= chunk_size) break;
|
||||
}
|
||||
if (size < chunk_size) fp->is_eof = 1;
|
||||
*n_ = n;
|
||||
return seqs;
|
||||
}
|
||||
|
||||
int mm_bseq_eof(mm_bseq_file_t *fp)
|
||||
{
|
||||
return fp->is_eof;
|
||||
return ks_eof(fp->ks->f);
|
||||
}
|
||||
|
|
|
|||
46
format.c
46
format.c
|
|
@ -6,7 +6,7 @@
|
|||
#include "kalloc.h"
|
||||
#include "mmpriv.h"
|
||||
|
||||
static char *mm_rg_line, mm_rg_id[256];
|
||||
static char mm_rg_id[256];
|
||||
|
||||
static inline void str_enlarge(kstring_t *s, int l)
|
||||
{
|
||||
|
|
@ -58,15 +58,13 @@ static void mm_sprintf_lite(kstring_t *s, const char *fmt, ...)
|
|||
s->s[s->l] = 0;
|
||||
}
|
||||
|
||||
static inline char *mm_escape(char *s)
|
||||
static char *mm_escape(char *s)
|
||||
{
|
||||
char *p, *q;
|
||||
for (p = q = s; *p; ++p) {
|
||||
if (*p == '\\') {
|
||||
++p;
|
||||
if (*p == 't') *q++ = '\t';
|
||||
else if (*p == 'n') *q++ = '\n';
|
||||
else if (*p == 'r') *q++ = '\r';
|
||||
else if (*p == '\\') *q++ = '\\';
|
||||
} else *q++ = *p;
|
||||
}
|
||||
|
|
@ -74,44 +72,56 @@ static inline char *mm_escape(char *s)
|
|||
return s;
|
||||
}
|
||||
|
||||
void mm_set_rg(const char *s)
|
||||
static void sam_write_rg_line(kstring_t *str, const char *s)
|
||||
{
|
||||
char *p, *q, *r, *rg_line = 0;
|
||||
memset(mm_rg_id, 0, 256);
|
||||
if (mm_rg_line) {
|
||||
free(mm_rg_line);
|
||||
mm_rg_line = 0;
|
||||
}
|
||||
if (s == 0) return;
|
||||
if (strstr(s, "@RG") != s) {
|
||||
if (mm_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__);
|
||||
if (mm_verbose >= 1) fprintf(stderr, "[ERROR] the read group line is not started with @RG\n");
|
||||
goto err_set_rg;
|
||||
}
|
||||
if (strstr(s, "\t") != NULL) {
|
||||
if (mm_verbose >= 1) fprintf(stderr, "[E::%s] the read group line contained literal <tab> characters -- replace with escaped tabs: \\t\n", __func__);
|
||||
if (mm_verbose >= 1) fprintf(stderr, "[ERROR] the read group line contained literal <tab> characters -- replace with escaped tabs: \\t\n");
|
||||
goto err_set_rg;
|
||||
}
|
||||
rg_line = strdup(s);
|
||||
mm_escape(rg_line);
|
||||
if ((p = strstr(rg_line, "\tID:")) == 0) {
|
||||
if (mm_verbose >= 1) fprintf(stderr, "[E::%s] no ID within the read group line\n", __func__);
|
||||
if (mm_verbose >= 1) fprintf(stderr, "[ERROR] no ID within the read group line\n");
|
||||
goto err_set_rg;
|
||||
}
|
||||
p += 4;
|
||||
for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
|
||||
if (q - p + 1 > 256) {
|
||||
if (mm_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__);
|
||||
if (mm_verbose >= 1) fprintf(stderr, "[ERROR] @RG:ID is longer than 255 characters\n");
|
||||
goto err_set_rg;
|
||||
}
|
||||
for (q = p, r = mm_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
|
||||
*r++ = *q;
|
||||
mm_rg_line = rg_line;
|
||||
return;
|
||||
mm_sprintf_lite(str, "%s\n", rg_line);
|
||||
|
||||
err_set_rg:
|
||||
free(rg_line);
|
||||
}
|
||||
|
||||
void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *argv[])
|
||||
{
|
||||
kstring_t str = {0,0,0};
|
||||
sam_write_rg_line(&str, rg);
|
||||
mm_sprintf_lite(&str, "@PG\tID:minimap2\tPN:minimap2");
|
||||
if (ver) mm_sprintf_lite(&str, "\tVN:%s", ver);
|
||||
if (argc > 1) {
|
||||
int i;
|
||||
mm_sprintf_lite(&str, "\tCL:minimap2");
|
||||
for (i = 1; i < argc; ++i)
|
||||
mm_sprintf_lite(&str, " %s", argv[i]);
|
||||
}
|
||||
mm_sprintf_lite(&str, "\n");
|
||||
fputs(str.s, stdout);
|
||||
free(str.s);
|
||||
}
|
||||
|
||||
static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r)
|
||||
{
|
||||
extern unsigned char seq_nt4_table[256];
|
||||
|
|
@ -214,12 +224,11 @@ static char comp_tab[] = {
|
|||
'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127
|
||||
};
|
||||
|
||||
void sam_write_sam_header(const mm_idx_t *idx)
|
||||
void mm_write_sam_SQ(const mm_idx_t *idx)
|
||||
{
|
||||
uint32_t i;
|
||||
for (i = 0; i < idx->n_seq; ++i)
|
||||
printf("@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len);
|
||||
if (mm_rg_line) puts(mm_rg_line);
|
||||
}
|
||||
|
||||
static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp)
|
||||
|
|
@ -274,8 +283,7 @@ void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
|
|||
else mm_sprintf_lite(s, "*");
|
||||
}
|
||||
write_tags(s, r);
|
||||
if (mm_rg_line && mm_rg_id[0])
|
||||
mm_sprintf_lite(s, "\tRG:Z:%s", mm_rg_id);
|
||||
if (mm_rg_id[0]) mm_sprintf_lite(s, "\tRG:Z:%s", mm_rg_id);
|
||||
if (r->parent == r->id && r->p && n_regs > 1 && regs && r >= regs && r - regs < n_regs) { // supplementary aln may exist
|
||||
int i, n_sa = 0; // n_sa: number of SA fields
|
||||
for (i = 0; i < n_regs; ++i)
|
||||
|
|
|
|||
2
index.c
2
index.c
|
|
@ -285,12 +285,12 @@ static void *worker_pipeline(void *shared, int step, void *in)
|
|||
mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int is_hpc, int mini_batch_size, int n_threads, uint64_t batch_size, int keep_name)
|
||||
{
|
||||
pipeline_t pl;
|
||||
if (fp == 0 || mm_bseq_eof(fp)) return 0;
|
||||
memset(&pl, 0, sizeof(pipeline_t));
|
||||
pl.mini_batch_size = mini_batch_size < batch_size? mini_batch_size : batch_size;
|
||||
pl.keep_name = keep_name;
|
||||
pl.batch_size = batch_size;
|
||||
pl.fp = fp;
|
||||
if (pl.fp == 0) return 0;
|
||||
pl.mi = mm_idx_init(w, k, b, is_hpc);
|
||||
|
||||
kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3);
|
||||
|
|
|
|||
26
main.c
26
main.c
|
|
@ -8,7 +8,7 @@
|
|||
#include "minimap.h"
|
||||
#include "mmpriv.h"
|
||||
|
||||
#define MM_VERSION "2.0-r301-dirty"
|
||||
#define MM_VERSION "2.0-r309-dirty"
|
||||
|
||||
void liftrlimit()
|
||||
{
|
||||
|
|
@ -33,6 +33,7 @@ static struct option long_options[] = {
|
|||
{ "print-aln-seq", no_argument, 0, 0 },
|
||||
{ "splice", no_argument, 0, 0 },
|
||||
{ "cost-non-gt-ag", required_argument, 0, 0 },
|
||||
{ "no-sam-sq", no_argument, 0, 0 },
|
||||
{ "help", no_argument, 0, 'h' },
|
||||
{ "max-intron-len", required_argument, 0, 'G' },
|
||||
{ "version", no_argument, 0, 'V' },
|
||||
|
|
@ -58,11 +59,11 @@ static inline int64_t mm_parse_num(const char *str)
|
|||
int main(int argc, char *argv[])
|
||||
{
|
||||
mm_mapopt_t opt;
|
||||
int i, c, k = 15, w = -1, bucket_bits = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0, long_idx, idx_par_set = 0, max_intron_len = 0;
|
||||
int i, c, k = 15, w = -1, bucket_bits = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0, long_idx, idx_par_set = 0, max_intron_len = 0, n_idx_part = 0;
|
||||
int minibatch_size = 200000000;
|
||||
uint64_t batch_size = 4000000000ULL;
|
||||
mm_bseq_file_t *fp = 0;
|
||||
char *fnw = 0, *s;
|
||||
char *fnw = 0, *rg = 0, *s;
|
||||
FILE *fpr = 0, *fpw = 0, *fp_help = stderr;
|
||||
|
||||
liftrlimit();
|
||||
|
|
@ -97,7 +98,7 @@ int main(int argc, char *argv[])
|
|||
else if (c == 's') opt.min_dp_max = atoi(optarg);
|
||||
else if (c == 'I') batch_size = mm_parse_num(optarg);
|
||||
else if (c == 'K') minibatch_size = (int)mm_parse_num(optarg);
|
||||
else if (c == 'R') mm_set_rg(optarg); // WARNING: this modifies global variables in format.c
|
||||
else if (c == 'R') rg = optarg;
|
||||
else if (c == 'h') fp_help = stdout;
|
||||
else if (c == 0 && long_idx == 0) bucket_bits = atoi(optarg); // --bucket-bits
|
||||
else if (c == 0 && long_idx == 2) keep_name = 0; // --int-rname
|
||||
|
|
@ -110,6 +111,7 @@ int main(int argc, char *argv[])
|
|||
else if (c == 0 && long_idx == 9) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ; // --print-aln-seq
|
||||
else if (c == 0 && long_idx ==10) opt.flag |= MM_F_SPLICE; // --splice
|
||||
else if (c == 0 && long_idx ==11) opt.noncan = atoi(optarg); // --cost-non-gt-ag
|
||||
else if (c == 0 && long_idx ==12) opt.flag |= MM_F_NO_SAM_SQ; // --no-sam-sq
|
||||
else if (c == 'V') {
|
||||
puts(MM_VERSION);
|
||||
return 0;
|
||||
|
|
@ -222,27 +224,31 @@ int main(int argc, char *argv[])
|
|||
|
||||
is_idx = mm_idx_is_idx(argv[optind]);
|
||||
if (is_idx < 0) {
|
||||
fprintf(stderr, "[E::%s] failed to open file '%s'\n", __func__, argv[optind]);
|
||||
fprintf(stderr, "[ERROR] failed to open file '%s'\n", argv[optind]);
|
||||
return 1;
|
||||
}
|
||||
if (!is_idx && fnw == 0 && argc - optind < 2) {
|
||||
fprintf(stderr, "[E::%s] missing input: please specify a query file or option -d\n", __func__);
|
||||
fprintf(stderr, "[ERROR] missing input: please specify a query file to map or option -d to keep the index\n");
|
||||
return 1;
|
||||
}
|
||||
if (is_idx) fpr = fopen(argv[optind], "rb");
|
||||
else fp = mm_bseq_open(argv[optind]);
|
||||
if (fnw) fpw = fopen(fnw, "wb");
|
||||
if (opt.flag & MM_F_OUT_SAM)
|
||||
mm_write_sam_hdr_no_SQ(rg, MM_VERSION, argc, argv);
|
||||
for (;;) {
|
||||
mm_idx_t *mi = 0;
|
||||
mm_idx_t *mi;
|
||||
if (fpr) {
|
||||
mi = mm_idx_load(fpr);
|
||||
if (idx_par_set && mm_verbose >= 2 && (mi->k != k || mi->w != w || mi->is_hpc != is_hpc))
|
||||
fprintf(stderr, "[W::%s::%.3f*%.2f] Indexing parameters on the command line (-k/-w/-H) overridden by parameters in the prebuilt index.\n",
|
||||
__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
|
||||
} else if (!mm_bseq_eof(fp)) {
|
||||
fprintf(stderr, "[WARNING] \033[1;31mIndexing parameters on the command line (-k/-w/-H) overridden by parameters in the prebuilt index.\033[0m\n");
|
||||
} else {
|
||||
mi = mm_idx_gen(fp, w, k, bucket_bits, is_hpc, minibatch_size, n_threads, batch_size, keep_name);
|
||||
}
|
||||
if (mi == 0) break;
|
||||
++n_idx_part;
|
||||
if (mm_verbose >= 2 && n_idx_part > 1 && (opt.flag&MM_F_OUT_SAM) && !(opt.flag&MM_F_NO_SAM_SQ))
|
||||
fprintf(stderr, "[WARNING] \033[1;31mSAM output is malformated due to internal @SQ lines. Please add option --no-sam-sq or filter afterwards.\033[0m\n");
|
||||
if (mm_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n",
|
||||
__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n_seq);
|
||||
|
|
|
|||
3
map.c
3
map.c
|
|
@ -385,7 +385,8 @@ int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int
|
|||
if (pl.fp == 0) return -1;
|
||||
pl.opt = opt, pl.mi = idx;
|
||||
pl.n_threads = n_threads, pl.mini_batch_size = mini_batch_size;
|
||||
if (opt->flag & MM_F_OUT_SAM) sam_write_sam_header(idx);
|
||||
if ((opt->flag & MM_F_OUT_SAM) && !(opt->flag & MM_F_NO_SAM_SQ))
|
||||
mm_write_sam_SQ(idx);
|
||||
kt_pipeline(n_threads == 1? 1 : 2, worker_pipeline, &pl, 3);
|
||||
free(pl.str.s);
|
||||
mm_bseq_close(pl.fp);
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@
|
|||
#define MM_F_SPLICE_FOR 0x100
|
||||
#define MM_F_SPLICE_REV 0x200
|
||||
#define MM_F_SPLICE_BOTH 0x400
|
||||
#define MM_F_NO_SAM_SQ 0x800
|
||||
|
||||
#define MM_IDX_MAGIC "MMI\2"
|
||||
|
||||
|
|
|
|||
11
minimap2.1
11
minimap2.1
|
|
@ -1,4 +1,4 @@
|
|||
.TH minimap2 1 "13 August 2017" "minimap2-2.0-r296-dirty" "Bioinformatics tools"
|
||||
.TH minimap2 1 "25 August 2017" "minimap2-2.0-r309-dirty" "Bioinformatics tools"
|
||||
.SH NAME
|
||||
.PP
|
||||
minimap2 - mapping and alignment between collections of DNA sequences
|
||||
|
|
@ -255,8 +255,13 @@ and
|
|||
use
|
||||
.BR -K500m .
|
||||
.TP
|
||||
.B -V
|
||||
.B --version
|
||||
Print version number to stdout
|
||||
.TP
|
||||
.B --no-sam-hdr
|
||||
Don't output SAM header lines. Use this option if the index consists of
|
||||
multiple parts; otherwise the SAM output is malformated due to internal header
|
||||
lines.
|
||||
.SS Preset options
|
||||
.TP 10
|
||||
.BI -x \ STR
|
||||
|
|
@ -320,7 +325,7 @@ is that this preset is not using HPC minimizers.
|
|||
Long-read spliced alignment
|
||||
.RB ( -k15
|
||||
.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -z200 -ub --cost-non-gt-ag
|
||||
.BR 4 ).
|
||||
.BR 5 ).
|
||||
In the splice mode, 1) long deletions are taken as introns and represented as
|
||||
the
|
||||
.RB ` N '
|
||||
|
|
|
|||
4
mmpriv.h
4
mmpriv.h
|
|
@ -40,8 +40,8 @@ void radix_sort_128x(mm128_t *beg, mm128_t *end);
|
|||
void radix_sort_64(uint64_t *beg, uint64_t *end);
|
||||
uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
|
||||
|
||||
void mm_set_rg(const char *s);
|
||||
void sam_write_sam_header(const mm_idx_t *idx);
|
||||
void mm_write_sam_SQ(const mm_idx_t *idx);
|
||||
void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *argv[]);
|
||||
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
|
||||
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs);
|
||||
int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km);
|
||||
|
|
|
|||
Loading…
Reference in New Issue