r309: improved SAM header output

This commit is contained in:
Heng Li 2017-08-25 10:35:58 +08:00
parent 240f6caaff
commit 0fe1a224ab
9 changed files with 61 additions and 42 deletions

View File

@ -61,7 +61,7 @@ static int mm_check_zdrop(const uint8_t *qseq, const uint8_t *tseq, uint32_t n_c
return 0;
}
static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e, int q_intron)
static void mm_update_extra(mm_extra_t *p, const uint8_t *qseq, const uint8_t *tseq, const int8_t *mat, int8_t q, int8_t e)
{
uint32_t k, l, toff = 0, qoff = 0;
int32_t s = 0, max = 0, n_gtag = 0, n_ctac = 0;
@ -374,7 +374,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
assert(re1 - rs1 <= re0 - rs0);
if (r->p) {
mm_idx_getseq(mi, rid, rs1, re1, tseq);
mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e, (opt->flag&MM_F_SPLICE)? opt->q2 : 0);
mm_update_extra(r->p, &qseq0[r->rev][qs1], tseq, mat, opt->q, opt->e);
if (rev && r->p->trans_strand)
r->p->trans_strand ^= 3; // flip to the read strand
}
@ -417,7 +417,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
if (ez->n_cigar == 0) goto end_align1_inv; // should never be here
mm_append_cigar(r_inv, ez->n_cigar, ez->cigar);
r_inv->p->dp_score = ez->max;
mm_update_extra(r_inv->p, qseq + q_off, tseq + t_off, mat, opt->q, opt->e, (opt->flag&MM_F_SPLICE)? opt->q2 : 0);
mm_update_extra(r_inv->p, qseq + q_off, tseq + t_off, mat, opt->q, opt->e);
r_inv->id = -1;
r_inv->parent = MM_PARENT_UNSET;
r_inv->inv = 1;

4
bseq.c
View File

@ -8,7 +8,6 @@
KSEQ_INIT(gzFile, gzread)
struct mm_bseq_file_s {
int is_eof;
gzFile fp;
kseq_t *ks;
};
@ -53,12 +52,11 @@ mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int chunk_size, int with_qual, int
size += seqs[n++].l_seq;
if (size >= chunk_size) break;
}
if (size < chunk_size) fp->is_eof = 1;
*n_ = n;
return seqs;
}
int mm_bseq_eof(mm_bseq_file_t *fp)
{
return fp->is_eof;
return ks_eof(fp->ks->f);
}

View File

@ -6,7 +6,7 @@
#include "kalloc.h"
#include "mmpriv.h"
static char *mm_rg_line, mm_rg_id[256];
static char mm_rg_id[256];
static inline void str_enlarge(kstring_t *s, int l)
{
@ -58,15 +58,13 @@ static void mm_sprintf_lite(kstring_t *s, const char *fmt, ...)
s->s[s->l] = 0;
}
static inline char *mm_escape(char *s)
static char *mm_escape(char *s)
{
char *p, *q;
for (p = q = s; *p; ++p) {
if (*p == '\\') {
++p;
if (*p == 't') *q++ = '\t';
else if (*p == 'n') *q++ = '\n';
else if (*p == 'r') *q++ = '\r';
else if (*p == '\\') *q++ = '\\';
} else *q++ = *p;
}
@ -74,44 +72,56 @@ static inline char *mm_escape(char *s)
return s;
}
void mm_set_rg(const char *s)
static void sam_write_rg_line(kstring_t *str, const char *s)
{
char *p, *q, *r, *rg_line = 0;
memset(mm_rg_id, 0, 256);
if (mm_rg_line) {
free(mm_rg_line);
mm_rg_line = 0;
}
if (s == 0) return;
if (strstr(s, "@RG") != s) {
if (mm_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__);
if (mm_verbose >= 1) fprintf(stderr, "[ERROR] the read group line is not started with @RG\n");
goto err_set_rg;
}
if (strstr(s, "\t") != NULL) {
if (mm_verbose >= 1) fprintf(stderr, "[E::%s] the read group line contained literal <tab> characters -- replace with escaped tabs: \\t\n", __func__);
if (mm_verbose >= 1) fprintf(stderr, "[ERROR] the read group line contained literal <tab> characters -- replace with escaped tabs: \\t\n");
goto err_set_rg;
}
rg_line = strdup(s);
mm_escape(rg_line);
if ((p = strstr(rg_line, "\tID:")) == 0) {
if (mm_verbose >= 1) fprintf(stderr, "[E::%s] no ID within the read group line\n", __func__);
if (mm_verbose >= 1) fprintf(stderr, "[ERROR] no ID within the read group line\n");
goto err_set_rg;
}
p += 4;
for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
if (q - p + 1 > 256) {
if (mm_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__);
if (mm_verbose >= 1) fprintf(stderr, "[ERROR] @RG:ID is longer than 255 characters\n");
goto err_set_rg;
}
for (q = p, r = mm_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
*r++ = *q;
mm_rg_line = rg_line;
return;
mm_sprintf_lite(str, "%s\n", rg_line);
err_set_rg:
free(rg_line);
}
void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *argv[])
{
kstring_t str = {0,0,0};
sam_write_rg_line(&str, rg);
mm_sprintf_lite(&str, "@PG\tID:minimap2\tPN:minimap2");
if (ver) mm_sprintf_lite(&str, "\tVN:%s", ver);
if (argc > 1) {
int i;
mm_sprintf_lite(&str, "\tCL:minimap2");
for (i = 1; i < argc; ++i)
mm_sprintf_lite(&str, " %s", argv[i]);
}
mm_sprintf_lite(&str, "\n");
fputs(str.s, stdout);
free(str.s);
}
static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r)
{
extern unsigned char seq_nt4_table[256];
@ -214,12 +224,11 @@ static char comp_tab[] = {
'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127
};
void sam_write_sam_header(const mm_idx_t *idx)
void mm_write_sam_SQ(const mm_idx_t *idx)
{
uint32_t i;
for (i = 0; i < idx->n_seq; ++i)
printf("@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len);
if (mm_rg_line) puts(mm_rg_line);
}
static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp)
@ -274,8 +283,7 @@ void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
else mm_sprintf_lite(s, "*");
}
write_tags(s, r);
if (mm_rg_line && mm_rg_id[0])
mm_sprintf_lite(s, "\tRG:Z:%s", mm_rg_id);
if (mm_rg_id[0]) mm_sprintf_lite(s, "\tRG:Z:%s", mm_rg_id);
if (r->parent == r->id && r->p && n_regs > 1 && regs && r >= regs && r - regs < n_regs) { // supplementary aln may exist
int i, n_sa = 0; // n_sa: number of SA fields
for (i = 0; i < n_regs; ++i)

View File

@ -285,12 +285,12 @@ static void *worker_pipeline(void *shared, int step, void *in)
mm_idx_t *mm_idx_gen(mm_bseq_file_t *fp, int w, int k, int b, int is_hpc, int mini_batch_size, int n_threads, uint64_t batch_size, int keep_name)
{
pipeline_t pl;
if (fp == 0 || mm_bseq_eof(fp)) return 0;
memset(&pl, 0, sizeof(pipeline_t));
pl.mini_batch_size = mini_batch_size < batch_size? mini_batch_size : batch_size;
pl.keep_name = keep_name;
pl.batch_size = batch_size;
pl.fp = fp;
if (pl.fp == 0) return 0;
pl.mi = mm_idx_init(w, k, b, is_hpc);
kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3);

26
main.c
View File

@ -8,7 +8,7 @@
#include "minimap.h"
#include "mmpriv.h"
#define MM_VERSION "2.0-r301-dirty"
#define MM_VERSION "2.0-r309-dirty"
void liftrlimit()
{
@ -33,6 +33,7 @@ static struct option long_options[] = {
{ "print-aln-seq", no_argument, 0, 0 },
{ "splice", no_argument, 0, 0 },
{ "cost-non-gt-ag", required_argument, 0, 0 },
{ "no-sam-sq", no_argument, 0, 0 },
{ "help", no_argument, 0, 'h' },
{ "max-intron-len", required_argument, 0, 'G' },
{ "version", no_argument, 0, 'V' },
@ -58,11 +59,11 @@ static inline int64_t mm_parse_num(const char *str)
int main(int argc, char *argv[])
{
mm_mapopt_t opt;
int i, c, k = 15, w = -1, bucket_bits = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0, long_idx, idx_par_set = 0, max_intron_len = 0;
int i, c, k = 15, w = -1, bucket_bits = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0, long_idx, idx_par_set = 0, max_intron_len = 0, n_idx_part = 0;
int minibatch_size = 200000000;
uint64_t batch_size = 4000000000ULL;
mm_bseq_file_t *fp = 0;
char *fnw = 0, *s;
char *fnw = 0, *rg = 0, *s;
FILE *fpr = 0, *fpw = 0, *fp_help = stderr;
liftrlimit();
@ -97,7 +98,7 @@ int main(int argc, char *argv[])
else if (c == 's') opt.min_dp_max = atoi(optarg);
else if (c == 'I') batch_size = mm_parse_num(optarg);
else if (c == 'K') minibatch_size = (int)mm_parse_num(optarg);
else if (c == 'R') mm_set_rg(optarg); // WARNING: this modifies global variables in format.c
else if (c == 'R') rg = optarg;
else if (c == 'h') fp_help = stdout;
else if (c == 0 && long_idx == 0) bucket_bits = atoi(optarg); // --bucket-bits
else if (c == 0 && long_idx == 2) keep_name = 0; // --int-rname
@ -110,6 +111,7 @@ int main(int argc, char *argv[])
else if (c == 0 && long_idx == 9) mm_dbg_flag |= MM_DBG_PRINT_QNAME | MM_DBG_PRINT_ALN_SEQ; // --print-aln-seq
else if (c == 0 && long_idx ==10) opt.flag |= MM_F_SPLICE; // --splice
else if (c == 0 && long_idx ==11) opt.noncan = atoi(optarg); // --cost-non-gt-ag
else if (c == 0 && long_idx ==12) opt.flag |= MM_F_NO_SAM_SQ; // --no-sam-sq
else if (c == 'V') {
puts(MM_VERSION);
return 0;
@ -222,27 +224,31 @@ int main(int argc, char *argv[])
is_idx = mm_idx_is_idx(argv[optind]);
if (is_idx < 0) {
fprintf(stderr, "[E::%s] failed to open file '%s'\n", __func__, argv[optind]);
fprintf(stderr, "[ERROR] failed to open file '%s'\n", argv[optind]);
return 1;
}
if (!is_idx && fnw == 0 && argc - optind < 2) {
fprintf(stderr, "[E::%s] missing input: please specify a query file or option -d\n", __func__);
fprintf(stderr, "[ERROR] missing input: please specify a query file to map or option -d to keep the index\n");
return 1;
}
if (is_idx) fpr = fopen(argv[optind], "rb");
else fp = mm_bseq_open(argv[optind]);
if (fnw) fpw = fopen(fnw, "wb");
if (opt.flag & MM_F_OUT_SAM)
mm_write_sam_hdr_no_SQ(rg, MM_VERSION, argc, argv);
for (;;) {
mm_idx_t *mi = 0;
mm_idx_t *mi;
if (fpr) {
mi = mm_idx_load(fpr);
if (idx_par_set && mm_verbose >= 2 && (mi->k != k || mi->w != w || mi->is_hpc != is_hpc))
fprintf(stderr, "[W::%s::%.3f*%.2f] Indexing parameters on the command line (-k/-w/-H) overridden by parameters in the prebuilt index.\n",
__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0));
} else if (!mm_bseq_eof(fp)) {
fprintf(stderr, "[WARNING] \033[1;31mIndexing parameters on the command line (-k/-w/-H) overridden by parameters in the prebuilt index.\033[0m\n");
} else {
mi = mm_idx_gen(fp, w, k, bucket_bits, is_hpc, minibatch_size, n_threads, batch_size, keep_name);
}
if (mi == 0) break;
++n_idx_part;
if (mm_verbose >= 2 && n_idx_part > 1 && (opt.flag&MM_F_OUT_SAM) && !(opt.flag&MM_F_NO_SAM_SQ))
fprintf(stderr, "[WARNING] \033[1;31mSAM output is malformated due to internal @SQ lines. Please add option --no-sam-sq or filter afterwards.\033[0m\n");
if (mm_verbose >= 3)
fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n",
__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n_seq);

3
map.c
View File

@ -385,7 +385,8 @@ int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int
if (pl.fp == 0) return -1;
pl.opt = opt, pl.mi = idx;
pl.n_threads = n_threads, pl.mini_batch_size = mini_batch_size;
if (opt->flag & MM_F_OUT_SAM) sam_write_sam_header(idx);
if ((opt->flag & MM_F_OUT_SAM) && !(opt->flag & MM_F_NO_SAM_SQ))
mm_write_sam_SQ(idx);
kt_pipeline(n_threads == 1? 1 : 2, worker_pipeline, &pl, 3);
free(pl.str.s);
mm_bseq_close(pl.fp);

View File

@ -18,6 +18,7 @@
#define MM_F_SPLICE_FOR 0x100
#define MM_F_SPLICE_REV 0x200
#define MM_F_SPLICE_BOTH 0x400
#define MM_F_NO_SAM_SQ 0x800
#define MM_IDX_MAGIC "MMI\2"

View File

@ -1,4 +1,4 @@
.TH minimap2 1 "13 August 2017" "minimap2-2.0-r296-dirty" "Bioinformatics tools"
.TH minimap2 1 "25 August 2017" "minimap2-2.0-r309-dirty" "Bioinformatics tools"
.SH NAME
.PP
minimap2 - mapping and alignment between collections of DNA sequences
@ -255,8 +255,13 @@ and
use
.BR -K500m .
.TP
.B -V
.B --version
Print version number to stdout
.TP
.B --no-sam-hdr
Don't output SAM header lines. Use this option if the index consists of
multiple parts; otherwise the SAM output is malformated due to internal header
lines.
.SS Preset options
.TP 10
.BI -x \ STR
@ -320,7 +325,7 @@ is that this preset is not using HPC minimizers.
Long-read spliced alignment
.RB ( -k15
.B -w5 --splice -g2000 -G200k -A1 -B2 -O2,32 -E1,0 -z200 -ub --cost-non-gt-ag
.BR 4 ).
.BR 5 ).
In the splice mode, 1) long deletions are taken as introns and represented as
the
.RB ` N '

View File

@ -40,8 +40,8 @@ void radix_sort_128x(mm128_t *beg, mm128_t *end);
void radix_sort_64(uint64_t *beg, uint64_t *end);
uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
void mm_set_rg(const char *s);
void sam_write_sam_header(const mm_idx_t *idx);
void mm_write_sam_SQ(const mm_idx_t *idx);
void mm_write_sam_hdr_no_SQ(const char *rg, const char *ver, int argc, char *argv[]);
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag);
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs);
int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km);