support inserting RG lines

This commit is contained in:
Heng Li 2017-08-17 23:34:09 +08:00
parent b5f5929bf9
commit bbb37d95f2
5 changed files with 83 additions and 12 deletions

View File

@ -6,6 +6,8 @@
#include "kalloc.h" #include "kalloc.h"
#include "mmpriv.h" #include "mmpriv.h"
static char *mm_rg_line, mm_rg_id[256];
static inline void str_enlarge(kstring_t *s, int l) static inline void str_enlarge(kstring_t *s, int l)
{ {
if (s->l + l + 1 > s->m) { if (s->l + l + 1 > s->m) {
@ -56,6 +58,60 @@ static void mm_sprintf_lite(kstring_t *s, const char *fmt, ...)
s->s[s->l] = 0; s->s[s->l] = 0;
} }
static inline char *mm_escape(char *s)
{
char *p, *q;
for (p = q = s; *p; ++p) {
if (*p == '\\') {
++p;
if (*p == 't') *q++ = '\t';
else if (*p == 'n') *q++ = '\n';
else if (*p == 'r') *q++ = '\r';
else if (*p == '\\') *q++ = '\\';
} else *q++ = *p;
}
*q = '\0';
return s;
}
void mm_set_rg(const char *s)
{
char *p, *q, *r, *rg_line = 0;
memset(mm_rg_id, 0, 256);
if (mm_rg_line) {
free(mm_rg_line);
mm_rg_line = 0;
}
if (s == 0) return;
if (strstr(s, "@RG") != s) {
if (mm_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__);
goto err_set_rg;
}
if (strstr(s, "\t") != NULL) {
if (mm_verbose >= 1) fprintf(stderr, "[E::%s] the read group line contained literal <tab> characters -- replace with escaped tabs: \\t\n", __func__);
goto err_set_rg;
}
rg_line = strdup(s);
mm_escape(rg_line);
if ((p = strstr(rg_line, "\tID:")) == 0) {
if (mm_verbose >= 1) fprintf(stderr, "[E::%s] no ID within the read group line\n", __func__);
goto err_set_rg;
}
p += 4;
for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
if (q - p + 1 > 256) {
if (mm_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__);
goto err_set_rg;
}
for (q = p, r = mm_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
*r++ = *q;
mm_rg_line = rg_line;
return;
err_set_rg:
free(rg_line);
}
static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r) static void write_cs(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r)
{ {
extern unsigned char seq_nt4_table[256]; extern unsigned char seq_nt4_table[256];
@ -158,6 +214,14 @@ static char comp_tab[] = {
'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127 'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127
}; };
void sam_write_sam_header(const mm_idx_t *idx)
{
uint32_t i;
for (i = 0; i < idx->n_seq; ++i)
printf("@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len);
if (mm_rg_line) puts(mm_rg_line);
}
static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp) static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp)
{ {
if (rev) { if (rev) {
@ -214,6 +278,8 @@ void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
else mm_sprintf_lite(s, "*"); else mm_sprintf_lite(s, "*");
} }
write_tags(s, r); write_tags(s, r);
if (mm_rg_line && mm_rg_id[0])
mm_sprintf_lite(s, "\tRG:Z:%s", mm_rg_id);
if (r->parent == r->id && r->p && n_regs > 1 && regs && r >= regs && r - regs < n_regs) { // supplementary aln may exist if (r->parent == r->id && r->p && n_regs > 1 && regs && r >= regs && r - regs < n_regs) { // supplementary aln may exist
int i, n_sa = 0; // n_sa: number of SA fields int i, n_sa = 0; // n_sa: number of SA fields
for (i = 0; i < n_regs; ++i) for (i = 0; i < n_regs; ++i)

10
main.c
View File

@ -68,7 +68,7 @@ int main(int argc, char *argv[])
mm_realtime0 = realtime(); mm_realtime0 = realtime();
mm_mapopt_init(&opt); mm_mapopt_init(&opt);
while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:", long_options, &long_idx)) >= 0) { while ((c = getopt_long(argc, argv, "aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:", long_options, &long_idx)) >= 0) {
if (c == 'w') w = atoi(optarg), idx_par_set = 1; if (c == 'w') w = atoi(optarg), idx_par_set = 1;
else if (c == 'k') k = atoi(optarg), idx_par_set = 1; else if (c == 'k') k = atoi(optarg), idx_par_set = 1;
else if (c == 'H') is_hpc = 1, idx_par_set = 1; else if (c == 'H') is_hpc = 1, idx_par_set = 1;
@ -96,6 +96,7 @@ int main(int argc, char *argv[])
else if (c == 's') opt.min_dp_max = atoi(optarg); else if (c == 's') opt.min_dp_max = atoi(optarg);
else if (c == 'I') batch_size = mm_parse_num(optarg); else if (c == 'I') batch_size = mm_parse_num(optarg);
else if (c == 'K') minibatch_size = (int)mm_parse_num(optarg); else if (c == 'K') minibatch_size = (int)mm_parse_num(optarg);
else if (c == 'R') mm_set_rg(optarg); // WARNING: this modifies global variables in format.c
else if (c == 0 && long_idx == 0) bucket_bits = atoi(optarg); // --bucket-bits else if (c == 0 && long_idx == 0) bucket_bits = atoi(optarg); // --bucket-bits
else if (c == 0 && long_idx == 2) keep_name = 0; // --int-rname else if (c == 0 && long_idx == 2) keep_name = 0; // --int-rname
else if (c == 0 && long_idx == 3) mm_dbg_flag |= MM_DBG_NO_KALLOC; // --no-kalloc else if (c == 0 && long_idx == 3) mm_dbg_flag |= MM_DBG_NO_KALLOC; // --no-kalloc
@ -194,14 +195,15 @@ int main(int argc, char *argv[])
fprintf(stderr, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max); fprintf(stderr, " -s INT minimal peak DP alignment score [%d]\n", opt.min_dp_max);
fprintf(stderr, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n"); fprintf(stderr, " -u CHAR how to find GT-AG. f:transcript strand, b:both strands, n:don't match GT-AG [n]\n");
fprintf(stderr, " Input/Output:\n"); fprintf(stderr, " Input/Output:\n");
fprintf(stderr, " -Q ignore base quality in the input\n");
fprintf(stderr, " -a output in the SAM format (PAF by default)\n"); fprintf(stderr, " -a output in the SAM format (PAF by default)\n");
fprintf(stderr, " -Q don't output base quality in SAM\n");
fprintf(stderr, " -R STR SAM read group line in a format like '@RG\\tID:foo\\tSM:bar' []\n");
fprintf(stderr, " -c output CIGAR in PAF\n"); fprintf(stderr, " -c output CIGAR in PAF\n");
fprintf(stderr, " -S output the cs tag in PAF\n"); fprintf(stderr, " -S output the cs tag in PAF (cs encodes both query and ref sequences)\n");
fprintf(stderr, " -t INT number of threads [%d]\n", n_threads); fprintf(stderr, " -t INT number of threads [%d]\n", n_threads);
fprintf(stderr, " -K NUM minibatch size [200M]\n"); fprintf(stderr, " -K NUM minibatch size [200M]\n");
// fprintf(stderr, " -v INT verbose level [%d]\n", mm_verbose); // fprintf(stderr, " -v INT verbose level [%d]\n", mm_verbose);
fprintf(stderr, " -V show version number\n"); fprintf(stderr, " --version show version number\n");
fprintf(stderr, " Preset:\n"); fprintf(stderr, " Preset:\n");
fprintf(stderr, " -x STR preset (recommended to be applied before other options) []\n"); fprintf(stderr, " -x STR preset (recommended to be applied before other options) []\n");
fprintf(stderr, " map10k/map-pb: -Hk19 (PacBio/ONT vs reference mapping)\n"); fprintf(stderr, " map10k/map-pb: -Hk19 (PacBio/ONT vs reference mapping)\n");

6
map.c
View File

@ -386,11 +386,7 @@ int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int
if (pl.fp == 0) return -1; if (pl.fp == 0) return -1;
pl.opt = opt, pl.mi = idx; pl.opt = opt, pl.mi = idx;
pl.n_threads = n_threads, pl.mini_batch_size = mini_batch_size; pl.n_threads = n_threads, pl.mini_batch_size = mini_batch_size;
if (opt->flag & MM_F_OUT_SAM) { if (opt->flag & MM_F_OUT_SAM) sam_write_sam_header(idx);
uint32_t i;
for (i = 0; i < idx->n_seq; ++i)
printf("@SQ\tSN:%s\tLN:%d\n", idx->seq[i].name, idx->seq[i].len);
}
kt_pipeline(n_threads == 1? 1 : 2, worker_pipeline, &pl, 3); kt_pipeline(n_threads == 1? 1 : 2, worker_pipeline, &pl, 3);
free(pl.str.s); free(pl.str.s);
mm_bseq_close(pl.fp); mm_bseq_close(pl.fp);

View File

@ -219,13 +219,18 @@ no attempt to match GT-AG [n]
Cost of non-canonical splicing sites [0]. Cost of non-canonical splicing sites [0].
.SS Input/output options .SS Input/output options
.TP 10 .TP 10
.B -Q
Ignore base quality in the input file.
.TP
.B -a .B -a
Generate CIGAR and output alignments in the SAM format. Minimap2 outputs in PAF Generate CIGAR and output alignments in the SAM format. Minimap2 outputs in PAF
by default. by default.
.TP .TP
.B -Q
Ignore base quality in the input file.
.TP
.BI -R \ STR
SAM read group line in a format like
.B @RG\\\\tID:foo\\\\tSM:bar
[].
.TP
.B -c .B -c
Generate CIGAR. In PAF, the CIGAR is written to the `cg' custom tag. Generate CIGAR. In PAF, the CIGAR is written to the `cg' custom tag.
.TP .TP

View File

@ -40,6 +40,8 @@ void radix_sort_128x(mm128_t *beg, mm128_t *end);
void radix_sort_64(uint64_t *beg, uint64_t *end); void radix_sort_64(uint64_t *beg, uint64_t *end);
uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
void mm_set_rg(const char *s);
void sam_write_sam_header(const mm_idx_t *idx);
void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int intron_thres); void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, void *km, int opt_flag, int intron_thres);
void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs, int intron_thres); void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int n_regs, const mm_reg1_t *regs, int intron_thres);
int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km); int mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int64_t n, mm128_t *a, uint64_t **_u, void *km);