diff --git a/bseq.c b/bseq.c index 6bf55f5..ec7dff3 100644 --- a/bseq.c +++ b/bseq.c @@ -62,7 +62,7 @@ static inline char *kstrdup(const kstring_t *s) return t; } -static inline void kseq2bseq(kseq_t *ks, mm_bseq1_t *s, int with_qual) +static inline void kseq2bseq(kseq_t *ks, mm_bseq1_t *s, int with_qual, int with_comment) { int i; s->name = kstrdup(&ks->name); @@ -71,10 +71,11 @@ static inline void kseq2bseq(kseq_t *ks, mm_bseq1_t *s, int with_qual) if (s->seq[i] == 'u' || s->seq[i] == 'U') --s->seq[i]; s->qual = with_qual && ks->qual.l? kstrdup(&ks->qual) : 0; + s->comment = with_comment && ks->comment.l? kstrdup(&ks->comment) : 0; s->l_seq = ks->seq.l; } -mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int chunk_size, int with_qual, int frag_mode, int *n_) +mm_bseq1_t *mm_bseq_read3(mm_bseq_file_t *fp, int chunk_size, int with_qual, int with_comment, int frag_mode, int *n_) { int64_t size = 0; kvec_t(mm_bseq1_t) a = {0,0,0}; @@ -91,12 +92,12 @@ mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int chunk_size, int with_qual, int assert(ks->seq.l <= INT32_MAX); if (a.m == 0) kv_resize(mm_bseq1_t, 0, a, 256); kv_pushp(mm_bseq1_t, 0, a, &s); - kseq2bseq(ks, s, with_qual); + kseq2bseq(ks, s, with_qual, with_comment); size += s->l_seq; if (size >= chunk_size) { if (frag_mode && a.a[a.n-1].l_seq < CHECK_PAIR_THRES) { while (kseq_read(ks) >= 0) { - kseq2bseq(ks, &fp->s, with_qual); + kseq2bseq(ks, &fp->s, with_qual, with_comment); if (mm_qname_same(fp->s.name, a.a[a.n-1].name)) { kv_push(mm_bseq1_t, 0, a, fp->s); memset(&fp->s, 0, sizeof(mm_bseq1_t)); @@ -110,12 +111,17 @@ mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int chunk_size, int with_qual, int return a.a; } +mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int chunk_size, int with_qual, int frag_mode, int *n_) +{ + return mm_bseq_read3(fp, chunk_size, with_qual, 0, frag_mode, n_); +} + mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int chunk_size, int with_qual, int *n_) { return mm_bseq_read2(fp, chunk_size, with_qual, 0, n_); } -mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int chunk_size, int with_qual, int *n_) +mm_bseq1_t *mm_bseq_read_frag2(int n_fp, mm_bseq_file_t **fp, int chunk_size, int with_qual, int with_comment, int *n_) { int i; int64_t size = 0; @@ -136,7 +142,7 @@ mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int chunk_size, int for (i = 0; i < n_fp; ++i) { mm_bseq1_t *s; kv_pushp(mm_bseq1_t, 0, a, &s); - kseq2bseq(fp[i]->ks, s, with_qual); + kseq2bseq(fp[i]->ks, s, with_qual, with_comment); size += s->l_seq; } if (size >= chunk_size) break; @@ -145,6 +151,11 @@ mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int chunk_size, int return a.a; } +mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int chunk_size, int with_qual, int *n_) +{ + return mm_bseq_read_frag2(n_fp, fp, chunk_size, with_qual, 0, n_); +} + int mm_bseq_eof(mm_bseq_file_t *fp) { return (ks_eof(fp->ks->f) && fp->s.seq == 0); diff --git a/bseq.h b/bseq.h index 09ae3da..2f25d3a 100644 --- a/bseq.h +++ b/bseq.h @@ -13,13 +13,15 @@ typedef struct mm_bseq_file_s mm_bseq_file_t; typedef struct { int l_seq, rid; - char *name, *seq, *qual; + char *name, *seq, *qual, *comment; } mm_bseq1_t; mm_bseq_file_t *mm_bseq_open(const char *fn); void mm_bseq_close(mm_bseq_file_t *fp); +mm_bseq1_t *mm_bseq_read3(mm_bseq_file_t *fp, int chunk_size, int with_qual, int with_comment, int frag_mode, int *n_); mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int chunk_size, int with_qual, int frag_mode, int *n_); mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int chunk_size, int with_qual, int *n_); +mm_bseq1_t *mm_bseq_read_frag2(int n_fp, mm_bseq_file_t **fp, int chunk_size, int with_qual, int with_comment, int *n_); mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int chunk_size, int with_qual, int *n_); int mm_bseq_eof(mm_bseq_file_t *fp); diff --git a/format.c b/format.c index 382dfd8..515a6b0 100644 --- a/format.c +++ b/format.c @@ -274,6 +274,8 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m } if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD))) write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD); + if ((opt_flag & MM_F_COPY_COMMENT) && t->comment) + mm_sprintf_lite(s, "\t%s", t->comment); } static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp) @@ -475,6 +477,9 @@ void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se write_sam_cigar(s, flag, 1, t->l_seq, r, opt_flag); } + if ((opt_flag & MM_F_COPY_COMMENT) && t->comment) + mm_sprintf_lite(s, "\t%s", t->comment); + s->s[s->l] = 0; // we always have room for an extra byte (see str_enlarge) } diff --git a/main.c b/main.c index 44dae53..a23866a 100644 --- a/main.c +++ b/main.c @@ -10,7 +10,7 @@ #include "getopt.h" #endif -#define MM_VERSION "2.9-r751-dirty" +#define MM_VERSION "2.9-r752-dirty" #ifdef __linux__ #include @@ -94,7 +94,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int flag, int long_idx, const cha int main(int argc, char *argv[]) { - const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:"; + const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:y"; mm_mapopt_t opt; mm_idxopt_t ipt; int i, c, n_threads = 3, long_idx; @@ -140,6 +140,7 @@ int main(int argc, char *argv[]) else if (c == 'Q') opt.flag |= MM_F_NO_QUAL; else if (c == 'Y') opt.flag |= MM_F_SOFTCLIP; else if (c == 'L') opt.flag |= MM_F_LONG_CIGAR; + else if (c == 'y') opt.flag |= MM_F_COPY_COMMENT; else if (c == 'T') opt.sdust_thres = atoi(optarg); else if (c == 'n') opt.min_cnt = atoi(optarg); else if (c == 'm') opt.min_chain_score = atoi(optarg); @@ -272,6 +273,7 @@ int main(int argc, char *argv[]) fprintf(fp_help, " -R STR SAM read group line in a format like '@RG\\tID:foo\\tSM:bar' []\n"); fprintf(fp_help, " -c output CIGAR in PAF\n"); fprintf(fp_help, " --cs[=STR] output the cs tag; STR is 'short' (if absent) or 'long' [none]\n"); + fprintf(fp_help, " --MD output the MD tag\n"); fprintf(fp_help, " -Y use soft clipping for supplementary alignments\n"); fprintf(fp_help, " -t INT number of threads [%d]\n", n_threads); fprintf(fp_help, " -K NUM minibatch size for mapping [500M]\n"); diff --git a/map.c b/map.c index 3146678..ebe4c61 100644 --- a/map.c +++ b/map.c @@ -442,11 +442,12 @@ static void *worker_pipeline(void *shared, int step, void *in) pipeline_t *p = (pipeline_t*)shared; if (step == 0) { // step 0: read sequences int with_qual = (!!(p->opt->flag & MM_F_OUT_SAM) && !(p->opt->flag & MM_F_NO_QUAL)); + int with_comment = !!(p->opt->flag & MM_F_COPY_COMMENT); int frag_mode = (p->n_fp > 1 || !!(p->opt->flag & MM_F_FRAG_MODE)); step_t *s; s = (step_t*)calloc(1, sizeof(step_t)); - if (p->n_fp > 1) s->seq = mm_bseq_read_frag(p->n_fp, p->fp, p->mini_batch_size, with_qual, &s->n_seq); - else s->seq = mm_bseq_read2(p->fp[0], p->mini_batch_size, with_qual, frag_mode, &s->n_seq); + if (p->n_fp > 1) s->seq = mm_bseq_read_frag2(p->n_fp, p->fp, p->mini_batch_size, with_qual, with_comment, &s->n_seq); + else s->seq = mm_bseq_read3(p->fp[0], p->mini_batch_size, with_qual, with_comment, frag_mode, &s->n_seq); if (s->seq) { s->p = p; for (i = 0; i < s->n_seq; ++i) diff --git a/minimap.h b/minimap.h index 399c5c4..7d416a1 100644 --- a/minimap.h +++ b/minimap.h @@ -30,6 +30,7 @@ #define MM_F_HEAP_SORT 0x400000 #define MM_F_ALL_CHAINS 0x800000 #define MM_F_OUT_MD 0x1000000 +#define MM_F_COPY_COMMENT 0x2000000 #define MM_I_HPC 0x1 #define MM_I_NO_SEQ 0x2 diff --git a/minimap2.1 b/minimap2.1 index c37301e..0305b41 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -370,6 +370,9 @@ SAM read group line in a format like .B @RG\\\\tID:foo\\\\tSM:bar []. .TP +.B -y +Copy input FASTA/Q comments to output. +.TP .B -c Generate CIGAR. In PAF, the CIGAR is written to the `cg' custom tag. .TP diff --git a/test/MT-orang.fa b/test/MT-orang.fa index ceb40ed..2919478 100644 --- a/test/MT-orang.fa +++ b/test/MT-orang.fa @@ -1,4 +1,4 @@ ->MT_orang +>MT_orang co:Z:comment GTTTATGTAGCTTATTCTATCCAAAGCAATGCACTGAAAATGTCTCGACGGGCCCACACG CCCCATAAACAAATAGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTGAGGTTACACAT GCAAGCATCCCCGCCCCAGTGAGTCGCCCTCCAAGTCACTCTGACTAAGAGGAGCAAGCA