r752: option to copy comments to output (#136)
This commit is contained in:
parent
8766d286df
commit
08bd2123b6
23
bseq.c
23
bseq.c
|
|
@ -62,7 +62,7 @@ static inline char *kstrdup(const kstring_t *s)
|
|||
return t;
|
||||
}
|
||||
|
||||
static inline void kseq2bseq(kseq_t *ks, mm_bseq1_t *s, int with_qual)
|
||||
static inline void kseq2bseq(kseq_t *ks, mm_bseq1_t *s, int with_qual, int with_comment)
|
||||
{
|
||||
int i;
|
||||
s->name = kstrdup(&ks->name);
|
||||
|
|
@ -71,10 +71,11 @@ static inline void kseq2bseq(kseq_t *ks, mm_bseq1_t *s, int with_qual)
|
|||
if (s->seq[i] == 'u' || s->seq[i] == 'U')
|
||||
--s->seq[i];
|
||||
s->qual = with_qual && ks->qual.l? kstrdup(&ks->qual) : 0;
|
||||
s->comment = with_comment && ks->comment.l? kstrdup(&ks->comment) : 0;
|
||||
s->l_seq = ks->seq.l;
|
||||
}
|
||||
|
||||
mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int chunk_size, int with_qual, int frag_mode, int *n_)
|
||||
mm_bseq1_t *mm_bseq_read3(mm_bseq_file_t *fp, int chunk_size, int with_qual, int with_comment, int frag_mode, int *n_)
|
||||
{
|
||||
int64_t size = 0;
|
||||
kvec_t(mm_bseq1_t) a = {0,0,0};
|
||||
|
|
@ -91,12 +92,12 @@ mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int chunk_size, int with_qual, int
|
|||
assert(ks->seq.l <= INT32_MAX);
|
||||
if (a.m == 0) kv_resize(mm_bseq1_t, 0, a, 256);
|
||||
kv_pushp(mm_bseq1_t, 0, a, &s);
|
||||
kseq2bseq(ks, s, with_qual);
|
||||
kseq2bseq(ks, s, with_qual, with_comment);
|
||||
size += s->l_seq;
|
||||
if (size >= chunk_size) {
|
||||
if (frag_mode && a.a[a.n-1].l_seq < CHECK_PAIR_THRES) {
|
||||
while (kseq_read(ks) >= 0) {
|
||||
kseq2bseq(ks, &fp->s, with_qual);
|
||||
kseq2bseq(ks, &fp->s, with_qual, with_comment);
|
||||
if (mm_qname_same(fp->s.name, a.a[a.n-1].name)) {
|
||||
kv_push(mm_bseq1_t, 0, a, fp->s);
|
||||
memset(&fp->s, 0, sizeof(mm_bseq1_t));
|
||||
|
|
@ -110,12 +111,17 @@ mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int chunk_size, int with_qual, int
|
|||
return a.a;
|
||||
}
|
||||
|
||||
mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int chunk_size, int with_qual, int frag_mode, int *n_)
|
||||
{
|
||||
return mm_bseq_read3(fp, chunk_size, with_qual, 0, frag_mode, n_);
|
||||
}
|
||||
|
||||
mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int chunk_size, int with_qual, int *n_)
|
||||
{
|
||||
return mm_bseq_read2(fp, chunk_size, with_qual, 0, n_);
|
||||
}
|
||||
|
||||
mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int chunk_size, int with_qual, int *n_)
|
||||
mm_bseq1_t *mm_bseq_read_frag2(int n_fp, mm_bseq_file_t **fp, int chunk_size, int with_qual, int with_comment, int *n_)
|
||||
{
|
||||
int i;
|
||||
int64_t size = 0;
|
||||
|
|
@ -136,7 +142,7 @@ mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int chunk_size, int
|
|||
for (i = 0; i < n_fp; ++i) {
|
||||
mm_bseq1_t *s;
|
||||
kv_pushp(mm_bseq1_t, 0, a, &s);
|
||||
kseq2bseq(fp[i]->ks, s, with_qual);
|
||||
kseq2bseq(fp[i]->ks, s, with_qual, with_comment);
|
||||
size += s->l_seq;
|
||||
}
|
||||
if (size >= chunk_size) break;
|
||||
|
|
@ -145,6 +151,11 @@ mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int chunk_size, int
|
|||
return a.a;
|
||||
}
|
||||
|
||||
mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int chunk_size, int with_qual, int *n_)
|
||||
{
|
||||
return mm_bseq_read_frag2(n_fp, fp, chunk_size, with_qual, 0, n_);
|
||||
}
|
||||
|
||||
int mm_bseq_eof(mm_bseq_file_t *fp)
|
||||
{
|
||||
return (ks_eof(fp->ks->f) && fp->s.seq == 0);
|
||||
|
|
|
|||
4
bseq.h
4
bseq.h
|
|
@ -13,13 +13,15 @@ typedef struct mm_bseq_file_s mm_bseq_file_t;
|
|||
|
||||
typedef struct {
|
||||
int l_seq, rid;
|
||||
char *name, *seq, *qual;
|
||||
char *name, *seq, *qual, *comment;
|
||||
} mm_bseq1_t;
|
||||
|
||||
mm_bseq_file_t *mm_bseq_open(const char *fn);
|
||||
void mm_bseq_close(mm_bseq_file_t *fp);
|
||||
mm_bseq1_t *mm_bseq_read3(mm_bseq_file_t *fp, int chunk_size, int with_qual, int with_comment, int frag_mode, int *n_);
|
||||
mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int chunk_size, int with_qual, int frag_mode, int *n_);
|
||||
mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int chunk_size, int with_qual, int *n_);
|
||||
mm_bseq1_t *mm_bseq_read_frag2(int n_fp, mm_bseq_file_t **fp, int chunk_size, int with_qual, int with_comment, int *n_);
|
||||
mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int chunk_size, int with_qual, int *n_);
|
||||
int mm_bseq_eof(mm_bseq_file_t *fp);
|
||||
|
||||
|
|
|
|||
5
format.c
5
format.c
|
|
@ -274,6 +274,8 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
|
|||
}
|
||||
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
|
||||
write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD);
|
||||
if ((opt_flag & MM_F_COPY_COMMENT) && t->comment)
|
||||
mm_sprintf_lite(s, "\t%s", t->comment);
|
||||
}
|
||||
|
||||
static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp)
|
||||
|
|
@ -475,6 +477,9 @@ void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
|
|||
write_sam_cigar(s, flag, 1, t->l_seq, r, opt_flag);
|
||||
}
|
||||
|
||||
if ((opt_flag & MM_F_COPY_COMMENT) && t->comment)
|
||||
mm_sprintf_lite(s, "\t%s", t->comment);
|
||||
|
||||
s->s[s->l] = 0; // we always have room for an extra byte (see str_enlarge)
|
||||
}
|
||||
|
||||
|
|
|
|||
6
main.c
6
main.c
|
|
@ -10,7 +10,7 @@
|
|||
#include "getopt.h"
|
||||
#endif
|
||||
|
||||
#define MM_VERSION "2.9-r751-dirty"
|
||||
#define MM_VERSION "2.9-r752-dirty"
|
||||
|
||||
#ifdef __linux__
|
||||
#include <sys/resource.h>
|
||||
|
|
@ -94,7 +94,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int flag, int long_idx, const cha
|
|||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:";
|
||||
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:y";
|
||||
mm_mapopt_t opt;
|
||||
mm_idxopt_t ipt;
|
||||
int i, c, n_threads = 3, long_idx;
|
||||
|
|
@ -140,6 +140,7 @@ int main(int argc, char *argv[])
|
|||
else if (c == 'Q') opt.flag |= MM_F_NO_QUAL;
|
||||
else if (c == 'Y') opt.flag |= MM_F_SOFTCLIP;
|
||||
else if (c == 'L') opt.flag |= MM_F_LONG_CIGAR;
|
||||
else if (c == 'y') opt.flag |= MM_F_COPY_COMMENT;
|
||||
else if (c == 'T') opt.sdust_thres = atoi(optarg);
|
||||
else if (c == 'n') opt.min_cnt = atoi(optarg);
|
||||
else if (c == 'm') opt.min_chain_score = atoi(optarg);
|
||||
|
|
@ -272,6 +273,7 @@ int main(int argc, char *argv[])
|
|||
fprintf(fp_help, " -R STR SAM read group line in a format like '@RG\\tID:foo\\tSM:bar' []\n");
|
||||
fprintf(fp_help, " -c output CIGAR in PAF\n");
|
||||
fprintf(fp_help, " --cs[=STR] output the cs tag; STR is 'short' (if absent) or 'long' [none]\n");
|
||||
fprintf(fp_help, " --MD output the MD tag\n");
|
||||
fprintf(fp_help, " -Y use soft clipping for supplementary alignments\n");
|
||||
fprintf(fp_help, " -t INT number of threads [%d]\n", n_threads);
|
||||
fprintf(fp_help, " -K NUM minibatch size for mapping [500M]\n");
|
||||
|
|
|
|||
5
map.c
5
map.c
|
|
@ -442,11 +442,12 @@ static void *worker_pipeline(void *shared, int step, void *in)
|
|||
pipeline_t *p = (pipeline_t*)shared;
|
||||
if (step == 0) { // step 0: read sequences
|
||||
int with_qual = (!!(p->opt->flag & MM_F_OUT_SAM) && !(p->opt->flag & MM_F_NO_QUAL));
|
||||
int with_comment = !!(p->opt->flag & MM_F_COPY_COMMENT);
|
||||
int frag_mode = (p->n_fp > 1 || !!(p->opt->flag & MM_F_FRAG_MODE));
|
||||
step_t *s;
|
||||
s = (step_t*)calloc(1, sizeof(step_t));
|
||||
if (p->n_fp > 1) s->seq = mm_bseq_read_frag(p->n_fp, p->fp, p->mini_batch_size, with_qual, &s->n_seq);
|
||||
else s->seq = mm_bseq_read2(p->fp[0], p->mini_batch_size, with_qual, frag_mode, &s->n_seq);
|
||||
if (p->n_fp > 1) s->seq = mm_bseq_read_frag2(p->n_fp, p->fp, p->mini_batch_size, with_qual, with_comment, &s->n_seq);
|
||||
else s->seq = mm_bseq_read3(p->fp[0], p->mini_batch_size, with_qual, with_comment, frag_mode, &s->n_seq);
|
||||
if (s->seq) {
|
||||
s->p = p;
|
||||
for (i = 0; i < s->n_seq; ++i)
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@
|
|||
#define MM_F_HEAP_SORT 0x400000
|
||||
#define MM_F_ALL_CHAINS 0x800000
|
||||
#define MM_F_OUT_MD 0x1000000
|
||||
#define MM_F_COPY_COMMENT 0x2000000
|
||||
|
||||
#define MM_I_HPC 0x1
|
||||
#define MM_I_NO_SEQ 0x2
|
||||
|
|
|
|||
|
|
@ -370,6 +370,9 @@ SAM read group line in a format like
|
|||
.B @RG\\\\tID:foo\\\\tSM:bar
|
||||
[].
|
||||
.TP
|
||||
.B -y
|
||||
Copy input FASTA/Q comments to output.
|
||||
.TP
|
||||
.B -c
|
||||
Generate CIGAR. In PAF, the CIGAR is written to the `cg' custom tag.
|
||||
.TP
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
>MT_orang
|
||||
>MT_orang co:Z:comment
|
||||
GTTTATGTAGCTTATTCTATCCAAAGCAATGCACTGAAAATGTCTCGACGGGCCCACACG
|
||||
CCCCATAAACAAATAGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTGAGGTTACACAT
|
||||
GCAAGCATCCCCGCCCCAGTGAGTCGCCCTCCAAGTCACTCTGACTAAGAGGAGCAAGCA
|
||||
|
|
|
|||
Loading…
Reference in New Issue