r520: added option -L to write long cigar to CG

This commit is contained in:
Heng Li 2017-10-17 17:32:44 -04:00
parent ffd953029f
commit 4683da2455
3 changed files with 51 additions and 13 deletions

View File

@ -43,6 +43,13 @@ static void mm_sprintf_lite(kstring_t *s, const char *fmt, ...)
if (c < 0) buf[l++] = '-';
str_enlarge(s, l);
for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
} else if (*p == 'u') {
int i, l = 0;
uint32_t x;
x = va_arg(ap, uint32_t);
do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
str_enlarge(s, l);
for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
} else if (*p == 's') {
char *r = va_arg(ap, char*);
str_copy(s, r, r + strlen(r));
@ -251,9 +258,35 @@ static inline const mm_reg1_t *get_sam_pri(int n_regs, const mm_reg1_t *regs)
return NULL;
}
static void write_sam_cigar(kstring_t *s, int sam_flag, int in_tag, int qlen, const mm_reg1_t *r)
{
if (r->p == 0) {
mm_sprintf_lite(s, "*");
} else {
uint32_t k, clip_len[2];
clip_len[0] = r->rev? qlen - r->qe : r->qs;
clip_len[1] = r->rev? r->qs : qlen - r->qe;
if (in_tag) {
int clip_char = (sam_flag&0x800)? 5 : 4;
mm_sprintf_lite(s, "\tCG:B:I");
if (clip_len[0]) mm_sprintf_lite(s, ",%u", clip_len[0]<<4|clip_char);
for (k = 0; k < r->p->n_cigar; ++k)
mm_sprintf_lite(s, ",%u", r->p->cigar[k]);
if (clip_len[1]) mm_sprintf_lite(s, ",%u", clip_len[1]<<4|clip_char);
} else {
int clip_char = (sam_flag&0x800)? 'H' : 'S';
if (clip_len[0]) mm_sprintf_lite(s, "%d%c", clip_len[0], clip_char);
for (k = 0; k < r->p->n_cigar; ++k)
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDN"[r->p->cigar[k]&0xf]);
if (clip_len[1]) mm_sprintf_lite(s, "%d%c", clip_len[1], clip_char);
}
}
}
void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int seg_idx, int reg_idx, int n_seg, const int *n_regss, const mm_reg1_t *const* regss, void *km, int opt_flag)
{
int flag, n_regs = n_regss[seg_idx];
const int max_bam_cigar_op = 65535;
int flag, n_regs = n_regss[seg_idx], cigar_in_tag = 0;
int this_rid = -1, this_pos = -1, this_rev = 0;
const mm_reg1_t *regs = regss[seg_idx], *r_prev = NULL, *r_next;
const mm_reg1_t *r = n_regs > 0 && reg_idx < n_regs && reg_idx >= 0? &regs[reg_idx] : NULL;
@ -306,15 +339,15 @@ void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
int mapq = !r->iden_flt? r->mapq : r->mapq < 3? r->mapq : 3;
this_rid = r->rid, this_pos = r->rs, this_rev = r->rev;
mm_sprintf_lite(s, "\t%s\t%d\t%d\t", mi->seq[r->rid].name, r->rs+1, mapq);
if (r->p) { // actually this should always be true for SAM output
uint32_t k, clip_len = r->rev? t->l_seq - r->qe : r->qs;
int clip_char = (flag&0x800)? 'H' : 'S';
if (clip_len) mm_sprintf_lite(s, "%d%c", clip_len, clip_char);
for (k = 0; k < r->p->n_cigar; ++k)
mm_sprintf_lite(s, "%d%c", r->p->cigar[k]>>4, "MIDN"[r->p->cigar[k]&0xf]);
clip_len = r->rev? r->qs : t->l_seq - r->qe;
if (clip_len) mm_sprintf_lite(s, "%d%c", clip_len, clip_char);
} else mm_sprintf_lite(s, "*");
if ((opt_flag & MM_F_LONG_CIGAR) && r->p && r->p->n_cigar > max_bam_cigar_op - 2) {
int n_cigar = r->p->n_cigar;
if (r->qs != 0) ++n_cigar;
if (r->qe != t->l_seq) ++n_cigar;
if (n_cigar > max_bam_cigar_op)
cigar_in_tag = 1;
}
if (cigar_in_tag) mm_sprintf_lite(s, "%dS", t->l_seq);
else write_sam_cigar(s, flag, 0, t->l_seq, r);
}
// write mate positions
@ -394,6 +427,8 @@ void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
}
if (r->p && (opt_flag & MM_F_OUT_CS))
write_cs(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG));
if (cigar_in_tag)
write_sam_cigar(s, flag, 1, t->l_seq, r);
}
s->s[s->l] = 0; // we always have room for an extra byte (see str_enlarge)

8
main.c
View File

@ -6,7 +6,7 @@
#include "mmpriv.h"
#include "getopt.h"
#define MM_VERSION "2.2-r519-dirty"
#define MM_VERSION "2.2-r520-dirty"
#ifdef __linux__
#include <sys/resource.h>
@ -65,7 +65,7 @@ static inline int64_t mm_parse_num(const char *str)
int main(int argc, char *argv[])
{
const char *opt_str = "2aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:i:";
const char *opt_str = "2aSw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:i:L";
mm_mapopt_t opt;
mm_idxopt_t ipt;
int i, c, n_threads = 3, long_idx, max_gap_ref = 0;
@ -108,6 +108,7 @@ int main(int argc, char *argv[])
else if (c == 'X') opt.flag |= MM_F_AVA | MM_F_NO_SELF;
else if (c == 'a') opt.flag |= MM_F_OUT_SAM | MM_F_CIGAR;
else if (c == 'Q') opt.flag |= MM_F_NO_QUAL;
else if (c == 'L') opt.flag |= MM_F_LONG_CIGAR;
else if (c == 'T') opt.sdust_thres = atoi(optarg);
else if (c == 'n') opt.min_cnt = atoi(optarg);
else if (c == 'm') opt.min_chain_score = atoi(optarg);
@ -206,7 +207,7 @@ int main(int argc, char *argv[])
fprintf(fp_help, " Mapping:\n");
fprintf(fp_help, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%g]\n", opt.mid_occ_frac);
fprintf(fp_help, " -g NUM stop chain enlongation if there are no minimizers in INT-bp [%d]\n", opt.max_gap);
fprintf(fp_help, " -G NUM max reference skip length [-xsplice:200k]\n");
fprintf(fp_help, " -G NUM max reference skip/intron length [-xsplice:200k]\n");
fprintf(fp_help, " -F NUM max fragment length in the fragment mode [-xsr:800]\n");
fprintf(fp_help, " -r NUM bandwidth used in chaining and DP-based alignment [%d]\n", opt.bw);
fprintf(fp_help, " -n INT minimal number of minimizers on a chain [%d]\n", opt.min_cnt);
@ -227,6 +228,7 @@ int main(int argc, char *argv[])
fprintf(fp_help, " Input/Output:\n");
fprintf(fp_help, " -a output in the SAM format (PAF by default)\n");
fprintf(fp_help, " -Q don't output base quality in SAM\n");
fprintf(fp_help, " -L write CIGAR with >65535 ops in the CG tag (compatible with future tools)\n");
fprintf(fp_help, " -R STR SAM read group line in a format like '@RG\\tID:foo\\tSM:bar' []\n");
fprintf(fp_help, " -c output CIGAR in PAF\n");
fprintf(fp_help, " --cs[=STR] output the cs tag; STR is 'short' (if absent) or 'long' [none]\n");

View File

@ -21,6 +21,7 @@
#define MM_F_FRAG_MODE 0x2000
#define MM_F_NO_PRINT_2ND 0x4000
#define MM_F_2_IO_THREADS 0x8000
#define MM_F_LONG_CIGAR 0x10000
#define MM_IDX_MAGIC "MMI\2"