From 9735d7a31ae6f4dce5073e2769e0cbe3c74f06c2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 22 May 2013 19:45:16 -0400 Subject: [PATCH] conform to the latest (unpublished) SAM spec for chimeric alignments --- Makefile | 1 + bwa.c | 6 +----- bwamem.c | 55 ++++++++++++++++++++++++++++++------------------------- bwamem.h | 1 - fastmap.c | 2 -- 5 files changed, 32 insertions(+), 33 deletions(-) diff --git a/Makefile b/Makefile index 85bb185..69b2ccd 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,5 @@ CC= gcc +#CC= clang --analyze CFLAGS= -g -Wall -O2 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS AR= ar diff --git a/bwa.c b/bwa.c index 35d3e95..f8949f7 100644 --- a/bwa.c +++ b/bwa.c @@ -185,12 +185,8 @@ int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, } else abort(); // should not be here } free(cigar); - if (*qb == *qe || *rb == *re) { // TODO: this may happen in theory, but should be very very rare... - fprintf(stderr, "[E::%s] If you see this message, please let the developer know. Sorry.\n", __func__); - exit(1); - } } - return 0; + return (*qb == *qe || *rb == *re)? -2 : 0; } /********************* diff --git a/bwamem.c b/bwamem.c index 5ee833b..2fdfd46 100644 --- a/bwamem.c +++ b/bwamem.c @@ -671,7 +671,9 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m kputw(p->mapq, str); kputc('\t', str); // MAPQ if (p->n_cigar) { // aligned for (i = 0; i < p->n_cigar; ++i) { - kputw(p->cigar[i]>>4, str); kputc("MIDSH"[p->cigar[i]&0xf], str); + int c = p->cigar[i]&0xf; + if (c == 3 || c == 4) c = which? 4 : 3; // use hard clipping for supplementary alignments + kputw(p->cigar[i]>>4, str); kputc("MIDSH"[c], str); } } else kputc('*', str); // having a coordinate but unaligned (e.g. when copy_mate is true) } else kputsn("*\t0\t0\t*", 7, str); // without coordinte @@ -698,8 +700,8 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m } else if (!p->is_rev) { // the forward strand int i, qb = 0, qe = s->l_seq; if (p->n_cigar) { - if ((p->cigar[0]&0xf) == 4) qb += p->cigar[0]>>4; - if ((p->cigar[p->n_cigar-1]&0xf) == 4) qe -= p->cigar[p->n_cigar-1]>>4; + if (which && ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3)) qb += p->cigar[0]>>4; + if (which && ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3)) qe -= p->cigar[p->n_cigar-1]>>4; } ks_resize(str, str->l + (qe - qb) + 1); for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]]; @@ -730,23 +732,25 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } - for (i = 0; i < n; ++i) - if (i != which && !(list[i].flag&0x20000)) break; // 0x20000: shadowed multi hit - if (i < n) { // there are other primary hits; output them - kputsn("\tXP:Z:", 6, str); - for (i = 0; i < n; ++i) { - const mem_aln_t *r = &list[i]; - int k; - if (i == which || (list[i].flag&0x20000)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit - kputs(bns->anns[r->rid].name, str); kputc(',', str); - kputc("+-"[r->is_rev], str); - kputl(r->pos+1, str); kputc(',', str); - for (k = 0; k < r->n_cigar; ++k) { - kputw(r->cigar[k]>>4, str); kputc("MIDSH"[r->cigar[k]&0xf], str); + if (!(p->flag & 0x20000)) { // not multi-hit + for (i = 0; i < n; ++i) + if (i != which && !(list[i].flag&0x20000)) break; // 0x20000: shadowed multi hit + if (i < n) { // there are other primary hits; output them + kputsn("\tSP:Z:", 6, str); + for (i = 0; i < n; ++i) { + const mem_aln_t *r = &list[i]; + int k; + if (i == which || (list[i].flag&0x20000)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit + kputs(bns->anns[r->rid].name, str); kputc(',', str); + kputl(r->pos+1, str); kputc(',', str); + kputc("+-"[r->is_rev], str); kputc(',', str); + for (k = 0; k < r->n_cigar; ++k) { + kputw(r->cigar[k]>>4, str); kputc("MIDSH"[r->cigar[k]&0xf], str); + } + kputc(',', str); kputw(r->mapq, str); + kputc(',', str); kputw(r->NM, str); + kputc(';', str); } - kputc(',', str); kputw(r->mapq, str); - kputc(',', str); kputw(r->NM, str); - kputc(';', str); } } if (s->comment) { kputc('\t', str); kputs(s->comment, str); } @@ -791,7 +795,8 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); q->flag |= extra_flag | (p->secondary >= 0? 0x100 : 0); // flag secondary if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score - if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) q->flag |= 0x10000; + if (k && p->secondary < 0) // if supplementary + q->flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800; if (k && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq; } if (aa.n == 0) { // no alignments good enough; then write an unaligned record @@ -866,9 +871,9 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0; if (ar->secondary >= 0) a.flag |= 0x20000; - if (bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re) < 0) { // unfixable cross-reference alignment - a.rid = -1; a.pos = -1; a.flag |= 0x4; - return a; + if (bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re) < 0) { + fprintf(stderr, "[E::%s] If you see this message, please let the developer know. Abort. Sorry.\n", __func__); + exit(1); } w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->q, opt->r); w2 = w2 < opt->w? w2 : opt->w; @@ -890,10 +895,10 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2)); if (clip5) { memmove(a.cigar+1, a.cigar, a.n_cigar * 4); - a.cigar[0] = clip5<<4 | (opt->flag&MEM_F_HARDCLIP? 4 : 3); + a.cigar[0] = clip5<<4 | 3; ++a.n_cigar; } - if (clip3) a.cigar[a.n_cigar++] = clip3<<4 | (opt->flag&MEM_F_HARDCLIP? 4 : 3); + if (clip3) a.cigar[a.n_cigar++] = clip3<<4 | 3; } a.rid = bns_pos2rid(bns, pos); a.pos = pos - bns->anns[a.rid].offset; diff --git a/bwamem.h b/bwamem.h index 76be8e3..be1862a 100644 --- a/bwamem.h +++ b/bwamem.h @@ -11,7 +11,6 @@ struct __smem_i; typedef struct __smem_i smem_i; -#define MEM_F_HARDCLIP 0x1 #define MEM_F_PE 0x2 #define MEM_F_NOPAIRING 0x4 #define MEM_F_ALL 0x8 diff --git a/fastmap.c b/fastmap.c index 592df02..b00ec00 100644 --- a/fastmap.c +++ b/fastmap.c @@ -39,7 +39,6 @@ int main_mem(int argc, char *argv[]) else if (c == 'U') opt->pen_unpaired = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; - else if (c == 'H') opt->flag |= MEM_F_HARDCLIP; else if (c == 'a') opt->flag |= MEM_F_ALL; else if (c == 'p') opt->flag |= MEM_F_PE; else if (c == 'M') opt->flag |= MEM_F_NO_MULTI; @@ -82,7 +81,6 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T); fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); - fprintf(stderr, " -H hard clipping\n"); fprintf(stderr, " -M mark shorter split hits as secondary (for Picard/GATK compatibility)\n"); fprintf(stderr, "\nNote: Please read the man page for detailed description of the command line and options.\n"); fprintf(stderr, "\n");