From 07921659cf28db08961545351ba188287c40c2b4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Mar 2013 09:38:12 -0500 Subject: [PATCH 01/17] move mem_fill_scmat() to bwa.{h,c} --- bwa.c | 11 +++++++++++ bwa.h | 1 + bwamem.c | 13 +------------ fastmap.c | 2 +- 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/bwa.c b/bwa.c index beea6d1..76b54ae 100644 --- a/bwa.c +++ b/bwa.c @@ -69,6 +69,17 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) * CIGAR related * *****************/ +void bwa_fill_scmat(int a, int b, int8_t mat[25]) +{ + int i, j, k; + for (i = k = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + mat[k++] = i == j? a : -b; + mat[k++] = 0; // ambiguous base + } + for (j = 0; j < 5; ++j) mat[k++] = 0; +} + // Generate CIGAR when the alignment end points are known uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM) { diff --git a/bwa.h b/bwa.h index 81d40e0..9d5b2aa 100644 --- a/bwa.h +++ b/bwa.h @@ -30,6 +30,7 @@ extern "C" { bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); + void bwa_fill_scmat(int a, int b, int8_t mat[25]); uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re); diff --git a/bwamem.c b/bwamem.c index a6897ba..cd3a1f5 100644 --- a/bwamem.c +++ b/bwamem.c @@ -58,21 +58,10 @@ mem_opt_t *mem_opt_init() o->chunk_size = 10000000; o->n_threads = 1; o->max_matesw = 100; - mem_fill_scmat(o->a, o->b, o->mat); + bwa_fill_scmat(o->a, o->b, o->mat); return o; } -void mem_fill_scmat(int a, int b, int8_t mat[25]) -{ - int i, j, k; - for (i = k = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - mat[k++] = i == j? a : -b; - mat[k++] = 0; // ambiguous base - } - for (j = 0; j < 5; ++j) mat[k++] = 0; -} - /*************************** * SMEM iterator interface * ***************************/ diff --git a/fastmap.c b/fastmap.c index 40c3a02..d4e5626 100644 --- a/fastmap.c +++ b/fastmap.c @@ -84,7 +84,7 @@ int main_mem(int argc, char *argv[]) return 1; } - mem_fill_scmat(opt->a, opt->b, opt->mat); + bwa_fill_scmat(opt->a, opt->b, opt->mat); if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak bwa_print_sam_hdr(idx->bns, rg_line); From 086c9d0e7dbba0519c598b819876c2b230780f32 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Mar 2013 09:54:49 -0500 Subject: [PATCH 02/17] bwa-sw: use bwa_gen_cigar() for cigar generation --- bwtsw2_aux.c | 56 +++++++++++----------------------------------------- 1 file changed, 11 insertions(+), 45 deletions(-) diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index bc12d20..4455c5f 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -169,33 +169,22 @@ void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, } /* generate CIGAR array(s) in b->cigar[] */ -static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b, const char *name) +static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], int64_t l_pac, const uint8_t *pac, bwtsw2_t *b, const char *name) { - uint8_t *target; - int i, matrix[25]; - AlnParam par; - path_t *path; + int i; + int8_t mat[25]; - par.matrix = matrix; - __gen_ap(par, opt); - i = ((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq; // maximum possible target length - target = calloc(i, 1); - path = calloc(i + lq, sizeof(path_t)); - // generate CIGAR + bwa_fill_scmat(opt->a, opt->b, mat); for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; bsw2aux_t *q = b->aux + i; uint8_t *query; - bwtint_t k; - int path_len, beg, end; + int beg, end, score; if (p->l) continue; beg = (p->flag & 0x10)? lq - p->end : p->beg; end = (p->flag & 0x10)? lq - p->beg : p->end; query = seq[(p->flag & 0x10)? 1 : 0] + beg; - for (k = p->k; k < p->k + p->len; ++k) // in principle, no out-of-boundary here - target[k - p->k] = pac[k>>2] >> (~k&3)*2 & 0x3; - aln_global_core(target, p->len, query, end - beg, &par, path, &path_len); - q->cigar = aln_path2cigar32(path, path_len, &q->n_cigar); + q->cigar = bwa_gen_cigar(mat, opt->q, opt->r, opt->bw, l_pac, pac, end - beg, query, p->k, p->k + p->len, &score, &q->n_cigar, &q->nm); #if 0 if (name && score != p->G) { // debugging only int j, glen = 0; @@ -206,7 +195,7 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8 __func__, name, score, p->G, lq, end - beg, p->len, glen, opt->bw); } #endif - if (beg != 0 || end < lq) { // write soft clipping + if (q->cigar && (beg != 0 || end < lq)) { // write soft clipping q->cigar = realloc(q->cigar, 4 * (q->n_cigar + 2)); if (beg != 0) { memmove(q->cigar + 1, q->cigar, q->n_cigar * 4); @@ -219,7 +208,6 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8 } } } - free(target); free(path); } /* this is for the debugging purpose only */ @@ -407,27 +395,6 @@ static int fix_cigar(const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *c return n_cigar; } -static int compute_nm(bsw2hit_t *p, int n_cigar, const uint32_t *cigar, const uint8_t *pac, const uint8_t *seq) -{ - int k, x, n_mm = 0, i, n_gap = 0; - bwtint_t y; - x = 0; y = p->k; - for (k = 0; k < n_cigar; ++k) { - int op = cigar[k]&0xf; - int len = cigar[k]>>4; - if (op == 0) { // match - for (i = 0; i < len; ++i) { - int ref = pac[(y+i)>>2] >> (~(y+i)&3)*2 & 0x3; - if (seq[x + i] != ref) ++n_mm; - } - x += len; y += len; - } else if (op == 1) x += len, n_gap += len; - else if (op == 2) y += len, n_gap += len; - else if (op == 4) x += len; - } - return n_mm + n_gap; -} - static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b, const char *name) { int i; @@ -439,7 +406,7 @@ static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8 } b->aux = calloc(b->n, sizeof(bsw2aux_t)); // generate CIGAR - gen_cigar(opt, qlen, seq, pac, b, name); + gen_cigar(opt, qlen, seq, bns->l_pac, pac, b, name); // fix CIGAR, generate mapQ, and write chromosomal position for (i = 0; i < b->n; ++i) { bsw2hit_t *p = &b->hits[i]; @@ -451,8 +418,6 @@ static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8 int subo; // fix out-of-boundary CIGAR q->n_cigar = fix_cigar(bns, p, q->n_cigar, q->cigar); - // compute the NM tag - q->nm = compute_nm(p, q->n_cigar, q->cigar, pac, seq[p->is_rev]); // compute mapQ subo = p->G2 > opt->t? p->G2 : opt->t; if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5; @@ -527,9 +492,10 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks bsw2aux_t *q = b->aux + i; int j, beg, end, type = 0; // print mandatory fields before SEQ + if (q->cigar == 0) q->flag |= 0x4; ksprintf(&str, "%s\t%d", ks->name, q->flag | (opt->multi_2nd && i? 0x100 : 0)); ksprintf(&str, "\t%s\t%ld", q->chr>=0? bns->anns[q->chr].name : "*", (long)q->pos + 1); - if (p->l == 0) { // not a repetitive hit + if (p->l == 0 && q->cigar) { // not a repetitive hit ksprintf(&str, "\t%d\t", q->pqual); for (k = 0; k < q->n_cigar; ++k) ksprintf(&str, "%d%c", q->cigar[k]>>4, (opt->hard_clip? "MIDNHHP" : "MIDNSHP")[q->cigar[k]&0xf]); @@ -538,7 +504,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks else ksprintf(&str, "\t%s\t%d\t%d\t", q->mchr==q->chr? "=" : (q->mchr<0? "*" : bns->anns[q->mchr].name), q->mpos+1, q->isize); // get the sequence begin and end beg = 0; end = ks->l; - if (opt->hard_clip) { + if (opt->hard_clip && q->cigar) { if ((q->cigar[0]&0xf) == 4) beg += q->cigar[0]>>4; if ((q->cigar[q->n_cigar-1]&0xf) == 4) end -= q->cigar[q->n_cigar-1]>>4; } From e6c262594fcb6c48aa4f1ced3f49827cfdb5543f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Mar 2013 10:12:38 -0500 Subject: [PATCH 03/17] bwa-sw: ditch stdaln --- bwtsw2_aux.c | 38 +++++++++++++++++--------------------- ksw.c | 2 +- 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 4455c5f..6527495 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -11,9 +11,9 @@ #include "bwt_lite.h" #include "utils.h" #include "bwtsw2.h" -#include "stdaln.h" #include "kstring.h" #include "bwa.h" +#include "ksw.h" #include "kseq.h" KSEQ_DECLARE(gzFile) @@ -94,13 +94,12 @@ bwtsw2_t *bsw2_dup_no_cigar(const bwtsw2_t *b) void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem) { - int i, matrix[25]; + int i; bwtint_t k; uint8_t *target = 0, *query; - AlnParam par; + int8_t mat[25]; - par.matrix = matrix; - __gen_ap(par, opt); + bwa_fill_scmat(opt->a, opt->b, mat); query = calloc(lq, 1); // sort according to the descending order of query end ks_introsort(hit, b->n, b->hits); @@ -111,8 +110,7 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; int lt = ((p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq; - int score, j; - path_t path; + int score, j, qle, tle; p->n_seeds = 1; if (p->l || p->k == 0) continue; for (j = score = 0; j < i; ++j) { @@ -127,12 +125,12 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; - score = aln_extend_core(target, lt, query + lq - p->beg, p->beg, &par, &path, 0, p->G, _mem); + score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, p->G, &qle, &tle, 0, 0, 0); if (score > p->G) { // extensible p->G = score; - p->len += path.i; - p->beg -= path.j; - p->k -= path.i; + p->k -= tle; + p->len += tle; + p->beg -= qle; } } free(query); free(target); @@ -140,29 +138,27 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem) { - int i, matrix[25]; + int i; bwtint_t k; uint8_t *target; - AlnParam par; - - par.matrix = matrix; - __gen_ap(par, opt); + int8_t mat[25]; + + bwa_fill_scmat(opt->a, opt->b, mat); target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; int lt = ((lq - p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq; - int j, score; - path_t path; + int j, score, qle, tle; if (p->l) continue; for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; - score = aln_extend_core(target, lt, query + p->beg, lq - p->beg, &par, &path, 0, 1, _mem); + score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, 1, &qle, &tle, 0, 0, 0); // if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G); if (score >= p->G) { p->G = score; - p->len = path.i; - p->end = path.j + p->beg; + p->len = tle; + p->end = p->beg + qle; } } free(target); diff --git a/ksw.c b/ksw.c index 5666a8f..e331390 100644 --- a/ksw.c +++ b/ksw.c @@ -426,7 +426,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, max_ie = gscore > h1? max_ie : i; gscore = gscore > h1? gscore : h1; } - if (m == 0 || max - m - abs((i - max_i) - (j - max_j)) * gape > zdrop) break; // drop to zero, or below Z-dropoff + if (m == 0 || (zdrop > 0 && max - m - abs((i - max_i) - (j - max_j)) * gape > zdrop)) break; // drop to zero, or below Z-dropoff if (m > max) { max = m, max_i = i, max_j = mj; max_off = max_off > abs(mj - i)? max_off : abs(mj - i); From bb37e14d02bea1eb13dfa78e4b25065429efec41 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Mar 2013 10:38:47 -0500 Subject: [PATCH 04/17] replace aln_global in bwase.c --- bwase.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/bwase.c b/bwase.c index 2dd783b..9e2696b 100644 --- a/bwase.c +++ b/bwase.c @@ -11,6 +11,7 @@ #include "utils.h" #include "kstring.h" #include "bwa.h" +#include "ksw.h" int g_log_n[256]; @@ -164,12 +165,13 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l int ext, int *n_cigar, int is_end_correct) { bwa_cigar_t *cigar = 0; + uint32_t *cigar32 = 0; ubyte_t *ref_seq; - int l = 0, path_len, ref_len; - AlnParam ap = aln_param_bwa; - path_t *path; + int l = 0, ref_len; int64_t k, __pos = *_pos; + int8_t mat[25]; + bwa_fill_scmat(1, 3, mat); ref_len = len + abs(ext); if (ext > 0) { ref_seq = (ubyte_t*)calloc(ref_len, 1); @@ -181,10 +183,11 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l for (l = 0, k = x - ref_len > 0? x - ref_len : 0; k < x && k < l_pac; ++k) ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; } - path = (path_t*)calloc(l+len, sizeof(path_t)); - aln_global_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len); - cigar = bwa_aln_path2cigar(path, path_len, n_cigar); + ksw_global(len, seq, l, ref_seq, 5, mat, 5, 1, 50, n_cigar, &cigar32); + cigar = (bwa_cigar_t*)cigar32; + for (k = 0; k < *n_cigar; ++k) + cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped to the forward strand for (l = k = 0; k < *n_cigar; ++k) { @@ -206,7 +209,7 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(3, (__cigar_len(cigar[0]))); *_pos = (bwtint_t)__pos; - free(ref_seq); free(path); + free(ref_seq); return cigar; } From 98f896675094c3bb12203717f29b45757e5fd056 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Mar 2013 12:00:24 -0500 Subject: [PATCH 05/17] r329: ditch stdaln.{c,h}; no changes to bwa-mem stdaln.{c,h} was written ten years ago. Its local and SW extension code are actually buggy (though that rarely happens and usually does not affect the results too much). ksw.{c,h} is more concise, potentially faster, less buggy, and richer in features. --- Makefile | 6 +- bwape.c | 41 ++- bwase.c | 6 +- bwtaln.c | 15 - bwtaln.h | 12 +- main.c | 2 +- stdaln.c | 1070 ------------------------------------------------------ stdaln.h | 162 --------- 8 files changed, 33 insertions(+), 1281 deletions(-) delete mode 100644 stdaln.c delete mode 100644 stdaln.h diff --git a/Makefile b/Makefile index 36f951f..96c3047 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o -AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ +AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ is.o bwtindex.o bwape.o kopen.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o @@ -48,8 +48,8 @@ fastmap.o:bwt.h bwamem.h bwtaln.o:bwt.h bwtaln.h kseq.h bwtgap.o:bwtgap.h bwtaln.h bwt.h -bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h -bwtsw2_aux.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h +bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h +bwtsw2_aux.o:bwtsw2.h bwt.h bwt_lite.h bwtsw2_main.o:bwtsw2.h clean: diff --git a/bwape.c b/bwape.c index 0b2b8d6..9fd12b1 100644 --- a/bwape.c +++ b/bwape.c @@ -8,9 +8,9 @@ #include "kvec.h" #include "bntseq.h" #include "utils.h" -#include "stdaln.h" #include "bwase.h" #include "bwa.h" +#include "ksw.h" typedef struct { int n; @@ -397,16 +397,17 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw #define SW_MIN_MAPQ 17 // cnt = n_mm<<16 | n_gapo<<8 | n_gape -bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen, - int *n_cigar, uint32_t *_cnt) +bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen, int *n_cigar, uint32_t *_cnt) { + kswr_t r; + uint32_t *cigar32 = 0; bwa_cigar_t *cigar = 0; ubyte_t *ref_seq; bwtint_t k, x, y, l; - int path_len, ret, subo; - AlnParam ap = aln_param_bwa; - path_t *path, *p; + int xtra; + int8_t mat[25]; + bwa_fill_scmat(1, 3, mat); // check whether there are too many N's if (reglen < SW_MIN_MATCH_LEN || (int64_t)l_pac - *beg < len) return 0; for (k = 0, x = 0; k < len; ++k) @@ -417,15 +418,19 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u ref_seq = (ubyte_t*)calloc(reglen, 1); for (k = *beg, l = 0; l < reglen && k < l_pac; ++k) ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; - path = (path_t*)calloc(l+len, sizeof(path_t)); // do alignment - ret = aln_local_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len, 1, &subo); - if (ret < 0 || subo == ret) { // no hit or tandem hits - free(path); free(cigar); free(ref_seq); *n_cigar = 0; + xtra = KSW_XSUBO | KSW_XSTART | (len < 250? KSW_XBYTE : 0); + r = ksw_align(len, (uint8_t*)seq, l, ref_seq, 5, mat, 5, 1, xtra, 0); + ksw_global(r.qe - r.qb + 1, &seq[r.qb], r.te - r.tb + 1, &ref_seq[r.tb], 5, mat, 5, 1, 50, n_cigar, &cigar32); + cigar = (bwa_cigar_t*)cigar32; + for (k = 0; k < *n_cigar; ++k) + cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); + + if (r.score < SW_MIN_MATCH_LEN || r.score2 == r.score) { // poor hit or tandem hits + free(cigar); free(ref_seq); *n_cigar = 0; return 0; } - cigar = bwa_aln_path2cigar(path, path_len, n_cigar); // check whether the alignment is good enough for (k = 0, x = y = 0; k < *n_cigar; ++k) { @@ -435,17 +440,14 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u else y += __cigar_len(c); } if (x < SW_MIN_MATCH_LEN || y < SW_MIN_MATCH_LEN) { // not good enough - free(path); free(cigar); free(ref_seq); + free(cigar); free(ref_seq); *n_cigar = 0; return 0; } { // update cigar and coordinate; - int start, end; - p = path + path_len - 1; - *beg += (p->i? p->i : 1) - 1; - start = (p->j? p->j : 1) - 1; - end = path->j; + int start = r.qb, end = r.qe + 1; + *beg += r.tb; cigar = (bwa_cigar_t*)realloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2)); if (start) { memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar)); @@ -462,8 +464,7 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u { // set *cnt int n_mm, n_gapo, n_gape; n_mm = n_gapo = n_gape = 0; - p = path + path_len - 1; - x = p->i? p->i - 1 : 0; y = p->j? p->j - 1 : 0; + x = r.tb; y = r.qb; for (k = 0; k < *n_cigar; ++k) { bwa_cigar_t c = cigar[k]; if (__cigar_op(c) == FROM_M) { @@ -479,7 +480,7 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u *_cnt = (uint32_t)n_mm<<16 | n_gapo<<8 | n_gape; } - free(ref_seq); free(path); + free(ref_seq); return cigar; } diff --git a/bwase.c b/bwase.c index 9e2696b..eebe22b 100644 --- a/bwase.c +++ b/bwase.c @@ -4,7 +4,6 @@ #include #include #include -#include "stdaln.h" #include "bwase.h" #include "bwtaln.h" #include "bntseq.h" @@ -205,8 +204,8 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l if (__cigar_op(cigar[*n_cigar-1]) == FROM_D) --(*n_cigar); // deletion at the 3'-end // change "I" at either end of the read to S. just in case. This should rarely happen... - if (__cigar_op(cigar[*n_cigar-1]) == FROM_I) cigar[*n_cigar-1] = __cigar_create(3, (__cigar_len(cigar[*n_cigar-1]))); - if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(3, (__cigar_len(cigar[0]))); + if (__cigar_op(cigar[*n_cigar-1]) == FROM_I) cigar[*n_cigar-1] = __cigar_create(FROM_S, (__cigar_len(cigar[*n_cigar-1]))); + if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(FROM_S, (__cigar_len(cigar[0]))); *_pos = (bwtint_t)__pos; free(ref_seq); @@ -589,5 +588,6 @@ int bwa_sai2sam_se(int argc, char *argv[]) return 0; } bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line); + free(prefix); return 0; } diff --git a/bwtaln.c b/bwtaln.c index 96d4026..d132157 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -312,18 +312,3 @@ int bwa_aln(int argc, char *argv[]) free(opt); free(prefix); return 0; } - -/* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t, -__cigar_op and __cigar_len while keeping stdaln stand alone */ -bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar) -{ - uint32_t *cigar32; - bwa_cigar_t *cigar; - int i; - cigar32 = aln_path2cigar32((path_t*) path, path_len, n_cigar); - cigar = (bwa_cigar_t*)cigar32; - for (i = 0; i < *n_cigar; ++i) - cigar[i] = __cigar_create( (cigar32[i]&0xf), (cigar32[i]>>4) ); - return cigar; -} - diff --git a/bwtaln.h b/bwtaln.h index 412cc04..556f259 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -28,6 +28,11 @@ #define bns_pac(pac, k) ((pac)[(k)>>2] >> ((~(k)&3)<<1) & 3) #endif +#define FROM_M 0 +#define FROM_I 1 +#define FROM_D 2 +#define FROM_S 3 + typedef struct { bwtint_t w; int bid; @@ -138,13 +143,6 @@ extern "C" { void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac); - - /* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t, - __cigar_op and __cigar_len while keeping stdaln stand alone */ -#include "stdaln.h" - - bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar); - #ifdef __cplusplus } #endif diff --git a/main.c b/main.c index 8b49c8b..00a21b9 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r324-beta" +#define PACKAGE_VERSION "0.7.0-r329-beta" #endif static int usage() diff --git a/stdaln.c b/stdaln.c deleted file mode 100644 index cd064cf..0000000 --- a/stdaln.c +++ /dev/null @@ -1,1070 +0,0 @@ -/* The MIT License - - Copyright (c) 2003-2006, 2008, 2009, by Heng Li - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -#include -#include -#include -#include -#include "stdaln.h" - -/* char -> 17 (=16+1) nucleotides */ -unsigned char aln_nt16_table[256] = { - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,16 /*'-'*/,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15, 1,14, 4, 11,15,15, 2, 13,15,15,10, 15, 5,15,15, - 15,15, 3, 6, 8,15, 7, 9, 0,12,15,15, 15,15,15,15, - 15, 1,14, 4, 11,15,15, 2, 13,15,15,10, 15, 5,15,15, - 15,15, 3, 6, 8,15, 7, 9, 0,12,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 -}; -char *aln_nt16_rev_table = "XAGRCMSVTWKDYHBN-"; - -/* char -> 5 (=4+1) nucleotides */ -unsigned char aln_nt4_table[256] = { - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 2, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 2, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 -}; -char *aln_nt4_rev_table = "AGCTN-"; - -/* char -> 22 (=20+1+1) amino acids */ -unsigned char aln_aa_table[256] = { - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,20,21, 21,22 /*'-'*/,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21, 0,21, 4, 3, 6,13, 7, 8, 9,21,11, 10,12, 2,21, - 14, 5, 1,15, 16,21,19,17, 21,18,21,21, 21,21,21,21, - 21, 0,21, 4, 3, 6,13, 7, 8, 9,21,11, 10,12, 2,21, - 14, 5, 1,15, 16,21,19,17, 21,18,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21 -}; -char *aln_aa_rev_table = "ARNDCQEGHILKMFPSTWYV*X-"; - /* 01234567890123456789012 */ - -/* translation table. They are useless in stdaln.c, but when you realize you need it, you need not write the table again. */ -unsigned char aln_trans_table_eu[66] = { - 11,11, 2, 2, 1, 1,15,15, 16,16,16,16, 9,12, 9, 9, - 6, 6, 3, 3, 7, 7, 7, 7, 0, 0, 0, 0, 19,19,19,19, - 5, 5, 8, 8, 1, 1, 1, 1, 14,14,14,14, 10,10,10,10, - 20,20,18,18, 20,17, 4, 4, 15,15,15,15, 10,10,13,13, 21, 22 -}; -char *aln_trans_table_eu_char = "KKNNRRSSTTTTIMIIEEDDGGGGAAAAVVVVQQHHRRRRPPPPLLLL**YY*WCCSSSSLLFFX"; - /* 01234567890123456789012345678901234567890123456789012345678901234 */ -int aln_sm_blosum62[] = { -/* A R N D C Q E G H I L K M F P S T W Y V * X */ - 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0, - -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1, - -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1, - -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1, - 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2, - -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1, - -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1, - 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1, - -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1, - -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1, - -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1, - -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1, - -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1, - -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1, - -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2, - 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0, - 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0, - -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2, - -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1, - 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1, - -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4, - 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1 -}; - -int aln_sm_blosum45[] = { -/* A R N D C Q E G H I L K M F P S T W Y V * X */ - 5,-2,-1,-2,-1,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-2,-2, 0,-5, 0, - -2, 7, 0,-1,-3, 1, 0,-2, 0,-3,-2, 3,-1,-2,-2,-1,-1,-2,-1,-2,-5,-1, - -1, 0, 6, 2,-2, 0, 0, 0, 1,-2,-3, 0,-2,-2,-2, 1, 0,-4,-2,-3,-5,-1, - -2,-1, 2, 7,-3, 0, 2,-1, 0,-4,-3, 0,-3,-4,-1, 0,-1,-4,-2,-3,-5,-1, - -1,-3,-2,-3,12,-3,-3,-3,-3,-3,-2,-3,-2,-2,-4,-1,-1,-5,-3,-1,-5,-2, - -1, 1, 0, 0,-3, 6, 2,-2, 1,-2,-2, 1, 0,-4,-1, 0,-1,-2,-1,-3,-5,-1, - -1, 0, 0, 2,-3, 2, 6,-2, 0,-3,-2, 1,-2,-3, 0, 0,-1,-3,-2,-3,-5,-1, - 0,-2, 0,-1,-3,-2,-2, 7,-2,-4,-3,-2,-2,-3,-2, 0,-2,-2,-3,-3,-5,-1, - -2, 0, 1, 0,-3, 1, 0,-2,10,-3,-2,-1, 0,-2,-2,-1,-2,-3, 2,-3,-5,-1, - -1,-3,-2,-4,-3,-2,-3,-4,-3, 5, 2,-3, 2, 0,-2,-2,-1,-2, 0, 3,-5,-1, - -1,-2,-3,-3,-2,-2,-2,-3,-2, 2, 5,-3, 2, 1,-3,-3,-1,-2, 0, 1,-5,-1, - -1, 3, 0, 0,-3, 1, 1,-2,-1,-3,-3, 5,-1,-3,-1,-1,-1,-2,-1,-2,-5,-1, - -1,-1,-2,-3,-2, 0,-2,-2, 0, 2, 2,-1, 6, 0,-2,-2,-1,-2, 0, 1,-5,-1, - -2,-2,-2,-4,-2,-4,-3,-3,-2, 0, 1,-3, 0, 8,-3,-2,-1, 1, 3, 0,-5,-1, - -1,-2,-2,-1,-4,-1, 0,-2,-2,-2,-3,-1,-2,-3, 9,-1,-1,-3,-3,-3,-5,-1, - 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-3,-1,-2,-2,-1, 4, 2,-4,-2,-1,-5, 0, - 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-1,-1, 2, 5,-3,-1, 0,-5, 0, - -2,-2,-4,-4,-5,-2,-3,-2,-3,-2,-2,-2,-2, 1,-3,-4,-3,15, 3,-3,-5,-2, - -2,-1,-2,-2,-3,-1,-2,-3, 2, 0, 0,-1, 0, 3,-3,-2,-1, 3, 8,-1,-5,-1, - 0,-2,-3,-3,-1,-3,-3,-3,-3, 3, 1,-2, 1, 0,-3,-1, 0,-3,-1, 5,-5,-1, - -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5, 1,-5, - 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0,-2,-1,-1,-5,-1 -}; - -int aln_sm_nt[] = { -/* X A G R C M S V T W K D Y H B N */ - -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2, - -2, 2,-1, 1,-2, 1,-2, 0,-2, 1,-2, 0,-2, 0,-2, 0, - -2,-1, 2, 1,-2,-2, 1, 0,-2,-2, 1, 0,-2,-2, 0, 0, - -2, 1, 1, 1,-2,-1,-1, 0,-2,-1,-1, 0,-2, 0, 0, 0, - -2,-2,-2,-2, 2, 1, 1, 0,-1,-2,-2,-2, 1, 0, 0, 0, - -2, 1,-2,-1, 1, 1,-1, 0,-2,-1,-2, 0,-1, 0, 0, 0, - -2,-2, 1,-1, 1,-1, 1, 0,-2,-2,-1, 0,-1, 0, 0, 0, - -2, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, - -2,-2,-2,-2,-1,-2,-2,-2, 2, 1, 1, 0, 1, 0, 0, 0, - -2, 1,-2,-1,-2,-1,-2, 0, 1, 1,-1, 0,-1, 0, 0, 0, - -2,-2, 1,-1,-2,-2,-1, 0, 1,-1, 1, 0,-1, 0, 0, 0, - -2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -2,-2,-2,-2, 1,-1,-1, 0, 1,-1,-1, 0, 1, 0, 0, 0, - -2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - -int aln_sm_read[] = { -/* X A G R C M S V T W K D Y H B N */ - -17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17, - -17, 2,-17, 1,-17, 1,-17, 0,-17, 1,-17, 0,-17, 0,-17, 0, - -17,-17, 2, 1,-17,-17, 1, 0,-17,-17, 1, 0,-17,-17, 0, 0, - -17, 1, 1, 1,-17,-17,-17, 0,-17,-17,-17, 0,-17, 0, 0, 0, - -17,-17,-17,-17, 2, 1, 1, 0,-17,-17,-17,-17, 1, 0, 0, 0, - -17, 1,-17,-17, 1, 1,-17, 0,-17,-17,-17, 0,-17, 0, 0, 0, - -17,-17, 1,-17, 1,-17, 1, 0,-17,-17,-17, 0,-17, 0, 0, 0, - -17, 0, 0, 0, 0, 0, 0, 0,-17, 0, 0, 0, 0, 0, 0, 0, - -17,-17,-17,-17,-17,-17,-17,-17, 2, 1, 1, 0, 1, 0, 0, 0, - -17, 1,-17,-17,-17,-17,-17, 0, 1, 1,-17, 0,-17, 0, 0, 0, - -17,-17, 1,-17,-17,-17,-17, 0, 1,-17, 1, 0,-17, 0, 0, 0, - -17, 0, 0, 0,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -17,-17,-17,-17, 1,-17,-17, 0, 1,-17,-17, 0, 1, 0, 0, 0, - -17, 0,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -17,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - -int aln_sm_hs[] = { -/* A G C T N */ - 91, -31,-114,-123, -44, - -31, 100,-125,-114, -42, - -123,-125, 100, -31, -42, - -114,-114, -31, 91, -42, - -44, -42, -42, -42, -43 -}; - -int aln_sm_maq[] = { - 11, -19, -19, -19, -13, - -19, 11, -19, -19, -13, - -19, -19, 11, -19, -13, - -19, -19, -19, 11, -13, - -13, -13, -13, -13, -13 -}; - -int aln_sm_blast[] = { - 1, -3, -3, -3, -2, - -3, 1, -3, -3, -2, - -3, -3, 1, -3, -2, - -3, -3, -3, 1, -2, - -2, -2, -2, -2, -2 -}; - -/********************/ -/* START OF align.c */ -/********************/ - -AlnParam aln_param_blast = { 5, 2, 2, aln_sm_blast, 5, 50 }; -AlnParam aln_param_bwa = { 26, 9, 5, aln_sm_maq, 5, 50 }; -AlnParam aln_param_nt2nt = { 8, 2, 2, aln_sm_nt, 16, 75 }; -AlnParam aln_param_rd2rd = { 1, 19, 19, aln_sm_read, 16, 75 }; -AlnParam aln_param_aa2aa = { 10, 2, 2, aln_sm_blosum62, 22, 50 }; - -AlnAln *aln_init_AlnAln() -{ - AlnAln *aa; - aa = (AlnAln*)malloc(sizeof(AlnAln)); - aa->path = 0; - aa->out1 = aa->out2 = aa->outm = 0; - aa->path_len = 0; - return aa; -} -void aln_free_AlnAln(AlnAln *aa) -{ - free(aa->path); free(aa->cigar32); - free(aa->out1); free(aa->out2); free(aa->outm); - free(aa); -} - -/***************************/ -/* START OF common_align.c */ -/***************************/ - -#define LOCAL_OVERFLOW_THRESHOLD 32000 -#define LOCAL_OVERFLOW_REDUCE 16000 -#define NT_LOCAL_SCORE int -#define NT_LOCAL_SHIFT 16 -#define NT_LOCAL_MASK 0xffff - -#define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF; - -#define set_M(MM, cur, p, sc) \ -{ \ - if ((p)->M >= (p)->I) { \ - if ((p)->M >= (p)->D) { \ - (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \ - } else { \ - (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ - } \ - } else { \ - if ((p)->I > (p)->D) { \ - (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \ - } else { \ - (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ - } \ - } \ -} -#define set_I(II, cur, p) \ -{ \ - if ((p)->M - gap_open > (p)->I) { \ - (cur)->It = FROM_M; \ - (II) = (p)->M - gap_open - gap_ext; \ - } else { \ - (cur)->It = FROM_I; \ - (II) = (p)->I - gap_ext; \ - } \ -} -#define set_end_I(II, cur, p) \ -{ \ - if (gap_end >= 0) { \ - if ((p)->M - gap_open > (p)->I) { \ - (cur)->It = FROM_M; \ - (II) = (p)->M - gap_open - gap_end; \ - } else { \ - (cur)->It = FROM_I; \ - (II) = (p)->I - gap_end; \ - } \ - } else set_I(II, cur, p); \ -} -#define set_D(DD, cur, p) \ -{ \ - if ((p)->M - gap_open > (p)->D) { \ - (cur)->Dt = FROM_M; \ - (DD) = (p)->M - gap_open - gap_ext; \ - } else { \ - (cur)->Dt = FROM_D; \ - (DD) = (p)->D - gap_ext; \ - } \ -} -#define set_end_D(DD, cur, p) \ -{ \ - if (gap_end >= 0) { \ - if ((p)->M - gap_open > (p)->D) { \ - (cur)->Dt = FROM_M; \ - (DD) = (p)->M - gap_open - gap_end; \ - } else { \ - (cur)->Dt = FROM_D; \ - (DD) = (p)->D - gap_end; \ - } \ - } else set_D(DD, cur, p); \ -} - -typedef struct -{ - unsigned char Mt:3, It:2, Dt:2; -} dpcell_t; - -typedef struct -{ - int M, I, D; -} dpscore_t; - -/* build score profile for accelerating alignment, in theory */ -void aln_init_score_array(unsigned char *seq, int len, int row, int *score_matrix, int **s_array) -{ - int *tmp, *tmp2, i, k; - for (i = 0; i != row; ++i) { - tmp = score_matrix + i * row; - tmp2 = s_array[i]; - for (k = 0; k != len; ++k) - tmp2[k] = tmp[seq[k]]; - } -} -/*************************** - * banded global alignment * - ***************************/ -int aln_global_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len) -{ - register int i, j; - dpcell_t **dpcell, *q; - dpscore_t *curr, *last, *s; - path_t *p; - int b1, b2, tmp_end; - int *mat, end, max; - unsigned char type, ctype; - - int gap_open, gap_ext, gap_end, b; - int *score_matrix, N_MATRIX_ROW; - - /* initialize some align-related parameters. just for compatibility */ - gap_open = ap->gap_open; - gap_ext = ap->gap_ext; - gap_end = ap->gap_end; - b = ap->band_width; - score_matrix = ap->matrix; - N_MATRIX_ROW = ap->row; - - if (len1 == 0 || len2 == 0) { - *path_len = 0; - return 0; - } - /* calculate b1 and b2 */ - if (len1 > len2) { - b1 = len1 - len2 + b; - b2 = b; - } else { - b1 = b; - b2 = len2 - len1 + b; - } - if (b1 > len1) b1 = len1; - if (b2 > len2) b2 = len2; - --seq1; --seq2; - - /* allocate memory */ - end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1); - dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1)); - for (j = 0; j <= len2; ++j) - dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end); - for (j = b2 + 1; j <= len2; ++j) - dpcell[j] -= j - b2; - curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); - last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); - - /* set first row */ - SET_INF(*curr); curr->M = 0; - for (i = 1, s = curr + 1; i < b1; ++i, ++s) { - SET_INF(*s); - set_end_D(s->D, dpcell[0] + i, s - 1); - } - s = curr; curr = last; last = s; - - /* core dynamic programming, part 1 */ - tmp_end = (b2 < len2)? b2 : len2 - 1; - for (j = 1; j <= tmp_end; ++j) { - q = dpcell[j]; s = curr; SET_INF(*s); - set_end_I(s->I, q, last); - end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - ++s; ++q; - for (i = 1; i != end; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ - set_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - } - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_D(s->D, q, s - 1); - if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ - set_end_I(s->I, q, last + i); - } else s->I = MINOR_INF; - s = curr; curr = last; last = s; - } - /* last row for part 1, use set_end_D() instead of set_D() */ - if (j == len2 && b2 != len2 - 1) { - q = dpcell[j]; s = curr; SET_INF(*s); - set_end_I(s->I, q, last); - end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - ++s; ++q; - for (i = 1; i != end; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ - set_I(s->I, q, last + i); - set_end_D(s->D, q, s - 1); - } - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_end_D(s->D, q, s - 1); - if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ - set_end_I(s->I, q, last + i); - } else s->I = MINOR_INF; - s = curr; curr = last; last = s; - ++j; - } - - /* core dynamic programming, part 2 */ - for (; j <= len2 - b2 + 1; ++j) { - SET_INF(curr[j - b2]); - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - end = j + b1 - 1; - for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - } - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_D(s->D, q, s - 1); - s->I = MINOR_INF; - s = curr; curr = last; last = s; - } - - /* core dynamic programming, part 3 */ - for (; j < len2; ++j) { - SET_INF(curr[j - b2]); - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - } - set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); - set_end_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - s = curr; curr = last; last = s; - } - /* last row */ - if (j == len2) { - SET_INF(curr[j - b2]); - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_I(s->I, q, last + i); - set_end_D(s->D, q, s - 1); - } - set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); - set_end_I(s->I, q, last + i); - set_end_D(s->D, q, s - 1); - s = curr; curr = last; last = s; - } - - /* backtrace */ - i = len1; j = len2; - q = dpcell[j] + i; - s = last + len1; - max = s->M; type = q->Mt; ctype = FROM_M; - if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; } - if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; } - - p = path; - p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */ - ++p; - do { - switch (ctype) { - case FROM_M: --i; --j; break; - case FROM_I: --j; break; - case FROM_D: --i; break; - } - q = dpcell[j] + i; - ctype = type; - switch (type) { - case FROM_M: type = q->Mt; break; - case FROM_I: type = q->It; break; - case FROM_D: type = q->Dt; break; - } - p->ctype = ctype; p->i = i; p->j = j; - ++p; - } while (i || j); - *path_len = p - path - 1; - - /* free memory */ - for (j = b2 + 1; j <= len2; ++j) - dpcell[j] += j - b2; - for (j = 0; j <= len2; ++j) - free(dpcell[j]); - free(dpcell); - free(curr); free(last); - - return max; -} -/************************************************* - * local alignment combined with banded strategy * - *************************************************/ -int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len, int _thres, int *_subo) -{ - register NT_LOCAL_SCORE *s; - register int i; - int q, r, qr, tmp_len, qr_shift; - int **s_array, *score_array; - int e, f; - int is_overflow, of_base; - NT_LOCAL_SCORE *eh, curr_h, last_h, curr_last_h; - int j, start_i, start_j, end_i, end_j; - path_t *p; - int score_f, score_r, score_g; - int start, end, max_score; - int thres, *suba, *ss; - - int gap_open, gap_ext; - int *score_matrix, N_MATRIX_ROW; - - /* initialize some align-related parameters. just for compatibility */ - gap_open = ap->gap_open; - gap_ext = ap->gap_ext; - score_matrix = ap->matrix; - N_MATRIX_ROW = ap->row; - thres = _thres > 0? _thres : -_thres; - - if (len1 == 0 || len2 == 0) return -1; - - /* allocate memory */ - suba = (int*)malloc(sizeof(int) * (len2 + 1)); - eh = (NT_LOCAL_SCORE*)malloc(sizeof(NT_LOCAL_SCORE) * (len1 + 1)); - s_array = (int**)malloc(sizeof(int*) * N_MATRIX_ROW); - for (i = 0; i != N_MATRIX_ROW; ++i) - s_array[i] = (int*)malloc(sizeof(int) * len1); - /* initialization */ - aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array); - q = gap_open; - r = gap_ext; - qr = q + r; - qr_shift = (qr+1) << NT_LOCAL_SHIFT; - tmp_len = len1 + 1; - start_i = start_j = end_i = end_j = 0; - for (i = 0, max_score = 0; i != N_MATRIX_ROW * N_MATRIX_ROW; ++i) - if (max_score < score_matrix[i]) max_score = score_matrix[i]; - /* convert the coordinate */ - --seq1; --seq2; - for (i = 0; i != N_MATRIX_ROW; ++i) --s_array[i]; - - /* forward dynamic programming */ - for (i = 0, s = eh; i != tmp_len; ++i, ++s) *s = 0; - score_f = 0; - is_overflow = of_base = 0; - suba[0] = 0; - for (j = 1, ss = suba + 1; j <= len2; ++j, ++ss) { - int subo = 0; - last_h = f = 0; - score_array = s_array[seq2[j]]; - if (is_overflow) { /* adjust eh[] array if overflow occurs. */ - /* If LOCAL_OVERFLOW_REDUCE is too small, optimal alignment might be missed. - * If it is too large, this block will be excuted frequently and therefore - * slow down the whole program. - * Acually, smaller LOCAL_OVERFLOW_REDUCE might also help to reduce the - * number of assignments because it sets some cells to zero when overflow - * happens. */ - int tmp, tmp2; - score_f -= LOCAL_OVERFLOW_REDUCE; - of_base += LOCAL_OVERFLOW_REDUCE; - is_overflow = 0; - for (i = 1, s = eh; i <= tmp_len; ++i, ++s) { - tmp = *s >> NT_LOCAL_SHIFT; tmp2 = *s & NT_LOCAL_MASK; - if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0; - else tmp2 -= LOCAL_OVERFLOW_REDUCE; - if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0; - else tmp -= LOCAL_OVERFLOW_REDUCE; - *s = (tmp << NT_LOCAL_SHIFT) | tmp2; - } - } - for (i = 1, s = eh; i != tmp_len; ++i, ++s) { - /* prepare for calculate current h */ - curr_h = (*s >> NT_LOCAL_SHIFT) + score_array[i]; - if (curr_h < 0) curr_h = 0; - if (last_h > 0) { /* initialize f */ - f = (f > last_h - q)? f - r : last_h - qr; - if (curr_h < f) curr_h = f; - } - if (*(s+1) >= qr_shift) { /* initialize e */ - curr_last_h = *(s+1) >> NT_LOCAL_SHIFT; - e = ((*s & NT_LOCAL_MASK) > curr_last_h - q)? (*s & NT_LOCAL_MASK) - r : curr_last_h - qr; - if (curr_h < e) curr_h = e; - *s = (last_h << NT_LOCAL_SHIFT) | e; - } else *s = last_h << NT_LOCAL_SHIFT; /* e = 0 */ - last_h = curr_h; - if (subo < curr_h) subo = curr_h; - if (score_f < curr_h) { - score_f = curr_h; end_i = i; end_j = j; - if (score_f > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1; - } - } - *s = last_h << NT_LOCAL_SHIFT; - *ss = subo + of_base; - } - score_f += of_base; - - if (score_f < thres) { /* no matching residue at all, 090218 */ - if (path_len) *path_len = 0; - goto end_func; - } - if (path == 0) goto end_func; /* skip path-filling */ - - /* reverse dynamic programming */ - for (i = end_i, s = eh + end_i; i >= 0; --i, --s) *s = 0; - if (end_i == 0 || end_j == 0) goto end_func; /* no local match */ - score_r = score_matrix[seq1[end_i] * N_MATRIX_ROW + seq2[end_j]]; - is_overflow = of_base = 0; - start_i = end_i; start_j = end_j; - eh[end_i] = ((NT_LOCAL_SCORE)(qr + score_r)) << NT_LOCAL_SHIFT; /* in order to initialize f and e, 040408 */ - start = end_i - 1; - end = end_i - 3; - if (end <= 0) end = 0; - - /* second pass DP can be done in a band, speed will thus be enhanced */ - for (j = end_j - 1; j != 0; --j) { - last_h = f = 0; - score_array = s_array[seq2[j]]; - if (is_overflow) { /* adjust eh[] array if overflow occurs. */ - int tmp, tmp2; - score_r -= LOCAL_OVERFLOW_REDUCE; - of_base += LOCAL_OVERFLOW_REDUCE; - is_overflow = 0; - for (i = start, s = eh + start + 1; i >= end; --i, --s) { - tmp = *s >> NT_LOCAL_SHIFT; tmp2 = *s & NT_LOCAL_MASK; - if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0; - else tmp2 -= LOCAL_OVERFLOW_REDUCE; - if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0; - else tmp -= LOCAL_OVERFLOW_REDUCE; - *s = (tmp << NT_LOCAL_SHIFT) | tmp2; - } - } - for (i = start, s = eh + start + 1; i != end; --i, --s) { - /* prepare for calculate current h */ - curr_h = (*s >> NT_LOCAL_SHIFT) + score_array[i]; - if (curr_h < 0) curr_h = 0; - if (last_h > 0) { /* initialize f */ - f = (f > last_h - q)? f - r : last_h - qr; - if (curr_h < f) curr_h = f; - } - curr_last_h = *(s-1) >> NT_LOCAL_SHIFT; - e = ((*s & NT_LOCAL_MASK) > curr_last_h - q)? (*s & NT_LOCAL_MASK) - r : curr_last_h - qr; - if (e < 0) e = 0; - if (curr_h < e) curr_h = e; - *s = (last_h << NT_LOCAL_SHIFT) | e; - last_h = curr_h; - if (score_r < curr_h) { - score_r = curr_h; start_i = i; start_j = j; - if (score_r + of_base - qr == score_f) { - j = 1; break; - } - if (score_r > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1; - } - } - *s = last_h << NT_LOCAL_SHIFT; - /* recalculate start and end, the boundaries of the band */ - if ((eh[start] >> NT_LOCAL_SHIFT) <= qr) --start; - if (start <= 0) start = 0; - end = start_i - (start_j - j) - (score_r + of_base + (start_j - j) * max_score) / r - 1; - if (end <= 0) end = 0; - } - - if (_subo) { - int tmp2 = 0, tmp = (int)(start_j - .33 * (end_j - start_j) + .499); - for (j = 1; j <= tmp; ++j) - if (tmp2 < suba[j]) tmp2 = suba[j]; - tmp = (int)(end_j + .33 * (end_j - start_j) + .499); - for (j = tmp; j <= len2; ++j) - if (tmp2 < suba[j]) tmp2 = suba[j]; - *_subo = tmp2; - } - - if (path_len == 0) { - path[0].i = start_i; path[0].j = start_j; - path[1].i = end_i; path[1].j = end_j; - goto end_func; - } - - score_r += of_base; - score_r -= qr; - -#ifdef DEBUG - /* this seems not a bug */ - if (score_f != score_r) - fprintf(stderr, "[aln_local_core] unknown flaw occurs: score_f(%d) != score_r(%d)\n", score_f, score_r); -#endif - - if (_thres > 0) { /* call global alignment to fill the path */ - score_g = 0; - j = (end_i - start_i > end_j - start_j)? end_i - start_i : end_j - start_j; - ++j; /* j is the maximum band_width */ - for (i = ap->band_width;; i <<= 1) { - AlnParam ap_real = *ap; - ap_real.gap_end = -1; - ap_real.band_width = i; - score_g = aln_global_core(seq1 + start_i, end_i - start_i + 1, seq2 + start_j, - end_j - start_j + 1, &ap_real, path, path_len); - if (score_g == score_r || score_f == score_g) break; - if (i > j) break; - } - if (score_r > score_g && score_f > score_g) { - fprintf(stderr, "[aln_local_core] Potential bug: (%d,%d) > %d\n", score_f, score_r, score_g); - score_f = score_r = -1; - } else score_f = score_g; - - /* convert coordinate */ - for (p = path + *path_len - 1; p >= path; --p) { - p->i += start_i - 1; - p->j += start_j - 1; - } - } else { /* just store the start and end */ - *path_len = 2; - path[1].i = start_i; path[1].j = start_j; - path->i = end_i; path->j = end_j; - } - -end_func: - /* free */ - free(eh); free(suba); - for (i = 0; i != N_MATRIX_ROW; ++i) { - ++s_array[i]; - free(s_array[i]); - } - free(s_array); - return score_f; -} -AlnAln *aln_stdaln_aux(const char *seq1, const char *seq2, const AlnParam *ap, - int type, int thres, int len1, int len2) -{ - unsigned char *seq11, *seq22; - int score; - int i, j, l; - path_t *p; - char *out1, *out2, *outm; - AlnAln *aa; - - if (len1 < 0) len1 = strlen(seq1); - if (len2 < 0) len2 = strlen(seq2); - - aa = aln_init_AlnAln(); - seq11 = (unsigned char*)malloc(sizeof(unsigned char) * len1); - seq22 = (unsigned char*)malloc(sizeof(unsigned char) * len2); - aa->path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 1)); - - if (ap->row < 10) { /* 4-nucleotide alignment */ - for (i = 0; i < len1; ++i) - seq11[i] = aln_nt4_table[(int)seq1[i]]; - for (j = 0; j < len2; ++j) - seq22[j] = aln_nt4_table[(int)seq2[j]]; - } else if (ap->row < 20) { /* 16-nucleotide alignment */ - for (i = 0; i < len1; ++i) - seq11[i] = aln_nt16_table[(int)seq1[i]]; - for (j = 0; j < len2; ++j) - seq22[j] = aln_nt16_table[(int)seq2[j]]; - } else { /* amino acids */ - for (i = 0; i < len1; ++i) - seq11[i] = aln_aa_table[(int)seq1[i]]; - for (j = 0; j < len2; ++j) - seq22[j] = aln_aa_table[(int)seq2[j]]; - } - - if (type == ALN_TYPE_GLOBAL) score = aln_global_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len); - else if (type == ALN_TYPE_LOCAL) score = aln_local_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len, thres, &aa->subo); - else if (type == ALN_TYPE_EXTEND) score = aln_extend_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len, 1, 0); - else { - free(seq11); free(seq22); free(aa->path); - aln_free_AlnAln(aa); - return 0; - } - aa->score = score; - - if (thres > 0) { - out1 = aa->out1 = (char*)malloc(sizeof(char) * (aa->path_len + 1)); - out2 = aa->out2 = (char*)malloc(sizeof(char) * (aa->path_len + 1)); - outm = aa->outm = (char*)malloc(sizeof(char) * (aa->path_len + 1)); - - --seq1; --seq2; - --seq11; --seq22; - - p = aa->path + aa->path_len - 1; - - for (l = 0; p >= aa->path; --p, ++l) { - switch (p->ctype) { - case FROM_M: out1[l] = seq1[p->i]; out2[l] = seq2[p->j]; - outm[l] = (seq11[p->i] == seq22[p->j] && seq11[p->i] != ap->row)? '|' : ' '; - break; - case FROM_I: out1[l] = '-'; out2[l] = seq2[p->j]; outm[l] = ' '; break; - case FROM_D: out1[l] = seq1[p->i]; out2[l] = '-'; outm[l] = ' '; break; - } - } - out1[l] = out2[l] = outm[l] = '\0'; - ++seq11; ++seq22; - } - - free(seq11); - free(seq22); - - p = aa->path + aa->path_len - 1; - aa->start1 = p->i? p->i : 1; - aa->end1 = aa->path->i; - aa->start2 = p->j? p->j : 1; - aa->end2 = aa->path->j; - aa->cigar32 = aln_path2cigar32(aa->path, aa->path_len, &aa->n_cigar); - - return aa; -} -AlnAln *aln_stdaln(const char *seq1, const char *seq2, const AlnParam *ap, int type, int thres) -{ - return aln_stdaln_aux(seq1, seq2, ap, type, thres, -1, -1); -} - -/* for backward compatibility */ -uint16_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar) -{ - uint32_t *cigar32; - uint16_t *cigar; - int i; - cigar32 = aln_path2cigar32(path, path_len, n_cigar); - cigar = (uint16_t*)cigar32; - for (i = 0; i < *n_cigar; ++i) - cigar[i] = (cigar32[i]&0xf)<<14 | (cigar32[i]>>4&0x3fff); - return cigar; -} - -/* newly added functions (2009-07-21) */ - -int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len, int G0, uint8_t *_mem) -{ - int q, r, qr; - int32_t **s_array, *score_array; - int is_overflow, of_base; - uint32_t *eh; - int i, j, end_i, end_j; - int score, start, end; - int *score_matrix, N_MATRIX_ROW; - uint8_t *mem, *_p; - - /* initialize some align-related parameters. just for compatibility */ - q = ap->gap_open; - r = ap->gap_ext; - qr = q + r; - score_matrix = ap->matrix; - N_MATRIX_ROW = ap->row; - - if (len1 == 0 || len2 == 0) return -1; - - /* allocate memory */ - mem = _mem? _mem : calloc((len1 + 2) * (N_MATRIX_ROW + 1), 4); - _p = mem; - eh = (uint32_t*)_p, _p += 4 * (len1 + 2); - s_array = calloc(N_MATRIX_ROW, sizeof(void*)); - for (i = 0; i != N_MATRIX_ROW; ++i) - s_array[i] = (int32_t*)_p, _p += 4 * len1; - /* initialization */ - aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array); - start = 1; end = 2; - end_i = end_j = 0; - score = 0; - is_overflow = of_base = 0; - /* convert the coordinate */ - --seq1; --seq2; - for (i = 0; i != N_MATRIX_ROW; ++i) --s_array[i]; - - /* dynamic programming */ - memset(eh, 0, 4 * (len1 + 2)); - eh[1] = (uint32_t)G0<<16; - for (j = 1; j <= len2; ++j) { - int _start, _end; - int h1 = 0, f = 0; - score_array = s_array[seq2[j]]; - /* set start and end */ - _start = j - ap->band_width; - if (_start < 1) _start = 1; - if (_start > start) start = _start; - _end = j + ap->band_width; - if (_end > len1 + 1) _end = len1 + 1; - if (_end < end) end = _end; - if (start == end) break; - /* adjust eh[] array if overflow occurs. */ - if (is_overflow) { - int tmp, tmp2; - score -= LOCAL_OVERFLOW_REDUCE; - of_base += LOCAL_OVERFLOW_REDUCE; - is_overflow = 0; - for (i = start; i <= end; ++i) { - uint32_t *s = &eh[i]; - tmp = *s >> 16; tmp2 = *s & 0xffff; - if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0; - else tmp2 -= LOCAL_OVERFLOW_REDUCE; - if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0; - else tmp -= LOCAL_OVERFLOW_REDUCE; - *s = (tmp << 16) | tmp2; - } - } - _start = _end = 0; - /* the inner loop */ - for (i = start; i < end; ++i) { - /* At the beginning of each cycle: - eh[i] -> h[j-1,i-1]<<16 | e[j,i] - f -> f[j,i] - h1 -> h[j,i-1] - */ - uint32_t *s = &eh[i]; - int h = (int)(*s >> 16); - int e = *s & 0xffff; /* this is e[j,i] */ - *s = (uint32_t)h1 << 16; /* eh[i] now stores h[j,i-1]<<16 */ - h += h? score_array[i] : 0; /* this is left_core() specific */ - /* calculate h[j,i]; don't need to test 0, as {e,f}>=0 */ - h = h > e? h : e; - h = h > f? h : f; /* h now is h[j,i] */ - h1 = h; - if (h > 0) { - if (_start == 0) _start = i; - _end = i; - if (score < h) { - score = h; end_i = i; end_j = j; - if (score > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1; - } - } - /* calculate e[j+1,i] and f[j,i+1] */ - h -= qr; - h = h > 0? h : 0; - e -= r; - e = e > h? e : h; - f -= r; - f = f > h? f : h; - *s |= e; - } - eh[end] = h1 << 16; - /* recalculate start and end, the boundaries of the band */ - if (_end <= 0) break; /* no cell in this row has a positive score */ - start = _start; - end = _end + 3; - } - - score += of_base - 1; - if (score <= 0) { - if (path_len) *path_len = 0; - goto end_left_func; - } - - if (path == 0) goto end_left_func; - - if (path_len == 0) { - path[0].i = end_i; path[0].j = end_j; - goto end_left_func; - } - - { /* call global alignment to fill the path */ - int score_g = 0; - j = (end_i - 1 > end_j - 1)? end_i - 1 : end_j - 1; - ++j; /* j is the maximum band_width */ - for (i = ap->band_width;; i <<= 1) { - AlnParam ap_real = *ap; - ap_real.gap_end = -1; - ap_real.band_width = i; - score_g = aln_global_core(seq1 + 1, end_i, seq2 + 1, end_j, &ap_real, path, path_len); - if (score == score_g) break; - if (i > j) break; - } - if (score > score_g) - fprintf(stderr, "[aln_left_core] no suitable bandwidth: %d < %d\n", score_g, score); - score = score_g; - } - -end_left_func: - /* free */ - free(s_array); - if (!_mem) free(mem); - return score; -} - -uint32_t *aln_path2cigar32(const path_t *path, int path_len, int *n_cigar) -{ - int i, n; - uint32_t *cigar; - unsigned char last_type; - - if (path_len == 0 || path == 0) { - *n_cigar = 0; - return 0; - } - - last_type = path->ctype; - for (i = n = 1; i < path_len; ++i) { - if (last_type != path[i].ctype) ++n; - last_type = path[i].ctype; - } - *n_cigar = n; - cigar = (uint32_t*)malloc(*n_cigar * 4); - - cigar[0] = 1u << 4 | path[path_len-1].ctype; - last_type = path[path_len-1].ctype; - for (i = path_len - 2, n = 0; i >= 0; --i) { - if (path[i].ctype == last_type) cigar[n] += 1u << 4; - else { - cigar[++n] = 1u << 4 | path[i].ctype; - last_type = path[i].ctype; - } - } - - return cigar; -} - -#ifdef STDALN_MAIN -int main() -{ - AlnAln *aln_local, *aln_global, *aln_left; - int i; - - aln_local = aln_stdaln("CGTGCGATGCactgCATACGGCTCGCCTAGATCA", "AAGGGATGCTCTGCATCgCTCGGCTAGCTGT", &aln_param_blast, 0, 1); - aln_global = aln_stdaln("CGTGCGATGCactgCATACGGCTCGCCTAGATCA", "AAGGGATGCTCTGCATCGgCTCGGCTAGCTGT", &aln_param_blast, 1, 1); -// aln_left = aln_stdaln( "GATGCACTGCATACGGCTCGCCTAGATCA", "GATGCTCTGCATCGgCTCGGCTAGCTGT", &aln_param_blast, 2, 1); - aln_left = aln_stdaln("CACCTTCGACTCACGTCTCATTCTCGGAGTCGAGTGGACGGTCCCTCATACACGAACAGGTTC", - "CACCTTCGACTTTCACCTCTCATTCTCGGACTCGAGTGGACGGTCCCTCATCCAAGAACAGGGTCTGTGAAA", &aln_param_blast, 2, 1); - - printf(">%d,%d\t%d,%d\n", aln_local->start1, aln_local->end1, aln_local->start2, aln_local->end2); - printf("%s\n%s\n%s\n", aln_local->out1, aln_local->outm, aln_local->out2); - - printf(">%d,%d\t%d,%d\t", aln_global->start1, aln_global->end1, aln_global->start2, aln_global->end2); - for (i = 0; i != aln_global->n_cigar; ++i) - printf("%d%c", aln_global->cigar32[i]>>4, "MID"[aln_global->cigar32[i]&0xf]); - printf("\n%s\n%s\n%s\n", aln_global->out1, aln_global->outm, aln_global->out2); - - printf(">%d\t%d,%d\t%d,%d\t", aln_left->score, aln_left->start1, aln_left->end1, aln_left->start2, aln_left->end2); - for (i = 0; i != aln_left->n_cigar; ++i) - printf("%d%c", aln_left->cigar32[i]>>4, "MID"[aln_left->cigar32[i]&0xf]); - printf("\n%s\n%s\n%s\n", aln_left->out1, aln_left->outm, aln_left->out2); - - aln_free_AlnAln(aln_local); - aln_free_AlnAln(aln_global); - aln_free_AlnAln(aln_left); - return 0; -} -#endif diff --git a/stdaln.h b/stdaln.h deleted file mode 100644 index f0048b3..0000000 --- a/stdaln.h +++ /dev/null @@ -1,162 +0,0 @@ -/* The MIT License - - Copyright (c) 2003-2006, 2008, by Heng Li - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* - 2009-07-23, 0.10.0 - - - Use 32-bit to store CIGAR - - - Report suboptimal aligments - - - Implemented half-fixed-half-open DP - - 2009-04-26, 0.9.10 - - - Allow to set a threshold for local alignment - - 2009-02-18, 0.9.9 - - - Fixed a bug when no residue matches - - 2008-08-04, 0.9.8 - - - Fixed the wrong declaration of aln_stdaln_aux() - - - Avoid 0 coordinate for global alignment - - 2008-08-01, 0.9.7 - - - Change gap_end penalty to 5 in aln_param_bwa - - - Add function to convert path_t to the CIGAR format - - 2008-08-01, 0.9.6 - - - The first gap now costs (gap_open+gap_ext), instead of - gap_open. Scoring systems are modified accordingly. - - - Gap end is now correctly handled. Previously it is not correct. - - - Change license to MIT. - - */ - -#ifndef LH3_STDALN_H_ -#define LH3_STDALN_H_ - - -#define STDALN_VERSION 0.11.0 - -#include - -#define FROM_M 0 -#define FROM_I 1 -#define FROM_D 2 -#define FROM_S 3 - -#define ALN_TYPE_LOCAL 0 -#define ALN_TYPE_GLOBAL 1 -#define ALN_TYPE_EXTEND 2 - -/* This is the smallest integer. It might be CPU-dependent in very RARE cases. */ -#define MINOR_INF -1073741823 - -typedef struct -{ - int gap_open; - int gap_ext; - int gap_end; - - int *matrix; - int row; - int band_width; -} AlnParam; - -typedef struct -{ - int i, j; - unsigned char ctype; -} path_t; - -typedef struct -{ - path_t *path; /* for advanced users... :-) */ - int path_len; /* for advanced users... :-) */ - int start1, end1; /* start and end of the first sequence, coordinations are 1-based */ - int start2, end2; /* start and end of the second sequence, coordinations are 1-based */ - int score, subo; /* score */ - - char *out1, *out2; /* print them, and then you will know */ - char *outm; - - int n_cigar; - uint32_t *cigar32; -} AlnAln; - -#ifdef __cplusplus -extern "C" { -#endif - - AlnAln *aln_stdaln_aux(const char *seq1, const char *seq2, const AlnParam *ap, - int type, int do_align, int len1, int len2); - AlnAln *aln_stdaln(const char *seq1, const char *seq2, const AlnParam *ap, int type, int do_align); - void aln_free_AlnAln(AlnAln *aa); - - int aln_global_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len); - int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len, int _thres, int *_subo); - int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len, int G0, uint8_t *_mem); - uint16_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar); - uint32_t *aln_path2cigar32(const path_t *path, int path_len, int *n_cigar); - -#ifdef __cplusplus -} -#endif - -/******************** - * global variables * - ********************/ - -extern AlnParam aln_param_bwa; /* = { 37, 9, 0, aln_sm_maq, 5, 50 }; */ -extern AlnParam aln_param_blast; /* = { 5, 2, 2, aln_sm_blast, 5, 50 }; */ -extern AlnParam aln_param_nt2nt; /* = { 10, 2, 2, aln_sm_nt, 16, 75 }; */ -extern AlnParam aln_param_aa2aa; /* = { 20, 19, 19, aln_sm_read, 16, 75 }; */ -extern AlnParam aln_param_rd2rd; /* = { 12, 2, 2, aln_sm_blosum62, 22, 50 }; */ - -/* common nucleotide score matrix for 16 bases */ -extern int aln_sm_nt[], aln_sm_bwa[]; - -/* BLOSUM62 and BLOSUM45 */ -extern int aln_sm_blosum62[], aln_sm_blosum45[]; - -/* common read for 16 bases. note that read alignment is quite different from common nucleotide alignment */ -extern int aln_sm_read[]; - -/* human-mouse score matrix for 4 bases */ -extern int aln_sm_hs[]; - -#endif From 25366c72206df5066f268845c03c1437f8e55dfd Mon Sep 17 00:00:00 2001 From: Jon Sorenson Date: Tue, 5 Mar 2013 20:48:16 +0000 Subject: [PATCH 06/17] Fixing problem with linking to libm on some Ubuntu systems (I see this on machine running 11.04, kernel 3.0.0-14-virtual). Changing order of -lm on the command line seems to do the trick and should be tolerated in other environments. --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 96c3047..bd4ee22 100644 --- a/Makefile +++ b/Makefile @@ -23,10 +23,10 @@ SUBDIRS= . all:$(PROG) bwa:libbwa.a $(AOBJS) main.o - $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa + $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS) bwamem-lite:libbwa.a example.o - $(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ $(LIBS) -L. -lbwa + $(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ -L. -lbwa $(LIBS) libbwa.a:$(LOBJS) $(AR) -csru $@ $(LOBJS) From 6476343a8310ea4886ac2e35702818955bde1c81 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Mar 2013 19:56:37 -0500 Subject: [PATCH 07/17] r331: rewrote CIGAR generation for bwa-short When backtracking, bwa-short does not keep the detailed alignment or the exact start and end positions. To find the boundary and the CIGAR, the old code does a global alignment with a small end-gap penalty. It then deals with a lot of special cases to derive the right position and CIGAR, which are actually not always right. It is a mess. As the new ksw.{c,h} does not support a different end-gap penalty, the old strategy does not work. But we get something better. The new code finds the boundaries with ksw_extend(). It is cleaner and gives more accurate CIGAR in most cases. --- bwase.c | 89 ++++++++++++++++++++++++++++----------------------------- main.c | 2 +- 2 files changed, 44 insertions(+), 47 deletions(-) diff --git a/bwase.c b/bwase.c index eebe22b..83e4baa 100644 --- a/bwase.c +++ b/bwase.c @@ -156,59 +156,58 @@ void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_se bwt_destroy(bwt); } -/* is_end_correct == 1 if (*pos+len) gives the correct coordinate on - * forward strand. This happens when p->pos is calculated by - * bwa_cal_pac_pos(). is_end_correct==0 if (*pos) gives the correct - * coordinate. This happens only for color-converted alignment. */ -bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, bwtint_t *_pos, - int ext, int *n_cigar, int is_end_correct) +#define SW_BW 50 + +bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, ubyte_t *seq, bwtint_t *_pos, int *n_cigar, int is_rev) { bwa_cigar_t *cigar = 0; uint32_t *cigar32 = 0; - ubyte_t *ref_seq; - int l = 0, ref_len; - int64_t k, __pos = *_pos; + ubyte_t *rseq; + int tle, qle, gtle, gscore, lscore; + int64_t k, rb, re, rlen; int8_t mat[25]; bwa_fill_scmat(1, 3, mat); - ref_len = len + abs(ext); - if (ext > 0) { - ref_seq = (ubyte_t*)calloc(ref_len, 1); - for (k = __pos; k < __pos + ref_len && k < l_pac; ++k) - ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; - } else { - int64_t x = __pos + (is_end_correct? len : ref_len); - ref_seq = (ubyte_t*)calloc(ref_len, 1); - for (l = 0, k = x - ref_len > 0? x - ref_len : 0; k < x && k < l_pac; ++k) - ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; + if (!is_rev) { // forward strand, the end position is correct + re = *_pos + len; + if (re > l_pac) re = l_pac; + rb = re - (len + SW_BW); + if (rb < 0) rb = 0; + rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); + seq_reverse(len, seq, 0); // as we need to do left extension, we have to reverse both query and reference sequences + seq_reverse(rlen, rseq, 0); + lscore = ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0); + if (gscore > 0) tle = gtle, qle = len; + rb = re - tle; rlen = tle; + seq_reverse(len, seq, 0); + seq_reverse(rlen, rseq, 0); + ksw_global(qle, &seq[len-qle], rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); + if (qle < len) { // write soft clip + cigar = realloc(cigar, (*n_cigar + 1) * 4); + memmove(cigar + 1, cigar, *n_cigar * 4); + cigar[0] = (len - qle)<<4 | FROM_S; + ++(*n_cigar); + } + } else { // reverse strand, the start position is correct + rb = *_pos; re = rb + len + SW_BW; + if (re > l_pac) re = l_pac; + rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); + lscore = ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0); + if (gscore > 0) tle = gtle, qle = len; + re = rb + tle; rlen = tle; + ksw_global(qle, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); // right extension + if (qle < len) { + cigar = realloc(cigar, (*n_cigar + 1) * 4); + cigar[*n_cigar - 1] = (len - qle)<<4 | FROM_S; + ++(*n_cigar); + } } + *_pos = rb; - ksw_global(len, seq, l, ref_seq, 5, mat, 5, 1, 50, n_cigar, &cigar32); cigar = (bwa_cigar_t*)cigar32; for (k = 0; k < *n_cigar; ++k) cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); - - if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped to the forward strand - for (l = k = 0; k < *n_cigar; ++k) { - if (__cigar_op(cigar[k]) == FROM_D) l -= __cigar_len(cigar[k]); - else if (__cigar_op(cigar[k]) == FROM_I) l += __cigar_len(cigar[k]); - } - __pos += l; - } - - if (__cigar_op(cigar[0]) == FROM_D) { // deletion at the 5'-end - __pos += __cigar_len(cigar[0]); - for (k = 0; k < *n_cigar - 1; ++k) cigar[k] = cigar[k+1]; - --(*n_cigar); - } - if (__cigar_op(cigar[*n_cigar-1]) == FROM_D) --(*n_cigar); // deletion at the 3'-end - - // change "I" at either end of the read to S. just in case. This should rarely happen... - if (__cigar_op(cigar[*n_cigar-1]) == FROM_I) cigar[*n_cigar-1] = __cigar_create(FROM_S, (__cigar_len(cigar[*n_cigar-1]))); - if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(FROM_S, (__cigar_len(cigar[0]))); - - *_pos = (bwtint_t)__pos; - free(ref_seq); + free(rseq); return cigar; } @@ -316,13 +315,11 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t bwt_multi1_t *q = s->multi + j; int n_cigar; if (q->gap == 0) continue; - q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos, - (q->strand? 1 : -1) * q->gap, &n_cigar, 1); + q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos, &n_cigar, q->strand); q->n_cigar = n_cigar; } if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue; - s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, - (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1); + s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, &s->n_cigar, s->strand); } // generate MD tag str = (kstring_t*)calloc(1, sizeof(kstring_t)); diff --git a/main.c b/main.c index 00a21b9..0954e01 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r329-beta" +#define PACKAGE_VERSION "0.7.0-r331-beta" #endif static int usage() From 5fbd4546829d552eec52b1e45ea9ea86b7d5419c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Mar 2013 22:49:38 -0500 Subject: [PATCH 08/17] r332: added output threshold Otherwise there are far too many short hits --- bwamem.c | 4 +++- bwamem.h | 1 + fastmap.c | 4 +++- main.c | 2 +- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index cd3a1f5..7a6d175 100644 --- a/bwamem.c +++ b/bwamem.c @@ -44,6 +44,7 @@ mem_opt_t *mem_opt_init() o = calloc(1, sizeof(mem_opt_t)); o->flag = 0; o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100; + o->T = 30; o->zdrop = 100; o->pen_unpaired = 9; o->pen_clip = 5; @@ -742,11 +743,12 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b int k; kstring_t str; str.l = str.m = 0; str.s = 0; - if (a->n > 0) { + if (a->n > 0 && a->a[0].score >= opt->T) { int mapq0 = -1; for (k = 0; k < a->n; ++k) { bwahit_t h; mem_alnreg_t *p = &a->a[k]; + if (p->score < opt->T) continue; if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue; mem_alnreg2hit(p, &h); diff --git a/bwamem.h b/bwamem.h index 96a3308..14f04d5 100644 --- a/bwamem.h +++ b/bwamem.h @@ -24,6 +24,7 @@ typedef struct { int w; // band width int zdrop; // Z-dropoff + int T; // output score threshold; only affecting output int flag; // see MEM_F_* macros int min_seed_len; // minimum seed length float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor diff --git a/fastmap.c b/fastmap.c index d4e5626..e6c2b1e 100644 --- a/fastmap.c +++ b/fastmap.c @@ -26,13 +26,14 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:")) >= 0) { + while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); else if (c == 'B') opt->b = atoi(optarg); else if (c == 'O') opt->q = atoi(optarg); else if (c == 'E') opt->r = atoi(optarg); + else if (c == 'T') opt->T = atoi(optarg); else if (c == 'L') opt->pen_clip = atoi(optarg); else if (c == 'U') opt->pen_unpaired = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; @@ -74,6 +75,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); fprintf(stderr, "\n"); fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose); + fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T); fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); fprintf(stderr, " -H hard clipping\n"); diff --git a/main.c b/main.c index 0954e01..c342fd4 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r331-beta" +#define PACKAGE_VERSION "0.7.0-r332-beta" #endif static int usage() From 773b86331b2a207337b519fb6eb82d5e70d6bda3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 6 Mar 2013 19:23:45 -0500 Subject: [PATCH 09/17] De-overlap paired-end reads --- pemerge.c | 267 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 267 insertions(+) create mode 100644 pemerge.c diff --git a/pemerge.c b/pemerge.c new file mode 100644 index 0000000..90d53a3 --- /dev/null +++ b/pemerge.c @@ -0,0 +1,267 @@ +#include +#include +#include +#include +#include +#include "ksw.h" +#include "kseq.h" + +#ifdef _PEM_MAIN +KSEQ_INIT(gzFile, gzread) + +unsigned char nst_nt4_table[128] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; + +void bwa_fill_scmat(int a, int b, int8_t mat[25]) +{ + int i, j, k; + for (i = k = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + mat[k++] = i == j? a : -b; + mat[k++] = 0; // ambiguous base + } + for (j = 0; j < 5; ++j) mat[k++] = 0; +} +#else +#include "bwa.h" +KSEQ_DECLARE(gzFile) +#endif + +#define MAX_SCORE_RATIO 0.9f +#define MAX_ERR 8 + +static const char *err_msg[MAX_ERR+1] = { + "successful merges", + "low-scoring pairs", + "pairs where the best SW alignment is not an overlap (long left end)", + "pairs where the best SW alignment is not an overlap (long right end)", + "pairs with large 2nd best SW score", + "pairs with gapped overlap", + "pairs where the end-to-end ungapped alignment score is not high enough", + "pairs potentially with tandem overlaps", + "pairs with high sum of errors" +}; + +typedef struct { + kstring_t n, s, q; +} mseq_t; + +typedef struct { + int a, b, q, r, w; + int q_def, q_thres; + int T; + int8_t mat[25]; +} pem_opt_t; + +pem_opt_t *pem_opt_init() +{ + pem_opt_t *opt; + opt = calloc(1, sizeof(pem_opt_t)); + opt->a = 5; opt->b = 4; opt->q = 2, opt->r = 17; opt->w = 20; + opt->T = opt->a * 10; + opt->q_def = 20; + opt->q_thres = 70; + bwa_fill_scmat(opt->a, opt->b, opt->mat); + return opt; +} + +int bwa_pemerge(const pem_opt_t *opt, mseq_t x[2], uint8_t **seq_, uint8_t **qual_) +{ + uint8_t *s[2], *q[2], *seq, *qual; + int i, xtra, l, l_seq, sum_q; + kswr_t r; + + *seq_ = *qual_ = 0; + s[0] = malloc(x[0].s.l); q[0] = malloc(x[0].s.l); + s[1] = malloc(x[1].s.l); q[1] = malloc(x[1].s.l); + for (i = 0; i < x[0].s.l; ++i) { + int c = x[0].s.s[i]; + s[0][i] = c < 0 || c > 127? 4 : c <= 4? c : nst_nt4_table[c]; + q[0][i] = x[0].q.l? x[0].q.s[i] - 33 : opt->q_def; + } + for (i = 0; i < x[1].s.l; ++i) { + int c = x[1].s.s[x[1].s.l - 1 - i]; + c = c < 0 || c > 127? 4 : c < 4? c : nst_nt4_table[c]; + s[1][i] = c < 4? 3 - c : 4; + q[1][i] = x[1].q.l? x[1].q.s[x[1].q.l - 1 - i] - 33 : opt->q_def; + } + + xtra = KSW_XSTART | KSW_XSUBO; + r = ksw_align(x[1].s.l, s[1], x[0].s.l, s[0], 5, opt->mat, opt->q, opt->r, xtra, 0); + ++r.qe; ++r.te; // change to the half-close-half-open coordinates + + if (r.score < opt->T) return -1; // poor alignment + if (r.tb < r.qb) return -2; // no enough space for the left end + if (x[0].s.l - r.te > x[1].s.l - r.qe) return -3; // no enough space for the right end + if ((double)r.score2 / r.score >= MAX_SCORE_RATIO) return -4; // the second best score is too large + if (r.qe - r.qb != r.te - r.tb) return -5; // we do not allow gaps + + { // test tandem match; O(n^2) + int max_m, max_m2, min_l, max_l, max_l2; + max_m = max_m2 = 0; max_l = max_l2 = 0; + min_l = x[0].s.l < x[1].s.l? x[0].s.l : x[1].s.l; + for (l = 0; l < min_l; ++l) { + int m = 0, o = x[0].s.l - l; + int a = opt->a, b = -opt->b; + for (i = 0; i < l; ++i) + m += (s[0][o + i] == s[1][i])? a : b; + if (m > max_m) max_m2 = max_m, max_m = m, max_l2 = max_l, max_l = l; + else if (m > max_m2) max_m2 = m, max_l2 = l; + } + if (max_m < opt->T) return -6; + if (max_l2 < max_l && max_m2 >= opt->T && (double)(max_m2 + (max_l - max_l2) * opt->a) / max_m >= MAX_SCORE_RATIO) + return -7; + //printf("*** %d,%d; %d,%d\n", max_m, max_m2, max_l, max_l2); + } + + l = x[0].s.l - (r.tb - r.qb); // length to merge + l_seq = x[0].s.l + x[1].s.l - l; + seq = malloc(l_seq + 1); + qual = malloc(l_seq + 1); + memcpy(seq, s[0], x[0].s.l); memcpy(seq + x[0].s.l, &s[1][l], x[1].s.l - l); + memcpy(qual, q[0], x[0].s.l); memcpy(qual + x[0].s.l, &q[1][l], x[1].s.l - l); + for (i = 0, sum_q = 0; i < l; ++i) { + int k = x[0].s.l - l + i; + if (s[0][k] == 4) { // ambiguous + seq[k] = s[1][i]; + qual[k] = q[1][i]; + } else if (s[1][i] == 4) { // do nothing + } else if (s[0][k] == s[1][i]) { + qual[k] = qual[k] > q[1][i]? qual[k] : q[1][i]; + } else { // s[0][k] != s[1][i] and neither is N + int qq = q[0][k] < q[1][i]? q[0][k] : q[1][i]; + sum_q += qq >= 3? qq<<1 : 1; + seq[k] = q[0][k] > q[1][i]? s[0][k] : s[1][i]; + qual[k] = abs((int)q[0][k] - (int)q[1][i]); + } + } + if (sum_q>>1 > opt->q_thres) { // too many mismatches + free(seq); free(qual); + return -8; + } + + for (i = 0; i < l_seq; ++i) seq[i] = "ACGTN"[(int)seq[i]], qual[i] += 33; + seq[l_seq] = qual[l_seq] = 0; + *seq_ = seq, *qual_ = qual; + return l_seq; +} + +static inline void kstrcpy(kstring_t *dst, const kstring_t *src) +{ + dst->l = 0; + if (src->l == 0) return; + if (dst->m < src->l + 1) { + dst->m = src->l + 2; + kroundup32(dst->m); + dst->s = realloc(dst->s, dst->m); + } + dst->l = src->l; + memcpy(dst->s, src->s, src->l + 1); +} + +static inline void kseq2mseq(mseq_t *ms, const kseq_t *ks) +{ + kstrcpy(&ms->n, &ks->name); + kstrcpy(&ms->s, &ks->seq); + kstrcpy(&ms->q, &ks->qual); +} + +static inline void print_seq(const char *n, const char *s, const char *q) +{ + putchar(q? '@' : '>'); + puts(n); puts(s); + if (q) { + puts("+"); puts(q); + } +} + +#ifdef _PEM_MAIN +int main(int argc, char *argv[]) +#else +int main_pemerge(int argc, char *argv[]) +#endif +{ + int c, flag = 0, i; + int64_t cnt[MAX_ERR+1]; + gzFile fp, fp2 = 0; + kseq_t *ks, *ks2 = 0; + mseq_t r[2]; + pem_opt_t *opt; + + opt = pem_opt_init(); + while ((c = getopt(argc, argv, "muQ:")) >= 0) { + if (c == 'm') flag |= 1; + else if (c == 'u') flag |= 2; + else if (c == 'Q') opt->q_thres = atoi(optarg); + } + if (flag == 0) flag = 3; + + if (optind == argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa pemerge [-mu] [read2.fq]\n\n"); + fprintf(stderr, "Options: -m output merged reads only\n"); + fprintf(stderr, " -u output unmerged reads only\n"); + fprintf(stderr, " -Q INT max sum of errors [%d]\n", opt->q_thres); + fprintf(stderr, "\n"); + free(opt); + return 1; + } + + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + ks = kseq_init(fp); + if (optind + 1 < argc) { + fp2 = strcmp(argv[optind+1], "-")? gzopen(argv[optind+1], "r") : gzdopen(fileno(stdin), "r"); + ks2 = kseq_init(fp2); + } + + memset(r, 0, sizeof(mseq_t)<<1); + memset(cnt, 0, 8 * (MAX_ERR+1)); + while (kseq_read(ks) >= 0) { + uint8_t *seq, *qual; + int l_seq; + kseq2mseq(&r[0], ks); + if (ks2) { + if (kseq_read(ks2) < 0) break; + kseq2mseq(&r[1], ks2); + } else { + if (kseq_read(ks) < 0) break; + kseq2mseq(&r[1], ks); + } + l_seq = bwa_pemerge(opt, r, &seq, &qual); + if (l_seq > 0) { + ++cnt[0]; + if (flag & 1) { + if (r[0].n.l > 2 && (r[0].n.s[r[0].n.l-1] == '1' || r[0].n.s[r[0].n.l-1] == '2') && r[0].n.s[r[0].n.l-2] == '/') + r[0].n.s[r[0].n.l-2] = 0, r[0].n.l -= 2; + print_seq(r[0].n.s, (char*)seq, (char*)qual); + } + } else { + ++cnt[-l_seq]; + if (flag & 2) { + printf("*** %d\n", l_seq); + print_seq(r[0].n.s, r[0].s.s, r[0].q.l? r[0].q.s : 0); + print_seq(r[1].n.s, r[1].s.s, r[1].q.l? r[1].q.s : 0); + } + } + } + + fprintf(stderr, "%12ld %s\n", (long)cnt[0], err_msg[0]); + for (i = 1; i <= MAX_ERR; ++i) + fprintf(stderr, "%12ld %s\n", (long)cnt[i], err_msg[i]); + kseq_destroy(ks); + gzclose(fp); + if (ks2) { + kseq_destroy(ks2); + gzclose(fp2); + } + free(opt); + return 0; +} From 042e1f4442af7ddea66bc669c37e7bec9d2ded40 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 6 Mar 2013 21:55:02 -0500 Subject: [PATCH 10/17] r334: added pemerge to bwa --- Makefile | 2 +- bntseq.c | 1 - main.c | 27 +++++++++++++++++++++++---- main.h | 28 ---------------------------- pemerge.c | 32 -------------------------------- 5 files changed, 24 insertions(+), 66 deletions(-) delete mode 100644 main.h diff --git a/Makefile b/Makefile index bd4ee22..93e3266 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ - is.o bwtindex.o bwape.o kopen.o \ + is.o bwtindex.o bwape.o kopen.o pemerge.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o PROG= bwa bwamem-lite diff --git a/bntseq.c b/bntseq.c index 540e966..29b3b12 100644 --- a/bntseq.c +++ b/bntseq.c @@ -31,7 +31,6 @@ #include #include #include "bntseq.h" -#include "main.h" #include "utils.h" #include "kseq.h" diff --git a/main.c b/main.c index c342fd4..26f22ae 100644 --- a/main.c +++ b/main.c @@ -1,12 +1,29 @@ #include #include -#include "main.h" #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r332-beta" +#define PACKAGE_VERSION "0.7.0-r334-beta" #endif +int bwa_fa2pac(int argc, char *argv[]); +int bwa_pac2bwt(int argc, char *argv[]); +int bwa_bwtupdate(int argc, char *argv[]); +int bwa_bwt2sa(int argc, char *argv[]); +int bwa_index(int argc, char *argv[]); +int bwt_bwtgen_main(int argc, char *argv[]); + +int bwa_aln(int argc, char *argv[]); +int bwa_sai2sam_se(int argc, char *argv[]); +int bwa_sai2sam_pe(int argc, char *argv[]); + +int bwa_bwtsw2(int argc, char *argv[]); + +int main_fastmap(int argc, char *argv[]); +int main_mem(int argc, char *argv[]); + +int main_pemerge(int argc, char *argv[]); + static int usage() { fprintf(stderr, "\n"); @@ -15,12 +32,13 @@ static int usage() fprintf(stderr, "Contact: Heng Li \n\n"); fprintf(stderr, "Usage: bwa [options]\n\n"); fprintf(stderr, "Command: index index sequences in the FASTA format\n"); + fprintf(stderr, " mem BWA-MEM algorithm\n"); + fprintf(stderr, " fastmap identify super-maximal exact matches\n"); + fprintf(stderr, " pemerge merge overlapping paired ends\n"); fprintf(stderr, " aln gapped/ungapped alignment\n"); fprintf(stderr, " samse generate alignment (single ended)\n"); fprintf(stderr, " sampe generate alignment (paired ended)\n"); fprintf(stderr, " bwasw BWA-SW for long queries\n"); - fprintf(stderr, " fastmap identify super-maximal exact matches\n"); - fprintf(stderr, " mem BWA-MEM algorithm\n"); fprintf(stderr, "\n"); fprintf(stderr, " fa2pac convert FASTA to PAC format\n"); fprintf(stderr, " pac2bwt generate BWT from PAC\n"); @@ -56,6 +74,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1); else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1); + else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1); else { fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; diff --git a/main.h b/main.h deleted file mode 100644 index 3e70362..0000000 --- a/main.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef BWA_MAIN_H -#define BWA_MAIN_H - -#ifdef __cplusplus -extern "C" { -#endif - - int bwa_fa2pac(int argc, char *argv[]); - int bwa_pac2bwt(int argc, char *argv[]); - int bwa_bwtupdate(int argc, char *argv[]); - int bwa_bwt2sa(int argc, char *argv[]); - int bwa_index(int argc, char *argv[]); - int bwa_aln(int argc, char *argv[]); - int bwt_bwtgen_main(int argc, char *argv[]); - - int bwa_sai2sam_se(int argc, char *argv[]); - int bwa_sai2sam_pe(int argc, char *argv[]); - - int bwa_bwtsw2(int argc, char *argv[]); - - int main_fastmap(int argc, char *argv[]); - int main_mem(int argc, char *argv[]); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/pemerge.c b/pemerge.c index 90d53a3..6756187 100644 --- a/pemerge.c +++ b/pemerge.c @@ -5,35 +5,8 @@ #include #include "ksw.h" #include "kseq.h" - -#ifdef _PEM_MAIN -KSEQ_INIT(gzFile, gzread) - -unsigned char nst_nt4_table[128] = { - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 -}; - -void bwa_fill_scmat(int a, int b, int8_t mat[25]) -{ - int i, j, k; - for (i = k = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - mat[k++] = i == j? a : -b; - mat[k++] = 0; // ambiguous base - } - for (j = 0; j < 5; ++j) mat[k++] = 0; -} -#else #include "bwa.h" KSEQ_DECLARE(gzFile) -#endif #define MAX_SCORE_RATIO 0.9f #define MAX_ERR 8 @@ -183,11 +156,7 @@ static inline void print_seq(const char *n, const char *s, const char *q) } } -#ifdef _PEM_MAIN -int main(int argc, char *argv[]) -#else int main_pemerge(int argc, char *argv[]) -#endif { int c, flag = 0, i; int64_t cnt[MAX_ERR+1]; @@ -246,7 +215,6 @@ int main_pemerge(int argc, char *argv[]) } else { ++cnt[-l_seq]; if (flag & 2) { - printf("*** %d\n", l_seq); print_seq(r[0].n.s, r[0].s.s, r[0].q.l? r[0].q.s : 0); print_seq(r[1].n.s, r[1].s.s, r[1].q.l? r[1].q.s : 0); } From 557d50c7e1d47ceff556847be52e9d9f51fa84a5 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 6 Mar 2013 21:57:13 -0500 Subject: [PATCH 11/17] r335: fixed a compiling error Caused by the last change --- bwtindex.c | 1 - main.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/bwtindex.c b/bwtindex.c index 298153d..934b382 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -33,7 +33,6 @@ #include #include "bntseq.h" #include "bwt.h" -#include "main.h" #include "utils.h" #ifdef _DIVBWT diff --git a/main.c b/main.c index 26f22ae..2663902 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r334-beta" +#define PACKAGE_VERSION "0.7.0-r335-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 72817b664e0fdab8a2f4837dea6c7a0a5f58eae5 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 6 Mar 2013 23:38:07 -0500 Subject: [PATCH 12/17] r336: fine tuning pemerge --- bwase.c | 6 +++--- main.c | 2 +- pemerge.c | 14 +++++++------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/bwase.c b/bwase.c index 83e4baa..fd06c7d 100644 --- a/bwase.c +++ b/bwase.c @@ -163,7 +163,7 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l bwa_cigar_t *cigar = 0; uint32_t *cigar32 = 0; ubyte_t *rseq; - int tle, qle, gtle, gscore, lscore; + int tle, qle, gtle, gscore; int64_t k, rb, re, rlen; int8_t mat[25]; @@ -176,7 +176,7 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); seq_reverse(len, seq, 0); // as we need to do left extension, we have to reverse both query and reference sequences seq_reverse(rlen, rseq, 0); - lscore = ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0); + ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0); if (gscore > 0) tle = gtle, qle = len; rb = re - tle; rlen = tle; seq_reverse(len, seq, 0); @@ -192,7 +192,7 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l rb = *_pos; re = rb + len + SW_BW; if (re > l_pac) re = l_pac; rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); - lscore = ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0); + ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0); if (gscore > 0) tle = gtle, qle = len; re = rb + tle; rlen = tle; ksw_global(qle, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); // right extension diff --git a/main.c b/main.c index 2663902..d9a87f1 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r335-beta" +#define PACKAGE_VERSION "0.7.0-r336-beta" #endif int bwa_fa2pac(int argc, char *argv[]); diff --git a/pemerge.c b/pemerge.c index 6756187..5f5e37e 100644 --- a/pemerge.c +++ b/pemerge.c @@ -18,7 +18,7 @@ static const char *err_msg[MAX_ERR+1] = { "pairs where the best SW alignment is not an overlap (long right end)", "pairs with large 2nd best SW score", "pairs with gapped overlap", - "pairs where the end-to-end ungapped alignment score is not high enough", + "pairs where the end-to-end alignment is inconsistent with SW", "pairs potentially with tandem overlaps", "pairs with high sum of errors" }; @@ -81,18 +81,18 @@ int bwa_pemerge(const pem_opt_t *opt, mseq_t x[2], uint8_t **seq_, uint8_t **qua int max_m, max_m2, min_l, max_l, max_l2; max_m = max_m2 = 0; max_l = max_l2 = 0; min_l = x[0].s.l < x[1].s.l? x[0].s.l : x[1].s.l; - for (l = 0; l < min_l; ++l) { + for (l = 1; l < min_l; ++l) { int m = 0, o = x[0].s.l - l; - int a = opt->a, b = -opt->b; - for (i = 0; i < l; ++i) - m += (s[0][o + i] == s[1][i])? a : b; + uint8_t *s0o = &s[0][o], *s1 = s[1]; + for (i = 0; i < l; ++i) // TODO: in principle, this can be done with SSE2. It is the bottleneck! + m += opt->mat[(s1[i]<<2) + s1[i] + s0o[i]]; // equivalent to s[1][i]*5 + s[0][o+i] if (m > max_m) max_m2 = max_m, max_m = m, max_l2 = max_l, max_l = l; else if (m > max_m2) max_m2 = m, max_l2 = l; } - if (max_m < opt->T) return -6; + if (max_m < opt->T || max_l != x[0].s.l - (r.tb - r.qb)) return -6; if (max_l2 < max_l && max_m2 >= opt->T && (double)(max_m2 + (max_l - max_l2) * opt->a) / max_m >= MAX_SCORE_RATIO) return -7; - //printf("*** %d,%d; %d,%d\n", max_m, max_m2, max_l, max_l2); + if (max_l2 > max_l && (double)max_m2 / max_m >= MAX_SCORE_RATIO) return -7; } l = x[0].s.l - (r.tb - r.qb); // length to merge From 3e3236dfc400b1157726af61468914f5ebcbdc8a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Mar 2013 11:00:15 -0500 Subject: [PATCH 13/17] r337: mem - always read even number of reads In the old code, we may read odd number of reads from an interleaved fastq. --- bwa.c | 2 +- fastmap.c | 5 +++++ main.c | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/bwa.c b/bwa.c index 76b54ae..991b23a 100644 --- a/bwa.c +++ b/bwa.c @@ -55,7 +55,7 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) kseq2bseq1(ks2, &seqs[n]); size += seqs[n++].l_seq; } - if (size >= chunk_size) break; + if (size >= chunk_size && (n&1) == 0) break; } if (size == 0) { // test if the 2nd file is finished if (ks2 && kseq_read(ks2) >= 0) diff --git a/fastmap.c b/fastmap.c index e6c2b1e..9204399 100644 --- a/fastmap.c +++ b/fastmap.c @@ -106,6 +106,11 @@ int main_mem(int argc, char *argv[]) } while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { int64_t size = 0; + if ((opt->flag & MEM_F_PE) && (n&1) == 1) { + if (bwa_verbose >= 2) + fprintf(stderr, "[W::%s] odd number of reads in the PE mode; last read dropped\n", __func__); + n = n>>1<<1; + } if (!copy_comment) for (i = 0; i < n; ++i) { free(seqs[i].comment); seqs[i].comment = 0; diff --git a/main.c b/main.c index d9a87f1..f7b3a03 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r336-beta" +#define PACKAGE_VERSION "0.7.0-r337-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 1cadfa15520ae611c8324ee8528a28dea1f33f8d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Mar 2013 11:14:52 -0500 Subject: [PATCH 14/17] r338: pemerge - fixed memory leaks; multithreading pemerge is actually quite slow. --- main.c | 2 +- pemerge.c | 193 +++++++++++++++++++++++++++++++----------------------- 2 files changed, 112 insertions(+), 83 deletions(-) diff --git a/main.c b/main.c index f7b3a03..31bba4b 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r337-beta" +#define PACKAGE_VERSION "0.7.0-r338-beta" #endif int bwa_fa2pac(int argc, char *argv[]); diff --git a/pemerge.c b/pemerge.c index 5f5e37e..696137d 100644 --- a/pemerge.c +++ b/pemerge.c @@ -3,8 +3,10 @@ #include #include #include +#include #include "ksw.h" #include "kseq.h" +#include "kstring.h" #include "bwa.h" KSEQ_DECLARE(gzFile) @@ -23,14 +25,13 @@ static const char *err_msg[MAX_ERR+1] = { "pairs with high sum of errors" }; -typedef struct { - kstring_t n, s, q; -} mseq_t; - typedef struct { int a, b, q, r, w; int q_def, q_thres; int T; + int chunk_size; + int n_threads; + int flag; // bit 1: print merged; 2: print unmerged int8_t mat[25]; } pem_opt_t; @@ -42,67 +43,70 @@ pem_opt_t *pem_opt_init() opt->T = opt->a * 10; opt->q_def = 20; opt->q_thres = 70; + opt->chunk_size = 10000000; + opt->n_threads = 1; + opt->flag = 3; bwa_fill_scmat(opt->a, opt->b, opt->mat); return opt; } -int bwa_pemerge(const pem_opt_t *opt, mseq_t x[2], uint8_t **seq_, uint8_t **qual_) +int bwa_pemerge(const pem_opt_t *opt, bseq1_t x[2]) { uint8_t *s[2], *q[2], *seq, *qual; - int i, xtra, l, l_seq, sum_q; + int i, xtra, l, l_seq, sum_q, ret = 0; kswr_t r; - *seq_ = *qual_ = 0; - s[0] = malloc(x[0].s.l); q[0] = malloc(x[0].s.l); - s[1] = malloc(x[1].s.l); q[1] = malloc(x[1].s.l); - for (i = 0; i < x[0].s.l; ++i) { - int c = x[0].s.s[i]; + s[0] = malloc(x[0].l_seq); q[0] = malloc(x[0].l_seq); + s[1] = malloc(x[1].l_seq); q[1] = malloc(x[1].l_seq); + for (i = 0; i < x[0].l_seq; ++i) { + int c = x[0].seq[i]; s[0][i] = c < 0 || c > 127? 4 : c <= 4? c : nst_nt4_table[c]; - q[0][i] = x[0].q.l? x[0].q.s[i] - 33 : opt->q_def; + q[0][i] = x[0].qual? x[0].qual[i] - 33 : opt->q_def; } - for (i = 0; i < x[1].s.l; ++i) { - int c = x[1].s.s[x[1].s.l - 1 - i]; + for (i = 0; i < x[1].l_seq; ++i) { + int c = x[1].seq[x[1].l_seq - 1 - i]; c = c < 0 || c > 127? 4 : c < 4? c : nst_nt4_table[c]; s[1][i] = c < 4? 3 - c : 4; - q[1][i] = x[1].q.l? x[1].q.s[x[1].q.l - 1 - i] - 33 : opt->q_def; + q[1][i] = x[1].qual? x[1].qual[x[1].l_seq - 1 - i] - 33 : opt->q_def; } xtra = KSW_XSTART | KSW_XSUBO; - r = ksw_align(x[1].s.l, s[1], x[0].s.l, s[0], 5, opt->mat, opt->q, opt->r, xtra, 0); + r = ksw_align(x[1].l_seq, s[1], x[0].l_seq, s[0], 5, opt->mat, opt->q, opt->r, xtra, 0); ++r.qe; ++r.te; // change to the half-close-half-open coordinates - if (r.score < opt->T) return -1; // poor alignment - if (r.tb < r.qb) return -2; // no enough space for the left end - if (x[0].s.l - r.te > x[1].s.l - r.qe) return -3; // no enough space for the right end - if ((double)r.score2 / r.score >= MAX_SCORE_RATIO) return -4; // the second best score is too large - if (r.qe - r.qb != r.te - r.tb) return -5; // we do not allow gaps + if (r.score < opt->T) { ret = -1; goto pem_ret; } // poor alignment + if (r.tb < r.qb) { ret = -2; goto pem_ret; } // no enough space for the left end + if (x[0].l_seq - r.te > x[1].l_seq - r.qe) { ret = -3; goto pem_ret; } // no enough space for the right end + if ((double)r.score2 / r.score >= MAX_SCORE_RATIO) { ret = -4; goto pem_ret; } // the second best score is too large + if (r.qe - r.qb != r.te - r.tb) { ret = -5; goto pem_ret; } // we do not allow gaps { // test tandem match; O(n^2) int max_m, max_m2, min_l, max_l, max_l2; max_m = max_m2 = 0; max_l = max_l2 = 0; - min_l = x[0].s.l < x[1].s.l? x[0].s.l : x[1].s.l; + min_l = x[0].l_seq < x[1].l_seq? x[0].l_seq : x[1].l_seq; for (l = 1; l < min_l; ++l) { - int m = 0, o = x[0].s.l - l; + int m = 0, o = x[0].l_seq - l; uint8_t *s0o = &s[0][o], *s1 = s[1]; for (i = 0; i < l; ++i) // TODO: in principle, this can be done with SSE2. It is the bottleneck! m += opt->mat[(s1[i]<<2) + s1[i] + s0o[i]]; // equivalent to s[1][i]*5 + s[0][o+i] if (m > max_m) max_m2 = max_m, max_m = m, max_l2 = max_l, max_l = l; else if (m > max_m2) max_m2 = m, max_l2 = l; } - if (max_m < opt->T || max_l != x[0].s.l - (r.tb - r.qb)) return -6; - if (max_l2 < max_l && max_m2 >= opt->T && (double)(max_m2 + (max_l - max_l2) * opt->a) / max_m >= MAX_SCORE_RATIO) - return -7; - if (max_l2 > max_l && (double)max_m2 / max_m >= MAX_SCORE_RATIO) return -7; + if (max_m < opt->T || max_l != x[0].l_seq - (r.tb - r.qb)) { ret = -6; goto pem_ret; } + if (max_l2 < max_l && max_m2 >= opt->T && (double)(max_m2 + (max_l - max_l2) * opt->a) / max_m >= MAX_SCORE_RATIO) { + ret = -7; goto pem_ret; + } + if (max_l2 > max_l && (double)max_m2 / max_m >= MAX_SCORE_RATIO) { ret = -7; goto pem_ret; } } - l = x[0].s.l - (r.tb - r.qb); // length to merge - l_seq = x[0].s.l + x[1].s.l - l; + l = x[0].l_seq - (r.tb - r.qb); // length to merge + l_seq = x[0].l_seq + x[1].l_seq - l; seq = malloc(l_seq + 1); qual = malloc(l_seq + 1); - memcpy(seq, s[0], x[0].s.l); memcpy(seq + x[0].s.l, &s[1][l], x[1].s.l - l); - memcpy(qual, q[0], x[0].s.l); memcpy(qual + x[0].s.l, &q[1][l], x[1].s.l - l); + memcpy(seq, s[0], x[0].l_seq); memcpy(seq + x[0].l_seq, &s[1][l], x[1].l_seq - l); + memcpy(qual, q[0], x[0].l_seq); memcpy(qual + x[0].l_seq, &q[1][l], x[1].l_seq - l); for (i = 0, sum_q = 0; i < l; ++i) { - int k = x[0].s.l - l + i; + int k = x[0].l_seq - l + i; if (s[0][k] == 4) { // ambiguous seq[k] = s[1][i]; qual[k] = q[1][i]; @@ -118,51 +122,99 @@ int bwa_pemerge(const pem_opt_t *opt, mseq_t x[2], uint8_t **seq_, uint8_t **qua } if (sum_q>>1 > opt->q_thres) { // too many mismatches free(seq); free(qual); - return -8; + ret = -8; goto pem_ret; } for (i = 0; i < l_seq; ++i) seq[i] = "ACGTN"[(int)seq[i]], qual[i] += 33; seq[l_seq] = qual[l_seq] = 0; - *seq_ = seq, *qual_ = qual; - return l_seq; + + free(x[1].name); free(x[1].seq); free(x[1].qual); free(x[1].comment); + memset(&x[1], 0, sizeof(bseq1_t)); + free(x[0].seq); free(x[0].qual); + x[0].l_seq = l_seq; x[0].seq = (char*)seq; x[0].qual = (char*)qual; + +pem_ret: + free(s[0]); free(s[1]); free(q[0]); free(q[1]); + return ret; } -static inline void kstrcpy(kstring_t *dst, const kstring_t *src) +static inline void print_bseq(const bseq1_t *s, int rn) { - dst->l = 0; - if (src->l == 0) return; - if (dst->m < src->l + 1) { - dst->m = src->l + 2; - kroundup32(dst->m); - dst->s = realloc(dst->s, dst->m); + putchar(s->qual? '@' : '>'); + fputs(s->name, stdout); + if (rn == 1 || rn == 2) { + putchar('/'); putchar('0' + rn); putchar('\n'); + } else puts(" merged"); + puts(s->seq); + if (s->qual) { + puts("+"); puts(s->qual); } - dst->l = src->l; - memcpy(dst->s, src->s, src->l + 1); } -static inline void kseq2mseq(mseq_t *ms, const kseq_t *ks) +typedef struct { + int n, start; + bseq1_t *seqs; + int64_t cnt[MAX_ERR+1]; + const pem_opt_t *opt; +} worker_t; + +void *worker(void *data) { - kstrcpy(&ms->n, &ks->name); - kstrcpy(&ms->s, &ks->seq); - kstrcpy(&ms->q, &ks->qual); + worker_t *w = (worker_t*)data; + int i; + for (i = w->start; i < w->n>>1; i += w->opt->n_threads) + ++w->cnt[-bwa_pemerge(w->opt, &w->seqs[i<<1])]; + return 0; } -static inline void print_seq(const char *n, const char *s, const char *q) +static void process_seqs(const pem_opt_t *opt, int n_, bseq1_t *seqs, int64_t cnt[MAX_ERR+1]) { - putchar(q? '@' : '>'); - puts(n); puts(s); - if (q) { - puts("+"); puts(q); + int i, j, n = n_>>1<<1; + worker_t *w; + + w = calloc(opt->n_threads, sizeof(worker_t)); + for (i = 0; i < opt->n_threads; ++i) { + worker_t *p = &w[i]; + p->start = i; p->n = n; + p->opt = opt; + p->seqs = seqs; + } + if (opt->n_threads == 1) { + worker(w); + } else { + pthread_t *tid; + tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); + for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker, &w[i]); + for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); + free(tid); + } + for (i = 0; i < opt->n_threads; ++i) { + worker_t *p = &w[i]; + for (j = 0; j <= MAX_ERR; ++j) cnt[j] += p->cnt[j]; + } + free(w); + for (i = 0; i < n>>1; ++i) { + if (seqs[i<<1|1].l_seq != 0) { + if (opt->flag&2) { + print_bseq(&seqs[i<<1|0], 1); + print_bseq(&seqs[i<<1|1], 2); + } + } else if (opt->flag&1) + print_bseq(&seqs[i<<1|0], 0); + } + for (i = 0; i < n; ++i) { + bseq1_t *s = &seqs[i]; + free(s->name); free(s->seq); free(s->qual); free(s->comment); } } int main_pemerge(int argc, char *argv[]) { - int c, flag = 0, i; + int c, flag = 0, i, n; int64_t cnt[MAX_ERR+1]; + bseq1_t *bseq; gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; - mseq_t r[2]; pem_opt_t *opt; opt = pem_opt_init(); @@ -172,6 +224,7 @@ int main_pemerge(int argc, char *argv[]) else if (c == 'Q') opt->q_thres = atoi(optarg); } if (flag == 0) flag = 3; + opt->flag = flag; if (optind == argc) { fprintf(stderr, "\n"); @@ -191,34 +244,10 @@ int main_pemerge(int argc, char *argv[]) ks2 = kseq_init(fp2); } - memset(r, 0, sizeof(mseq_t)<<1); memset(cnt, 0, 8 * (MAX_ERR+1)); - while (kseq_read(ks) >= 0) { - uint8_t *seq, *qual; - int l_seq; - kseq2mseq(&r[0], ks); - if (ks2) { - if (kseq_read(ks2) < 0) break; - kseq2mseq(&r[1], ks2); - } else { - if (kseq_read(ks) < 0) break; - kseq2mseq(&r[1], ks); - } - l_seq = bwa_pemerge(opt, r, &seq, &qual); - if (l_seq > 0) { - ++cnt[0]; - if (flag & 1) { - if (r[0].n.l > 2 && (r[0].n.s[r[0].n.l-1] == '1' || r[0].n.s[r[0].n.l-1] == '2') && r[0].n.s[r[0].n.l-2] == '/') - r[0].n.s[r[0].n.l-2] = 0, r[0].n.l -= 2; - print_seq(r[0].n.s, (char*)seq, (char*)qual); - } - } else { - ++cnt[-l_seq]; - if (flag & 2) { - print_seq(r[0].n.s, r[0].s.s, r[0].q.l? r[0].q.s : 0); - print_seq(r[1].n.s, r[1].s.s, r[1].q.l? r[1].q.s : 0); - } - } + while ((bseq = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) { + process_seqs(opt, n, bseq, cnt); + free(bseq); } fprintf(stderr, "%12ld %s\n", (long)cnt[0], err_msg[0]); From 503ca9ed2e9d82727c38ed587c368905da43cc66 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Mar 2013 11:22:19 -0500 Subject: [PATCH 15/17] r339: pemerge - expose some settings to CLI --- main.c | 2 +- pemerge.c | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/main.c b/main.c index 31bba4b..b06ce8f 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r338-beta" +#define PACKAGE_VERSION "0.7.0-r339-beta" #endif int bwa_fa2pac(int argc, char *argv[]); diff --git a/pemerge.c b/pemerge.c index 696137d..e37567a 100644 --- a/pemerge.c +++ b/pemerge.c @@ -210,7 +210,7 @@ static void process_seqs(const pem_opt_t *opt, int n_, bseq1_t *seqs, int64_t cn int main_pemerge(int argc, char *argv[]) { - int c, flag = 0, i, n; + int c, flag = 0, i, n, min_ovlp = 10; int64_t cnt[MAX_ERR+1]; bseq1_t *bseq; gzFile fp, fp2 = 0; @@ -218,19 +218,24 @@ int main_pemerge(int argc, char *argv[]) pem_opt_t *opt; opt = pem_opt_init(); - while ((c = getopt(argc, argv, "muQ:")) >= 0) { + while ((c = getopt(argc, argv, "muQ:t:T:")) >= 0) { if (c == 'm') flag |= 1; else if (c == 'u') flag |= 2; else if (c == 'Q') opt->q_thres = atoi(optarg); + else if (c == 't') opt->n_threads = atoi(optarg); + else if (c == 'T') min_ovlp = atoi(optarg); } if (flag == 0) flag = 3; opt->flag = flag; + opt->T = opt->a * min_ovlp; if (optind == argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa pemerge [-mu] [read2.fq]\n\n"); fprintf(stderr, "Options: -m output merged reads only\n"); fprintf(stderr, " -u output unmerged reads only\n"); + fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); + fprintf(stderr, " -T INT minimum end overlap [%d]\n", min_ovlp); fprintf(stderr, " -Q INT max sum of errors [%d]\n", opt->q_thres); fprintf(stderr, "\n"); free(opt); From b0a76884e8db42f3004fd218a94d663289936f05 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Mar 2013 11:51:23 -0500 Subject: [PATCH 16/17] r340: feature freeze; updated the manpage I will stop adding new features to bwa and prepare for the next release. I will briefly evaluate the variant calling accuracy before the release. --- bwa.1 | 24 +++++++++++++++++++++++- fastmap.c | 2 +- main.c | 4 ++-- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/bwa.1 b/bwa.1 index 45b9921..b1bbb2a 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "27 Feburary 2013" "bwa-0.7.0" "Bioinformatics tools" +.TH bwa 1 "10 March 2013" "bwa-0.7.1" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool @@ -81,6 +81,8 @@ genome. .IR minSeedLen ] .RB [ -w .IR bandWidth ] +.RB [ -d +.IR zDropoff ] .RB [ -r .IR seedSplitRatio ] .RB [ -c @@ -163,6 +165,21 @@ Band width. Essentially, gaps longer than will not be found. Note that the maximum gap length is also affected by the scoring matrix and the hit length, not solely determined by this option. [100] .TP +.BI -d \ INT +Off-diagonal X-dropoff (Z-dropoff). Stop extension when the difference between +the best and the current extension score is above +.RI | i - j |* A + INT , +where +.I i +and +.I j +are the current positions of the query and reference, respectively, and +.I A +is the matching score. Z-dropoff is similar to BLAST's X-dropoff except that it +doesn't penalize gaps in one of the sequences in the alignment. Z-dropoff not +only avoids unnecessary extension, but also reduces poor alignments inside a +long good alignment. [100] +.TP .BI -r \ FLOAT Trigger re-seeding for a MEM longer than .IR minSeedLen * FLOAT . @@ -215,6 +232,11 @@ and will be converted to a TAB in the output SAM. The read group ID will be attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'. [null] .TP +.BI -T \ INT +Don't output alignment with score lower than +.IR INT . +This option only affects output. [30] +.TP .B -a Output all found alignments for single-end or unpaired paired-end reads. These alignments will be flagged as secondary alignments. diff --git a/fastmap.c b/fastmap.c index 9204399..eda06bb 100644 --- a/fastmap.c +++ b/fastmap.c @@ -59,7 +59,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); - fprintf(stderr, " -d INT off-diagnal X-dropoff [%d]\n", opt->zdrop); + fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); // fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); diff --git a/main.c b/main.c index b06ce8f..da986a7 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r339-beta" +#define PACKAGE_VERSION "0.7.0-r340-beta" #endif int bwa_fa2pac(int argc, char *argv[]); @@ -34,7 +34,7 @@ static int usage() fprintf(stderr, "Command: index index sequences in the FASTA format\n"); fprintf(stderr, " mem BWA-MEM algorithm\n"); fprintf(stderr, " fastmap identify super-maximal exact matches\n"); - fprintf(stderr, " pemerge merge overlapping paired ends\n"); + fprintf(stderr, " pemerge merge overlapping paired ends (EXPERIMENTAL)\n"); fprintf(stderr, " aln gapped/ungapped alignment\n"); fprintf(stderr, " samse generate alignment (single ended)\n"); fprintf(stderr, " sampe generate alignment (paired ended)\n"); From b5b50ac8da72bfe06810bbe24055dfb1218f83e3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Mar 2013 21:35:57 -0500 Subject: [PATCH 17/17] r341: bugfix - wrong mate position when one end is mapped with a score less than -T. Caused by the -T option. --- bwamem_pair.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 9ff12b3..49e9ad2 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -303,7 +303,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co no_pairing: for (i = 0; i < 2; ++i) { - if (a[i].n) { + if (a[i].n && a[i].a[0].score >= opt->T) { mem_alnreg2hit(&a[i].a[0], &h[i]); bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[i].seq, &h[i].qb, &h[i].qe, &h[i].rb, &h[i].re); } else h[i].rb = h[i].re = -1; diff --git a/main.c b/main.c index da986a7..a772e3b 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r340-beta" +#define PACKAGE_VERSION "0.7.0-r341-beta" #endif int bwa_fa2pac(int argc, char *argv[]);