From 3e4a178e084397ced83533680219c88549b1619f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 1 Mar 2013 11:14:51 -0500 Subject: [PATCH 01/11] r314: cleanup bwamem API Don't modify input sequences; more documentations --- bwamem.c | 18 ++++++++++++++---- bwamem.h | 29 +++++++++++++++++++++-------- example.c | 2 +- main.c | 2 +- 4 files changed, 37 insertions(+), 14 deletions(-) diff --git a/bwamem.c b/bwamem.c index 52dc7fb..7efbb8d 100644 --- a/bwamem.c +++ b/bwamem.c @@ -715,20 +715,29 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse return regs; } -mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) -{ // the difference from mem_align1_core() lies in that this routine calls mem_mark_primary_se() +mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq_) +{ // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence mem_alnreg_v ar; + char *seq; + seq = malloc(l_seq); + memcpy(seq, seq_, l_seq); // makes a copy of seq_ ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq); mem_mark_primary_se(opt, ar.n, ar.a); + free(seq); return ar; } // This routine is only used for the API purpose -mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, uint8_t *query, const mem_alnreg_t *ar) +mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) { mem_aln_t a; - int w2, qb = ar->qb, qe = ar->qe, NM, score, is_rev; + int i, w2, qb = ar->qb, qe = ar->qe, NM, score, is_rev; int64_t pos, rb = ar->rb, re = ar->re; + uint8_t *query; + + query = malloc(l_query); + for (i = 0; i < l_query; ++i) // convert to the nt4 encoding + query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; memset(&a, 0, sizeof(mem_aln_t)); a.mapq = mem_approx_mapq_se(opt, ar); bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re); @@ -752,6 +761,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * } a.rid = bns_pos2rid(bns, pos); a.pos = pos - bns->anns[a.rid].offset; + free(query); return a; } diff --git a/bwamem.h b/bwamem.h index c2f124c..a856fa5 100644 --- a/bwamem.h +++ b/bwamem.h @@ -64,11 +64,11 @@ typedef struct { // TODO: This is an intermediate struct only. Better get rid of } bwahit_t; typedef struct { // This struct is only used for the convenience of API. - int rid; - int pos; - uint32_t is_rev:1, mapq:8, NM:23; - int n_cigar; - uint32_t *cigar; + int rid; // reference sequence index in bntseq_t + int pos; // forward strand 5'-end mapping position + uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance + int n_cigar; // number of CIGAR operations + uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234 } mem_aln_t; #ifdef __cplusplus @@ -116,13 +116,26 @@ extern "C" { * @param bns Information of the reference * @param pac 2-bit encoded reference * @param l_seq length of query sequence - * @param seq query sequence; conversion ACGTN/acgtn=>01234 to be applied + * @param seq query sequence * * @return list of aligned regions. */ - mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq); + mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq); - mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, uint8_t *query, const mem_alnreg_t *ar); + /** + * Generate CIGAR and forward-strand position from alignment region + * + * @param opt alignment parameters + * @param bwt FM-index of the reference sequence + * @param bns Information of the reference + * @param pac 2-bit encoded reference + * @param l_seq length of query sequence + * @param seq query sequence + * @param ar one alignment region + * + * @return CIGAR, strand, mapping quality and forward-strand position + */ + mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar); /** * Infer the insert size distribution from interleaved alignment regions diff --git a/example.c b/example.c index 6564cbd..b59eec2 100644 --- a/example.c +++ b/example.c @@ -32,7 +32,7 @@ int main(int argc, char *argv[]) for (i = 0; i < ar.n; ++i) { // traverse each hit mem_aln_t a; if (ar.a[i].secondary >= 0) continue; // skip secondary alignments - a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, (uint8_t*)ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR + a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR // print alignment printf("%s\t%c\t%s\t%d\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, a.pos, a.mapq); for (k = 0; k < a.n_cigar; ++k) // print CIGAR diff --git a/main.c b/main.c index ba60cf7..fee5bd6 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r313" +#define PACKAGE_VERSION "0.7.0-r314" #endif static int usage() From 35fb7f9fdfb6bc1396e63f9727301a3a12e9b155 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 1 Mar 2013 11:47:51 -0500 Subject: [PATCH 02/11] r315: move kopen.o out of libbwa.a --- Makefile | 4 ++-- bwamem.h | 3 +-- main.c | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index eab4198..36f951f 100644 --- a/Makefile +++ b/Makefile @@ -3,9 +3,9 @@ CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 -LOBJS= utils.o kstring.o ksw.o kopen.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o +LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ - is.o bwtindex.o bwape.o \ + is.o bwtindex.o bwape.o kopen.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o PROG= bwa bwamem-lite diff --git a/bwamem.h b/bwamem.h index a856fa5..3f4ce32 100644 --- a/bwamem.h +++ b/bwamem.h @@ -109,7 +109,7 @@ extern "C" { * Find the aligned regions for one query sequence * * Note that this routine does not generate CIGAR. CIGAR should be - * generated later by bwa_gen_cigar() defined in bwa.c. + * generated later by mem_reg2aln() below. * * @param opt alignment parameters * @param bwt FM-index of the reference sequence @@ -126,7 +126,6 @@ extern "C" { * Generate CIGAR and forward-strand position from alignment region * * @param opt alignment parameters - * @param bwt FM-index of the reference sequence * @param bns Information of the reference * @param pac 2-bit encoded reference * @param l_seq length of query sequence diff --git a/main.c b/main.c index fee5bd6..7499609 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r314" +#define PACKAGE_VERSION "0.7.0-r315" #endif static int usage() From d35f33b5135ce2ea0017e34e04967c34ed04a1bf Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Mar 2013 10:22:18 -0500 Subject: [PATCH 03/11] r316: don't allocate zero-length memory It is not a bug, but Electric Fence does not like that. --- bntseq.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bntseq.c b/bntseq.c index 972837e..540e966 100644 --- a/bntseq.c +++ b/bntseq.c @@ -124,7 +124,7 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes); l_pac = xx; xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files."); - bns->ambs = (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t)); + bns->ambs = bns->n_holes? (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t)) : 0; for (i = 0; i < bns->n_holes; ++i) { bntamb1_t *p = bns->ambs + i; fscanf(fp, "%lld%d%s", &xx, &p->len, str); diff --git a/main.c b/main.c index 7499609..91a62cc 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r315" +#define PACKAGE_VERSION "0.7.0-r316" #endif static int usage() From 1a451df80082872890dd3a0904abb837be9c4f03 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Mar 2013 10:32:33 -0500 Subject: [PATCH 04/11] prepare to ditch stdaln.{h,c} --- bwtsw2_pair.c | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index cf29087..cad96e9 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -7,11 +7,7 @@ #include "bwtsw2.h" #include "kstring.h" #include "utils.h" -#ifndef _NO_SSE2 #include "ksw.h" -#else -#include "stdaln.h" -#endif #define MIN_RATIO 0.8 #define OUTLIER_BOUND 2.0 @@ -126,8 +122,7 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b for (i = 0; i < l_mseq; ++i) // on the forward strand seq[i] = nst_nt4_table[(int)mseq[i]]; } -#ifndef _NO_SSE2 - { // FIXME!!! The following block has not been tested since the update of the ksw library + { int flag = KSW_XSUBO | KSW_XSTART | (l_mseq * g_mat[0] < 250? KSW_XBYTE : 0) | opt->t; kswr_t aln; aln = ksw_align(l_mseq, seq, end - beg, ref, 5, g_mat, opt->q, opt->r, flag, 0); @@ -146,24 +141,6 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b printf("G=%d,G2=%d,beg=%d,end=%d,k=%lld,len=%d\n", a->G, a->G2, a->beg, a->end, a->k, a->len); */ } -#else - { - AlnParam ap; - path_t path[2]; - int matrix[25]; - for (i = 0; i < 25; ++i) matrix[i] = g_mat[i]; - ap.gap_open = opt->q; ap.gap_ext = opt->r; ap.gap_end = opt->r; - ap.matrix = matrix; ap.row = 5; ap.band_width = 50; - a->G = aln_local_core(ref, end - beg, seq, l_mseq, &ap, path, 0, opt->t, &a->G2); - if (a->G < opt->t) a->G = 0; - if (a->G2 < opt->t) a->G2 = 0; - if (a->G2) a->flag |= BSW2_FLAG_TANDEM; - a->k = beg + path[0].i - 1; - a->len = path[1].i - path[0].i + 1; - a->beg = path[0].j - 1; - a->end = path[1].j; - } -#endif if (a->is_rev) i = a->beg, a->beg = l_mseq - a->end, a->end = l_mseq - i; free(seq); } From 7e00dbcac524f09f6e49b14b118c2fb374ef5867 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Mar 2013 11:35:23 -0500 Subject: [PATCH 05/11] r317: bugfix - out-of-range extension This happens when target region crosses the forward-reverse boundary. This will almost never happen to short-read alignment. --- bwamem.c | 19 +++++++++++-------- main.c | 2 +- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/bwamem.c b/bwamem.c index 7efbb8d..10cfae8 100644 --- a/bwamem.c +++ b/bwamem.c @@ -176,7 +176,7 @@ typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; #define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos)) KBTREE_INIT(chn, mem_chain_t, chain_cmp) -static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t *p) +static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p) { int64_t qend, rend, x, y; const mem_seed_t *last = &c->seeds[c->n-1]; @@ -184,6 +184,7 @@ static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t rend = last->rbeg + last->len; if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) return 1; // contained seed; do nothing + if ((last->rbeg < l_pac || c->seeds[0].rbeg < l_pac) && p->rbeg >= l_pac) return 0; // don't chain if on different strand x = p->qbeg - last->qbeg; // always non-negtive y = p->rbeg - last->rbeg; if (y >= 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain @@ -197,7 +198,7 @@ static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t return 0; // request to add a new chain } -static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *itr) +static void mem_insert_seed(const mem_opt_t *opt, int64_t l_pac, kbtree_t(chn) *tree, smem_i *itr) { const bwtintv_v *a; int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); @@ -216,9 +217,10 @@ static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *i s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference s.qbeg = p->info>>32; s.len = slen; + if (s.rbeg < l_pac && l_pac < s.rbeg + s.len) continue; // bridging forward-reverse boundary; skip if (kb_size(tree)) { kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain - if (!lower || !test_and_merge(opt, lower, &s)) to_add = 1; + if (!lower || !test_and_merge(opt, l_pac, lower, &s)) to_add = 1; } else to_add = 1; if (to_add) { // add the seed as a new chain tmp.n = 1; tmp.m = 4; @@ -249,7 +251,7 @@ void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn) } } -mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq) +mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int64_t l_pac, int len, const uint8_t *seq) { mem_chain_v chain; smem_i *itr; @@ -260,7 +262,7 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uin tree = kb_init(chn, KB_DEFAULT_SIZE); itr = smem_itr_init(bwt); smem_set_query(itr, len, seq); - mem_insert_seed(opt, tree, itr); + mem_insert_seed(opt, l_pac, tree, itr); kv_resize(mem_chain_t, chain, kb_size(tree)); @@ -449,12 +451,12 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int rmax[0] = rmax[0] > 0? rmax[0] : 0; rmax[1] = rmax[1] < l_pac<<1? rmax[1] : l_pac<<1; if (rmax[0] < l_pac && l_pac < rmax[1]) { // crossing the forward-reverse boundary; then choose one side - if (l_pac - rmax[0] > rmax[1] - l_pac) rmax[1] = l_pac; + if (c->seeds[0].rbeg < l_pac) rmax[1] = l_pac; // this works because all seeds are guaranteed to be on the same strand else rmax[0] = l_pac; } // retrieve the reference sequence rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); - if (rlen != rmax[1] - rmax[0]) return; + assert(rlen == rmax[1] - rmax[0]); srt = malloc(c->n * 8); for (i = 0; i < c->n; ++i) @@ -505,6 +507,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int int qle, tle, qe, re, gtle, gscore; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; + assert(re >= 0); a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle, >le, &gscore); // similar to the above if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle; @@ -700,7 +703,7 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]]; - chn = mem_chain(opt, bwt, l_seq, (uint8_t*)seq); + chn = mem_chain(opt, bwt, bns->l_pac, l_seq, (uint8_t*)seq); chn.n = mem_chain_flt(opt, chn.n, chn.a); if (bwa_verbose >= 4) mem_print_chain(bns, &chn); diff --git a/main.c b/main.c index 91a62cc..ec69e81 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r316" +#define PACKAGE_VERSION "0.7.0-r317" #endif static int usage() From 40f121473621f8ef8afbc253ab22f2395a4e0827 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Mar 2013 11:52:11 -0500 Subject: [PATCH 06/11] change to debugging code only --- bwamem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 10cfae8..848ade7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -513,7 +513,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle; else a->qe = l_query, a->re = rmax[0] + re + gtle; } else a->qe = l_query, a->re = s->rbeg + s->len; - if (bwa_verbose >= 4) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); + if (bwa_verbose >= 4) { printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); fflush(stdout); } // compute seedcov for (i = 0, a->seedcov = 0; i < c->n; ++i) { From 733410b50d4bcd772851c4d0930823c0227d1d67 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Mar 2013 14:43:49 -0500 Subject: [PATCH 07/11] r320: speed up very long sequence alignment 100-200bp read alignment should not be affected at all. --- bwamem.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- main.c | 2 +- 2 files changed, 67 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index 848ade7..84c8424 100644 --- a/bwamem.c +++ b/bwamem.c @@ -421,6 +421,68 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORT * Construct the alignment from a chain * ****************************************/ +/* mem_chain2aln() vs mem_chain2aln_short() + * + * mem_chain2aln() covers all the functionality of mem_chain2aln_short(). + * However, it may waste time on extracting the reference sequences given a + * very long query. mem_chain2aln_short() is faster for very short chains in a + * long query. It may fail when the matches are long or reach the end of the + * query. In this case, mem_chain2aln() will be called again. + * mem_chain2aln_short() is almost never used for short-read alignment. + */ + +#define MEM_SHORT_EXT 50 +#define MEM_SHORT_LEN 200 + +int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) +{ + int i, qb, qe, xtra; + int64_t rb, re, rlen; + uint8_t *rseq = 0; + mem_alnreg_t a; + kswr_t x; + + if (c->n == 0) return -1; + qb = l_query; qe = 0; + rb = l_pac<<1; re = 0; + memset(&a, 0, sizeof(mem_alnreg_t)); + for (i = 0; i < c->n; ++i) { + const mem_seed_t *s = &c->seeds[i]; + qb = qb < s->qbeg? qb : s->qbeg; + qe = qe > s->qbeg + s->len? qe : s->qbeg + s->len; + rb = rb < s->rbeg? rb : s->rbeg; + re = re > s->rbeg + s->len? re : s->rbeg + s->len; + a.seedcov += s->len; + } + qb -= MEM_SHORT_EXT; qe += MEM_SHORT_EXT; + if (qb <= 10 || qe >= l_query - 10) return 1; // because ksw_align() does not support end-to-end alignment + rb -= MEM_SHORT_EXT; re += MEM_SHORT_EXT; + rb = rb > 0? rb : 0; + re = re < l_pac<<1? re : l_pac<<1; + if (rb < l_pac && l_pac < re) { + if (c->seeds[0].rbeg < l_pac) re = l_pac; + else rb = l_pac; + } + if ((re - rb) - (qe - qb) > MEM_SHORT_EXT || (qe - qb) - (re - rb) > MEM_SHORT_EXT) return 1; + if (qe - qb >= opt->w * 4 || re - rb >= opt->w * 4) return 1; + if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return 1; + + rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); + assert(rlen == re - rb); + xtra = KSW_XSUBO | KSW_XSTART | ((qe - qb) * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); + x = ksw_align(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->q, opt->r, xtra, 0); + free(rseq); + if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1; + + a.rb = rb + x.tb; a.re = rb + x.te + 1; + a.qb = qb + x.qb; a.qe = qb + x.qe + 1; + a.score = x.score; + a.csub = x.score2; + kv_push(mem_alnreg_t, *av, a); + if (bwa_verbose >= 4) printf("SHORT: [%d,%d) <=> [%ld,%ld)\n", a.qb, a.qe, (long)a.rb, (long)a.re); + return 0; +} + static inline int cal_max_gap(const mem_opt_t *opt, int qlen) { int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.); @@ -429,7 +491,7 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) } void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) -{ // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds +{ int i, k; int64_t rlen, rmax[2], tmp, max = 0; const mem_seed_t *s; @@ -710,7 +772,9 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse kv_init(regs); for (i = 0; i < chn.n; ++i) { mem_chain_t *p = &chn.a[i]; - mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); + int ret; + ret = mem_chain2aln_short(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); + if (ret > 0) mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); free(chn.a[i].seeds); } free(chn.a); diff --git a/main.c b/main.c index ec69e81..be31bf0 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r317" +#define PACKAGE_VERSION "0.7.0-r320" #endif static int usage() From 59bc9341f6e71d7b59774e8a6e51b864fa3c9084 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Mar 2013 17:29:07 -0500 Subject: [PATCH 08/11] code backup; more changes coming later --- bwamem.c | 28 ++++++++++++++++++++-------- bwamem.h | 2 ++ ksw.c | 13 +++++++++---- ksw.h | 2 +- 4 files changed, 32 insertions(+), 13 deletions(-) diff --git a/bwamem.c b/bwamem.c index 84c8424..42788c4 100644 --- a/bwamem.c +++ b/bwamem.c @@ -43,7 +43,7 @@ mem_opt_t *mem_opt_init() mem_opt_t *o; o = calloc(1, sizeof(mem_opt_t)); o->flag = 0; - o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100; + o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 60; o->max_w = 500; o->pen_unpaired = 9; o->pen_clip = 5; o->min_seed_len = 19; @@ -492,7 +492,7 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) { - int i, k; + int i, k, max_off[2], aw[2]; // aw: actual bandwidth used in extension int64_t rlen, rmax[2], tmp, max = 0; const mem_seed_t *s; uint8_t *rseq = 0; @@ -549,16 +549,22 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a = kv_pushp(mem_alnreg_t, *av); memset(a, 0, sizeof(mem_alnreg_t)); + a->w = aw[0] = aw[1] = opt->w; if (s->qbeg) { // left extension uint8_t *rs, *qs; - int qle, tle, gtle, gscore; + int qle, tle, gtle, gscore, tmps = -1; qs = malloc(s->qbeg); for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i]; tmp = s->rbeg - rmax[0]; rs = malloc(tmp); for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; - a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle, >le, &gscore); + for (aw[0] = opt->w;; aw[0] <<= 1) { + a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); + if (bwa_verbose >= 4) printf("L\t%d < %d; w=%d; max_off=%d\n", tmps, a->score, aw[0], max_off[0]); fflush(stdout); + if (a->score == tmps || aw[0]<<1 > opt->max_w || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; + tmps = a->score; + } // check whether we prefer to reach the end of the query if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; // local hits else a->qb = 0, a->rb = s->rbeg - gtle; // reach the end @@ -566,16 +572,21 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int } else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; if (s->qbeg + s->len != l_query) { // right extension - int qle, tle, qe, re, gtle, gscore; + int qle, tle, qe, re, gtle, gscore, tmps = -1; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; assert(re >= 0); - a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle, >le, &gscore); + for (aw[1] = opt->w;; aw[1] <<= 1) { + a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], a->score, &qle, &tle, >le, &gscore, &max_off[1]); + if (bwa_verbose >= 4) printf("R\t%d < %d; w=%d; max_off=%d\n", tmps, a->score, aw[1], max_off[1]); fflush(stdout); + if (a->score == tmps || aw[1]<<1 > opt->max_w || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; + tmps = a->score; + } // similar to the above if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle; else a->qe = l_query, a->re = rmax[0] + re + gtle; } else a->qe = l_query, a->re = s->rbeg + s->len; - if (bwa_verbose >= 4) { printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); fflush(stdout); } + if (bwa_verbose >= 4) { printf("[%d]\taw={%d,%d}\tscore=%d\t[%d,%d) <=> [%ld,%ld)\n", k, aw[0], aw[1], a->score, a->qb, a->qe, (long)a->rb, (long)a->re); fflush(stdout); } // compute seedcov for (i = 0, a->seedcov = 0; i < c->n; ++i) { @@ -583,6 +594,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough } + a->w = aw[0] > aw[1]? aw[0] : aw[1]; } free(srt); free(rseq); } @@ -750,7 +762,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b h.qual = p->secondary >= 0? 0 : mem_approx_mapq_se(opt, p); if (k == 0) mapq0 = h.qual; else if (h.qual > mapq0) h.qual = mapq0; - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, p->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); } } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP, m); s->sam = str.s; diff --git a/bwamem.h b/bwamem.h index 3f4ce32..4d632e7 100644 --- a/bwamem.h +++ b/bwamem.h @@ -22,6 +22,7 @@ typedef struct { int pen_unpaired; // phred-scaled penalty for unpaired reads int pen_clip; // clipping penalty. This score is not deducted from the DP score. int w; // band width + int max_w; // max band width int flag; // see MEM_F_* macros int min_seed_len; // minimum seed length @@ -45,6 +46,7 @@ typedef struct { int sub; // 2nd best SW score int csub; // SW score of a tandem hit int sub_n; // approximate number of suboptimal hits + int w; // actual band width used in extension int seedcov; // length of regions coverged by seeds int secondary; // index of the parent hit shadowing the current hit; <0 if primary } mem_alnreg_t; diff --git a/ksw.c b/ksw.c index b97fed5..3747cb0 100644 --- a/ksw.c +++ b/ksw.c @@ -359,11 +359,11 @@ typedef struct { int32_t h, e; } eh_t; -int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore) +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) { eh_t *eh; // score array int8_t *qp; // query profile - int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap, max_ie, gscore; + int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap, max_ie, gscore, max_off; if (h0 < 0) h0 = 0; // allocate memory qp = malloc(qlen * m); @@ -386,6 +386,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, w = w < max_gap? w : max_gap; // DP loop max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1; + max_off = 0; beg = 0, end = qlen; for (i = 0; LIKELY(i < tlen); ++i) { int f = 0, h1, m = 0, mj = -1; @@ -410,7 +411,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, h = h > e? h : e; h = h > f? h : f; h1 = h; // save H(i,j) to h1 for the next column - mj = m > h? mj : j; + mj = m > h? mj : j; // record the position where max score is achieved m = m > h? m : h; // m is stored at eh[mj+1] h -= gapoe; h = h > 0? h : 0; @@ -426,7 +427,10 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, gscore = gscore > h1? gscore : h1; } if (m == 0) break; - if (m > max) max = m, max_i = i, max_j = mj; + if (m > max) { + max = m, max_i = i, max_j = mj; + max_off = max_off > abs(mj - i)? max_off : abs(mj - i); + } // update beg and end for the next round for (j = mj; j >= beg && eh[j].h; --j); beg = j + 1; @@ -439,6 +443,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, if (_tle) *_tle = max_i + 1; if (_gtle) *_gtle = max_ie + 1; if (_gscore) *_gscore = gscore; + if (_max_off) *_max_off = max_off; return max; } diff --git a/ksw.h b/ksw.h index d2975de..6d1f7cf 100644 --- a/ksw.h +++ b/ksw.h @@ -102,7 +102,7 @@ extern "C" { * * @return best semi-local alignment score */ - int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *qle, int *tle, int *gtle, int *gscore); + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); #ifdef __cplusplus } From d6096c3f997d52f40aeef1c352461dbd45fd163b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Mar 2013 18:41:57 -0500 Subject: [PATCH 09/11] bugfix: caused by the latest change --- bwamem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 42788c4..d751cfb 100644 --- a/bwamem.c +++ b/bwamem.c @@ -572,12 +572,12 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int } else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; if (s->qbeg + s->len != l_query) { // right extension - int qle, tle, qe, re, gtle, gscore, tmps = -1; + int qle, tle, qe, re, gtle, gscore, tmps = -1, sc0 = a->score; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; assert(re >= 0); for (aw[1] = opt->w;; aw[1] <<= 1) { - a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], a->score, &qle, &tle, >le, &gscore, &max_off[1]); + a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], sc0, &qle, &tle, >le, &gscore, &max_off[1]); if (bwa_verbose >= 4) printf("R\t%d < %d; w=%d; max_off=%d\n", tmps, a->score, aw[1], max_off[1]); fflush(stdout); if (a->score == tmps || aw[1]<<1 > opt->max_w || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; tmps = a->score; From e0991d6a459daa27031df17405e0d62f0395a6f1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Mar 2013 00:34:33 -0500 Subject: [PATCH 10/11] r323: added Z-dropoff, a variant of blast's X-drop --- bwamem.c | 5 +++-- bwamem.h | 1 + fastmap.c | 6 +++++- ksw.c | 4 ++-- ksw.h | 2 +- main.c | 2 +- 6 files changed, 13 insertions(+), 7 deletions(-) diff --git a/bwamem.c b/bwamem.c index d751cfb..20d8bdb 100644 --- a/bwamem.c +++ b/bwamem.c @@ -44,6 +44,7 @@ mem_opt_t *mem_opt_init() o = calloc(1, sizeof(mem_opt_t)); o->flag = 0; o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 60; o->max_w = 500; + o->zdrop = 100; o->pen_unpaired = 9; o->pen_clip = 5; o->min_seed_len = 19; @@ -560,7 +561,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int rs = malloc(tmp); for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; for (aw[0] = opt->w;; aw[0] <<= 1) { - a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); + a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); if (bwa_verbose >= 4) printf("L\t%d < %d; w=%d; max_off=%d\n", tmps, a->score, aw[0], max_off[0]); fflush(stdout); if (a->score == tmps || aw[0]<<1 > opt->max_w || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; tmps = a->score; @@ -577,7 +578,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int re = s->rbeg + s->len - rmax[0]; assert(re >= 0); for (aw[1] = opt->w;; aw[1] <<= 1) { - a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], sc0, &qle, &tle, >le, &gscore, &max_off[1]); + a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); if (bwa_verbose >= 4) printf("R\t%d < %d; w=%d; max_off=%d\n", tmps, a->score, aw[1], max_off[1]); fflush(stdout); if (a->score == tmps || aw[1]<<1 > opt->max_w || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; tmps = a->score; diff --git a/bwamem.h b/bwamem.h index 4d632e7..7d6e821 100644 --- a/bwamem.h +++ b/bwamem.h @@ -23,6 +23,7 @@ typedef struct { int pen_clip; // clipping penalty. This score is not deducted from the DP score. int w; // band width int max_w; // max band width + int zdrop; // Z-dropoff int flag; // see MEM_F_* macros int min_seed_len; // minimum seed length diff --git a/fastmap.c b/fastmap.c index 56cfb01..eb93994 100644 --- a/fastmap.c +++ b/fastmap.c @@ -26,9 +26,10 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:")) >= 0) { + while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:W:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); + else if (c == 'W') opt->max_w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); else if (c == 'B') opt->b = atoi(optarg); else if (c == 'O') opt->q = atoi(optarg); @@ -42,6 +43,7 @@ int main_mem(int argc, char *argv[]) else if (c == 'p') opt->flag |= MEM_F_PE; else if (c == 'M') opt->flag |= MEM_F_NO_MULTI; else if (c == 'c') opt->max_occ = atoi(optarg); + else if (c == 'd') opt->zdrop = atoi(optarg); else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg); else if (c == 'C') copy_comment = 1; @@ -57,6 +59,8 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); + fprintf(stderr, " -W INT max band width [%d]\n", opt->max_w); + fprintf(stderr, " -d INT off-diagnal X-dropoff [%d]\n", opt->zdrop); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); // fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); diff --git a/ksw.c b/ksw.c index 3747cb0..5666a8f 100644 --- a/ksw.c +++ b/ksw.c @@ -359,7 +359,7 @@ typedef struct { int32_t h, e; } eh_t; -int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) { eh_t *eh; // score array int8_t *qp; // query profile @@ -426,7 +426,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, max_ie = gscore > h1? max_ie : i; gscore = gscore > h1? gscore : h1; } - if (m == 0) break; + if (m == 0 || max - m - abs((i - max_i) - (j - max_j)) * gape > zdrop) break; // drop to zero, or below Z-dropoff if (m > max) { max = m, max_i = i, max_j = mj; max_off = max_off > abs(mj - i)? max_off : abs(mj - i); diff --git a/ksw.h b/ksw.h index 6d1f7cf..2dd6499 100644 --- a/ksw.h +++ b/ksw.h @@ -102,7 +102,7 @@ extern "C" { * * @return best semi-local alignment score */ - int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); #ifdef __cplusplus } diff --git a/main.c b/main.c index be31bf0..dd1a481 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r320" +#define PACKAGE_VERSION "0.7.0-r323-beta" #endif static int usage() From efd9769b07114d2d8539e6115277c7e24ce2930f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Mar 2013 00:57:16 -0500 Subject: [PATCH 11/11] r324: a little code cleanup The changes after r317 aim to improve the performance and accuracy for very long query alignment. The short-read alignment should not be affected. The changes include: 1) Z-dropoff. This is a variant of blast's X-dropoff. I orginally thought this heuristic only improves speed, but now I realize it also reduces poor alignment with long good flanking alignments. The difference from blast's X-dropoff is that Z-dropoff allows big gaps, but X-dropoff does not. 2) Band width doubling. When band width is too small, we will get a poor alignment in the middle. Sometimes such alignments cannot be fully excluded with Z-dropoff. Band width doubling is an alternative heuristic. It is based on the observation that the existing of close-to-boundary high score possibly implies inadequate band width. When we see such a signal, we double the band width. --- bwamem.c | 26 +++++++++++++++----------- bwamem.h | 1 - fastmap.c | 4 +--- main.c | 2 +- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/bwamem.c b/bwamem.c index 20d8bdb..a6897ba 100644 --- a/bwamem.c +++ b/bwamem.c @@ -43,7 +43,7 @@ mem_opt_t *mem_opt_init() mem_opt_t *o; o = calloc(1, sizeof(mem_opt_t)); o->flag = 0; - o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 60; o->max_w = 500; + o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100; o->zdrop = 100; o->pen_unpaired = 9; o->pen_clip = 5; @@ -434,6 +434,7 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORT #define MEM_SHORT_EXT 50 #define MEM_SHORT_LEN 200 +#define MAX_BAND_TRY 2 int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) { @@ -551,20 +552,22 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a = kv_pushp(mem_alnreg_t, *av); memset(a, 0, sizeof(mem_alnreg_t)); a->w = aw[0] = aw[1] = opt->w; + a->score = -1; if (s->qbeg) { // left extension uint8_t *rs, *qs; - int qle, tle, gtle, gscore, tmps = -1; + int qle, tle, gtle, gscore; qs = malloc(s->qbeg); for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i]; tmp = s->rbeg - rmax[0]; rs = malloc(tmp); for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; - for (aw[0] = opt->w;; aw[0] <<= 1) { + for (i = 0; i < MAX_BAND_TRY; ++i) { + int prev = a->score; + aw[0] = opt->w << i; a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); - if (bwa_verbose >= 4) printf("L\t%d < %d; w=%d; max_off=%d\n", tmps, a->score, aw[0], max_off[0]); fflush(stdout); - if (a->score == tmps || aw[0]<<1 > opt->max_w || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; - tmps = a->score; + if (bwa_verbose >= 4) printf("L\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); + if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; } // check whether we prefer to reach the end of the query if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; // local hits @@ -573,15 +576,16 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int } else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; if (s->qbeg + s->len != l_query) { // right extension - int qle, tle, qe, re, gtle, gscore, tmps = -1, sc0 = a->score; + int qle, tle, qe, re, gtle, gscore, sc0 = a->score; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; assert(re >= 0); - for (aw[1] = opt->w;; aw[1] <<= 1) { + for (i = 0; i < MAX_BAND_TRY; ++i) { + int prev = a->score; + aw[1] = opt->w << i; a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); - if (bwa_verbose >= 4) printf("R\t%d < %d; w=%d; max_off=%d\n", tmps, a->score, aw[1], max_off[1]); fflush(stdout); - if (a->score == tmps || aw[1]<<1 > opt->max_w || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; - tmps = a->score; + if (bwa_verbose >= 4) printf("R\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); + if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; } // similar to the above if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle; diff --git a/bwamem.h b/bwamem.h index 7d6e821..96a3308 100644 --- a/bwamem.h +++ b/bwamem.h @@ -22,7 +22,6 @@ typedef struct { int pen_unpaired; // phred-scaled penalty for unpaired reads int pen_clip; // clipping penalty. This score is not deducted from the DP score. int w; // band width - int max_w; // max band width int zdrop; // Z-dropoff int flag; // see MEM_F_* macros diff --git a/fastmap.c b/fastmap.c index eb93994..40c3a02 100644 --- a/fastmap.c +++ b/fastmap.c @@ -26,10 +26,9 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:W:")) >= 0) { + while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); - else if (c == 'W') opt->max_w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); else if (c == 'B') opt->b = atoi(optarg); else if (c == 'O') opt->q = atoi(optarg); @@ -59,7 +58,6 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); - fprintf(stderr, " -W INT max band width [%d]\n", opt->max_w); fprintf(stderr, " -d INT off-diagnal X-dropoff [%d]\n", opt->zdrop); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); // fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); diff --git a/main.c b/main.c index dd1a481..8b49c8b 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r323-beta" +#define PACKAGE_VERSION "0.7.0-r324-beta" #endif static int usage()