diff --git a/Makefile b/Makefile index 8d6d388..5aa07c5 100644 --- a/Makefile +++ b/Makefile @@ -4,8 +4,8 @@ CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o -AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ - is.o bwtindex.o bwape.o kopen.o \ +AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ + is.o bwtindex.o bwape.o kopen.o pemerge.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o PROG= bwa bwamem-lite @@ -23,10 +23,10 @@ SUBDIRS= . all:$(PROG) bwa:libbwa.a $(AOBJS) main.o - $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa + $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS) bwamem-lite:libbwa.a example.o - $(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ $(LIBS) -L. -lbwa + $(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ -L. -lbwa $(LIBS) libbwa.a:$(LOBJS) $(AR) -csru $@ $(LOBJS) @@ -41,33 +41,33 @@ depend: QSufSort.o: QSufSort.h bamlite.o: utils.h bamlite.h -bntseq.o: bntseq.h main.h utils.h kseq.h +bntseq.o: bntseq.h utils.h kseq.h bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h kseq.h bwamem.o: kstring.h utils.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h bwamem.o: ksort.h kbtree.h bwamem_pair.o: kstring.h utils.h bwamem.h bwt.h bntseq.h bwa.h kvec.h ksw.h -bwape.o: bwtaln.h bwt.h stdaln.h kvec.h bntseq.h utils.h bwase.h bwa.h -bwape.o: khash.h -bwase.o: stdaln.h bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h bwa.h -bwaseqio.o: bwtaln.h bwt.h stdaln.h utils.h bamlite.h kseq.h +bwape.o: bwtaln.h bwt.h kvec.h bntseq.h utils.h bwase.h bwa.h ksw.h khash.h +bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h bwa.h ksw.h +bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h kseq.h bwt.o: utils.h bwt.h kvec.h bwt_gen.o: QSufSort.h utils.h bwt_lite.o: bwt_lite.h utils.h -bwtaln.o: bwtaln.h bwt.h stdaln.h bwtgap.h utils.h bwa.h bntseq.h -bwtgap.o: bwtgap.h bwt.h bwtaln.h stdaln.h utils.h -bwtindex.o: bntseq.h bwt.h main.h utils.h -bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h stdaln.h kstring.h -bwtsw2_aux.o: bwa.h kseq.h ksort.h +bwtaln.o: bwtaln.h bwt.h bwtgap.h utils.h bwa.h bntseq.h +bwtgap.o: bwtgap.h bwt.h bwtaln.h utils.h +bwtindex.o: bntseq.h bwt.h utils.h +bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h kstring.h bwa.h +bwtsw2_aux.o: ksw.h kseq.h ksort.h bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h utils.h ksort.h bwtsw2_core.o: bwt_lite.h bwtsw2.h bntseq.h bwt.h kvec.h utils.h khash.h bwtsw2_core.o: ksort.h bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h ksw.h +example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h utils.h fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h utils.h kseq.h is.o: utils.h kopen.o: utils.h kstring.o: kstring.h utils.h ksw.o: ksw.h utils.h -main.o: main.h utils.h -stdaln.o: stdaln.h utils.h +main.o: utils.h +pemerge.o: ksw.h kseq.h utils.h kstring.h bwa.h bntseq.h bwt.h utils.o: utils.h ksort.h kseq.h diff --git a/bntseq.c b/bntseq.c index 4f11c83..01a5e3c 100644 --- a/bntseq.c +++ b/bntseq.c @@ -32,7 +32,6 @@ #include #include #include "bntseq.h" -#include "main.h" #include "utils.h" #include "kseq.h" diff --git a/bwa.1 b/bwa.1 index 45b9921..b1bbb2a 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "27 Feburary 2013" "bwa-0.7.0" "Bioinformatics tools" +.TH bwa 1 "10 March 2013" "bwa-0.7.1" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool @@ -81,6 +81,8 @@ genome. .IR minSeedLen ] .RB [ -w .IR bandWidth ] +.RB [ -d +.IR zDropoff ] .RB [ -r .IR seedSplitRatio ] .RB [ -c @@ -163,6 +165,21 @@ Band width. Essentially, gaps longer than will not be found. Note that the maximum gap length is also affected by the scoring matrix and the hit length, not solely determined by this option. [100] .TP +.BI -d \ INT +Off-diagonal X-dropoff (Z-dropoff). Stop extension when the difference between +the best and the current extension score is above +.RI | i - j |* A + INT , +where +.I i +and +.I j +are the current positions of the query and reference, respectively, and +.I A +is the matching score. Z-dropoff is similar to BLAST's X-dropoff except that it +doesn't penalize gaps in one of the sequences in the alignment. Z-dropoff not +only avoids unnecessary extension, but also reduces poor alignments inside a +long good alignment. [100] +.TP .BI -r \ FLOAT Trigger re-seeding for a MEM longer than .IR minSeedLen * FLOAT . @@ -215,6 +232,11 @@ and will be converted to a TAB in the output SAM. The read group ID will be attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'. [null] .TP +.BI -T \ INT +Don't output alignment with score lower than +.IR INT . +This option only affects output. [30] +.TP .B -a Output all found alignments for single-end or unpaired paired-end reads. These alignments will be flagged as secondary alignments. diff --git a/bwa.c b/bwa.c index b3c9191..722ca4d 100644 --- a/bwa.c +++ b/bwa.c @@ -55,7 +55,7 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) kseq2bseq1(ks2, &seqs[n]); size += seqs[n++].l_seq; } - if (size >= chunk_size) break; + if (size >= chunk_size && (n&1) == 0) break; } if (size == 0) { // test if the 2nd file is finished if (ks2 && kseq_read(ks2) >= 0) @@ -69,6 +69,17 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) * CIGAR related * *****************/ +void bwa_fill_scmat(int a, int b, int8_t mat[25]) +{ + int i, j, k; + for (i = k = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + mat[k++] = i == j? a : -b; + mat[k++] = 0; // ambiguous base + } + for (j = 0; j < 5; ++j) mat[k++] = 0; +} + // Generate CIGAR when the alignment end points are known uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM) { diff --git a/bwa.h b/bwa.h index 81d40e0..9d5b2aa 100644 --- a/bwa.h +++ b/bwa.h @@ -30,6 +30,7 @@ extern "C" { bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); + void bwa_fill_scmat(int a, int b, int8_t mat[25]); uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re); diff --git a/bwamem.c b/bwamem.c index 1a49186..78d77d0 100644 --- a/bwamem.c +++ b/bwamem.c @@ -44,6 +44,7 @@ mem_opt_t *mem_opt_init() o = xcalloc(1, sizeof(mem_opt_t)); o->flag = 0; o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100; + o->T = 30; o->zdrop = 100; o->pen_unpaired = 9; o->pen_clip = 5; @@ -58,21 +59,10 @@ mem_opt_t *mem_opt_init() o->chunk_size = 10000000; o->n_threads = 1; o->max_matesw = 100; - mem_fill_scmat(o->a, o->b, o->mat); + bwa_fill_scmat(o->a, o->b, o->mat); return o; } -void mem_fill_scmat(int a, int b, int8_t mat[25]) -{ - int i, j, k; - for (i = k = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - mat[k++] = i == j? a : -b; - mat[k++] = 0; // ambiguous base - } - for (j = 0; j < 5; ++j) mat[k++] = 0; -} - /*************************** * SMEM iterator interface * ***************************/ @@ -753,11 +743,12 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b int k; kstring_t str; str.l = str.m = 0; str.s = 0; - if (a->n > 0) { + if (a->n > 0 && a->a[0].score >= opt->T) { int mapq0 = -1; for (k = 0; k < a->n; ++k) { bwahit_t h; mem_alnreg_t *p = &a->a[k]; + if (p->score < opt->T) continue; if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue; mem_alnreg2hit(p, &h); diff --git a/bwamem.h b/bwamem.h index 96a3308..14f04d5 100644 --- a/bwamem.h +++ b/bwamem.h @@ -24,6 +24,7 @@ typedef struct { int w; // band width int zdrop; // Z-dropoff + int T; // output score threshold; only affecting output int flag; // see MEM_F_* macros int min_seed_len; // minimum seed length float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor diff --git a/bwamem_pair.c b/bwamem_pair.c index 2fc26b6..837666b 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -303,7 +303,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co no_pairing: for (i = 0; i < 2; ++i) { - if (a[i].n) { + if (a[i].n && a[i].a[0].score >= opt->T) { mem_alnreg2hit(&a[i].a[0], &h[i]); bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[i].seq, &h[i].qb, &h[i].qe, &h[i].rb, &h[i].re); } else h[i].rb = h[i].re = -1; diff --git a/bwape.c b/bwape.c index f0ecb7a..c413b34 100644 --- a/bwape.c +++ b/bwape.c @@ -8,9 +8,9 @@ #include "kvec.h" #include "bntseq.h" #include "utils.h" -#include "stdaln.h" #include "bwase.h" #include "bwa.h" +#include "ksw.h" typedef struct { int n; @@ -397,16 +397,17 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw #define SW_MIN_MAPQ 17 // cnt = n_mm<<16 | n_gapo<<8 | n_gape -bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen, - int *n_cigar, uint32_t *_cnt) +bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen, int *n_cigar, uint32_t *_cnt) { + kswr_t r; + uint32_t *cigar32 = 0; bwa_cigar_t *cigar = 0; ubyte_t *ref_seq; bwtint_t k, x, y, l; - int path_len, ret, subo; - AlnParam ap = aln_param_bwa; - path_t *path, *p; + int xtra; + int8_t mat[25]; + bwa_fill_scmat(1, 3, mat); // check whether there are too many N's if (reglen < SW_MIN_MATCH_LEN || (int64_t)l_pac - *beg < len) return 0; for (k = 0, x = 0; k < len; ++k) @@ -417,15 +418,19 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u ref_seq = (ubyte_t*)xcalloc(reglen, 1); for (k = *beg, l = 0; l < reglen && k < l_pac; ++k) ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; - path = (path_t*)xcalloc(l+len, sizeof(path_t)); // do alignment - ret = aln_local_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len, 1, &subo); - if (ret < 0 || subo == ret) { // no hit or tandem hits - free(path); free(cigar); free(ref_seq); *n_cigar = 0; + xtra = KSW_XSUBO | KSW_XSTART | (len < 250? KSW_XBYTE : 0); + r = ksw_align(len, (uint8_t*)seq, l, ref_seq, 5, mat, 5, 1, xtra, 0); + ksw_global(r.qe - r.qb + 1, &seq[r.qb], r.te - r.tb + 1, &ref_seq[r.tb], 5, mat, 5, 1, 50, n_cigar, &cigar32); + cigar = (bwa_cigar_t*)cigar32; + for (k = 0; k < *n_cigar; ++k) + cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); + + if (r.score < SW_MIN_MATCH_LEN || r.score2 == r.score) { // poor hit or tandem hits + free(cigar); free(ref_seq); *n_cigar = 0; return 0; } - cigar = bwa_aln_path2cigar(path, path_len, n_cigar); // check whether the alignment is good enough for (k = 0, x = y = 0; k < *n_cigar; ++k) { @@ -435,17 +440,14 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u else y += __cigar_len(c); } if (x < SW_MIN_MATCH_LEN || y < SW_MIN_MATCH_LEN) { // not good enough - free(path); free(cigar); free(ref_seq); + free(cigar); free(ref_seq); *n_cigar = 0; return 0; } { // update cigar and coordinate; - int start, end; - p = path + path_len - 1; - *beg += (p->i? p->i : 1) - 1; - start = (p->j? p->j : 1) - 1; - end = path->j; + int start = r.qb, end = r.qe + 1; + *beg += r.tb; cigar = (bwa_cigar_t*)xrealloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2)); if (start) { memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar)); @@ -462,8 +464,7 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u { // set *cnt int n_mm, n_gapo, n_gape; n_mm = n_gapo = n_gape = 0; - p = path + path_len - 1; - x = p->i? p->i - 1 : 0; y = p->j? p->j - 1 : 0; + x = r.tb; y = r.qb; for (k = 0; k < *n_cigar; ++k) { bwa_cigar_t c = cigar[k]; if (__cigar_op(c) == FROM_M) { @@ -479,7 +480,7 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u *_cnt = (uint32_t)n_mm<<16 | n_gapo<<8 | n_gape; } - free(ref_seq); free(path); + free(ref_seq); return cigar; } diff --git a/bwase.c b/bwase.c index e5204e9..718dd5d 100644 --- a/bwase.c +++ b/bwase.c @@ -4,13 +4,13 @@ #include #include #include -#include "stdaln.h" #include "bwase.h" #include "bwtaln.h" #include "bntseq.h" #include "utils.h" #include "kstring.h" #include "bwa.h" +#include "ksw.h" int g_log_n[256]; @@ -156,57 +156,58 @@ void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_se bwt_destroy(bwt); } -/* is_end_correct == 1 if (*pos+len) gives the correct coordinate on - * forward strand. This happens when p->pos is calculated by - * bwa_cal_pac_pos(). is_end_correct==0 if (*pos) gives the correct - * coordinate. This happens only for color-converted alignment. */ -bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, bwtint_t *_pos, - int ext, int *n_cigar, int is_end_correct) +#define SW_BW 50 + +bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, ubyte_t *seq, bwtint_t *_pos, int *n_cigar, int is_rev) { bwa_cigar_t *cigar = 0; - ubyte_t *ref_seq; - int l = 0, path_len, ref_len; - AlnParam ap = aln_param_bwa; - path_t *path; - int64_t k, __pos = *_pos; + uint32_t *cigar32 = 0; + ubyte_t *rseq; + int tle, qle, gtle, gscore; + int64_t k, rb, re, rlen; + int8_t mat[25]; - ref_len = len + abs(ext); - if (ext > 0) { - ref_seq = (ubyte_t*)xcalloc(ref_len, 1); - for (k = __pos; k < __pos + ref_len && k < l_pac; ++k) - ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; - } else { - int64_t x = __pos + (is_end_correct? len : ref_len); - ref_seq = (ubyte_t*)xcalloc(ref_len, 1); - for (l = 0, k = x - ref_len > 0? x - ref_len : 0; k < x && k < l_pac; ++k) - ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; - } - path = (path_t*)xcalloc(l+len, sizeof(path_t)); - - aln_global_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len); - cigar = bwa_aln_path2cigar(path, path_len, n_cigar); - - if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped to the forward strand - for (l = k = 0; k < *n_cigar; ++k) { - if (__cigar_op(cigar[k]) == FROM_D) l -= __cigar_len(cigar[k]); - else if (__cigar_op(cigar[k]) == FROM_I) l += __cigar_len(cigar[k]); + bwa_fill_scmat(1, 3, mat); + if (!is_rev) { // forward strand, the end position is correct + re = *_pos + len; + if (re > l_pac) re = l_pac; + rb = re - (len + SW_BW); + if (rb < 0) rb = 0; + rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); + seq_reverse(len, seq, 0); // as we need to do left extension, we have to reverse both query and reference sequences + seq_reverse(rlen, rseq, 0); + ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0); + if (gscore > 0) tle = gtle, qle = len; + rb = re - tle; rlen = tle; + seq_reverse(len, seq, 0); + seq_reverse(rlen, rseq, 0); + ksw_global(qle, &seq[len-qle], rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); + if (qle < len) { // write soft clip + cigar = xrealloc(cigar, (*n_cigar + 1) * 4); + memmove(cigar + 1, cigar, *n_cigar * 4); + cigar[0] = (len - qle)<<4 | FROM_S; + ++(*n_cigar); + } + } else { // reverse strand, the start position is correct + rb = *_pos; re = rb + len + SW_BW; + if (re > l_pac) re = l_pac; + rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); + ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0); + if (gscore > 0) tle = gtle, qle = len; + re = rb + tle; rlen = tle; + ksw_global(qle, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); // right extension + if (qle < len) { + cigar = xrealloc(cigar, (*n_cigar + 1) * 4); + cigar[*n_cigar - 1] = (len - qle)<<4 | FROM_S; + ++(*n_cigar); } - __pos += l; } + *_pos = rb; - if (__cigar_op(cigar[0]) == FROM_D) { // deletion at the 5'-end - __pos += __cigar_len(cigar[0]); - for (k = 0; k < *n_cigar - 1; ++k) cigar[k] = cigar[k+1]; - --(*n_cigar); - } - if (__cigar_op(cigar[*n_cigar-1]) == FROM_D) --(*n_cigar); // deletion at the 3'-end - - // change "I" at either end of the read to S. just in case. This should rarely happen... - if (__cigar_op(cigar[*n_cigar-1]) == FROM_I) cigar[*n_cigar-1] = __cigar_create(3, (__cigar_len(cigar[*n_cigar-1]))); - if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(3, (__cigar_len(cigar[0]))); - - *_pos = (bwtint_t)__pos; - free(ref_seq); free(path); + cigar = (bwa_cigar_t*)cigar32; + for (k = 0; k < *n_cigar; ++k) + cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); + free(rseq); return cigar; } @@ -314,13 +315,11 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t bwt_multi1_t *q = s->multi + j; int n_cigar; if (q->gap == 0) continue; - q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos, - (q->strand? 1 : -1) * q->gap, &n_cigar, 1); + q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos, &n_cigar, q->strand); q->n_cigar = n_cigar; } if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue; - s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, - (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1); + s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, &s->n_cigar, s->strand); } // generate MD tag str = (kstring_t*)xcalloc(1, sizeof(kstring_t)); @@ -606,5 +605,6 @@ int bwa_sai2sam_se(int argc, char *argv[]) return 0; } bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line); + free(prefix); return 0; } diff --git a/bwtaln.c b/bwtaln.c index 2b6a643..b906e1e 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -312,18 +312,3 @@ int bwa_aln(int argc, char *argv[]) free(opt); free(prefix); return 0; } - -/* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t, -__cigar_op and __cigar_len while keeping stdaln stand alone */ -bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar) -{ - uint32_t *cigar32; - bwa_cigar_t *cigar; - int i; - cigar32 = aln_path2cigar32((path_t*) path, path_len, n_cigar); - cigar = (bwa_cigar_t*)cigar32; - for (i = 0; i < *n_cigar; ++i) - cigar[i] = __cigar_create( (cigar32[i]&0xf), (cigar32[i]>>4) ); - return cigar; -} - diff --git a/bwtaln.h b/bwtaln.h index 412cc04..556f259 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -28,6 +28,11 @@ #define bns_pac(pac, k) ((pac)[(k)>>2] >> ((~(k)&3)<<1) & 3) #endif +#define FROM_M 0 +#define FROM_I 1 +#define FROM_D 2 +#define FROM_S 3 + typedef struct { bwtint_t w; int bid; @@ -138,13 +143,6 @@ extern "C" { void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac); - - /* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t, - __cigar_op and __cigar_len while keeping stdaln stand alone */ -#include "stdaln.h" - - bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar); - #ifdef __cplusplus } #endif diff --git a/bwtindex.c b/bwtindex.c index 0d2a832..6c1731c 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -33,7 +33,6 @@ #include #include "bntseq.h" #include "bwt.h" -#include "main.h" #include "utils.h" #ifdef _DIVBWT diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 3b2e5eb..bcb0bcb 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -11,9 +11,9 @@ #include "bwt_lite.h" #include "utils.h" #include "bwtsw2.h" -#include "stdaln.h" #include "kstring.h" #include "bwa.h" +#include "ksw.h" #include "kseq.h" KSEQ_DECLARE(gzFile) @@ -94,13 +94,12 @@ bwtsw2_t *bsw2_dup_no_cigar(const bwtsw2_t *b) void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem) { - int i, matrix[25]; + int i; bwtint_t k; uint8_t *target = 0, *query; - AlnParam par; + int8_t mat[25]; - par.matrix = matrix; - __gen_ap(par, opt); + bwa_fill_scmat(opt->a, opt->b, mat); query = xcalloc(lq, 1); // sort according to the descending order of query end ks_introsort(hit, b->n, b->hits); @@ -111,8 +110,7 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; int lt = ((p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq; - int score, j; - path_t path; + int score, j, qle, tle; p->n_seeds = 1; if (p->l || p->k == 0) continue; for (j = score = 0; j < i; ++j) { @@ -127,12 +125,12 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; - score = aln_extend_core(target, lt, query + lq - p->beg, p->beg, &par, &path, 0, p->G, _mem); + score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, p->G, &qle, &tle, 0, 0, 0); if (score > p->G) { // extensible p->G = score; - p->len += path.i; - p->beg -= path.j; - p->k -= path.i; + p->k -= tle; + p->len += tle; + p->beg -= qle; } } free(query); free(target); @@ -140,62 +138,49 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem) { - int i, matrix[25]; + int i; bwtint_t k; uint8_t *target; - AlnParam par; - - par.matrix = matrix; - __gen_ap(par, opt); + int8_t mat[25]; + + bwa_fill_scmat(opt->a, opt->b, mat); target = xcalloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; int lt = ((lq - p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq; - int j, score; - path_t path; + int j, score, qle, tle; if (p->l) continue; for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; - score = aln_extend_core(target, lt, query + p->beg, lq - p->beg, &par, &path, 0, 1, _mem); + score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, 1, &qle, &tle, 0, 0, 0); // if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G); if (score >= p->G) { p->G = score; - p->len = path.i; - p->end = path.j + p->beg; + p->len = tle; + p->end = p->beg + qle; } } free(target); } /* generate CIGAR array(s) in b->cigar[] */ -static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b, const char *name) +static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], int64_t l_pac, const uint8_t *pac, bwtsw2_t *b, const char *name) { - uint8_t *target; - int i, matrix[25]; - AlnParam par; - path_t *path; + int i; + int8_t mat[25]; - par.matrix = matrix; - __gen_ap(par, opt); - i = ((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq; // maximum possible target length - target = xcalloc(i, 1); - path = xcalloc(i + lq, sizeof(path_t)); - // generate CIGAR + bwa_fill_scmat(opt->a, opt->b, mat); for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; bsw2aux_t *q = b->aux + i; uint8_t *query; - bwtint_t k; - int path_len, beg, end; + int beg, end, score; if (p->l) continue; beg = (p->flag & 0x10)? lq - p->end : p->beg; end = (p->flag & 0x10)? lq - p->beg : p->end; query = seq[(p->flag & 0x10)? 1 : 0] + beg; - for (k = p->k; k < p->k + p->len; ++k) // in principle, no out-of-boundary here - target[k - p->k] = pac[k>>2] >> (~k&3)*2 & 0x3; - aln_global_core(target, p->len, query, end - beg, &par, path, &path_len); - q->cigar = aln_path2cigar32(path, path_len, &q->n_cigar); + q->cigar = bwa_gen_cigar(mat, opt->q, opt->r, opt->bw, l_pac, pac, end - beg, query, p->k, p->k + p->len, &score, &q->n_cigar, &q->nm); #if 0 if (name && score != p->G) { // debugging only int j, glen = 0; @@ -206,7 +191,7 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8 __func__, name, score, p->G, lq, end - beg, p->len, glen, opt->bw); } #endif - if (beg != 0 || end < lq) { // write soft clipping + if (q->cigar && (beg != 0 || end < lq)) { // write soft clipping q->cigar = xrealloc(q->cigar, 4 * (q->n_cigar + 2)); if (beg != 0) { memmove(q->cigar + 1, q->cigar, q->n_cigar * 4); @@ -219,7 +204,6 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8 } } } - free(target); free(path); } /* this is for the debugging purpose only */ @@ -407,27 +391,6 @@ static int fix_cigar(const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *c return n_cigar; } -static int compute_nm(bsw2hit_t *p, int n_cigar, const uint32_t *cigar, const uint8_t *pac, const uint8_t *seq) -{ - int k, x, n_mm = 0, i, n_gap = 0; - bwtint_t y; - x = 0; y = p->k; - for (k = 0; k < n_cigar; ++k) { - int op = cigar[k]&0xf; - int len = cigar[k]>>4; - if (op == 0) { // match - for (i = 0; i < len; ++i) { - int ref = pac[(y+i)>>2] >> (~(y+i)&3)*2 & 0x3; - if (seq[x + i] != ref) ++n_mm; - } - x += len; y += len; - } else if (op == 1) x += len, n_gap += len; - else if (op == 2) y += len, n_gap += len; - else if (op == 4) x += len; - } - return n_mm + n_gap; -} - static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b, const char *name) { int i; @@ -439,7 +402,7 @@ static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8 } b->aux = xcalloc(b->n, sizeof(bsw2aux_t)); // generate CIGAR - gen_cigar(opt, qlen, seq, pac, b, name); + gen_cigar(opt, qlen, seq, bns->l_pac, pac, b, name); // fix CIGAR, generate mapQ, and write chromosomal position for (i = 0; i < b->n; ++i) { bsw2hit_t *p = &b->hits[i]; @@ -451,8 +414,6 @@ static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8 int subo; // fix out-of-boundary CIGAR q->n_cigar = fix_cigar(bns, p, q->n_cigar, q->cigar); - // compute the NM tag - q->nm = compute_nm(p, q->n_cigar, q->cigar, pac, seq[p->is_rev]); // compute mapQ subo = p->G2 > opt->t? p->G2 : opt->t; if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5; @@ -527,9 +488,10 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks bsw2aux_t *q = b->aux + i; int j, beg, end, type = 0; // print mandatory fields before SEQ + if (q->cigar == 0) q->flag |= 0x4; ksprintf(&str, "%s\t%d", ks->name, q->flag | (opt->multi_2nd && i? 0x100 : 0)); ksprintf(&str, "\t%s\t%ld", q->chr>=0? bns->anns[q->chr].name : "*", (long)q->pos + 1); - if (p->l == 0) { // not a repetitive hit + if (p->l == 0 && q->cigar) { // not a repetitive hit ksprintf(&str, "\t%d\t", q->pqual); for (k = 0; k < q->n_cigar; ++k) ksprintf(&str, "%d%c", q->cigar[k]>>4, (opt->hard_clip? "MIDNHHP" : "MIDNSHP")[q->cigar[k]&0xf]); @@ -538,7 +500,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks else ksprintf(&str, "\t%s\t%d\t%d\t", q->mchr==q->chr? "=" : (q->mchr<0? "*" : bns->anns[q->mchr].name), q->mpos+1, q->isize); // get the sequence begin and end beg = 0; end = ks->l; - if (opt->hard_clip) { + if (opt->hard_clip && q->cigar) { if ((q->cigar[0]&0xf) == 4) beg += q->cigar[0]>>4; if ((q->cigar[q->n_cigar-1]&0xf) == 4) end -= q->cigar[q->n_cigar-1]>>4; } diff --git a/fastmap.c b/fastmap.c index e500c23..76e3c03 100644 --- a/fastmap.c +++ b/fastmap.c @@ -27,13 +27,14 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:")) >= 0) { + while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); else if (c == 'B') opt->b = atoi(optarg); else if (c == 'O') opt->q = atoi(optarg); else if (c == 'E') opt->r = atoi(optarg); + else if (c == 'T') opt->T = atoi(optarg); else if (c == 'L') opt->pen_clip = atoi(optarg); else if (c == 'U') opt->pen_unpaired = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; @@ -59,7 +60,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); - fprintf(stderr, " -d INT off-diagnal X-dropoff [%d]\n", opt->zdrop); + fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); // fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); @@ -75,6 +76,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); fprintf(stderr, "\n"); fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose); + fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T); fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); fprintf(stderr, " -H hard clipping\n"); @@ -85,7 +87,7 @@ int main_mem(int argc, char *argv[]) return 1; } - mem_fill_scmat(opt->a, opt->b, opt->mat); + bwa_fill_scmat(opt->a, opt->b, opt->mat); if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak bwa_print_sam_hdr(idx->bns, rg_line); @@ -105,6 +107,11 @@ int main_mem(int argc, char *argv[]) } while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { int64_t size = 0; + if ((opt->flag & MEM_F_PE) && (n&1) == 1) { + if (bwa_verbose >= 2) + fprintf(stderr, "[W::%s] odd number of reads in the PE mode; last read dropped\n", __func__); + n = n>>1<<1; + } if (!copy_comment) for (i = 0; i < n; ++i) { free(seqs[i].comment); seqs[i].comment = 0; diff --git a/ksw.c b/ksw.c index 26f908c..700afd0 100644 --- a/ksw.c +++ b/ksw.c @@ -427,7 +427,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, max_ie = gscore > h1? max_ie : i; gscore = gscore > h1? gscore : h1; } - if (m == 0 || max - m - abs((i - max_i) - (j - max_j)) * gape > zdrop) break; // drop to zero, or below Z-dropoff + if (m == 0 || (zdrop > 0 && max - m - abs((i - max_i) - (j - max_j)) * gape > zdrop)) break; // drop to zero, or below Z-dropoff if (m > max) { max = m, max_i = i, max_j = mj; max_off = max_off > abs(mj - i)? max_off : abs(mj - i); diff --git a/main.c b/main.c index 3bda623..20f0a02 100644 --- a/main.c +++ b/main.c @@ -1,12 +1,29 @@ #include #include -#include "main.h" #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r324-beta" +#define PACKAGE_VERSION "0.7.0-r341-beta" #endif +int bwa_fa2pac(int argc, char *argv[]); +int bwa_pac2bwt(int argc, char *argv[]); +int bwa_bwtupdate(int argc, char *argv[]); +int bwa_bwt2sa(int argc, char *argv[]); +int bwa_index(int argc, char *argv[]); +int bwt_bwtgen_main(int argc, char *argv[]); + +int bwa_aln(int argc, char *argv[]); +int bwa_sai2sam_se(int argc, char *argv[]); +int bwa_sai2sam_pe(int argc, char *argv[]); + +int bwa_bwtsw2(int argc, char *argv[]); + +int main_fastmap(int argc, char *argv[]); +int main_mem(int argc, char *argv[]); + +int main_pemerge(int argc, char *argv[]); + static int usage() { fprintf(stderr, "\n"); @@ -15,12 +32,13 @@ static int usage() fprintf(stderr, "Contact: Heng Li \n\n"); fprintf(stderr, "Usage: bwa [options]\n\n"); fprintf(stderr, "Command: index index sequences in the FASTA format\n"); + fprintf(stderr, " mem BWA-MEM algorithm\n"); + fprintf(stderr, " fastmap identify super-maximal exact matches\n"); + fprintf(stderr, " pemerge merge overlapping paired ends (EXPERIMENTAL)\n"); fprintf(stderr, " aln gapped/ungapped alignment\n"); fprintf(stderr, " samse generate alignment (single ended)\n"); fprintf(stderr, " sampe generate alignment (paired ended)\n"); fprintf(stderr, " bwasw BWA-SW for long queries\n"); - fprintf(stderr, " fastmap identify super-maximal exact matches\n"); - fprintf(stderr, " mem BWA-MEM algorithm\n"); fprintf(stderr, "\n"); fprintf(stderr, " fa2pac convert FASTA to PAC format\n"); fprintf(stderr, " pac2bwt generate BWT from PAC\n"); @@ -56,6 +74,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1); else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1); + else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1); else { fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; diff --git a/main.h b/main.h deleted file mode 100644 index 3e70362..0000000 --- a/main.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef BWA_MAIN_H -#define BWA_MAIN_H - -#ifdef __cplusplus -extern "C" { -#endif - - int bwa_fa2pac(int argc, char *argv[]); - int bwa_pac2bwt(int argc, char *argv[]); - int bwa_bwtupdate(int argc, char *argv[]); - int bwa_bwt2sa(int argc, char *argv[]); - int bwa_index(int argc, char *argv[]); - int bwa_aln(int argc, char *argv[]); - int bwt_bwtgen_main(int argc, char *argv[]); - - int bwa_sai2sam_se(int argc, char *argv[]); - int bwa_sai2sam_pe(int argc, char *argv[]); - - int bwa_bwtsw2(int argc, char *argv[]); - - int main_fastmap(int argc, char *argv[]); - int main_mem(int argc, char *argv[]); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/pemerge.c b/pemerge.c new file mode 100644 index 0000000..dbcfaab --- /dev/null +++ b/pemerge.c @@ -0,0 +1,286 @@ +#include +#include +#include +#include +#include +#include +#include +#include "ksw.h" +#include "kseq.h" +#include "kstring.h" +#include "bwa.h" +#include "utils.h" +KSEQ_DECLARE(gzFile) + +#define MAX_SCORE_RATIO 0.9f +#define MAX_ERR 8 + +static const char *err_msg[MAX_ERR+1] = { + "successful merges", + "low-scoring pairs", + "pairs where the best SW alignment is not an overlap (long left end)", + "pairs where the best SW alignment is not an overlap (long right end)", + "pairs with large 2nd best SW score", + "pairs with gapped overlap", + "pairs where the end-to-end alignment is inconsistent with SW", + "pairs potentially with tandem overlaps", + "pairs with high sum of errors" +}; + +typedef struct { + int a, b, q, r, w; + int q_def, q_thres; + int T; + int chunk_size; + int n_threads; + int flag; // bit 1: print merged; 2: print unmerged + int8_t mat[25]; +} pem_opt_t; + +pem_opt_t *pem_opt_init() +{ + pem_opt_t *opt; + opt = xcalloc(1, sizeof(pem_opt_t)); + opt->a = 5; opt->b = 4; opt->q = 2, opt->r = 17; opt->w = 20; + opt->T = opt->a * 10; + opt->q_def = 20; + opt->q_thres = 70; + opt->chunk_size = 10000000; + opt->n_threads = 1; + opt->flag = 3; + bwa_fill_scmat(opt->a, opt->b, opt->mat); + return opt; +} + +int bwa_pemerge(const pem_opt_t *opt, bseq1_t x[2]) +{ + uint8_t *s[2], *q[2], *seq, *qual; + int i, xtra, l, l_seq, sum_q, ret = 0; + kswr_t r; + + s[0] = xmalloc(x[0].l_seq); q[0] = xmalloc(x[0].l_seq); + s[1] = xmalloc(x[1].l_seq); q[1] = xmalloc(x[1].l_seq); + for (i = 0; i < x[0].l_seq; ++i) { + int c = x[0].seq[i]; + s[0][i] = c < 0 || c > 127? 4 : c <= 4? c : nst_nt4_table[c]; + q[0][i] = x[0].qual? x[0].qual[i] - 33 : opt->q_def; + } + for (i = 0; i < x[1].l_seq; ++i) { + int c = x[1].seq[x[1].l_seq - 1 - i]; + c = c < 0 || c > 127? 4 : c < 4? c : nst_nt4_table[c]; + s[1][i] = c < 4? 3 - c : 4; + q[1][i] = x[1].qual? x[1].qual[x[1].l_seq - 1 - i] - 33 : opt->q_def; + } + + xtra = KSW_XSTART | KSW_XSUBO; + r = ksw_align(x[1].l_seq, s[1], x[0].l_seq, s[0], 5, opt->mat, opt->q, opt->r, xtra, 0); + ++r.qe; ++r.te; // change to the half-close-half-open coordinates + + if (r.score < opt->T) { ret = -1; goto pem_ret; } // poor alignment + if (r.tb < r.qb) { ret = -2; goto pem_ret; } // no enough space for the left end + if (x[0].l_seq - r.te > x[1].l_seq - r.qe) { ret = -3; goto pem_ret; } // no enough space for the right end + if ((double)r.score2 / r.score >= MAX_SCORE_RATIO) { ret = -4; goto pem_ret; } // the second best score is too large + if (r.qe - r.qb != r.te - r.tb) { ret = -5; goto pem_ret; } // we do not allow gaps + + { // test tandem match; O(n^2) + int max_m, max_m2, min_l, max_l, max_l2; + max_m = max_m2 = 0; max_l = max_l2 = 0; + min_l = x[0].l_seq < x[1].l_seq? x[0].l_seq : x[1].l_seq; + for (l = 1; l < min_l; ++l) { + int m = 0, o = x[0].l_seq - l; + uint8_t *s0o = &s[0][o], *s1 = s[1]; + for (i = 0; i < l; ++i) // TODO: in principle, this can be done with SSE2. It is the bottleneck! + m += opt->mat[(s1[i]<<2) + s1[i] + s0o[i]]; // equivalent to s[1][i]*5 + s[0][o+i] + if (m > max_m) max_m2 = max_m, max_m = m, max_l2 = max_l, max_l = l; + else if (m > max_m2) max_m2 = m, max_l2 = l; + } + if (max_m < opt->T || max_l != x[0].l_seq - (r.tb - r.qb)) { ret = -6; goto pem_ret; } + if (max_l2 < max_l && max_m2 >= opt->T && (double)(max_m2 + (max_l - max_l2) * opt->a) / max_m >= MAX_SCORE_RATIO) { + ret = -7; goto pem_ret; + } + if (max_l2 > max_l && (double)max_m2 / max_m >= MAX_SCORE_RATIO) { ret = -7; goto pem_ret; } + } + + l = x[0].l_seq - (r.tb - r.qb); // length to merge + l_seq = x[0].l_seq + x[1].l_seq - l; + seq = xmalloc(l_seq + 1); + qual = xmalloc(l_seq + 1); + memcpy(seq, s[0], x[0].l_seq); memcpy(seq + x[0].l_seq, &s[1][l], x[1].l_seq - l); + memcpy(qual, q[0], x[0].l_seq); memcpy(qual + x[0].l_seq, &q[1][l], x[1].l_seq - l); + for (i = 0, sum_q = 0; i < l; ++i) { + int k = x[0].l_seq - l + i; + if (s[0][k] == 4) { // ambiguous + seq[k] = s[1][i]; + qual[k] = q[1][i]; + } else if (s[1][i] == 4) { // do nothing + } else if (s[0][k] == s[1][i]) { + qual[k] = qual[k] > q[1][i]? qual[k] : q[1][i]; + } else { // s[0][k] != s[1][i] and neither is N + int qq = q[0][k] < q[1][i]? q[0][k] : q[1][i]; + sum_q += qq >= 3? qq<<1 : 1; + seq[k] = q[0][k] > q[1][i]? s[0][k] : s[1][i]; + qual[k] = abs((int)q[0][k] - (int)q[1][i]); + } + } + if (sum_q>>1 > opt->q_thres) { // too many mismatches + free(seq); free(qual); + ret = -8; goto pem_ret; + } + + for (i = 0; i < l_seq; ++i) seq[i] = "ACGTN"[(int)seq[i]], qual[i] += 33; + seq[l_seq] = qual[l_seq] = 0; + + free(x[1].name); free(x[1].seq); free(x[1].qual); free(x[1].comment); + memset(&x[1], 0, sizeof(bseq1_t)); + free(x[0].seq); free(x[0].qual); + x[0].l_seq = l_seq; x[0].seq = (char*)seq; x[0].qual = (char*)qual; + +pem_ret: + free(s[0]); free(s[1]); free(q[0]); free(q[1]); + return ret; +} + +static inline void print_bseq(const bseq1_t *s, int rn) +{ + err_putchar(s->qual? '@' : '>'); + err_fputs(s->name, stdout); + if (rn == 1 || rn == 2) { + err_putchar('/'); err_putchar('0' + rn); err_putchar('\n'); + } else err_puts(" merged"); + err_puts(s->seq); + if (s->qual) { + err_puts("+"); err_puts(s->qual); + } +} + +typedef struct { + int n, start; + bseq1_t *seqs; + int64_t cnt[MAX_ERR+1]; + const pem_opt_t *opt; +} worker_t; + +void *worker(void *data) +{ + worker_t *w = (worker_t*)data; + int i; + for (i = w->start; i < w->n>>1; i += w->opt->n_threads) + ++w->cnt[-bwa_pemerge(w->opt, &w->seqs[i<<1])]; + return 0; +} + +static void process_seqs(const pem_opt_t *opt, int n_, bseq1_t *seqs, int64_t cnt[MAX_ERR+1]) +{ + int i, j, n = n_>>1<<1; + worker_t *w; + + w = xcalloc(opt->n_threads, sizeof(worker_t)); + for (i = 0; i < opt->n_threads; ++i) { + worker_t *p = &w[i]; + p->start = i; p->n = n; + p->opt = opt; + p->seqs = seqs; + } + if (opt->n_threads == 1) { + worker(w); + } else { + pthread_t *tid; + tid = (pthread_t*)xcalloc(opt->n_threads, sizeof(pthread_t)); + for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker, &w[i]); + for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); + free(tid); + } + for (i = 0; i < opt->n_threads; ++i) { + worker_t *p = &w[i]; + for (j = 0; j <= MAX_ERR; ++j) cnt[j] += p->cnt[j]; + } + free(w); + for (i = 0; i < n>>1; ++i) { + if (seqs[i<<1|1].l_seq != 0) { + if (opt->flag&2) { + print_bseq(&seqs[i<<1|0], 1); + print_bseq(&seqs[i<<1|1], 2); + } + } else if (opt->flag&1) + print_bseq(&seqs[i<<1|0], 0); + } + for (i = 0; i < n; ++i) { + bseq1_t *s = &seqs[i]; + free(s->name); free(s->seq); free(s->qual); free(s->comment); + } +} + +int main_pemerge(int argc, char *argv[]) +{ + int c, flag = 0, i, n, min_ovlp = 10; + int64_t cnt[MAX_ERR+1]; + bseq1_t *bseq; + gzFile fp, fp2 = 0; + kseq_t *ks, *ks2 = 0; + pem_opt_t *opt; + + opt = pem_opt_init(); + while ((c = getopt(argc, argv, "muQ:t:T:")) >= 0) { + if (c == 'm') flag |= 1; + else if (c == 'u') flag |= 2; + else if (c == 'Q') opt->q_thres = atoi(optarg); + else if (c == 't') opt->n_threads = atoi(optarg); + else if (c == 'T') min_ovlp = atoi(optarg); + } + if (flag == 0) flag = 3; + opt->flag = flag; + opt->T = opt->a * min_ovlp; + + if (optind == argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa pemerge [-mu] [read2.fq]\n\n"); + fprintf(stderr, "Options: -m output merged reads only\n"); + fprintf(stderr, " -u output unmerged reads only\n"); + fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); + fprintf(stderr, " -T INT minimum end overlap [%d]\n", min_ovlp); + fprintf(stderr, " -Q INT max sum of errors [%d]\n", opt->q_thres); + fprintf(stderr, "\n"); + free(opt); + return 1; + } + + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + if (NULL == fp) { + fprintf(stderr, "Couldn't open %s : %s\n", + strcmp(argv[optind], "-") ? argv[optind] : "stdin", + errno ? strerror(errno) : "Out of memory"); + exit(EXIT_FAILURE); + } + ks = kseq_init(fp); + if (optind + 1 < argc) { + fp2 = strcmp(argv[optind+1], "-")? gzopen(argv[optind+1], "r") : gzdopen(fileno(stdin), "r"); + if (NULL == fp) { + fprintf(stderr, "Couldn't open %s : %s\n", + strcmp(argv[optind+1], "-") ? argv[optind+1] : "stdin", + errno ? strerror(errno) : "Out of memory"); + exit(EXIT_FAILURE); + } + ks2 = kseq_init(fp2); + } + + memset(cnt, 0, 8 * (MAX_ERR+1)); + while ((bseq = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) { + process_seqs(opt, n, bseq, cnt); + free(bseq); + } + + fprintf(stderr, "%12ld %s\n", (long)cnt[0], err_msg[0]); + for (i = 1; i <= MAX_ERR; ++i) + fprintf(stderr, "%12ld %s\n", (long)cnt[i], err_msg[i]); + kseq_destroy(ks); + err_gzclose(fp); + if (ks2) { + kseq_destroy(ks2); + err_gzclose(fp2); + } + free(opt); + + err_fflush(stdout); + + return 0; +} diff --git a/stdaln.c b/stdaln.c deleted file mode 100644 index 336a4e4..0000000 --- a/stdaln.c +++ /dev/null @@ -1,1071 +0,0 @@ -/* The MIT License - - Copyright (c) 2003-2006, 2008, 2009, by Heng Li - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -#include -#include -#include -#include -#include "stdaln.h" -#include "utils.h" - -/* char -> 17 (=16+1) nucleotides */ -unsigned char aln_nt16_table[256] = { - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,16 /*'-'*/,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15, 1,14, 4, 11,15,15, 2, 13,15,15,10, 15, 5,15,15, - 15,15, 3, 6, 8,15, 7, 9, 0,12,15,15, 15,15,15,15, - 15, 1,14, 4, 11,15,15, 2, 13,15,15,10, 15, 5,15,15, - 15,15, 3, 6, 8,15, 7, 9, 0,12,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 -}; -char *aln_nt16_rev_table = "XAGRCMSVTWKDYHBN-"; - -/* char -> 5 (=4+1) nucleotides */ -unsigned char aln_nt4_table[256] = { - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 2, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 2, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 -}; -char *aln_nt4_rev_table = "AGCTN-"; - -/* char -> 22 (=20+1+1) amino acids */ -unsigned char aln_aa_table[256] = { - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,20,21, 21,22 /*'-'*/,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21, 0,21, 4, 3, 6,13, 7, 8, 9,21,11, 10,12, 2,21, - 14, 5, 1,15, 16,21,19,17, 21,18,21,21, 21,21,21,21, - 21, 0,21, 4, 3, 6,13, 7, 8, 9,21,11, 10,12, 2,21, - 14, 5, 1,15, 16,21,19,17, 21,18,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21 -}; -char *aln_aa_rev_table = "ARNDCQEGHILKMFPSTWYV*X-"; - /* 01234567890123456789012 */ - -/* translation table. They are useless in stdaln.c, but when you realize you need it, you need not write the table again. */ -unsigned char aln_trans_table_eu[66] = { - 11,11, 2, 2, 1, 1,15,15, 16,16,16,16, 9,12, 9, 9, - 6, 6, 3, 3, 7, 7, 7, 7, 0, 0, 0, 0, 19,19,19,19, - 5, 5, 8, 8, 1, 1, 1, 1, 14,14,14,14, 10,10,10,10, - 20,20,18,18, 20,17, 4, 4, 15,15,15,15, 10,10,13,13, 21, 22 -}; -char *aln_trans_table_eu_char = "KKNNRRSSTTTTIMIIEEDDGGGGAAAAVVVVQQHHRRRRPPPPLLLL**YY*WCCSSSSLLFFX"; - /* 01234567890123456789012345678901234567890123456789012345678901234 */ -int aln_sm_blosum62[] = { -/* A R N D C Q E G H I L K M F P S T W Y V * X */ - 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0, - -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1, - -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1, - -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1, - 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2, - -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1, - -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1, - 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1, - -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1, - -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1, - -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1, - -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1, - -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1, - -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1, - -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2, - 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0, - 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0, - -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2, - -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1, - 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1, - -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4, - 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1 -}; - -int aln_sm_blosum45[] = { -/* A R N D C Q E G H I L K M F P S T W Y V * X */ - 5,-2,-1,-2,-1,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-2,-2, 0,-5, 0, - -2, 7, 0,-1,-3, 1, 0,-2, 0,-3,-2, 3,-1,-2,-2,-1,-1,-2,-1,-2,-5,-1, - -1, 0, 6, 2,-2, 0, 0, 0, 1,-2,-3, 0,-2,-2,-2, 1, 0,-4,-2,-3,-5,-1, - -2,-1, 2, 7,-3, 0, 2,-1, 0,-4,-3, 0,-3,-4,-1, 0,-1,-4,-2,-3,-5,-1, - -1,-3,-2,-3,12,-3,-3,-3,-3,-3,-2,-3,-2,-2,-4,-1,-1,-5,-3,-1,-5,-2, - -1, 1, 0, 0,-3, 6, 2,-2, 1,-2,-2, 1, 0,-4,-1, 0,-1,-2,-1,-3,-5,-1, - -1, 0, 0, 2,-3, 2, 6,-2, 0,-3,-2, 1,-2,-3, 0, 0,-1,-3,-2,-3,-5,-1, - 0,-2, 0,-1,-3,-2,-2, 7,-2,-4,-3,-2,-2,-3,-2, 0,-2,-2,-3,-3,-5,-1, - -2, 0, 1, 0,-3, 1, 0,-2,10,-3,-2,-1, 0,-2,-2,-1,-2,-3, 2,-3,-5,-1, - -1,-3,-2,-4,-3,-2,-3,-4,-3, 5, 2,-3, 2, 0,-2,-2,-1,-2, 0, 3,-5,-1, - -1,-2,-3,-3,-2,-2,-2,-3,-2, 2, 5,-3, 2, 1,-3,-3,-1,-2, 0, 1,-5,-1, - -1, 3, 0, 0,-3, 1, 1,-2,-1,-3,-3, 5,-1,-3,-1,-1,-1,-2,-1,-2,-5,-1, - -1,-1,-2,-3,-2, 0,-2,-2, 0, 2, 2,-1, 6, 0,-2,-2,-1,-2, 0, 1,-5,-1, - -2,-2,-2,-4,-2,-4,-3,-3,-2, 0, 1,-3, 0, 8,-3,-2,-1, 1, 3, 0,-5,-1, - -1,-2,-2,-1,-4,-1, 0,-2,-2,-2,-3,-1,-2,-3, 9,-1,-1,-3,-3,-3,-5,-1, - 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-3,-1,-2,-2,-1, 4, 2,-4,-2,-1,-5, 0, - 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-1,-1, 2, 5,-3,-1, 0,-5, 0, - -2,-2,-4,-4,-5,-2,-3,-2,-3,-2,-2,-2,-2, 1,-3,-4,-3,15, 3,-3,-5,-2, - -2,-1,-2,-2,-3,-1,-2,-3, 2, 0, 0,-1, 0, 3,-3,-2,-1, 3, 8,-1,-5,-1, - 0,-2,-3,-3,-1,-3,-3,-3,-3, 3, 1,-2, 1, 0,-3,-1, 0,-3,-1, 5,-5,-1, - -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5, 1,-5, - 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0,-2,-1,-1,-5,-1 -}; - -int aln_sm_nt[] = { -/* X A G R C M S V T W K D Y H B N */ - -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2, - -2, 2,-1, 1,-2, 1,-2, 0,-2, 1,-2, 0,-2, 0,-2, 0, - -2,-1, 2, 1,-2,-2, 1, 0,-2,-2, 1, 0,-2,-2, 0, 0, - -2, 1, 1, 1,-2,-1,-1, 0,-2,-1,-1, 0,-2, 0, 0, 0, - -2,-2,-2,-2, 2, 1, 1, 0,-1,-2,-2,-2, 1, 0, 0, 0, - -2, 1,-2,-1, 1, 1,-1, 0,-2,-1,-2, 0,-1, 0, 0, 0, - -2,-2, 1,-1, 1,-1, 1, 0,-2,-2,-1, 0,-1, 0, 0, 0, - -2, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, - -2,-2,-2,-2,-1,-2,-2,-2, 2, 1, 1, 0, 1, 0, 0, 0, - -2, 1,-2,-1,-2,-1,-2, 0, 1, 1,-1, 0,-1, 0, 0, 0, - -2,-2, 1,-1,-2,-2,-1, 0, 1,-1, 1, 0,-1, 0, 0, 0, - -2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -2,-2,-2,-2, 1,-1,-1, 0, 1,-1,-1, 0, 1, 0, 0, 0, - -2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - -int aln_sm_read[] = { -/* X A G R C M S V T W K D Y H B N */ - -17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17, - -17, 2,-17, 1,-17, 1,-17, 0,-17, 1,-17, 0,-17, 0,-17, 0, - -17,-17, 2, 1,-17,-17, 1, 0,-17,-17, 1, 0,-17,-17, 0, 0, - -17, 1, 1, 1,-17,-17,-17, 0,-17,-17,-17, 0,-17, 0, 0, 0, - -17,-17,-17,-17, 2, 1, 1, 0,-17,-17,-17,-17, 1, 0, 0, 0, - -17, 1,-17,-17, 1, 1,-17, 0,-17,-17,-17, 0,-17, 0, 0, 0, - -17,-17, 1,-17, 1,-17, 1, 0,-17,-17,-17, 0,-17, 0, 0, 0, - -17, 0, 0, 0, 0, 0, 0, 0,-17, 0, 0, 0, 0, 0, 0, 0, - -17,-17,-17,-17,-17,-17,-17,-17, 2, 1, 1, 0, 1, 0, 0, 0, - -17, 1,-17,-17,-17,-17,-17, 0, 1, 1,-17, 0,-17, 0, 0, 0, - -17,-17, 1,-17,-17,-17,-17, 0, 1,-17, 1, 0,-17, 0, 0, 0, - -17, 0, 0, 0,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -17,-17,-17,-17, 1,-17,-17, 0, 1,-17,-17, 0, 1, 0, 0, 0, - -17, 0,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -17,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - -int aln_sm_hs[] = { -/* A G C T N */ - 91, -31,-114,-123, -44, - -31, 100,-125,-114, -42, - -123,-125, 100, -31, -42, - -114,-114, -31, 91, -42, - -44, -42, -42, -42, -43 -}; - -int aln_sm_maq[] = { - 11, -19, -19, -19, -13, - -19, 11, -19, -19, -13, - -19, -19, 11, -19, -13, - -19, -19, -19, 11, -13, - -13, -13, -13, -13, -13 -}; - -int aln_sm_blast[] = { - 1, -3, -3, -3, -2, - -3, 1, -3, -3, -2, - -3, -3, 1, -3, -2, - -3, -3, -3, 1, -2, - -2, -2, -2, -2, -2 -}; - -/********************/ -/* START OF align.c */ -/********************/ - -AlnParam aln_param_blast = { 5, 2, 2, aln_sm_blast, 5, 50 }; -AlnParam aln_param_bwa = { 26, 9, 5, aln_sm_maq, 5, 50 }; -AlnParam aln_param_nt2nt = { 8, 2, 2, aln_sm_nt, 16, 75 }; -AlnParam aln_param_rd2rd = { 1, 19, 19, aln_sm_read, 16, 75 }; -AlnParam aln_param_aa2aa = { 10, 2, 2, aln_sm_blosum62, 22, 50 }; - -AlnAln *aln_init_AlnAln() -{ - AlnAln *aa; - aa = (AlnAln*)xmalloc(sizeof(AlnAln)); - aa->path = 0; - aa->out1 = aa->out2 = aa->outm = 0; - aa->path_len = 0; - return aa; -} -void aln_free_AlnAln(AlnAln *aa) -{ - free(aa->path); free(aa->cigar32); - free(aa->out1); free(aa->out2); free(aa->outm); - free(aa); -} - -/***************************/ -/* START OF common_align.c */ -/***************************/ - -#define LOCAL_OVERFLOW_THRESHOLD 32000 -#define LOCAL_OVERFLOW_REDUCE 16000 -#define NT_LOCAL_SCORE int -#define NT_LOCAL_SHIFT 16 -#define NT_LOCAL_MASK 0xffff - -#define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF; - -#define set_M(MM, cur, p, sc) \ -{ \ - if ((p)->M >= (p)->I) { \ - if ((p)->M >= (p)->D) { \ - (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \ - } else { \ - (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ - } \ - } else { \ - if ((p)->I > (p)->D) { \ - (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \ - } else { \ - (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ - } \ - } \ -} -#define set_I(II, cur, p) \ -{ \ - if ((p)->M - gap_open > (p)->I) { \ - (cur)->It = FROM_M; \ - (II) = (p)->M - gap_open - gap_ext; \ - } else { \ - (cur)->It = FROM_I; \ - (II) = (p)->I - gap_ext; \ - } \ -} -#define set_end_I(II, cur, p) \ -{ \ - if (gap_end >= 0) { \ - if ((p)->M - gap_open > (p)->I) { \ - (cur)->It = FROM_M; \ - (II) = (p)->M - gap_open - gap_end; \ - } else { \ - (cur)->It = FROM_I; \ - (II) = (p)->I - gap_end; \ - } \ - } else set_I(II, cur, p); \ -} -#define set_D(DD, cur, p) \ -{ \ - if ((p)->M - gap_open > (p)->D) { \ - (cur)->Dt = FROM_M; \ - (DD) = (p)->M - gap_open - gap_ext; \ - } else { \ - (cur)->Dt = FROM_D; \ - (DD) = (p)->D - gap_ext; \ - } \ -} -#define set_end_D(DD, cur, p) \ -{ \ - if (gap_end >= 0) { \ - if ((p)->M - gap_open > (p)->D) { \ - (cur)->Dt = FROM_M; \ - (DD) = (p)->M - gap_open - gap_end; \ - } else { \ - (cur)->Dt = FROM_D; \ - (DD) = (p)->D - gap_end; \ - } \ - } else set_D(DD, cur, p); \ -} - -typedef struct -{ - unsigned char Mt:3, It:2, Dt:2; -} dpcell_t; - -typedef struct -{ - int M, I, D; -} dpscore_t; - -/* build score profile for accelerating alignment, in theory */ -void aln_init_score_array(unsigned char *seq, int len, int row, int *score_matrix, int **s_array) -{ - int *tmp, *tmp2, i, k; - for (i = 0; i != row; ++i) { - tmp = score_matrix + i * row; - tmp2 = s_array[i]; - for (k = 0; k != len; ++k) - tmp2[k] = tmp[seq[k]]; - } -} -/*************************** - * banded global alignment * - ***************************/ -int aln_global_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len) -{ - register int i, j; - dpcell_t **dpcell, *q; - dpscore_t *curr, *last, *s; - path_t *p; - int b1, b2, tmp_end; - int *mat, end, max; - unsigned char type, ctype; - - int gap_open, gap_ext, gap_end, b; - int *score_matrix, N_MATRIX_ROW; - - /* initialize some align-related parameters. just for compatibility */ - gap_open = ap->gap_open; - gap_ext = ap->gap_ext; - gap_end = ap->gap_end; - b = ap->band_width; - score_matrix = ap->matrix; - N_MATRIX_ROW = ap->row; - - if (len1 == 0 || len2 == 0) { - *path_len = 0; - return 0; - } - /* calculate b1 and b2 */ - if (len1 > len2) { - b1 = len1 - len2 + b; - b2 = b; - } else { - b1 = b; - b2 = len2 - len1 + b; - } - if (b1 > len1) b1 = len1; - if (b2 > len2) b2 = len2; - --seq1; --seq2; - - /* allocate memory */ - end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1); - dpcell = (dpcell_t**)xmalloc(sizeof(dpcell_t*) * (len2 + 1)); - for (j = 0; j <= len2; ++j) - dpcell[j] = (dpcell_t*)xmalloc(sizeof(dpcell_t) * end); - for (j = b2 + 1; j <= len2; ++j) - dpcell[j] -= j - b2; - curr = (dpscore_t*)xmalloc(sizeof(dpscore_t) * (len1 + 1)); - last = (dpscore_t*)xmalloc(sizeof(dpscore_t) * (len1 + 1)); - - /* set first row */ - SET_INF(*curr); curr->M = 0; - for (i = 1, s = curr + 1; i < b1; ++i, ++s) { - SET_INF(*s); - set_end_D(s->D, dpcell[0] + i, s - 1); - } - s = curr; curr = last; last = s; - - /* core dynamic programming, part 1 */ - tmp_end = (b2 < len2)? b2 : len2 - 1; - for (j = 1; j <= tmp_end; ++j) { - q = dpcell[j]; s = curr; SET_INF(*s); - set_end_I(s->I, q, last); - end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - ++s; ++q; - for (i = 1; i != end; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ - set_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - } - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_D(s->D, q, s - 1); - if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ - set_end_I(s->I, q, last + i); - } else s->I = MINOR_INF; - s = curr; curr = last; last = s; - } - /* last row for part 1, use set_end_D() instead of set_D() */ - if (j == len2 && b2 != len2 - 1) { - q = dpcell[j]; s = curr; SET_INF(*s); - set_end_I(s->I, q, last); - end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - ++s; ++q; - for (i = 1; i != end; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ - set_I(s->I, q, last + i); - set_end_D(s->D, q, s - 1); - } - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_end_D(s->D, q, s - 1); - if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ - set_end_I(s->I, q, last + i); - } else s->I = MINOR_INF; - s = curr; curr = last; last = s; - ++j; - } - - /* core dynamic programming, part 2 */ - for (; j <= len2 - b2 + 1; ++j) { - SET_INF(curr[j - b2]); - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - end = j + b1 - 1; - for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - } - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_D(s->D, q, s - 1); - s->I = MINOR_INF; - s = curr; curr = last; last = s; - } - - /* core dynamic programming, part 3 */ - for (; j < len2; ++j) { - SET_INF(curr[j - b2]); - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - } - set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); - set_end_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - s = curr; curr = last; last = s; - } - /* last row */ - if (j == len2) { - SET_INF(curr[j - b2]); - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_I(s->I, q, last + i); - set_end_D(s->D, q, s - 1); - } - set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); - set_end_I(s->I, q, last + i); - set_end_D(s->D, q, s - 1); - s = curr; curr = last; last = s; - } - - /* backtrace */ - i = len1; j = len2; - q = dpcell[j] + i; - s = last + len1; - max = s->M; type = q->Mt; ctype = FROM_M; - if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; } - if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; } - - p = path; - p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */ - ++p; - do { - switch (ctype) { - case FROM_M: --i; --j; break; - case FROM_I: --j; break; - case FROM_D: --i; break; - } - q = dpcell[j] + i; - ctype = type; - switch (type) { - case FROM_M: type = q->Mt; break; - case FROM_I: type = q->It; break; - case FROM_D: type = q->Dt; break; - } - p->ctype = ctype; p->i = i; p->j = j; - ++p; - } while (i || j); - *path_len = p - path - 1; - - /* free memory */ - for (j = b2 + 1; j <= len2; ++j) - dpcell[j] += j - b2; - for (j = 0; j <= len2; ++j) - free(dpcell[j]); - free(dpcell); - free(curr); free(last); - - return max; -} -/************************************************* - * local alignment combined with banded strategy * - *************************************************/ -int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len, int _thres, int *_subo) -{ - register NT_LOCAL_SCORE *s; - register int i; - int q, r, qr, tmp_len, qr_shift; - int **s_array, *score_array; - int e, f; - int is_overflow, of_base; - NT_LOCAL_SCORE *eh, curr_h, last_h, curr_last_h; - int j, start_i, start_j, end_i, end_j; - path_t *p; - int score_f, score_r, score_g; - int start, end, max_score; - int thres, *suba, *ss; - - int gap_open, gap_ext; - int *score_matrix, N_MATRIX_ROW; - - /* initialize some align-related parameters. just for compatibility */ - gap_open = ap->gap_open; - gap_ext = ap->gap_ext; - score_matrix = ap->matrix; - N_MATRIX_ROW = ap->row; - thres = _thres > 0? _thres : -_thres; - - if (len1 == 0 || len2 == 0) return -1; - - /* allocate memory */ - suba = (int*)xmalloc(sizeof(int) * (len2 + 1)); - eh = (NT_LOCAL_SCORE*)xmalloc(sizeof(NT_LOCAL_SCORE) * (len1 + 1)); - s_array = (int**)xmalloc(sizeof(int*) * N_MATRIX_ROW); - for (i = 0; i != N_MATRIX_ROW; ++i) - s_array[i] = (int*)xmalloc(sizeof(int) * len1); - /* initialization */ - aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array); - q = gap_open; - r = gap_ext; - qr = q + r; - qr_shift = (qr+1) << NT_LOCAL_SHIFT; - tmp_len = len1 + 1; - start_i = start_j = end_i = end_j = 0; - for (i = 0, max_score = 0; i != N_MATRIX_ROW * N_MATRIX_ROW; ++i) - if (max_score < score_matrix[i]) max_score = score_matrix[i]; - /* convert the coordinate */ - --seq1; --seq2; - for (i = 0; i != N_MATRIX_ROW; ++i) --s_array[i]; - - /* forward dynamic programming */ - for (i = 0, s = eh; i != tmp_len; ++i, ++s) *s = 0; - score_f = 0; - is_overflow = of_base = 0; - suba[0] = 0; - for (j = 1, ss = suba + 1; j <= len2; ++j, ++ss) { - int subo = 0; - last_h = f = 0; - score_array = s_array[seq2[j]]; - if (is_overflow) { /* adjust eh[] array if overflow occurs. */ - /* If LOCAL_OVERFLOW_REDUCE is too small, optimal alignment might be missed. - * If it is too large, this block will be excuted frequently and therefore - * slow down the whole program. - * Acually, smaller LOCAL_OVERFLOW_REDUCE might also help to reduce the - * number of assignments because it sets some cells to zero when overflow - * happens. */ - int tmp, tmp2; - score_f -= LOCAL_OVERFLOW_REDUCE; - of_base += LOCAL_OVERFLOW_REDUCE; - is_overflow = 0; - for (i = 1, s = eh; i <= tmp_len; ++i, ++s) { - tmp = *s >> NT_LOCAL_SHIFT; tmp2 = *s & NT_LOCAL_MASK; - if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0; - else tmp2 -= LOCAL_OVERFLOW_REDUCE; - if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0; - else tmp -= LOCAL_OVERFLOW_REDUCE; - *s = (tmp << NT_LOCAL_SHIFT) | tmp2; - } - } - for (i = 1, s = eh; i != tmp_len; ++i, ++s) { - /* prepare for calculate current h */ - curr_h = (*s >> NT_LOCAL_SHIFT) + score_array[i]; - if (curr_h < 0) curr_h = 0; - if (last_h > 0) { /* initialize f */ - f = (f > last_h - q)? f - r : last_h - qr; - if (curr_h < f) curr_h = f; - } - if (*(s+1) >= qr_shift) { /* initialize e */ - curr_last_h = *(s+1) >> NT_LOCAL_SHIFT; - e = ((*s & NT_LOCAL_MASK) > curr_last_h - q)? (*s & NT_LOCAL_MASK) - r : curr_last_h - qr; - if (curr_h < e) curr_h = e; - *s = (last_h << NT_LOCAL_SHIFT) | e; - } else *s = last_h << NT_LOCAL_SHIFT; /* e = 0 */ - last_h = curr_h; - if (subo < curr_h) subo = curr_h; - if (score_f < curr_h) { - score_f = curr_h; end_i = i; end_j = j; - if (score_f > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1; - } - } - *s = last_h << NT_LOCAL_SHIFT; - *ss = subo + of_base; - } - score_f += of_base; - - if (score_f < thres) { /* no matching residue at all, 090218 */ - if (path_len) *path_len = 0; - goto end_func; - } - if (path == 0) goto end_func; /* skip path-filling */ - - /* reverse dynamic programming */ - for (i = end_i, s = eh + end_i; i >= 0; --i, --s) *s = 0; - if (end_i == 0 || end_j == 0) goto end_func; /* no local match */ - score_r = score_matrix[seq1[end_i] * N_MATRIX_ROW + seq2[end_j]]; - is_overflow = of_base = 0; - start_i = end_i; start_j = end_j; - eh[end_i] = ((NT_LOCAL_SCORE)(qr + score_r)) << NT_LOCAL_SHIFT; /* in order to initialize f and e, 040408 */ - start = end_i - 1; - end = end_i - 3; - if (end <= 0) end = 0; - - /* second pass DP can be done in a band, speed will thus be enhanced */ - for (j = end_j - 1; j != 0; --j) { - last_h = f = 0; - score_array = s_array[seq2[j]]; - if (is_overflow) { /* adjust eh[] array if overflow occurs. */ - int tmp, tmp2; - score_r -= LOCAL_OVERFLOW_REDUCE; - of_base += LOCAL_OVERFLOW_REDUCE; - is_overflow = 0; - for (i = start, s = eh + start + 1; i >= end; --i, --s) { - tmp = *s >> NT_LOCAL_SHIFT; tmp2 = *s & NT_LOCAL_MASK; - if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0; - else tmp2 -= LOCAL_OVERFLOW_REDUCE; - if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0; - else tmp -= LOCAL_OVERFLOW_REDUCE; - *s = (tmp << NT_LOCAL_SHIFT) | tmp2; - } - } - for (i = start, s = eh + start + 1; i != end; --i, --s) { - /* prepare for calculate current h */ - curr_h = (*s >> NT_LOCAL_SHIFT) + score_array[i]; - if (curr_h < 0) curr_h = 0; - if (last_h > 0) { /* initialize f */ - f = (f > last_h - q)? f - r : last_h - qr; - if (curr_h < f) curr_h = f; - } - curr_last_h = *(s-1) >> NT_LOCAL_SHIFT; - e = ((*s & NT_LOCAL_MASK) > curr_last_h - q)? (*s & NT_LOCAL_MASK) - r : curr_last_h - qr; - if (e < 0) e = 0; - if (curr_h < e) curr_h = e; - *s = (last_h << NT_LOCAL_SHIFT) | e; - last_h = curr_h; - if (score_r < curr_h) { - score_r = curr_h; start_i = i; start_j = j; - if (score_r + of_base - qr == score_f) { - j = 1; break; - } - if (score_r > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1; - } - } - *s = last_h << NT_LOCAL_SHIFT; - /* recalculate start and end, the boundaries of the band */ - if ((eh[start] >> NT_LOCAL_SHIFT) <= qr) --start; - if (start <= 0) start = 0; - end = start_i - (start_j - j) - (score_r + of_base + (start_j - j) * max_score) / r - 1; - if (end <= 0) end = 0; - } - - if (_subo) { - int tmp2 = 0, tmp = (int)(start_j - .33 * (end_j - start_j) + .499); - for (j = 1; j <= tmp; ++j) - if (tmp2 < suba[j]) tmp2 = suba[j]; - tmp = (int)(end_j + .33 * (end_j - start_j) + .499); - for (j = tmp; j <= len2; ++j) - if (tmp2 < suba[j]) tmp2 = suba[j]; - *_subo = tmp2; - } - - if (path_len == 0) { - path[0].i = start_i; path[0].j = start_j; - path[1].i = end_i; path[1].j = end_j; - goto end_func; - } - - score_r += of_base; - score_r -= qr; - -#ifdef DEBUG - /* this seems not a bug */ - if (score_f != score_r) - fprintf(stderr, "[aln_local_core] unknown flaw occurs: score_f(%d) != score_r(%d)\n", score_f, score_r); -#endif - - if (_thres > 0) { /* call global alignment to fill the path */ - score_g = 0; - j = (end_i - start_i > end_j - start_j)? end_i - start_i : end_j - start_j; - ++j; /* j is the maximum band_width */ - for (i = ap->band_width;; i <<= 1) { - AlnParam ap_real = *ap; - ap_real.gap_end = -1; - ap_real.band_width = i; - score_g = aln_global_core(seq1 + start_i, end_i - start_i + 1, seq2 + start_j, - end_j - start_j + 1, &ap_real, path, path_len); - if (score_g == score_r || score_f == score_g) break; - if (i > j) break; - } - if (score_r > score_g && score_f > score_g) { - fprintf(stderr, "[aln_local_core] Potential bug: (%d,%d) > %d\n", score_f, score_r, score_g); - score_f = score_r = -1; - } else score_f = score_g; - - /* convert coordinate */ - for (p = path + *path_len - 1; p >= path; --p) { - p->i += start_i - 1; - p->j += start_j - 1; - } - } else { /* just store the start and end */ - *path_len = 2; - path[1].i = start_i; path[1].j = start_j; - path->i = end_i; path->j = end_j; - } - -end_func: - /* free */ - free(eh); free(suba); - for (i = 0; i != N_MATRIX_ROW; ++i) { - ++s_array[i]; - free(s_array[i]); - } - free(s_array); - return score_f; -} -AlnAln *aln_stdaln_aux(const char *seq1, const char *seq2, const AlnParam *ap, - int type, int thres, int len1, int len2) -{ - unsigned char *seq11, *seq22; - int score; - int i, j, l; - path_t *p; - char *out1, *out2, *outm; - AlnAln *aa; - - if (len1 < 0) len1 = strlen(seq1); - if (len2 < 0) len2 = strlen(seq2); - - aa = aln_init_AlnAln(); - seq11 = (unsigned char*)xmalloc(sizeof(unsigned char) * len1); - seq22 = (unsigned char*)xmalloc(sizeof(unsigned char) * len2); - aa->path = (path_t*)xmalloc(sizeof(path_t) * (len1 + len2 + 1)); - - if (ap->row < 10) { /* 4-nucleotide alignment */ - for (i = 0; i < len1; ++i) - seq11[i] = aln_nt4_table[(int)seq1[i]]; - for (j = 0; j < len2; ++j) - seq22[j] = aln_nt4_table[(int)seq2[j]]; - } else if (ap->row < 20) { /* 16-nucleotide alignment */ - for (i = 0; i < len1; ++i) - seq11[i] = aln_nt16_table[(int)seq1[i]]; - for (j = 0; j < len2; ++j) - seq22[j] = aln_nt16_table[(int)seq2[j]]; - } else { /* amino acids */ - for (i = 0; i < len1; ++i) - seq11[i] = aln_aa_table[(int)seq1[i]]; - for (j = 0; j < len2; ++j) - seq22[j] = aln_aa_table[(int)seq2[j]]; - } - - if (type == ALN_TYPE_GLOBAL) score = aln_global_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len); - else if (type == ALN_TYPE_LOCAL) score = aln_local_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len, thres, &aa->subo); - else if (type == ALN_TYPE_EXTEND) score = aln_extend_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len, 1, 0); - else { - free(seq11); free(seq22); free(aa->path); - aln_free_AlnAln(aa); - return 0; - } - aa->score = score; - - if (thres > 0) { - out1 = aa->out1 = (char*)xmalloc(sizeof(char) * (aa->path_len + 1)); - out2 = aa->out2 = (char*)xmalloc(sizeof(char) * (aa->path_len + 1)); - outm = aa->outm = (char*)xmalloc(sizeof(char) * (aa->path_len + 1)); - - --seq1; --seq2; - --seq11; --seq22; - - p = aa->path + aa->path_len - 1; - - for (l = 0; p >= aa->path; --p, ++l) { - switch (p->ctype) { - case FROM_M: out1[l] = seq1[p->i]; out2[l] = seq2[p->j]; - outm[l] = (seq11[p->i] == seq22[p->j] && seq11[p->i] != ap->row)? '|' : ' '; - break; - case FROM_I: out1[l] = '-'; out2[l] = seq2[p->j]; outm[l] = ' '; break; - case FROM_D: out1[l] = seq1[p->i]; out2[l] = '-'; outm[l] = ' '; break; - } - } - out1[l] = out2[l] = outm[l] = '\0'; - ++seq11; ++seq22; - } - - free(seq11); - free(seq22); - - p = aa->path + aa->path_len - 1; - aa->start1 = p->i? p->i : 1; - aa->end1 = aa->path->i; - aa->start2 = p->j? p->j : 1; - aa->end2 = aa->path->j; - aa->cigar32 = aln_path2cigar32(aa->path, aa->path_len, &aa->n_cigar); - - return aa; -} -AlnAln *aln_stdaln(const char *seq1, const char *seq2, const AlnParam *ap, int type, int thres) -{ - return aln_stdaln_aux(seq1, seq2, ap, type, thres, -1, -1); -} - -/* for backward compatibility */ -uint16_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar) -{ - uint32_t *cigar32; - uint16_t *cigar; - int i; - cigar32 = aln_path2cigar32(path, path_len, n_cigar); - cigar = (uint16_t*)cigar32; - for (i = 0; i < *n_cigar; ++i) - cigar[i] = (cigar32[i]&0xf)<<14 | (cigar32[i]>>4&0x3fff); - return cigar; -} - -/* newly added functions (2009-07-21) */ - -int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len, int G0, uint8_t *_mem) -{ - int q, r, qr; - int32_t **s_array, *score_array; - int is_overflow, of_base; - uint32_t *eh; - int i, j, end_i, end_j; - int score, start, end; - int *score_matrix, N_MATRIX_ROW; - uint8_t *mem, *_p; - - /* initialize some align-related parameters. just for compatibility */ - q = ap->gap_open; - r = ap->gap_ext; - qr = q + r; - score_matrix = ap->matrix; - N_MATRIX_ROW = ap->row; - - if (len1 == 0 || len2 == 0) return -1; - - /* allocate memory */ - mem = _mem? _mem : xcalloc((len1 + 2) * (N_MATRIX_ROW + 1), 4); - _p = mem; - eh = (uint32_t*)_p, _p += 4 * (len1 + 2); - s_array = xcalloc(N_MATRIX_ROW, sizeof(void*)); - for (i = 0; i != N_MATRIX_ROW; ++i) - s_array[i] = (int32_t*)_p, _p += 4 * len1; - /* initialization */ - aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array); - start = 1; end = 2; - end_i = end_j = 0; - score = 0; - is_overflow = of_base = 0; - /* convert the coordinate */ - --seq1; --seq2; - for (i = 0; i != N_MATRIX_ROW; ++i) --s_array[i]; - - /* dynamic programming */ - memset(eh, 0, 4 * (len1 + 2)); - eh[1] = (uint32_t)G0<<16; - for (j = 1; j <= len2; ++j) { - int _start, _end; - int h1 = 0, f = 0; - score_array = s_array[seq2[j]]; - /* set start and end */ - _start = j - ap->band_width; - if (_start < 1) _start = 1; - if (_start > start) start = _start; - _end = j + ap->band_width; - if (_end > len1 + 1) _end = len1 + 1; - if (_end < end) end = _end; - if (start == end) break; - /* adjust eh[] array if overflow occurs. */ - if (is_overflow) { - int tmp, tmp2; - score -= LOCAL_OVERFLOW_REDUCE; - of_base += LOCAL_OVERFLOW_REDUCE; - is_overflow = 0; - for (i = start; i <= end; ++i) { - uint32_t *s = &eh[i]; - tmp = *s >> 16; tmp2 = *s & 0xffff; - if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0; - else tmp2 -= LOCAL_OVERFLOW_REDUCE; - if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0; - else tmp -= LOCAL_OVERFLOW_REDUCE; - *s = (tmp << 16) | tmp2; - } - } - _start = _end = 0; - /* the inner loop */ - for (i = start; i < end; ++i) { - /* At the beginning of each cycle: - eh[i] -> h[j-1,i-1]<<16 | e[j,i] - f -> f[j,i] - h1 -> h[j,i-1] - */ - uint32_t *s = &eh[i]; - int h = (int)(*s >> 16); - int e = *s & 0xffff; /* this is e[j,i] */ - *s = (uint32_t)h1 << 16; /* eh[i] now stores h[j,i-1]<<16 */ - h += h? score_array[i] : 0; /* this is left_core() specific */ - /* calculate h[j,i]; don't need to test 0, as {e,f}>=0 */ - h = h > e? h : e; - h = h > f? h : f; /* h now is h[j,i] */ - h1 = h; - if (h > 0) { - if (_start == 0) _start = i; - _end = i; - if (score < h) { - score = h; end_i = i; end_j = j; - if (score > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1; - } - } - /* calculate e[j+1,i] and f[j,i+1] */ - h -= qr; - h = h > 0? h : 0; - e -= r; - e = e > h? e : h; - f -= r; - f = f > h? f : h; - *s |= e; - } - eh[end] = h1 << 16; - /* recalculate start and end, the boundaries of the band */ - if (_end <= 0) break; /* no cell in this row has a positive score */ - start = _start; - end = _end + 3; - } - - score += of_base - 1; - if (score <= 0) { - if (path_len) *path_len = 0; - goto end_left_func; - } - - if (path == 0) goto end_left_func; - - if (path_len == 0) { - path[0].i = end_i; path[0].j = end_j; - goto end_left_func; - } - - { /* call global alignment to fill the path */ - int score_g = 0; - j = (end_i - 1 > end_j - 1)? end_i - 1 : end_j - 1; - ++j; /* j is the maximum band_width */ - for (i = ap->band_width;; i <<= 1) { - AlnParam ap_real = *ap; - ap_real.gap_end = -1; - ap_real.band_width = i; - score_g = aln_global_core(seq1 + 1, end_i, seq2 + 1, end_j, &ap_real, path, path_len); - if (score == score_g) break; - if (i > j) break; - } - if (score > score_g) - fprintf(stderr, "[aln_left_core] no suitable bandwidth: %d < %d\n", score_g, score); - score = score_g; - } - -end_left_func: - /* free */ - free(s_array); - if (!_mem) free(mem); - return score; -} - -uint32_t *aln_path2cigar32(const path_t *path, int path_len, int *n_cigar) -{ - int i, n; - uint32_t *cigar; - unsigned char last_type; - - if (path_len == 0 || path == 0) { - *n_cigar = 0; - return 0; - } - - last_type = path->ctype; - for (i = n = 1; i < path_len; ++i) { - if (last_type != path[i].ctype) ++n; - last_type = path[i].ctype; - } - *n_cigar = n; - cigar = (uint32_t*)xmalloc(*n_cigar * 4); - - cigar[0] = 1u << 4 | path[path_len-1].ctype; - last_type = path[path_len-1].ctype; - for (i = path_len - 2, n = 0; i >= 0; --i) { - if (path[i].ctype == last_type) cigar[n] += 1u << 4; - else { - cigar[++n] = 1u << 4 | path[i].ctype; - last_type = path[i].ctype; - } - } - - return cigar; -} - -#ifdef STDALN_MAIN -int main() -{ - AlnAln *aln_local, *aln_global, *aln_left; - int i; - - aln_local = aln_stdaln("CGTGCGATGCactgCATACGGCTCGCCTAGATCA", "AAGGGATGCTCTGCATCgCTCGGCTAGCTGT", &aln_param_blast, 0, 1); - aln_global = aln_stdaln("CGTGCGATGCactgCATACGGCTCGCCTAGATCA", "AAGGGATGCTCTGCATCGgCTCGGCTAGCTGT", &aln_param_blast, 1, 1); -// aln_left = aln_stdaln( "GATGCACTGCATACGGCTCGCCTAGATCA", "GATGCTCTGCATCGgCTCGGCTAGCTGT", &aln_param_blast, 2, 1); - aln_left = aln_stdaln("CACCTTCGACTCACGTCTCATTCTCGGAGTCGAGTGGACGGTCCCTCATACACGAACAGGTTC", - "CACCTTCGACTTTCACCTCTCATTCTCGGACTCGAGTGGACGGTCCCTCATCCAAGAACAGGGTCTGTGAAA", &aln_param_blast, 2, 1); - - printf(">%d,%d\t%d,%d\n", aln_local->start1, aln_local->end1, aln_local->start2, aln_local->end2); - printf("%s\n%s\n%s\n", aln_local->out1, aln_local->outm, aln_local->out2); - - printf(">%d,%d\t%d,%d\t", aln_global->start1, aln_global->end1, aln_global->start2, aln_global->end2); - for (i = 0; i != aln_global->n_cigar; ++i) - printf("%d%c", aln_global->cigar32[i]>>4, "MID"[aln_global->cigar32[i]&0xf]); - printf("\n%s\n%s\n%s\n", aln_global->out1, aln_global->outm, aln_global->out2); - - printf(">%d\t%d,%d\t%d,%d\t", aln_left->score, aln_left->start1, aln_left->end1, aln_left->start2, aln_left->end2); - for (i = 0; i != aln_left->n_cigar; ++i) - printf("%d%c", aln_left->cigar32[i]>>4, "MID"[aln_left->cigar32[i]&0xf]); - printf("\n%s\n%s\n%s\n", aln_left->out1, aln_left->outm, aln_left->out2); - - aln_free_AlnAln(aln_local); - aln_free_AlnAln(aln_global); - aln_free_AlnAln(aln_left); - return 0; -} -#endif diff --git a/stdaln.h b/stdaln.h deleted file mode 100644 index f0048b3..0000000 --- a/stdaln.h +++ /dev/null @@ -1,162 +0,0 @@ -/* The MIT License - - Copyright (c) 2003-2006, 2008, by Heng Li - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* - 2009-07-23, 0.10.0 - - - Use 32-bit to store CIGAR - - - Report suboptimal aligments - - - Implemented half-fixed-half-open DP - - 2009-04-26, 0.9.10 - - - Allow to set a threshold for local alignment - - 2009-02-18, 0.9.9 - - - Fixed a bug when no residue matches - - 2008-08-04, 0.9.8 - - - Fixed the wrong declaration of aln_stdaln_aux() - - - Avoid 0 coordinate for global alignment - - 2008-08-01, 0.9.7 - - - Change gap_end penalty to 5 in aln_param_bwa - - - Add function to convert path_t to the CIGAR format - - 2008-08-01, 0.9.6 - - - The first gap now costs (gap_open+gap_ext), instead of - gap_open. Scoring systems are modified accordingly. - - - Gap end is now correctly handled. Previously it is not correct. - - - Change license to MIT. - - */ - -#ifndef LH3_STDALN_H_ -#define LH3_STDALN_H_ - - -#define STDALN_VERSION 0.11.0 - -#include - -#define FROM_M 0 -#define FROM_I 1 -#define FROM_D 2 -#define FROM_S 3 - -#define ALN_TYPE_LOCAL 0 -#define ALN_TYPE_GLOBAL 1 -#define ALN_TYPE_EXTEND 2 - -/* This is the smallest integer. It might be CPU-dependent in very RARE cases. */ -#define MINOR_INF -1073741823 - -typedef struct -{ - int gap_open; - int gap_ext; - int gap_end; - - int *matrix; - int row; - int band_width; -} AlnParam; - -typedef struct -{ - int i, j; - unsigned char ctype; -} path_t; - -typedef struct -{ - path_t *path; /* for advanced users... :-) */ - int path_len; /* for advanced users... :-) */ - int start1, end1; /* start and end of the first sequence, coordinations are 1-based */ - int start2, end2; /* start and end of the second sequence, coordinations are 1-based */ - int score, subo; /* score */ - - char *out1, *out2; /* print them, and then you will know */ - char *outm; - - int n_cigar; - uint32_t *cigar32; -} AlnAln; - -#ifdef __cplusplus -extern "C" { -#endif - - AlnAln *aln_stdaln_aux(const char *seq1, const char *seq2, const AlnParam *ap, - int type, int do_align, int len1, int len2); - AlnAln *aln_stdaln(const char *seq1, const char *seq2, const AlnParam *ap, int type, int do_align); - void aln_free_AlnAln(AlnAln *aa); - - int aln_global_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len); - int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len, int _thres, int *_subo); - int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len, int G0, uint8_t *_mem); - uint16_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar); - uint32_t *aln_path2cigar32(const path_t *path, int path_len, int *n_cigar); - -#ifdef __cplusplus -} -#endif - -/******************** - * global variables * - ********************/ - -extern AlnParam aln_param_bwa; /* = { 37, 9, 0, aln_sm_maq, 5, 50 }; */ -extern AlnParam aln_param_blast; /* = { 5, 2, 2, aln_sm_blast, 5, 50 }; */ -extern AlnParam aln_param_nt2nt; /* = { 10, 2, 2, aln_sm_nt, 16, 75 }; */ -extern AlnParam aln_param_aa2aa; /* = { 20, 19, 19, aln_sm_read, 16, 75 }; */ -extern AlnParam aln_param_rd2rd; /* = { 12, 2, 2, aln_sm_blosum62, 22, 50 }; */ - -/* common nucleotide score matrix for 16 bases */ -extern int aln_sm_nt[], aln_sm_bwa[]; - -/* BLOSUM62 and BLOSUM45 */ -extern int aln_sm_blosum62[], aln_sm_blosum45[]; - -/* common read for 16 bases. note that read alignment is quite different from common nucleotide alignment */ -extern int aln_sm_read[]; - -/* human-mouse score matrix for 4 bases */ -extern int aln_sm_hs[]; - -#endif