From 98f896675094c3bb12203717f29b45757e5fd056 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Mar 2013 12:00:24 -0500 Subject: [PATCH] r329: ditch stdaln.{c,h}; no changes to bwa-mem stdaln.{c,h} was written ten years ago. Its local and SW extension code are actually buggy (though that rarely happens and usually does not affect the results too much). ksw.{c,h} is more concise, potentially faster, less buggy, and richer in features. --- Makefile | 6 +- bwape.c | 41 ++- bwase.c | 6 +- bwtaln.c | 15 - bwtaln.h | 12 +- main.c | 2 +- stdaln.c | 1070 ------------------------------------------------------ stdaln.h | 162 --------- 8 files changed, 33 insertions(+), 1281 deletions(-) delete mode 100644 stdaln.c delete mode 100644 stdaln.h diff --git a/Makefile b/Makefile index 36f951f..96c3047 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o -AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ +AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ is.o bwtindex.o bwape.o kopen.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o @@ -48,8 +48,8 @@ fastmap.o:bwt.h bwamem.h bwtaln.o:bwt.h bwtaln.h kseq.h bwtgap.o:bwtgap.h bwtaln.h bwt.h -bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h -bwtsw2_aux.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h +bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h +bwtsw2_aux.o:bwtsw2.h bwt.h bwt_lite.h bwtsw2_main.o:bwtsw2.h clean: diff --git a/bwape.c b/bwape.c index 0b2b8d6..9fd12b1 100644 --- a/bwape.c +++ b/bwape.c @@ -8,9 +8,9 @@ #include "kvec.h" #include "bntseq.h" #include "utils.h" -#include "stdaln.h" #include "bwase.h" #include "bwa.h" +#include "ksw.h" typedef struct { int n; @@ -397,16 +397,17 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw #define SW_MIN_MAPQ 17 // cnt = n_mm<<16 | n_gapo<<8 | n_gape -bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen, - int *n_cigar, uint32_t *_cnt) +bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen, int *n_cigar, uint32_t *_cnt) { + kswr_t r; + uint32_t *cigar32 = 0; bwa_cigar_t *cigar = 0; ubyte_t *ref_seq; bwtint_t k, x, y, l; - int path_len, ret, subo; - AlnParam ap = aln_param_bwa; - path_t *path, *p; + int xtra; + int8_t mat[25]; + bwa_fill_scmat(1, 3, mat); // check whether there are too many N's if (reglen < SW_MIN_MATCH_LEN || (int64_t)l_pac - *beg < len) return 0; for (k = 0, x = 0; k < len; ++k) @@ -417,15 +418,19 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u ref_seq = (ubyte_t*)calloc(reglen, 1); for (k = *beg, l = 0; l < reglen && k < l_pac; ++k) ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; - path = (path_t*)calloc(l+len, sizeof(path_t)); // do alignment - ret = aln_local_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len, 1, &subo); - if (ret < 0 || subo == ret) { // no hit or tandem hits - free(path); free(cigar); free(ref_seq); *n_cigar = 0; + xtra = KSW_XSUBO | KSW_XSTART | (len < 250? KSW_XBYTE : 0); + r = ksw_align(len, (uint8_t*)seq, l, ref_seq, 5, mat, 5, 1, xtra, 0); + ksw_global(r.qe - r.qb + 1, &seq[r.qb], r.te - r.tb + 1, &ref_seq[r.tb], 5, mat, 5, 1, 50, n_cigar, &cigar32); + cigar = (bwa_cigar_t*)cigar32; + for (k = 0; k < *n_cigar; ++k) + cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); + + if (r.score < SW_MIN_MATCH_LEN || r.score2 == r.score) { // poor hit or tandem hits + free(cigar); free(ref_seq); *n_cigar = 0; return 0; } - cigar = bwa_aln_path2cigar(path, path_len, n_cigar); // check whether the alignment is good enough for (k = 0, x = y = 0; k < *n_cigar; ++k) { @@ -435,17 +440,14 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u else y += __cigar_len(c); } if (x < SW_MIN_MATCH_LEN || y < SW_MIN_MATCH_LEN) { // not good enough - free(path); free(cigar); free(ref_seq); + free(cigar); free(ref_seq); *n_cigar = 0; return 0; } { // update cigar and coordinate; - int start, end; - p = path + path_len - 1; - *beg += (p->i? p->i : 1) - 1; - start = (p->j? p->j : 1) - 1; - end = path->j; + int start = r.qb, end = r.qe + 1; + *beg += r.tb; cigar = (bwa_cigar_t*)realloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2)); if (start) { memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar)); @@ -462,8 +464,7 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u { // set *cnt int n_mm, n_gapo, n_gape; n_mm = n_gapo = n_gape = 0; - p = path + path_len - 1; - x = p->i? p->i - 1 : 0; y = p->j? p->j - 1 : 0; + x = r.tb; y = r.qb; for (k = 0; k < *n_cigar; ++k) { bwa_cigar_t c = cigar[k]; if (__cigar_op(c) == FROM_M) { @@ -479,7 +480,7 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u *_cnt = (uint32_t)n_mm<<16 | n_gapo<<8 | n_gape; } - free(ref_seq); free(path); + free(ref_seq); return cigar; } diff --git a/bwase.c b/bwase.c index 9e2696b..eebe22b 100644 --- a/bwase.c +++ b/bwase.c @@ -4,7 +4,6 @@ #include #include #include -#include "stdaln.h" #include "bwase.h" #include "bwtaln.h" #include "bntseq.h" @@ -205,8 +204,8 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l if (__cigar_op(cigar[*n_cigar-1]) == FROM_D) --(*n_cigar); // deletion at the 3'-end // change "I" at either end of the read to S. just in case. This should rarely happen... - if (__cigar_op(cigar[*n_cigar-1]) == FROM_I) cigar[*n_cigar-1] = __cigar_create(3, (__cigar_len(cigar[*n_cigar-1]))); - if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(3, (__cigar_len(cigar[0]))); + if (__cigar_op(cigar[*n_cigar-1]) == FROM_I) cigar[*n_cigar-1] = __cigar_create(FROM_S, (__cigar_len(cigar[*n_cigar-1]))); + if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(FROM_S, (__cigar_len(cigar[0]))); *_pos = (bwtint_t)__pos; free(ref_seq); @@ -589,5 +588,6 @@ int bwa_sai2sam_se(int argc, char *argv[]) return 0; } bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line); + free(prefix); return 0; } diff --git a/bwtaln.c b/bwtaln.c index 96d4026..d132157 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -312,18 +312,3 @@ int bwa_aln(int argc, char *argv[]) free(opt); free(prefix); return 0; } - -/* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t, -__cigar_op and __cigar_len while keeping stdaln stand alone */ -bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar) -{ - uint32_t *cigar32; - bwa_cigar_t *cigar; - int i; - cigar32 = aln_path2cigar32((path_t*) path, path_len, n_cigar); - cigar = (bwa_cigar_t*)cigar32; - for (i = 0; i < *n_cigar; ++i) - cigar[i] = __cigar_create( (cigar32[i]&0xf), (cigar32[i]>>4) ); - return cigar; -} - diff --git a/bwtaln.h b/bwtaln.h index 412cc04..556f259 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -28,6 +28,11 @@ #define bns_pac(pac, k) ((pac)[(k)>>2] >> ((~(k)&3)<<1) & 3) #endif +#define FROM_M 0 +#define FROM_I 1 +#define FROM_D 2 +#define FROM_S 3 + typedef struct { bwtint_t w; int bid; @@ -138,13 +143,6 @@ extern "C" { void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac); - - /* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t, - __cigar_op and __cigar_len while keeping stdaln stand alone */ -#include "stdaln.h" - - bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar); - #ifdef __cplusplus } #endif diff --git a/main.c b/main.c index 8b49c8b..00a21b9 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r324-beta" +#define PACKAGE_VERSION "0.7.0-r329-beta" #endif static int usage() diff --git a/stdaln.c b/stdaln.c deleted file mode 100644 index cd064cf..0000000 --- a/stdaln.c +++ /dev/null @@ -1,1070 +0,0 @@ -/* The MIT License - - Copyright (c) 2003-2006, 2008, 2009, by Heng Li - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -#include -#include -#include -#include -#include "stdaln.h" - -/* char -> 17 (=16+1) nucleotides */ -unsigned char aln_nt16_table[256] = { - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,16 /*'-'*/,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15, 1,14, 4, 11,15,15, 2, 13,15,15,10, 15, 5,15,15, - 15,15, 3, 6, 8,15, 7, 9, 0,12,15,15, 15,15,15,15, - 15, 1,14, 4, 11,15,15, 2, 13,15,15,10, 15, 5,15,15, - 15,15, 3, 6, 8,15, 7, 9, 0,12,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 -}; -char *aln_nt16_rev_table = "XAGRCMSVTWKDYHBN-"; - -/* char -> 5 (=4+1) nucleotides */ -unsigned char aln_nt4_table[256] = { - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 2, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 2, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 -}; -char *aln_nt4_rev_table = "AGCTN-"; - -/* char -> 22 (=20+1+1) amino acids */ -unsigned char aln_aa_table[256] = { - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,20,21, 21,22 /*'-'*/,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21, 0,21, 4, 3, 6,13, 7, 8, 9,21,11, 10,12, 2,21, - 14, 5, 1,15, 16,21,19,17, 21,18,21,21, 21,21,21,21, - 21, 0,21, 4, 3, 6,13, 7, 8, 9,21,11, 10,12, 2,21, - 14, 5, 1,15, 16,21,19,17, 21,18,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21 -}; -char *aln_aa_rev_table = "ARNDCQEGHILKMFPSTWYV*X-"; - /* 01234567890123456789012 */ - -/* translation table. They are useless in stdaln.c, but when you realize you need it, you need not write the table again. */ -unsigned char aln_trans_table_eu[66] = { - 11,11, 2, 2, 1, 1,15,15, 16,16,16,16, 9,12, 9, 9, - 6, 6, 3, 3, 7, 7, 7, 7, 0, 0, 0, 0, 19,19,19,19, - 5, 5, 8, 8, 1, 1, 1, 1, 14,14,14,14, 10,10,10,10, - 20,20,18,18, 20,17, 4, 4, 15,15,15,15, 10,10,13,13, 21, 22 -}; -char *aln_trans_table_eu_char = "KKNNRRSSTTTTIMIIEEDDGGGGAAAAVVVVQQHHRRRRPPPPLLLL**YY*WCCSSSSLLFFX"; - /* 01234567890123456789012345678901234567890123456789012345678901234 */ -int aln_sm_blosum62[] = { -/* A R N D C Q E G H I L K M F P S T W Y V * X */ - 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0, - -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1, - -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1, - -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1, - 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2, - -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1, - -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1, - 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1, - -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1, - -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1, - -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1, - -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1, - -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1, - -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1, - -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2, - 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0, - 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0, - -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2, - -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1, - 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1, - -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4, - 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1 -}; - -int aln_sm_blosum45[] = { -/* A R N D C Q E G H I L K M F P S T W Y V * X */ - 5,-2,-1,-2,-1,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-2,-2, 0,-5, 0, - -2, 7, 0,-1,-3, 1, 0,-2, 0,-3,-2, 3,-1,-2,-2,-1,-1,-2,-1,-2,-5,-1, - -1, 0, 6, 2,-2, 0, 0, 0, 1,-2,-3, 0,-2,-2,-2, 1, 0,-4,-2,-3,-5,-1, - -2,-1, 2, 7,-3, 0, 2,-1, 0,-4,-3, 0,-3,-4,-1, 0,-1,-4,-2,-3,-5,-1, - -1,-3,-2,-3,12,-3,-3,-3,-3,-3,-2,-3,-2,-2,-4,-1,-1,-5,-3,-1,-5,-2, - -1, 1, 0, 0,-3, 6, 2,-2, 1,-2,-2, 1, 0,-4,-1, 0,-1,-2,-1,-3,-5,-1, - -1, 0, 0, 2,-3, 2, 6,-2, 0,-3,-2, 1,-2,-3, 0, 0,-1,-3,-2,-3,-5,-1, - 0,-2, 0,-1,-3,-2,-2, 7,-2,-4,-3,-2,-2,-3,-2, 0,-2,-2,-3,-3,-5,-1, - -2, 0, 1, 0,-3, 1, 0,-2,10,-3,-2,-1, 0,-2,-2,-1,-2,-3, 2,-3,-5,-1, - -1,-3,-2,-4,-3,-2,-3,-4,-3, 5, 2,-3, 2, 0,-2,-2,-1,-2, 0, 3,-5,-1, - -1,-2,-3,-3,-2,-2,-2,-3,-2, 2, 5,-3, 2, 1,-3,-3,-1,-2, 0, 1,-5,-1, - -1, 3, 0, 0,-3, 1, 1,-2,-1,-3,-3, 5,-1,-3,-1,-1,-1,-2,-1,-2,-5,-1, - -1,-1,-2,-3,-2, 0,-2,-2, 0, 2, 2,-1, 6, 0,-2,-2,-1,-2, 0, 1,-5,-1, - -2,-2,-2,-4,-2,-4,-3,-3,-2, 0, 1,-3, 0, 8,-3,-2,-1, 1, 3, 0,-5,-1, - -1,-2,-2,-1,-4,-1, 0,-2,-2,-2,-3,-1,-2,-3, 9,-1,-1,-3,-3,-3,-5,-1, - 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-3,-1,-2,-2,-1, 4, 2,-4,-2,-1,-5, 0, - 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-1,-1, 2, 5,-3,-1, 0,-5, 0, - -2,-2,-4,-4,-5,-2,-3,-2,-3,-2,-2,-2,-2, 1,-3,-4,-3,15, 3,-3,-5,-2, - -2,-1,-2,-2,-3,-1,-2,-3, 2, 0, 0,-1, 0, 3,-3,-2,-1, 3, 8,-1,-5,-1, - 0,-2,-3,-3,-1,-3,-3,-3,-3, 3, 1,-2, 1, 0,-3,-1, 0,-3,-1, 5,-5,-1, - -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5, 1,-5, - 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0,-2,-1,-1,-5,-1 -}; - -int aln_sm_nt[] = { -/* X A G R C M S V T W K D Y H B N */ - -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2, - -2, 2,-1, 1,-2, 1,-2, 0,-2, 1,-2, 0,-2, 0,-2, 0, - -2,-1, 2, 1,-2,-2, 1, 0,-2,-2, 1, 0,-2,-2, 0, 0, - -2, 1, 1, 1,-2,-1,-1, 0,-2,-1,-1, 0,-2, 0, 0, 0, - -2,-2,-2,-2, 2, 1, 1, 0,-1,-2,-2,-2, 1, 0, 0, 0, - -2, 1,-2,-1, 1, 1,-1, 0,-2,-1,-2, 0,-1, 0, 0, 0, - -2,-2, 1,-1, 1,-1, 1, 0,-2,-2,-1, 0,-1, 0, 0, 0, - -2, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, - -2,-2,-2,-2,-1,-2,-2,-2, 2, 1, 1, 0, 1, 0, 0, 0, - -2, 1,-2,-1,-2,-1,-2, 0, 1, 1,-1, 0,-1, 0, 0, 0, - -2,-2, 1,-1,-2,-2,-1, 0, 1,-1, 1, 0,-1, 0, 0, 0, - -2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -2,-2,-2,-2, 1,-1,-1, 0, 1,-1,-1, 0, 1, 0, 0, 0, - -2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - -int aln_sm_read[] = { -/* X A G R C M S V T W K D Y H B N */ - -17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17, - -17, 2,-17, 1,-17, 1,-17, 0,-17, 1,-17, 0,-17, 0,-17, 0, - -17,-17, 2, 1,-17,-17, 1, 0,-17,-17, 1, 0,-17,-17, 0, 0, - -17, 1, 1, 1,-17,-17,-17, 0,-17,-17,-17, 0,-17, 0, 0, 0, - -17,-17,-17,-17, 2, 1, 1, 0,-17,-17,-17,-17, 1, 0, 0, 0, - -17, 1,-17,-17, 1, 1,-17, 0,-17,-17,-17, 0,-17, 0, 0, 0, - -17,-17, 1,-17, 1,-17, 1, 0,-17,-17,-17, 0,-17, 0, 0, 0, - -17, 0, 0, 0, 0, 0, 0, 0,-17, 0, 0, 0, 0, 0, 0, 0, - -17,-17,-17,-17,-17,-17,-17,-17, 2, 1, 1, 0, 1, 0, 0, 0, - -17, 1,-17,-17,-17,-17,-17, 0, 1, 1,-17, 0,-17, 0, 0, 0, - -17,-17, 1,-17,-17,-17,-17, 0, 1,-17, 1, 0,-17, 0, 0, 0, - -17, 0, 0, 0,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -17,-17,-17,-17, 1,-17,-17, 0, 1,-17,-17, 0, 1, 0, 0, 0, - -17, 0,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -17,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - -int aln_sm_hs[] = { -/* A G C T N */ - 91, -31,-114,-123, -44, - -31, 100,-125,-114, -42, - -123,-125, 100, -31, -42, - -114,-114, -31, 91, -42, - -44, -42, -42, -42, -43 -}; - -int aln_sm_maq[] = { - 11, -19, -19, -19, -13, - -19, 11, -19, -19, -13, - -19, -19, 11, -19, -13, - -19, -19, -19, 11, -13, - -13, -13, -13, -13, -13 -}; - -int aln_sm_blast[] = { - 1, -3, -3, -3, -2, - -3, 1, -3, -3, -2, - -3, -3, 1, -3, -2, - -3, -3, -3, 1, -2, - -2, -2, -2, -2, -2 -}; - -/********************/ -/* START OF align.c */ -/********************/ - -AlnParam aln_param_blast = { 5, 2, 2, aln_sm_blast, 5, 50 }; -AlnParam aln_param_bwa = { 26, 9, 5, aln_sm_maq, 5, 50 }; -AlnParam aln_param_nt2nt = { 8, 2, 2, aln_sm_nt, 16, 75 }; -AlnParam aln_param_rd2rd = { 1, 19, 19, aln_sm_read, 16, 75 }; -AlnParam aln_param_aa2aa = { 10, 2, 2, aln_sm_blosum62, 22, 50 }; - -AlnAln *aln_init_AlnAln() -{ - AlnAln *aa; - aa = (AlnAln*)malloc(sizeof(AlnAln)); - aa->path = 0; - aa->out1 = aa->out2 = aa->outm = 0; - aa->path_len = 0; - return aa; -} -void aln_free_AlnAln(AlnAln *aa) -{ - free(aa->path); free(aa->cigar32); - free(aa->out1); free(aa->out2); free(aa->outm); - free(aa); -} - -/***************************/ -/* START OF common_align.c */ -/***************************/ - -#define LOCAL_OVERFLOW_THRESHOLD 32000 -#define LOCAL_OVERFLOW_REDUCE 16000 -#define NT_LOCAL_SCORE int -#define NT_LOCAL_SHIFT 16 -#define NT_LOCAL_MASK 0xffff - -#define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF; - -#define set_M(MM, cur, p, sc) \ -{ \ - if ((p)->M >= (p)->I) { \ - if ((p)->M >= (p)->D) { \ - (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \ - } else { \ - (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ - } \ - } else { \ - if ((p)->I > (p)->D) { \ - (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \ - } else { \ - (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ - } \ - } \ -} -#define set_I(II, cur, p) \ -{ \ - if ((p)->M - gap_open > (p)->I) { \ - (cur)->It = FROM_M; \ - (II) = (p)->M - gap_open - gap_ext; \ - } else { \ - (cur)->It = FROM_I; \ - (II) = (p)->I - gap_ext; \ - } \ -} -#define set_end_I(II, cur, p) \ -{ \ - if (gap_end >= 0) { \ - if ((p)->M - gap_open > (p)->I) { \ - (cur)->It = FROM_M; \ - (II) = (p)->M - gap_open - gap_end; \ - } else { \ - (cur)->It = FROM_I; \ - (II) = (p)->I - gap_end; \ - } \ - } else set_I(II, cur, p); \ -} -#define set_D(DD, cur, p) \ -{ \ - if ((p)->M - gap_open > (p)->D) { \ - (cur)->Dt = FROM_M; \ - (DD) = (p)->M - gap_open - gap_ext; \ - } else { \ - (cur)->Dt = FROM_D; \ - (DD) = (p)->D - gap_ext; \ - } \ -} -#define set_end_D(DD, cur, p) \ -{ \ - if (gap_end >= 0) { \ - if ((p)->M - gap_open > (p)->D) { \ - (cur)->Dt = FROM_M; \ - (DD) = (p)->M - gap_open - gap_end; \ - } else { \ - (cur)->Dt = FROM_D; \ - (DD) = (p)->D - gap_end; \ - } \ - } else set_D(DD, cur, p); \ -} - -typedef struct -{ - unsigned char Mt:3, It:2, Dt:2; -} dpcell_t; - -typedef struct -{ - int M, I, D; -} dpscore_t; - -/* build score profile for accelerating alignment, in theory */ -void aln_init_score_array(unsigned char *seq, int len, int row, int *score_matrix, int **s_array) -{ - int *tmp, *tmp2, i, k; - for (i = 0; i != row; ++i) { - tmp = score_matrix + i * row; - tmp2 = s_array[i]; - for (k = 0; k != len; ++k) - tmp2[k] = tmp[seq[k]]; - } -} -/*************************** - * banded global alignment * - ***************************/ -int aln_global_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len) -{ - register int i, j; - dpcell_t **dpcell, *q; - dpscore_t *curr, *last, *s; - path_t *p; - int b1, b2, tmp_end; - int *mat, end, max; - unsigned char type, ctype; - - int gap_open, gap_ext, gap_end, b; - int *score_matrix, N_MATRIX_ROW; - - /* initialize some align-related parameters. just for compatibility */ - gap_open = ap->gap_open; - gap_ext = ap->gap_ext; - gap_end = ap->gap_end; - b = ap->band_width; - score_matrix = ap->matrix; - N_MATRIX_ROW = ap->row; - - if (len1 == 0 || len2 == 0) { - *path_len = 0; - return 0; - } - /* calculate b1 and b2 */ - if (len1 > len2) { - b1 = len1 - len2 + b; - b2 = b; - } else { - b1 = b; - b2 = len2 - len1 + b; - } - if (b1 > len1) b1 = len1; - if (b2 > len2) b2 = len2; - --seq1; --seq2; - - /* allocate memory */ - end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1); - dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1)); - for (j = 0; j <= len2; ++j) - dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end); - for (j = b2 + 1; j <= len2; ++j) - dpcell[j] -= j - b2; - curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); - last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); - - /* set first row */ - SET_INF(*curr); curr->M = 0; - for (i = 1, s = curr + 1; i < b1; ++i, ++s) { - SET_INF(*s); - set_end_D(s->D, dpcell[0] + i, s - 1); - } - s = curr; curr = last; last = s; - - /* core dynamic programming, part 1 */ - tmp_end = (b2 < len2)? b2 : len2 - 1; - for (j = 1; j <= tmp_end; ++j) { - q = dpcell[j]; s = curr; SET_INF(*s); - set_end_I(s->I, q, last); - end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - ++s; ++q; - for (i = 1; i != end; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ - set_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - } - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_D(s->D, q, s - 1); - if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ - set_end_I(s->I, q, last + i); - } else s->I = MINOR_INF; - s = curr; curr = last; last = s; - } - /* last row for part 1, use set_end_D() instead of set_D() */ - if (j == len2 && b2 != len2 - 1) { - q = dpcell[j]; s = curr; SET_INF(*s); - set_end_I(s->I, q, last); - end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - ++s; ++q; - for (i = 1; i != end; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ - set_I(s->I, q, last + i); - set_end_D(s->D, q, s - 1); - } - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_end_D(s->D, q, s - 1); - if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ - set_end_I(s->I, q, last + i); - } else s->I = MINOR_INF; - s = curr; curr = last; last = s; - ++j; - } - - /* core dynamic programming, part 2 */ - for (; j <= len2 - b2 + 1; ++j) { - SET_INF(curr[j - b2]); - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - end = j + b1 - 1; - for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - } - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_D(s->D, q, s - 1); - s->I = MINOR_INF; - s = curr; curr = last; last = s; - } - - /* core dynamic programming, part 3 */ - for (; j < len2; ++j) { - SET_INF(curr[j - b2]); - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - } - set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); - set_end_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - s = curr; curr = last; last = s; - } - /* last row */ - if (j == len2) { - SET_INF(curr[j - b2]); - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_I(s->I, q, last + i); - set_end_D(s->D, q, s - 1); - } - set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); - set_end_I(s->I, q, last + i); - set_end_D(s->D, q, s - 1); - s = curr; curr = last; last = s; - } - - /* backtrace */ - i = len1; j = len2; - q = dpcell[j] + i; - s = last + len1; - max = s->M; type = q->Mt; ctype = FROM_M; - if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; } - if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; } - - p = path; - p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */ - ++p; - do { - switch (ctype) { - case FROM_M: --i; --j; break; - case FROM_I: --j; break; - case FROM_D: --i; break; - } - q = dpcell[j] + i; - ctype = type; - switch (type) { - case FROM_M: type = q->Mt; break; - case FROM_I: type = q->It; break; - case FROM_D: type = q->Dt; break; - } - p->ctype = ctype; p->i = i; p->j = j; - ++p; - } while (i || j); - *path_len = p - path - 1; - - /* free memory */ - for (j = b2 + 1; j <= len2; ++j) - dpcell[j] += j - b2; - for (j = 0; j <= len2; ++j) - free(dpcell[j]); - free(dpcell); - free(curr); free(last); - - return max; -} -/************************************************* - * local alignment combined with banded strategy * - *************************************************/ -int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len, int _thres, int *_subo) -{ - register NT_LOCAL_SCORE *s; - register int i; - int q, r, qr, tmp_len, qr_shift; - int **s_array, *score_array; - int e, f; - int is_overflow, of_base; - NT_LOCAL_SCORE *eh, curr_h, last_h, curr_last_h; - int j, start_i, start_j, end_i, end_j; - path_t *p; - int score_f, score_r, score_g; - int start, end, max_score; - int thres, *suba, *ss; - - int gap_open, gap_ext; - int *score_matrix, N_MATRIX_ROW; - - /* initialize some align-related parameters. just for compatibility */ - gap_open = ap->gap_open; - gap_ext = ap->gap_ext; - score_matrix = ap->matrix; - N_MATRIX_ROW = ap->row; - thres = _thres > 0? _thres : -_thres; - - if (len1 == 0 || len2 == 0) return -1; - - /* allocate memory */ - suba = (int*)malloc(sizeof(int) * (len2 + 1)); - eh = (NT_LOCAL_SCORE*)malloc(sizeof(NT_LOCAL_SCORE) * (len1 + 1)); - s_array = (int**)malloc(sizeof(int*) * N_MATRIX_ROW); - for (i = 0; i != N_MATRIX_ROW; ++i) - s_array[i] = (int*)malloc(sizeof(int) * len1); - /* initialization */ - aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array); - q = gap_open; - r = gap_ext; - qr = q + r; - qr_shift = (qr+1) << NT_LOCAL_SHIFT; - tmp_len = len1 + 1; - start_i = start_j = end_i = end_j = 0; - for (i = 0, max_score = 0; i != N_MATRIX_ROW * N_MATRIX_ROW; ++i) - if (max_score < score_matrix[i]) max_score = score_matrix[i]; - /* convert the coordinate */ - --seq1; --seq2; - for (i = 0; i != N_MATRIX_ROW; ++i) --s_array[i]; - - /* forward dynamic programming */ - for (i = 0, s = eh; i != tmp_len; ++i, ++s) *s = 0; - score_f = 0; - is_overflow = of_base = 0; - suba[0] = 0; - for (j = 1, ss = suba + 1; j <= len2; ++j, ++ss) { - int subo = 0; - last_h = f = 0; - score_array = s_array[seq2[j]]; - if (is_overflow) { /* adjust eh[] array if overflow occurs. */ - /* If LOCAL_OVERFLOW_REDUCE is too small, optimal alignment might be missed. - * If it is too large, this block will be excuted frequently and therefore - * slow down the whole program. - * Acually, smaller LOCAL_OVERFLOW_REDUCE might also help to reduce the - * number of assignments because it sets some cells to zero when overflow - * happens. */ - int tmp, tmp2; - score_f -= LOCAL_OVERFLOW_REDUCE; - of_base += LOCAL_OVERFLOW_REDUCE; - is_overflow = 0; - for (i = 1, s = eh; i <= tmp_len; ++i, ++s) { - tmp = *s >> NT_LOCAL_SHIFT; tmp2 = *s & NT_LOCAL_MASK; - if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0; - else tmp2 -= LOCAL_OVERFLOW_REDUCE; - if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0; - else tmp -= LOCAL_OVERFLOW_REDUCE; - *s = (tmp << NT_LOCAL_SHIFT) | tmp2; - } - } - for (i = 1, s = eh; i != tmp_len; ++i, ++s) { - /* prepare for calculate current h */ - curr_h = (*s >> NT_LOCAL_SHIFT) + score_array[i]; - if (curr_h < 0) curr_h = 0; - if (last_h > 0) { /* initialize f */ - f = (f > last_h - q)? f - r : last_h - qr; - if (curr_h < f) curr_h = f; - } - if (*(s+1) >= qr_shift) { /* initialize e */ - curr_last_h = *(s+1) >> NT_LOCAL_SHIFT; - e = ((*s & NT_LOCAL_MASK) > curr_last_h - q)? (*s & NT_LOCAL_MASK) - r : curr_last_h - qr; - if (curr_h < e) curr_h = e; - *s = (last_h << NT_LOCAL_SHIFT) | e; - } else *s = last_h << NT_LOCAL_SHIFT; /* e = 0 */ - last_h = curr_h; - if (subo < curr_h) subo = curr_h; - if (score_f < curr_h) { - score_f = curr_h; end_i = i; end_j = j; - if (score_f > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1; - } - } - *s = last_h << NT_LOCAL_SHIFT; - *ss = subo + of_base; - } - score_f += of_base; - - if (score_f < thres) { /* no matching residue at all, 090218 */ - if (path_len) *path_len = 0; - goto end_func; - } - if (path == 0) goto end_func; /* skip path-filling */ - - /* reverse dynamic programming */ - for (i = end_i, s = eh + end_i; i >= 0; --i, --s) *s = 0; - if (end_i == 0 || end_j == 0) goto end_func; /* no local match */ - score_r = score_matrix[seq1[end_i] * N_MATRIX_ROW + seq2[end_j]]; - is_overflow = of_base = 0; - start_i = end_i; start_j = end_j; - eh[end_i] = ((NT_LOCAL_SCORE)(qr + score_r)) << NT_LOCAL_SHIFT; /* in order to initialize f and e, 040408 */ - start = end_i - 1; - end = end_i - 3; - if (end <= 0) end = 0; - - /* second pass DP can be done in a band, speed will thus be enhanced */ - for (j = end_j - 1; j != 0; --j) { - last_h = f = 0; - score_array = s_array[seq2[j]]; - if (is_overflow) { /* adjust eh[] array if overflow occurs. */ - int tmp, tmp2; - score_r -= LOCAL_OVERFLOW_REDUCE; - of_base += LOCAL_OVERFLOW_REDUCE; - is_overflow = 0; - for (i = start, s = eh + start + 1; i >= end; --i, --s) { - tmp = *s >> NT_LOCAL_SHIFT; tmp2 = *s & NT_LOCAL_MASK; - if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0; - else tmp2 -= LOCAL_OVERFLOW_REDUCE; - if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0; - else tmp -= LOCAL_OVERFLOW_REDUCE; - *s = (tmp << NT_LOCAL_SHIFT) | tmp2; - } - } - for (i = start, s = eh + start + 1; i != end; --i, --s) { - /* prepare for calculate current h */ - curr_h = (*s >> NT_LOCAL_SHIFT) + score_array[i]; - if (curr_h < 0) curr_h = 0; - if (last_h > 0) { /* initialize f */ - f = (f > last_h - q)? f - r : last_h - qr; - if (curr_h < f) curr_h = f; - } - curr_last_h = *(s-1) >> NT_LOCAL_SHIFT; - e = ((*s & NT_LOCAL_MASK) > curr_last_h - q)? (*s & NT_LOCAL_MASK) - r : curr_last_h - qr; - if (e < 0) e = 0; - if (curr_h < e) curr_h = e; - *s = (last_h << NT_LOCAL_SHIFT) | e; - last_h = curr_h; - if (score_r < curr_h) { - score_r = curr_h; start_i = i; start_j = j; - if (score_r + of_base - qr == score_f) { - j = 1; break; - } - if (score_r > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1; - } - } - *s = last_h << NT_LOCAL_SHIFT; - /* recalculate start and end, the boundaries of the band */ - if ((eh[start] >> NT_LOCAL_SHIFT) <= qr) --start; - if (start <= 0) start = 0; - end = start_i - (start_j - j) - (score_r + of_base + (start_j - j) * max_score) / r - 1; - if (end <= 0) end = 0; - } - - if (_subo) { - int tmp2 = 0, tmp = (int)(start_j - .33 * (end_j - start_j) + .499); - for (j = 1; j <= tmp; ++j) - if (tmp2 < suba[j]) tmp2 = suba[j]; - tmp = (int)(end_j + .33 * (end_j - start_j) + .499); - for (j = tmp; j <= len2; ++j) - if (tmp2 < suba[j]) tmp2 = suba[j]; - *_subo = tmp2; - } - - if (path_len == 0) { - path[0].i = start_i; path[0].j = start_j; - path[1].i = end_i; path[1].j = end_j; - goto end_func; - } - - score_r += of_base; - score_r -= qr; - -#ifdef DEBUG - /* this seems not a bug */ - if (score_f != score_r) - fprintf(stderr, "[aln_local_core] unknown flaw occurs: score_f(%d) != score_r(%d)\n", score_f, score_r); -#endif - - if (_thres > 0) { /* call global alignment to fill the path */ - score_g = 0; - j = (end_i - start_i > end_j - start_j)? end_i - start_i : end_j - start_j; - ++j; /* j is the maximum band_width */ - for (i = ap->band_width;; i <<= 1) { - AlnParam ap_real = *ap; - ap_real.gap_end = -1; - ap_real.band_width = i; - score_g = aln_global_core(seq1 + start_i, end_i - start_i + 1, seq2 + start_j, - end_j - start_j + 1, &ap_real, path, path_len); - if (score_g == score_r || score_f == score_g) break; - if (i > j) break; - } - if (score_r > score_g && score_f > score_g) { - fprintf(stderr, "[aln_local_core] Potential bug: (%d,%d) > %d\n", score_f, score_r, score_g); - score_f = score_r = -1; - } else score_f = score_g; - - /* convert coordinate */ - for (p = path + *path_len - 1; p >= path; --p) { - p->i += start_i - 1; - p->j += start_j - 1; - } - } else { /* just store the start and end */ - *path_len = 2; - path[1].i = start_i; path[1].j = start_j; - path->i = end_i; path->j = end_j; - } - -end_func: - /* free */ - free(eh); free(suba); - for (i = 0; i != N_MATRIX_ROW; ++i) { - ++s_array[i]; - free(s_array[i]); - } - free(s_array); - return score_f; -} -AlnAln *aln_stdaln_aux(const char *seq1, const char *seq2, const AlnParam *ap, - int type, int thres, int len1, int len2) -{ - unsigned char *seq11, *seq22; - int score; - int i, j, l; - path_t *p; - char *out1, *out2, *outm; - AlnAln *aa; - - if (len1 < 0) len1 = strlen(seq1); - if (len2 < 0) len2 = strlen(seq2); - - aa = aln_init_AlnAln(); - seq11 = (unsigned char*)malloc(sizeof(unsigned char) * len1); - seq22 = (unsigned char*)malloc(sizeof(unsigned char) * len2); - aa->path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 1)); - - if (ap->row < 10) { /* 4-nucleotide alignment */ - for (i = 0; i < len1; ++i) - seq11[i] = aln_nt4_table[(int)seq1[i]]; - for (j = 0; j < len2; ++j) - seq22[j] = aln_nt4_table[(int)seq2[j]]; - } else if (ap->row < 20) { /* 16-nucleotide alignment */ - for (i = 0; i < len1; ++i) - seq11[i] = aln_nt16_table[(int)seq1[i]]; - for (j = 0; j < len2; ++j) - seq22[j] = aln_nt16_table[(int)seq2[j]]; - } else { /* amino acids */ - for (i = 0; i < len1; ++i) - seq11[i] = aln_aa_table[(int)seq1[i]]; - for (j = 0; j < len2; ++j) - seq22[j] = aln_aa_table[(int)seq2[j]]; - } - - if (type == ALN_TYPE_GLOBAL) score = aln_global_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len); - else if (type == ALN_TYPE_LOCAL) score = aln_local_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len, thres, &aa->subo); - else if (type == ALN_TYPE_EXTEND) score = aln_extend_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len, 1, 0); - else { - free(seq11); free(seq22); free(aa->path); - aln_free_AlnAln(aa); - return 0; - } - aa->score = score; - - if (thres > 0) { - out1 = aa->out1 = (char*)malloc(sizeof(char) * (aa->path_len + 1)); - out2 = aa->out2 = (char*)malloc(sizeof(char) * (aa->path_len + 1)); - outm = aa->outm = (char*)malloc(sizeof(char) * (aa->path_len + 1)); - - --seq1; --seq2; - --seq11; --seq22; - - p = aa->path + aa->path_len - 1; - - for (l = 0; p >= aa->path; --p, ++l) { - switch (p->ctype) { - case FROM_M: out1[l] = seq1[p->i]; out2[l] = seq2[p->j]; - outm[l] = (seq11[p->i] == seq22[p->j] && seq11[p->i] != ap->row)? '|' : ' '; - break; - case FROM_I: out1[l] = '-'; out2[l] = seq2[p->j]; outm[l] = ' '; break; - case FROM_D: out1[l] = seq1[p->i]; out2[l] = '-'; outm[l] = ' '; break; - } - } - out1[l] = out2[l] = outm[l] = '\0'; - ++seq11; ++seq22; - } - - free(seq11); - free(seq22); - - p = aa->path + aa->path_len - 1; - aa->start1 = p->i? p->i : 1; - aa->end1 = aa->path->i; - aa->start2 = p->j? p->j : 1; - aa->end2 = aa->path->j; - aa->cigar32 = aln_path2cigar32(aa->path, aa->path_len, &aa->n_cigar); - - return aa; -} -AlnAln *aln_stdaln(const char *seq1, const char *seq2, const AlnParam *ap, int type, int thres) -{ - return aln_stdaln_aux(seq1, seq2, ap, type, thres, -1, -1); -} - -/* for backward compatibility */ -uint16_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar) -{ - uint32_t *cigar32; - uint16_t *cigar; - int i; - cigar32 = aln_path2cigar32(path, path_len, n_cigar); - cigar = (uint16_t*)cigar32; - for (i = 0; i < *n_cigar; ++i) - cigar[i] = (cigar32[i]&0xf)<<14 | (cigar32[i]>>4&0x3fff); - return cigar; -} - -/* newly added functions (2009-07-21) */ - -int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len, int G0, uint8_t *_mem) -{ - int q, r, qr; - int32_t **s_array, *score_array; - int is_overflow, of_base; - uint32_t *eh; - int i, j, end_i, end_j; - int score, start, end; - int *score_matrix, N_MATRIX_ROW; - uint8_t *mem, *_p; - - /* initialize some align-related parameters. just for compatibility */ - q = ap->gap_open; - r = ap->gap_ext; - qr = q + r; - score_matrix = ap->matrix; - N_MATRIX_ROW = ap->row; - - if (len1 == 0 || len2 == 0) return -1; - - /* allocate memory */ - mem = _mem? _mem : calloc((len1 + 2) * (N_MATRIX_ROW + 1), 4); - _p = mem; - eh = (uint32_t*)_p, _p += 4 * (len1 + 2); - s_array = calloc(N_MATRIX_ROW, sizeof(void*)); - for (i = 0; i != N_MATRIX_ROW; ++i) - s_array[i] = (int32_t*)_p, _p += 4 * len1; - /* initialization */ - aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array); - start = 1; end = 2; - end_i = end_j = 0; - score = 0; - is_overflow = of_base = 0; - /* convert the coordinate */ - --seq1; --seq2; - for (i = 0; i != N_MATRIX_ROW; ++i) --s_array[i]; - - /* dynamic programming */ - memset(eh, 0, 4 * (len1 + 2)); - eh[1] = (uint32_t)G0<<16; - for (j = 1; j <= len2; ++j) { - int _start, _end; - int h1 = 0, f = 0; - score_array = s_array[seq2[j]]; - /* set start and end */ - _start = j - ap->band_width; - if (_start < 1) _start = 1; - if (_start > start) start = _start; - _end = j + ap->band_width; - if (_end > len1 + 1) _end = len1 + 1; - if (_end < end) end = _end; - if (start == end) break; - /* adjust eh[] array if overflow occurs. */ - if (is_overflow) { - int tmp, tmp2; - score -= LOCAL_OVERFLOW_REDUCE; - of_base += LOCAL_OVERFLOW_REDUCE; - is_overflow = 0; - for (i = start; i <= end; ++i) { - uint32_t *s = &eh[i]; - tmp = *s >> 16; tmp2 = *s & 0xffff; - if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0; - else tmp2 -= LOCAL_OVERFLOW_REDUCE; - if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0; - else tmp -= LOCAL_OVERFLOW_REDUCE; - *s = (tmp << 16) | tmp2; - } - } - _start = _end = 0; - /* the inner loop */ - for (i = start; i < end; ++i) { - /* At the beginning of each cycle: - eh[i] -> h[j-1,i-1]<<16 | e[j,i] - f -> f[j,i] - h1 -> h[j,i-1] - */ - uint32_t *s = &eh[i]; - int h = (int)(*s >> 16); - int e = *s & 0xffff; /* this is e[j,i] */ - *s = (uint32_t)h1 << 16; /* eh[i] now stores h[j,i-1]<<16 */ - h += h? score_array[i] : 0; /* this is left_core() specific */ - /* calculate h[j,i]; don't need to test 0, as {e,f}>=0 */ - h = h > e? h : e; - h = h > f? h : f; /* h now is h[j,i] */ - h1 = h; - if (h > 0) { - if (_start == 0) _start = i; - _end = i; - if (score < h) { - score = h; end_i = i; end_j = j; - if (score > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1; - } - } - /* calculate e[j+1,i] and f[j,i+1] */ - h -= qr; - h = h > 0? h : 0; - e -= r; - e = e > h? e : h; - f -= r; - f = f > h? f : h; - *s |= e; - } - eh[end] = h1 << 16; - /* recalculate start and end, the boundaries of the band */ - if (_end <= 0) break; /* no cell in this row has a positive score */ - start = _start; - end = _end + 3; - } - - score += of_base - 1; - if (score <= 0) { - if (path_len) *path_len = 0; - goto end_left_func; - } - - if (path == 0) goto end_left_func; - - if (path_len == 0) { - path[0].i = end_i; path[0].j = end_j; - goto end_left_func; - } - - { /* call global alignment to fill the path */ - int score_g = 0; - j = (end_i - 1 > end_j - 1)? end_i - 1 : end_j - 1; - ++j; /* j is the maximum band_width */ - for (i = ap->band_width;; i <<= 1) { - AlnParam ap_real = *ap; - ap_real.gap_end = -1; - ap_real.band_width = i; - score_g = aln_global_core(seq1 + 1, end_i, seq2 + 1, end_j, &ap_real, path, path_len); - if (score == score_g) break; - if (i > j) break; - } - if (score > score_g) - fprintf(stderr, "[aln_left_core] no suitable bandwidth: %d < %d\n", score_g, score); - score = score_g; - } - -end_left_func: - /* free */ - free(s_array); - if (!_mem) free(mem); - return score; -} - -uint32_t *aln_path2cigar32(const path_t *path, int path_len, int *n_cigar) -{ - int i, n; - uint32_t *cigar; - unsigned char last_type; - - if (path_len == 0 || path == 0) { - *n_cigar = 0; - return 0; - } - - last_type = path->ctype; - for (i = n = 1; i < path_len; ++i) { - if (last_type != path[i].ctype) ++n; - last_type = path[i].ctype; - } - *n_cigar = n; - cigar = (uint32_t*)malloc(*n_cigar * 4); - - cigar[0] = 1u << 4 | path[path_len-1].ctype; - last_type = path[path_len-1].ctype; - for (i = path_len - 2, n = 0; i >= 0; --i) { - if (path[i].ctype == last_type) cigar[n] += 1u << 4; - else { - cigar[++n] = 1u << 4 | path[i].ctype; - last_type = path[i].ctype; - } - } - - return cigar; -} - -#ifdef STDALN_MAIN -int main() -{ - AlnAln *aln_local, *aln_global, *aln_left; - int i; - - aln_local = aln_stdaln("CGTGCGATGCactgCATACGGCTCGCCTAGATCA", "AAGGGATGCTCTGCATCgCTCGGCTAGCTGT", &aln_param_blast, 0, 1); - aln_global = aln_stdaln("CGTGCGATGCactgCATACGGCTCGCCTAGATCA", "AAGGGATGCTCTGCATCGgCTCGGCTAGCTGT", &aln_param_blast, 1, 1); -// aln_left = aln_stdaln( "GATGCACTGCATACGGCTCGCCTAGATCA", "GATGCTCTGCATCGgCTCGGCTAGCTGT", &aln_param_blast, 2, 1); - aln_left = aln_stdaln("CACCTTCGACTCACGTCTCATTCTCGGAGTCGAGTGGACGGTCCCTCATACACGAACAGGTTC", - "CACCTTCGACTTTCACCTCTCATTCTCGGACTCGAGTGGACGGTCCCTCATCCAAGAACAGGGTCTGTGAAA", &aln_param_blast, 2, 1); - - printf(">%d,%d\t%d,%d\n", aln_local->start1, aln_local->end1, aln_local->start2, aln_local->end2); - printf("%s\n%s\n%s\n", aln_local->out1, aln_local->outm, aln_local->out2); - - printf(">%d,%d\t%d,%d\t", aln_global->start1, aln_global->end1, aln_global->start2, aln_global->end2); - for (i = 0; i != aln_global->n_cigar; ++i) - printf("%d%c", aln_global->cigar32[i]>>4, "MID"[aln_global->cigar32[i]&0xf]); - printf("\n%s\n%s\n%s\n", aln_global->out1, aln_global->outm, aln_global->out2); - - printf(">%d\t%d,%d\t%d,%d\t", aln_left->score, aln_left->start1, aln_left->end1, aln_left->start2, aln_left->end2); - for (i = 0; i != aln_left->n_cigar; ++i) - printf("%d%c", aln_left->cigar32[i]>>4, "MID"[aln_left->cigar32[i]&0xf]); - printf("\n%s\n%s\n%s\n", aln_left->out1, aln_left->outm, aln_left->out2); - - aln_free_AlnAln(aln_local); - aln_free_AlnAln(aln_global); - aln_free_AlnAln(aln_left); - return 0; -} -#endif diff --git a/stdaln.h b/stdaln.h deleted file mode 100644 index f0048b3..0000000 --- a/stdaln.h +++ /dev/null @@ -1,162 +0,0 @@ -/* The MIT License - - Copyright (c) 2003-2006, 2008, by Heng Li - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* - 2009-07-23, 0.10.0 - - - Use 32-bit to store CIGAR - - - Report suboptimal aligments - - - Implemented half-fixed-half-open DP - - 2009-04-26, 0.9.10 - - - Allow to set a threshold for local alignment - - 2009-02-18, 0.9.9 - - - Fixed a bug when no residue matches - - 2008-08-04, 0.9.8 - - - Fixed the wrong declaration of aln_stdaln_aux() - - - Avoid 0 coordinate for global alignment - - 2008-08-01, 0.9.7 - - - Change gap_end penalty to 5 in aln_param_bwa - - - Add function to convert path_t to the CIGAR format - - 2008-08-01, 0.9.6 - - - The first gap now costs (gap_open+gap_ext), instead of - gap_open. Scoring systems are modified accordingly. - - - Gap end is now correctly handled. Previously it is not correct. - - - Change license to MIT. - - */ - -#ifndef LH3_STDALN_H_ -#define LH3_STDALN_H_ - - -#define STDALN_VERSION 0.11.0 - -#include - -#define FROM_M 0 -#define FROM_I 1 -#define FROM_D 2 -#define FROM_S 3 - -#define ALN_TYPE_LOCAL 0 -#define ALN_TYPE_GLOBAL 1 -#define ALN_TYPE_EXTEND 2 - -/* This is the smallest integer. It might be CPU-dependent in very RARE cases. */ -#define MINOR_INF -1073741823 - -typedef struct -{ - int gap_open; - int gap_ext; - int gap_end; - - int *matrix; - int row; - int band_width; -} AlnParam; - -typedef struct -{ - int i, j; - unsigned char ctype; -} path_t; - -typedef struct -{ - path_t *path; /* for advanced users... :-) */ - int path_len; /* for advanced users... :-) */ - int start1, end1; /* start and end of the first sequence, coordinations are 1-based */ - int start2, end2; /* start and end of the second sequence, coordinations are 1-based */ - int score, subo; /* score */ - - char *out1, *out2; /* print them, and then you will know */ - char *outm; - - int n_cigar; - uint32_t *cigar32; -} AlnAln; - -#ifdef __cplusplus -extern "C" { -#endif - - AlnAln *aln_stdaln_aux(const char *seq1, const char *seq2, const AlnParam *ap, - int type, int do_align, int len1, int len2); - AlnAln *aln_stdaln(const char *seq1, const char *seq2, const AlnParam *ap, int type, int do_align); - void aln_free_AlnAln(AlnAln *aa); - - int aln_global_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len); - int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len, int _thres, int *_subo); - int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len, int G0, uint8_t *_mem); - uint16_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar); - uint32_t *aln_path2cigar32(const path_t *path, int path_len, int *n_cigar); - -#ifdef __cplusplus -} -#endif - -/******************** - * global variables * - ********************/ - -extern AlnParam aln_param_bwa; /* = { 37, 9, 0, aln_sm_maq, 5, 50 }; */ -extern AlnParam aln_param_blast; /* = { 5, 2, 2, aln_sm_blast, 5, 50 }; */ -extern AlnParam aln_param_nt2nt; /* = { 10, 2, 2, aln_sm_nt, 16, 75 }; */ -extern AlnParam aln_param_aa2aa; /* = { 20, 19, 19, aln_sm_read, 16, 75 }; */ -extern AlnParam aln_param_rd2rd; /* = { 12, 2, 2, aln_sm_blosum62, 22, 50 }; */ - -/* common nucleotide score matrix for 16 bases */ -extern int aln_sm_nt[], aln_sm_bwa[]; - -/* BLOSUM62 and BLOSUM45 */ -extern int aln_sm_blosum62[], aln_sm_blosum45[]; - -/* common read for 16 bases. note that read alignment is quite different from common nucleotide alignment */ -extern int aln_sm_read[]; - -/* human-mouse score matrix for 4 bases */ -extern int aln_sm_hs[]; - -#endif