diff --git a/.gitignore b/.gitignore index 16d123a..57cb318 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.[oa] bwa test +test64 .*.swp diff --git a/Makefile b/Makefile index b8fa824..70b89b0 100644 --- a/Makefile +++ b/Makefile @@ -1,14 +1,11 @@ CC= gcc -CXX= g++ -CFLAGS= -g -Wall -O2 +CFLAGS= -g -Wall -O2 -msse2 CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 -LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o stdaln.o \ - bwaseqio.o bwase.o kstring.o -AOBJS= QSufSort.o bwt_gen.o \ - is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \ - bwape.o cs2nt.o \ +LOBJS= utils.o kstring.o ksw.o kopen.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o +AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ + is.o bwtindex.o bwape.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o PROG= bwa @@ -26,7 +23,7 @@ SUBDIRS= . all:$(PROG) bwa:libbwa.a $(AOBJS) main.o - $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS) + $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa libbwa.a:$(LOBJS) $(AR) -csru $@ $(LOBJS) @@ -34,35 +31,40 @@ libbwa.a:$(LOBJS) clean: rm -f gmon.out *.o a.out $(PROG) *~ *.a +depend: + ( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) -- *.c ) + +# DO NOT DELETE THIS LINE -- make depend depends on it. + QSufSort.o: QSufSort.h -bamlite.o: bamlite.h utils.h -bntseq.o: bntseq.h kseq.h main.h utils.h -bwa.o: bntseq.h bwa.h bwt.h bwtaln.h bwtgap.h stdaln.h utils.h -bwape.o: bntseq.h bwase.h bwt.h bwtaln.h khash.h ksort.h kvec.h stdaln.h -bwape.o: utils.h -bwase.o: bntseq.h bwase.h bwt.h bwtaln.h kstring.h stdaln.h utils.h -bwaseqio.o: bamlite.h bwt.h bwtaln.h kseq.h stdaln.h utils.h -bwt.o: bwt.h kvec.h utils.h +bamlite.o: utils.h bamlite.h +bntseq.o: bntseq.h main.h utils.h kseq.h +bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h kseq.h +bwamem.o: kstring.h utils.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h +bwamem.o: ksort.h kbtree.h +bwamem_pair.o: kstring.h utils.h bwamem.h bwt.h bntseq.h bwa.h kvec.h ksw.h +bwape.o: bwtaln.h bwt.h stdaln.h kvec.h bntseq.h utils.h bwase.h bwa.h +bwape.o: khash.h +bwase.o: stdaln.h bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h bwa.h +bwaseqio.o: bwtaln.h bwt.h stdaln.h utils.h bamlite.h kseq.h +bwt.o: utils.h bwt.h kvec.h bwt_gen.o: QSufSort.h utils.h bwt_lite.o: bwt_lite.h utils.h -bwtaln.o: bwt.h bwtaln.h bwtgap.h stdaln.h utils.h -bwtgap.o: bwt.h bwtaln.h bwtgap.h stdaln.h utils.h +bwtaln.o: bwtaln.h bwt.h stdaln.h bwtgap.h utils.h bwa.h bntseq.h +bwtgap.o: bwtgap.h bwt.h bwtaln.h stdaln.h utils.h bwtindex.o: bntseq.h bwt.h main.h utils.h -bwtio.o: bwt.h utils.h -bwtmisc.o: bntseq.h bwt.h main.h utils.h -bwtsw2_aux.o: bntseq.h bwt.h bwt_lite.h bwtsw2.h kseq.h ksort.h kstring.h -bwtsw2_aux.o: stdaln.h utils.h -bwtsw2_chain.o: bntseq.h bwt.h bwt_lite.h bwtsw2.h ksort.h utils.h -bwtsw2_core.o: bntseq.h bwt.h bwt_lite.h bwtsw2.h khash.h ksort.h kvec.h -bwtsw2_core.o: utils.h -bwtsw2_main.o: bntseq.h bwt.h bwt_lite.h bwtsw2.h utils.h -bwtsw2_pair.o: bntseq.h bwt.h bwt_lite.h bwtsw2.h kstring.h ksw.h utils.h -cs2nt.o: bwt.h bwtaln.h stdaln.h utils.h -fastmap.o: bntseq.h bwt.h kseq.h kvec.h utils.h +bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h stdaln.h kstring.h +bwtsw2_aux.o: bwa.h kseq.h ksort.h +bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h utils.h ksort.h +bwtsw2_core.o: bwt_lite.h bwtsw2.h bntseq.h bwt.h kvec.h utils.h khash.h +bwtsw2_core.o: ksort.h +bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h +bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h ksw.h +fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h utils.h kseq.h is.o: utils.h +kopen.o: utils.h kstring.o: kstring.h utils.h ksw.o: ksw.h utils.h main.o: main.h utils.h -simple_dp.o: kseq.h stdaln.h utils.h stdaln.o: stdaln.h utils.h -utils.o: utils.h +utils.o: utils.h ksort.h kseq.h diff --git a/QSufSort.c b/QSufSort.c index e437ac3..36c5a51 100644 --- a/QSufSort.c +++ b/QSufSort.c @@ -59,12 +59,9 @@ void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsin qsint_t i, j; qsint_t s, negatedSortedGroupLength; qsint_t numSymbolAggregated; - qsint_t maxNumInputSymbol; qsint_t numSortedPos = 1; qsint_t newAlphabetSize; - maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1; - if (!skipTransform) { /* bucketing possible*/ newAlphabetSize = QSufSortTransform(V, I, numChar, largestInputSymbol, smallestInputSymbol, diff --git a/bntseq.c b/bntseq.c index af83211..624b4dd 100644 --- a/bntseq.c +++ b/bntseq.c @@ -36,7 +36,7 @@ #include "utils.h" #include "kseq.h" -KSEQ_INIT(gzFile, err_gzread) +KSEQ_DECLARE(gzFile) unsigned char nst_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, @@ -310,21 +310,26 @@ int bwa_fa2pac(int argc, char *argv[]) return 0; } +int bns_pos2rid(const bntseq_t *bns, int64_t pos_f) +{ + int left, mid, right; + if (pos_f >= bns->l_pac) return -1; + left = 0; mid = 0; right = bns->n_seqs; + while (left < right) { // binary search + mid = (left + right) >> 1; + if (pos_f >= bns->anns[mid].offset) { + if (mid == bns->n_seqs - 1) break; + if (pos_f < bns->anns[mid+1].offset) break; // bracketed + left = mid + 1; + } else right = mid; + } + return mid; +} + int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) { int left, mid, right, nn; - if (ref_id) { - left = 0; mid = 0; right = bns->n_seqs; - while (left < right) { - mid = (left + right) >> 1; - if (pos_f >= bns->anns[mid].offset) { - if (mid == bns->n_seqs - 1) break; - if (pos_f < bns->anns[mid+1].offset) break; // bracketed - left = mid + 1; - } else right = mid; - } - *ref_id = mid; - } + if (ref_id) *ref_id = bns_pos2rid(bns, pos_f); left = 0; right = bns->n_holes; nn = 0; while (left < right) { mid = (left + right) >> 1; @@ -343,3 +348,26 @@ int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) } return nn; } + +uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len) +{ + uint8_t *seq = 0; + if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap + if (end > l_pac<<1) end = l_pac<<1; + if (beg < 0) beg = 0; + if (beg >= l_pac || end <= l_pac) { + int64_t k, l = 0; + *len = end - beg; + seq = xmalloc(end - beg); + if (beg >= l_pac) { // reverse strand + int64_t beg_f = (l_pac<<1) - 1 - end; + int64_t end_f = (l_pac<<1) - 1 - beg; + for (k = end_f; k > beg_f; --k) + seq[l++] = 3 - _get_pac(pac, k); + } else { // forward strand + for (k = beg; k < end; ++k) + seq[l++] = _get_pac(pac, k); + } + } else *len = 0; // if bridging the forward-reverse boundary, return nothing + return seq; +} diff --git a/bntseq.h b/bntseq.h index 843db64..4061438 100644 --- a/bntseq.h +++ b/bntseq.h @@ -29,6 +29,7 @@ #define BWT_BNTSEQ_H #include +#include #include #ifndef BWA_UBYTE @@ -71,7 +72,9 @@ extern "C" { bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename); void bns_destroy(bntseq_t *bns); int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only); + int bns_pos2rid(const bntseq_t *bns, int64_t pos_f); int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id); + uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len); #ifdef __cplusplus } diff --git a/bwa.c b/bwa.c index 0b6a420..55a0a9b 100644 --- a/bwa.c +++ b/bwa.c @@ -1,274 +1,313 @@ - -#include #include #include -#include -#include "utils.h" -#include "bwa.h" -#include "bwt.h" -#include "bwtgap.h" +#include +#include #include "bntseq.h" +#include "bwa.h" +#include "ksw.h" +#include "utils.h" -#ifndef kroundup32 -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) -#endif +int bwa_verbose = 3; +char bwa_rg_id[256]; -extern unsigned char nst_nt4_table[256]; -extern void seq_reverse(int len, uint8_t *seq, int is_comp); +/************************ + * Batch FASTA/Q reader * + ************************/ -bwa_opt_t bwa_def_opt = { 11, 4, -1, 1, 6, 32, 2, 0.04 }; +#include "kseq.h" +KSEQ_DECLARE(gzFile) -struct bwa_idx_t { +static inline void trim_readno(kstring_t *s) +{ + if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1])) + s->l -= 2, s->s[s->l] = 0; +} + +static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) +{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice + s->name = xstrdup(ks->name.s); + s->comment = ks->comment.l? xstrdup(ks->comment.s) : 0; + s->seq = xstrdup(ks->seq.s); + s->qual = ks->qual.l? xstrdup(ks->qual.s) : 0; + s->l_seq = strlen(s->seq); +} + +bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) +{ + kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_; + int size = 0, m, n; + bseq1_t *seqs; + m = n = 0; seqs = 0; + while (kseq_read(ks) >= 0) { + if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads + fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); + break; + } + if (n >= m) { + m = m? m<<1 : 256; + seqs = xrealloc(seqs, m * sizeof(bseq1_t)); + } + trim_readno(&ks->name); + kseq2bseq1(ks, &seqs[n]); + size += seqs[n++].l_seq; + if (ks2) { + trim_readno(&ks2->name); + kseq2bseq1(ks2, &seqs[n]); + size += seqs[n++].l_seq; + } + if (size >= chunk_size) break; + } + if (size == 0) { // test if the 2nd file is finished + if (ks2 && kseq_read(ks2) >= 0) + fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); + } + *n_ = n; + return seqs; +} + +/***************** + * CIGAR related * + *****************/ + +// Generate CIGAR when the alignment end points are known +uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM) +{ + uint32_t *cigar = 0; + uint8_t tmp, *rseq; + int i, w; + int64_t rlen; + *n_cigar = 0; *NM = -1; + if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand + rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); + if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range + if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position + for (i = 0; i < l_query>>1; ++i) + tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; + for (i = 0; i < rlen>>1; ++i) + tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp; + } + //printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); + //printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); + // set the band-width + w = (int)((double)(l_query * mat[0] - q) / r + 1.); + w = w < 1? w : 1; + w = w < w_? w : w_; + w += abs(rlen - l_query); + // NW alignment + *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); + {// compute NM + int k, x, y, n_mm = 0, n_gap = 0; + for (k = 0, x = y = 0; k < *n_cigar; ++k) { + int op = cigar[k]&0xf; + int len = cigar[k]>>4; + if (op == 0) { // match + for (i = 0; i < len; ++i) + if (query[x + i] != rseq[y + i]) ++n_mm; + x += len; y += len; + } else if (op == 1) x += len, n_gap += len; + else if (op == 2) y += len, n_gap += len; + } + *NM = n_mm + n_gap; + } + if (rb >= l_pac) // reverse back query + for (i = 0; i < l_query>>1; ++i) + tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; + +ret_gen_cigar: + free(rseq); + return cigar; +} + +int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re) +{ + int ib, ie, is_rev; + int64_t fb, fe, mid = -1; + if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary + *qb = *qe = *rb = *re = -1; + return -1; // unable to fix + } else { + fb = bns_depos(bns, *rb < bns->l_pac? *rb : *re - 1, &is_rev); + ib = bns_pos2rid(bns, fb); + if (fb - bns->anns[ib].offset + (*re - *rb) <= bns->anns[ib].len) return 0; // no need to fix + fe = bns_depos(bns, *re - 1 < bns->l_pac? *re - 1 : *rb, &is_rev); + ie = bns_pos2rid(bns, fe); + if (ie - ib > 1) { // bridge three or more references + *qb = *qe = *rb = *re = -1; + return -2; // unable to fix + } else { + int l = bns->anns[ib].offset + bns->anns[ib].len - fb; + mid = is_rev? *re - l : *rb + l; + } + } + if (mid >= 0) { + int i, score, n_cigar, y, NM; + uint32_t *cigar; + int64_t x; + cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM); + for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) { + int op = cigar[i]&0xf, len = cigar[i]>>4; + if (op == 0) { + if (x <= mid && mid < x + len) { + if (mid - *rb > *re - mid) { // the first part is longer + if (x == mid) { // need to check the previous operation + assert(i); // mid != *rb should always stand + if ((cigar[i-1]&0xf) == 1) *qe = y - (cigar[i-1]>>4), *re = x; + else if ((cigar[i-1]&0xf) == 2) *qe = y, *re = x - (cigar[i-1]>>4); + else abort(); // should not be here + } else *qe = y + (mid - x), *re = mid; + } else *qb = y + (mid - x), *rb = mid; + break; + } else x += len, y += len; + } else if (op == 1) { // insertion + y += len; + } else if (op == 2) { // deletion + if (x <= mid && mid < x + len) { + if (mid - *rb > *re - mid) *qe = y, *re = x; + else *qb = y, *rb = x + len; + break; + } else x += len; + } else abort(); // should not be here + } + free(cigar); + } + return 1; +} + +/********************* + * Full index reader * + *********************/ + +char *bwa_idx_infer_prefix(const char *hint) +{ + char *prefix; + int l_hint; + FILE *fp; + l_hint = strlen(hint); + prefix = xmalloc(l_hint + 3 + 4 + 1); + strcpy(prefix, hint); + strcpy(prefix + l_hint, ".64.bwt"); + if ((fp = fopen(prefix, "rb")) != 0) { + fclose(fp); + prefix[l_hint + 3] = 0; + return prefix; + } else { + strcpy(prefix + l_hint, ".bwt"); + if ((fp = fopen(prefix, "rb")) == 0) { + free(prefix); + return 0; + } else { + fclose(fp); + prefix[l_hint] = 0; + return prefix; + } + } +} + +bwt_t *bwa_idx_load_bwt(const char *hint) +{ + char *tmp, *prefix; bwt_t *bwt; - bntseq_t *bns; - uint8_t *pac; -}; - -struct bwa_buf_t { - int max_buf; - bwa_pestat_t pes; - gap_stack_t *stack; - gap_opt_t *opt; - int *diff_tab; - uint8_t *buf; - int *logn; -}; - -bwa_idx_t *bwa_idx_load(const char *prefix) -{ - bwa_idx_t *p; - int l; - char *str; - l = strlen(prefix); - p = xcalloc(1, sizeof(bwa_idx_t)); - str = xmalloc(l + 10); - strcpy(str, prefix); - p->bns = bns_restore(str); - strcpy(str + l, ".bwt"); - p->bwt = bwt_restore_bwt(str); - str[l] = 0; - strcpy(str + l, ".sa"); - bwt_restore_sa(str, p->bwt); - free(str); - p->pac = xcalloc(p->bns->l_pac/4+1, 1); - err_fread_noeof(p->pac, 1, p->bns->l_pac/4+1, p->bns->fp_pac); - err_fclose(p->bns->fp_pac); - p->bns->fp_pac = 0; - return p; + prefix = bwa_idx_infer_prefix(hint); + if (prefix == 0) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__); + return 0; + } + tmp = xcalloc(strlen(prefix) + 5, 1); + strcat(strcpy(tmp, prefix), ".bwt"); // FM-index + bwt = bwt_restore_bwt(tmp); + strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA) + bwt_restore_sa(tmp, bwt); + free(tmp); free(prefix); + return bwt; } -void bwa_idx_destroy(bwa_idx_t *p) +bwaidx_t *bwa_idx_load(const char *hint, int which) { - bns_destroy(p->bns); - bwt_destroy(p->bwt); - free(p->pac); - free(p); + bwaidx_t *idx; + char *prefix; + prefix = bwa_idx_infer_prefix(hint); + if (prefix == 0) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__); + return 0; + } + idx = xcalloc(1, sizeof(bwaidx_t)); + if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint); + if (which & BWA_IDX_BNS) { + idx->bns = bns_restore(prefix); + if (which & BWA_IDX_PAC) { + idx->pac = xcalloc(idx->bns->l_pac/4+1, 1); + err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence + err_fclose(idx->bns->fp_pac); + idx->bns->fp_pac = 0; + } + } + free(prefix); + return idx; } -bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score) +void bwa_idx_destroy(bwaidx_t *idx) +{ + if (idx == 0) return; + if (idx->bwt) bwt_destroy(idx->bwt); + if (idx->bns) bns_destroy(idx->bns); + if (idx->pac) free(idx->pac); + free(idx); +} + +/*********************** + * SAM header routines * + ***********************/ + +void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line) { - extern gap_opt_t *gap_init_opt(void); - extern int bwa_cal_maxdiff(int l, double err, double thres); int i; - bwa_buf_t *p; - p = xmalloc(sizeof(bwa_buf_t)); - p->stack = gap_init_stack2(max_score); - p->opt = gap_init_opt(); - p->opt->s_gapo = opt->s_gapo; - p->opt->s_gape = opt->s_gape; - p->opt->max_diff = opt->max_diff; - p->opt->max_gapo = opt->max_gapo; - p->opt->max_gape = opt->max_gape; - p->opt->seed_len = opt->seed_len; - p->opt->max_seed_diff = opt->max_seed_diff; - p->opt->fnr = opt->fnr; - p->diff_tab = xcalloc(BWA_MAX_QUERY_LEN, sizeof(int)); - for (i = 1; i < BWA_MAX_QUERY_LEN; ++i) - p->diff_tab[i] = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr); - p->logn = xcalloc(256, sizeof(int)); - for (i = 1; i != 256; ++i) - p->logn[i] = (int)(4.343 * log(i) + 0.499); - return p; + for (i = 0; i < bns->n_seqs; ++i) + err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len); + if (rg_line) err_printf("%s\n", rg_line); } -void bwa_buf_destroy(bwa_buf_t *p) +static char *bwa_escape(char *s) { - gap_destroy_stack(p->stack); - free(p->diff_tab); free(p->logn); free(p->opt); - free(p); -} - -bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq) -{ - extern int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width); - int i, seq_len, buf_len; - bwt_width_t *w, *seed_w; - uint8_t *s; - gap_opt_t opt2 = *buf->opt; - bwa_sai_t sai; - - seq_len = strlen(seq); - // estimate the buffer length - buf_len = (buf->opt->seed_len + seq_len + 1) * sizeof(bwt_width_t) + seq_len; - if (buf_len > buf->max_buf) { - buf->max_buf = buf_len; - kroundup32(buf->max_buf); - buf->buf = xrealloc(buf->buf, buf->max_buf); + char *p, *q; + for (p = q = s; *p; ++p) { + if (*p == '\\') { + ++p; + if (*p == 't') *q++ = '\t'; + else if (*p == 'n') *q++ = '\n'; + else if (*p == 'r') *q++ = '\r'; + else if (*p == '\\') *q++ = '\\'; + } else *q++ = *p; } - memset(buf->buf, 0, buf_len); - seed_w = (bwt_width_t*)buf->buf; - w = seed_w + buf->opt->seed_len; - s = (uint8_t*)(w + seq_len + 1); - if (opt2.fnr > 0.) opt2.max_diff = buf->diff_tab[seq_len]; - // copy the sequence - for (i = 0; i < seq_len; ++i) - s[i] = nst_nt4_table[(int)seq[i]]; - seq_reverse(seq_len, s, 0); - // mapping - bwt_cal_width(idx->bwt, seq_len, s, w); - if (opt2.seed_len >= seq_len) opt2.seed_len = 0x7fffffff; - if (seq_len > buf->opt->seed_len) - bwt_cal_width(idx->bwt, buf->opt->seed_len, s + (seq_len - buf->opt->seed_len), seed_w); - for (i = 0; i < seq_len; ++i) // complement; I forgot why... - s[i] = s[i] > 3? 4 : 3 - s[i]; - sai.sai = (bwa_sai1_t*)bwt_match_gap(idx->bwt, seq_len, s, w, seq_len <= buf->opt->seed_len? 0 : seed_w, &opt2, &sai.n, buf->stack); - return sai; + *q = '\0'; + return s; } -static void compute_NM(const uint8_t *pac, uint64_t l_pac, uint8_t *seq, int64_t pos, int n_cigar, uint32_t *cigar, int *n_mm, int *n_gaps) +char *bwa_set_rg(const char *s) { - uint64_t x = pos, z; - int k, y = 0; - *n_mm = *n_gaps = 0; - for (k = 0; k < n_cigar; ++k) { - int l = cigar[k]>>4; - int op = cigar[k]&0xf; - if (op == 0) { // match/mismatch - for (z = 0; z < l && x + z < l_pac; ++z) { - int c = pac[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3; - if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) ++(*n_mm); - } - } - if (op == 1 || op == 2) (*n_gaps) += l; - if (op == 0 || op == 2) x += l; - if (op == 0 || op == 1 || op == 4) y += l; + char *p, *q, *r, *rg_line = 0; + memset(bwa_rg_id, 0, 256); + if (strstr(s, "@RG") != s) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__); + goto err_set_rg; } + rg_line = xstrdup(s); + bwa_escape(rg_line); + if ((p = strstr(rg_line, "\tID:")) == 0) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID at the read group line\n", __func__); + goto err_set_rg; + } + p += 4; + for (q = p; *q && *q != '\t' && *q != '\n'; ++q); + if (q - p + 1 > 256) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__); + goto err_set_rg; + } + for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q) + *r++ = *q; + return rg_line; + +err_set_rg: + free(rg_line); + return 0; } -void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps, bwa_aln_t *aln) -{ - extern bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand); - extern bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const uint8_t *seq, bwtint_t *_pos, int ext, int *n_cigar, int is_end_correct); - int strand, seq_len, i, n_gap, n_mm; - uint64_t pos3, pac_pos; - uint8_t *s[2]; - - memset(aln, 0, sizeof(bwa_aln_t)); - seq_len = strlen(seq); - if (seq_len<<1 > buf->max_buf) { - buf->max_buf = seq_len<<1; - kroundup32(buf->max_buf); - buf->buf = xrealloc(buf->buf, buf->max_buf); - } - s[0] = buf->buf; - s[1] = s[0] + seq_len; - for (i = 0; i < seq_len; ++i) - s[0][i] = s[1][i] = nst_nt4_table[(int)seq[i]]; - seq_reverse(seq_len, s[1], 1); - pac_pos = bwa_sa2pos(idx->bns, idx->bwt, sa, seq_len, &strand); - if (strand) aln->flag |= 16; - if (n_gaps) { // only for gapped alignment - int n_cigar; - bwa_cigar_t *cigar16; - cigar16 = bwa_refine_gapped_core(idx->bns->l_pac, idx->pac, seq_len, s[strand], &pac_pos, strand? n_gaps : -n_gaps, &n_cigar, 1); - aln->n_cigar = n_cigar; - aln->cigar = xmalloc(n_cigar * 4); - for (i = 0, pos3 = pac_pos; i < n_cigar; ++i) { - int op = cigar16[i]>>14; - int len = cigar16[i]&0x3fff; - if (op == 3) op = 4; // the 16-bit CIGAR is different from the 32-bit CIGAR - aln->cigar[i] = len<<4 | op; - if (op == 0 || op == 2) pos3 += len; - } - free(cigar16); - } else { // ungapped - aln->n_cigar = 1; - aln->cigar = xmalloc(4); - aln->cigar[0] = seq_len<<4 | 0; - pos3 = pac_pos + seq_len; - } - aln->n_n = bns_cnt_ambi(idx->bns, pac_pos, pos3 - pac_pos, &aln->ref_id); - aln->offset = pac_pos - idx->bns->anns[aln->ref_id].offset; - if (pos3 - idx->bns->anns[aln->ref_id].offset > idx->bns->anns[aln->ref_id].len) // read mapped beyond the end of a sequence - aln->flag |= 4; // read unmapped - compute_NM(idx->pac, idx->bns->l_pac, s[strand], pac_pos, aln->n_cigar, aln->cigar, &n_mm, &n_gap); - aln->n_mm = n_mm; - aln->n_gap = n_gap; -} - -/************************ - * Single-end alignment * - ************************/ - -bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar) -{ - bwa_one_t *one; - int best, cnt, i, seq_len; - - seq_len = strlen(seq); - one = xcalloc(1, sizeof(bwa_one_t)); - one->sai = bwa_sai(idx, buf, seq); - if (one->sai.n == 0) return one; - // count number of hits; randomly select one alignment - best = one->sai.sai[0].score; - for (i = cnt = 0; i < one->sai.n; ++i) { - bwa_sai1_t *p = &one->sai.sai[i]; - if (p->score > best) break; - if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) { - one->which = p; - one->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48()); - } - cnt += p->l - p->k + 1; - } - one->c1 = cnt; - for (; i < one->sai.n; ++i) - cnt += one->sai.sai[i].l - one->sai.sai[i].k + 1; - one->c2 = cnt - one->c1; - // estimate single-end mapping quality - one->mapQs = -1; - if (one->c1 == 0) one->mapQs = 23; // FIXME: is it possible? - else if (one->c1 > 1) one->mapQs = 0; - else { - int diff = one->which->n_mm + one->which->n_gapo + one->which->n_gape; - if (diff >= buf->diff_tab[seq_len]) one->mapQs = 25; - else if (one->c2 == 0) one->mapQs = 37; - } - if (one->mapQs < 0) { - cnt = (one->c2 >= 255)? 255 : one->c2; - one->mapQs = 23 < buf->logn[cnt]? 0 : 23 - buf->logn[cnt]; - } - one->mapQ = one->mapQs; - // compute CIGAR on request - one->one.ref_id = -1; - if (gen_cigar) bwa_sa2aln(idx, buf, seq, one->sa, one->which->n_gapo + one->which->n_gape, &one->one); - return one; -} - -void bwa_one_destroy(bwa_one_t *one) -{ - free(one->sai.sai); - free(one->one.cigar); - free(one); -} - -/************************ - * Paired-end alignment * - ************************/ - -void bwa_pestat(bwa_buf_t *buf, int n, bwa_one_t **o[2]) -{ -} - -void bwa_pe(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq[2], bwa_one_t *o[2]) -{ -} diff --git a/bwa.h b/bwa.h index e8172da..81d40e0 100644 --- a/bwa.h +++ b/bwa.h @@ -2,103 +2,45 @@ #define BWA_H_ #include +#include "bntseq.h" +#include "bwt.h" -#define BWA_DEF_MAX_SCORE 2048 -#define BWA_MAX_QUERY_LEN 1024 - -// BWA index -struct bwa_idx_t; -typedef struct bwa_idx_t bwa_idx_t; - -// Buffer for BWA alignment -struct bwa_buf_t; -typedef struct bwa_buf_t bwa_buf_t; - -// BWA alignment options -typedef struct { - int s_gapo, s_gape; // gap open and extension penalties; the mismatch penalty is fixed at 3 - int max_diff, max_gapo, max_gape; // max differences (-1 to use fnr for length-adjusted max diff), gap opens and gap extensions - int seed_len, max_seed_diff; // seed length and max differences allowed in the seed - float fnr; // parameter for automatic length-adjusted max differences -} bwa_opt_t; - -// default BWA alignment options -extern bwa_opt_t bwa_def_opt; // = { 11, 4, -1, 1, 6, 32, 2, 0.04 } - -// an interval hit in the SA coordinate; basic unit in .sai files -typedef struct { - uint32_t n_mm:16, n_gapo:8, n_gape:8; - int score; - uint64_t k, l; // [k,l] is the SA interval; each interval has l-k+1 hits -} bwa_sai1_t; - -// all interval hits in the SA coordinate -typedef struct { - int n; // number of interval hits - bwa_sai1_t *sai; -} bwa_sai_t; - -// an alignment -typedef struct { - uint32_t n_n:8, n_gap:12, n_mm:12; // number of ambiguous bases, gaps and mismatches in the alignment - int32_t ref_id; // referece sequence index (the first seq is indexed by 0) - uint32_t offset; // coordinate on the reference; zero-based - uint32_t n_cigar:16, flag:16; // number of CIGAR operations; SAM flag - uint32_t *cigar; // CIGAR in the BAM 28+4 encoding; having n_cigar operations -} bwa_aln_t; +#define BWA_IDX_BWT 0x1 +#define BWA_IDX_BNS 0x2 +#define BWA_IDX_PAC 0x4 +#define BWA_IDX_ALL 0x7 typedef struct { - int mapQs, mapQ, c1, c2; - uint64_t sa; - bwa_sai1_t *which; - bwa_sai_t sai; - bwa_aln_t one; -} bwa_one_t; + bwt_t *bwt; // FM-index + bntseq_t *bns; // information on the reference sequences + uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base +} bwaidx_t; typedef struct { - double avg, std, ap_prior; - uint64_t low, high, high_bayesian; -} bwa_pestat_t; + int l_seq; + char *name, *comment, *seq, *qual, *sam; +} bseq1_t; + +extern int bwa_verbose; +extern char bwa_rg_id[256]; #ifdef __cplusplus extern "C" { #endif - // load a BWA index - bwa_idx_t *bwa_idx_load(const char *prefix); - void bwa_idx_destroy(bwa_idx_t *p); + bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); - // allocate a BWA alignment buffer; if unsure, set opt to &bwa_def_opt and max_score to BWA_DEF_MAX_SCORE - bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score); - void bwa_buf_destroy(bwa_buf_t *p); + uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); + int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re); - /** - * Find all the SA intervals - * - * @param idx BWA index; multiple threads can share the same index - * @param buf BWA alignment buffer; each thread should have its own buffer - * @param seq NULL terminated C string, consisting of A/C/G/T/N only - * - * @return SA intervals seq is matched to - */ - bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq); + char *bwa_idx_infer_prefix(const char *hint); + bwt_t *bwa_idx_load_bwt(const char *hint); - /** - * Construct an alignment in the base-pair coordinate - * - * @param idx BWA index - * @param buf BWA alignment buffer - * @param seq NULL terinated C string - * @param sa Suffix array value - * @param n_gaps Number of gaps (typically equal to bwa_sai1_t::n_gapo + bwa_sai1_t::n_gape - * - * @return An alignment - */ - void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps, bwa_aln_t *aln); + bwaidx_t *bwa_idx_load(const char *hint, int which); + void bwa_idx_destroy(bwaidx_t *idx); - bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar); - - void bwa_one_destroy(bwa_one_t *one); + void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line); + char *bwa_set_rg(const char *s); #ifdef __cplusplus } diff --git a/bwamem.c b/bwamem.c new file mode 100644 index 0000000..5a8a4dc --- /dev/null +++ b/bwamem.c @@ -0,0 +1,791 @@ +#include +#include +#include +#include +#include +#ifdef HAVE_PTHREAD +#include +#endif + +#include "kstring.h" +#include "bwamem.h" +#include "bntseq.h" +#include "ksw.h" +#include "kvec.h" +#include "ksort.h" +#include "utils.h" + +/* Theory on probability and scoring *ungapped* alignment + * + * s'(a,b) = log[P(b|a)/P(b)] = log[4P(b|a)], assuming uniform base distribution + * s'(a,a) = log(4), s'(a,b) = log(4e/3), where e is the error rate + * + * Scale s'(a,b) to s(a,a) s.t. s(a,a)=x. Then s(a,b) = x*s'(a,b)/log(4), or conversely: s'(a,b)=s(a,b)*log(4)/x + * + * If the matching score is x and mismatch penalty is -y, we can compute error rate e: + * e = .75 * exp[-log(4) * y/x] + * + * log P(seq) = \sum_i log P(b_i|a_i) = \sum_i {s'(a,b) - log(4)} + * = \sum_i { s(a,b)*log(4)/x - log(4) } = log(4) * (S/x - l) + * + * where S=\sum_i s(a,b) is the alignment score. Converting to the phred scale: + * Q(seq) = -10/log(10) * log P(seq) = 10*log(4)/log(10) * (l - S/x) = 6.02 * (l - S/x) + * + * + * Gap open (zero gap): q' = log[P(gap-open)], r' = log[P(gap-ext)] (see Durbin et al. (1998) Section 4.1) + * Then q = x*log[P(gap-open)]/log(4), r = x*log[P(gap-ext)]/log(4) + * + * When there are gaps, l should be the length of alignment matches (i.e. the M operator in CIGAR) + */ + +mem_opt_t *mem_opt_init() +{ + mem_opt_t *o; + o = xcalloc(1, sizeof(mem_opt_t)); + o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100; + o->flag = 0; + o->min_seed_len = 19; + o->split_width = 10; + o->max_occ = 10000; + o->max_chain_gap = 10000; + o->max_ins = 10000; + o->mask_level = 0.50; + o->chain_drop_ratio = 0.50; + o->split_factor = 1.5; + o->chunk_size = 10000000; + o->n_threads = 1; + o->pen_unpaired = 9; + o->max_matesw = 100; + mem_fill_scmat(o->a, o->b, o->mat); + return o; +} + +void mem_fill_scmat(int a, int b, int8_t mat[25]) +{ + int i, j, k; + for (i = k = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + mat[k++] = i == j? a : -b; + mat[k++] = 0; // ambiguous base + } + for (j = 0; j < 5; ++j) mat[k++] = 0; +} + +/*************************** + * SMEM iterator interface * + ***************************/ + +struct __smem_i { + const bwt_t *bwt; + const uint8_t *query; + int start, len; + bwtintv_v *matches; // matches; to be returned by smem_next() + bwtintv_v *sub; // sub-matches inside the longest match; temporary + bwtintv_v *tmpvec[2]; // temporary arrays +}; + +smem_i *smem_itr_init(const bwt_t *bwt) +{ + smem_i *itr; + itr = xcalloc(1, sizeof(smem_i)); + itr->bwt = bwt; + itr->tmpvec[0] = xcalloc(1, sizeof(bwtintv_v)); + itr->tmpvec[1] = xcalloc(1, sizeof(bwtintv_v)); + itr->matches = xcalloc(1, sizeof(bwtintv_v)); + itr->sub = xcalloc(1, sizeof(bwtintv_v)); + return itr; +} + +void smem_itr_destroy(smem_i *itr) +{ + free(itr->tmpvec[0]->a); free(itr->tmpvec[0]); + free(itr->tmpvec[1]->a); free(itr->tmpvec[1]); + free(itr->matches->a); free(itr->matches); + free(itr->sub->a); free(itr->sub); + free(itr); +} + +void smem_set_query(smem_i *itr, int len, const uint8_t *query) +{ + itr->query = query; + itr->start = 0; + itr->len = len; +} + +const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width) +{ + int i, max, max_i, ori_start; + itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0; + if (itr->start >= itr->len || itr->start < 0) return 0; + while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases + if (itr->start == itr->len) return 0; + ori_start = itr->start; + itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM + if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here + for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match + bwtintv_t *p = &itr->matches->a[i]; + int len = (uint32_t)p->info - (p->info>>32); + if (max < len) max = len, max_i = i; + } + if (split_len > 0 && max >= split_len && itr->matches->a[max_i].x[2] <= split_width) { // if the longest SMEM is unique and long + int j; + bwtintv_v *a = itr->tmpvec[0]; // reuse tmpvec[0] for merging + bwtintv_t *p = &itr->matches->a[max_i]; + bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, itr->matches->a[max_i].x[2]+1, itr->sub, itr->tmpvec); // starting from the middle of the longest MEM + i = j = 0; a->n = 0; + while (i < itr->matches->n && j < itr->sub->n) { // ordered merge + int64_t xi = itr->matches->a[i].info>>32<<32 | (itr->len - (uint32_t)itr->matches->a[i].info); + int64_t xj = itr->sub->a[j].info>>32<<32 | (itr->len - (uint32_t)itr->sub->a[j].info); + if (xi < xj) { + kv_push(bwtintv_t, *a, itr->matches->a[i]); + ++i; + } else if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1 && (uint32_t)itr->sub->a[j].info > ori_start) { + kv_push(bwtintv_t, *a, itr->sub->a[j]); + ++j; + } else ++j; + } + for (; i < itr->matches->n; ++i) kv_push(bwtintv_t, *a, itr->matches->a[i]); + for (; j < itr->sub->n; ++j) + if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1 && (uint32_t)itr->sub->a[j].info > ori_start) + kv_push(bwtintv_t, *a, itr->sub->a[j]); + kv_copy(bwtintv_t, *itr->matches, *a); + } + return itr->matches; +} + +/******************************** + * Chaining while finding SMEMs * + ********************************/ + +typedef struct { + int64_t rbeg; + int32_t qbeg, len; +} mem_seed_t; + +typedef struct { + int n, m; + int64_t pos; + mem_seed_t *seeds; +} mem_chain_t; + +typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; + +#include "kbtree.h" + +#define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos)) +KBTREE_INIT(chn, mem_chain_t, chain_cmp) + +static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t *p) +{ + int64_t qend, rend, x, y; + const mem_seed_t *last = &c->seeds[c->n-1]; + qend = last->qbeg + last->len; + rend = last->rbeg + last->len; + if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) + return 1; // contained seed; do nothing + x = p->qbeg - last->qbeg; // always non-negtive + y = p->rbeg - last->rbeg; + if (y >= 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain + if (c->n == c->m) { + c->m <<= 1; + c->seeds = xrealloc(c->seeds, c->m * sizeof(mem_seed_t)); + } + c->seeds[c->n++] = *p; + return 1; + } + return 0; // request to add a new chain +} + +static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *itr) +{ + const bwtintv_v *a; + int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); + split_len = split_len < itr->len? split_len : itr->len; + while ((a = smem_next(itr, split_len, opt->split_width)) != 0) { // to find all SMEM and some internal MEM + int i; + for (i = 0; i < a->n; ++i) { // go through each SMEM/MEM up to itr->start + bwtintv_t *p = &a->a[i]; + int slen = (uint32_t)p->info - (p->info>>32); // seed length + int64_t k; + if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive + for (k = 0; k < p->x[2]; ++k) { + mem_chain_t tmp, *lower, *upper; + mem_seed_t s; + int to_add = 0; + s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference + s.qbeg = p->info>>32; + s.len = slen; + if (kb_size(tree)) { + kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain + if (!lower || !test_and_merge(opt, lower, &s)) to_add = 1; + } else to_add = 1; + if (to_add) { // add the seed as a new chain + tmp.n = 1; tmp.m = 4; + tmp.seeds = xcalloc(tmp.m, sizeof(mem_seed_t)); + tmp.seeds[0] = s; + kb_putp(chn, tree, &tmp); + } + } + } + } +} + +void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn) +{ + int i, j; + for (i = 0; i < chn->n; ++i) { + mem_chain_t *p = &chn->a[i]; + err_printf("%d", p->n); + for (j = 0; j < p->n; ++j) { + bwtint_t pos; + int is_rev, ref_id; + pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev); + if (is_rev) pos -= p->seeds[j].len - 1; + bns_cnt_ambi(bns, pos, p->seeds[j].len, &ref_id); + err_printf("\t%d,%d,%ld(%s:%c%ld)", p->seeds[j].len, p->seeds[j].qbeg, (long)p->seeds[j].rbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); + } + err_putchar('\n'); + } +} + +mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq) +{ + mem_chain_v chain; + smem_i *itr; + kbtree_t(chn) *tree; + + kv_init(chain); + if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match + tree = kb_init(chn, KB_DEFAULT_SIZE); + itr = smem_itr_init(bwt); + smem_set_query(itr, len, seq); + mem_insert_seed(opt, tree, itr); + + kv_resize(mem_chain_t, chain, kb_size(tree)); + + #define traverse_func(p_) (chain.a[chain.n++] = *(p_)) + __kb_traverse(mem_chain_t, tree, traverse_func); + #undef traverse_func + + smem_itr_destroy(itr); + kb_destroy(chn, tree); + return chain; +} + +/******************** + * Filtering chains * + ********************/ + +typedef struct { + int beg, end, w; + void *p, *p2; +} flt_aux_t; + +#define flt_lt(a, b) ((a).w > (b).w) +KSORT_INIT(mem_flt, flt_aux_t, flt_lt) + +int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains) +{ + flt_aux_t *a; + int i, j, n; + if (n_chn <= 1) return n_chn; // no need to filter + a = xmalloc(sizeof(flt_aux_t) * n_chn); + for (i = 0; i < n_chn; ++i) { + mem_chain_t *c = &chains[i]; + int64_t end; + int w = 0, tmp; + for (j = 0, end = 0; j < c->n; ++j) { + const mem_seed_t *s = &c->seeds[j]; + if (s->qbeg >= end) w += s->len; + else if (s->qbeg + s->len > end) w += s->qbeg + s->len - end; + end = end > s->qbeg + s->len? end : s->qbeg + s->len; + } + tmp = w; + for (j = 0, end = 0; j < c->n; ++j) { + const mem_seed_t *s = &c->seeds[j]; + if (s->rbeg >= end) w += s->len; + else if (s->rbeg + s->len > end) w += s->rbeg + s->len - end; + end = end > s->qbeg + s->len? end : s->qbeg + s->len; + } + w = w < tmp? w : tmp; + a[i].beg = c->seeds[0].qbeg; + a[i].end = c->seeds[c->n-1].qbeg + c->seeds[c->n-1].len; + a[i].w = w; a[i].p = c; a[i].p2 = 0; + } + ks_introsort(mem_flt, n_chn, a); + { // reorder chains such that the best chain appears first + mem_chain_t *swap; + swap = xmalloc(sizeof(mem_chain_t) * n_chn); + for (i = 0; i < n_chn; ++i) { + swap[i] = *((mem_chain_t*)a[i].p); + a[i].p = &chains[i]; // as we will memcpy() below, a[i].p is changed + } + memcpy(chains, swap, sizeof(mem_chain_t) * n_chn); + free(swap); + } + for (i = 1, n = 1; i < n_chn; ++i) { + for (j = 0; j < n; ++j) { + int b_max = a[j].beg > a[i].beg? a[j].beg : a[i].beg; + int e_min = a[j].end < a[i].end? a[j].end : a[i].end; + if (e_min > b_max) { // have overlap + int min_l = a[i].end - a[i].beg < a[j].end - a[j].beg? a[i].end - a[i].beg : a[j].end - a[j].beg; + if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap + if (a[j].p2 == 0) a[j].p2 = a[i].p; + if (a[i].w < a[j].w * opt->chain_drop_ratio && a[j].w - a[i].w >= opt->min_seed_len<<1) + break; + } + } + } + if (j == n) a[n++] = a[i]; // if have no significant overlap with better chains, keep it. + } + for (i = 0; i < n; ++i) { // mark chains to be kept + mem_chain_t *c = (mem_chain_t*)a[i].p; + if (c->n > 0) c->n = -c->n; + c = (mem_chain_t*)a[i].p2; + if (c && c->n > 0) c->n = -c->n; + } + free(a); + for (i = 0; i < n_chn; ++i) { // free discarded chains + mem_chain_t *c = &chains[i]; + if (c->n >= 0) { + free(c->seeds); + c->n = c->m = 0; + } else c->n = -c->n; + } + for (i = n = 0; i < n_chn; ++i) { // squeeze out discarded chains + if (chains[i].n > 0) { + if (n != i) chains[n++] = chains[i]; + else ++n; + } + } + return n; +} + +/****************************** + * De-overlap single-end hits * + ******************************/ + +#define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb)))) +KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt) + +int mem_sort_and_dedup(int n, mem_alnreg_t *a) +{ + int m, i; + if (n <= 1) return n; + ks_introsort(mem_ars, n, a); + for (i = 1; i < n; ++i) { // mark identical hits + if (a[i].score == a[i-1].score && a[i].rb == a[i-1].rb && a[i].qb == a[i-1].qb) + a[i].qe = a[i].qb; + } + for (i = 1, m = 1; i < n; ++i) // exclude identical hits + if (a[i].qe > a[i].qb) { + if (m != i) a[m++] = a[i]; + else ++m; + } + return m; +} + +void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORTANT: must run mem_sort_and_dedup() before calling this function +{ // similar to the loop in mem_chain_flt() + int i, k, tmp; + kvec_t(int) z; + if (n == 0) return; + kv_init(z); + for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1; + tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r; + kv_push(int, z, 0); + for (i = 1; i < n; ++i) { + for (k = 0; k < z.n; ++k) { + int j = z.a[k]; + int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb; + int e_min = a[j].qe < a[i].qe? a[j].qe : a[i].qe; + if (e_min > b_max) { // have overlap + int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb; + if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap + if (a[j].sub == 0) a[j].sub = a[i].score; + if (a[j].score - a[i].score <= tmp) ++a[j].sub_n; + break; + } + } + } + if (k == z.n) kv_push(int, z, i); + else a[i].secondary = z.a[k]; + } + free(z.a); +} + +/**************************************** + * Construct the alignment from a chain * + ****************************************/ + +static inline int cal_max_gap(const mem_opt_t *opt, int qlen) +{ + int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.); + l = l > 1? l : 1; + return l < opt->w<<1? l : opt->w<<1; +} + +void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) +{ // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds + int i, k; + int64_t rlen, rmax[2], tmp, max = 0; + const mem_seed_t *s; + uint8_t *rseq = 0; + uint64_t *srt; + + if (c->n == 0) return; + // get the max possible span + rmax[0] = l_pac<<1; rmax[1] = 0; + for (i = 0; i < c->n; ++i) { + int64_t b, e; + const mem_seed_t *t = &c->seeds[i]; + b = t->rbeg - (t->qbeg + cal_max_gap(opt, t->qbeg)); + e = t->rbeg + t->len + ((l_query - t->qbeg - t->len) + cal_max_gap(opt, l_query - t->qbeg - t->len)); + rmax[0] = rmax[0] < b? rmax[0] : b; + rmax[1] = rmax[1] > e? rmax[1] : e; + if (t->len > max) max = t->len; + } + rmax[0] = rmax[0] > 0? rmax[0] : 0; + rmax[1] = rmax[1] < l_pac<<1? rmax[1] : l_pac<<1; + if (rmax[0] < l_pac && l_pac < rmax[1]) { // crossing the forward-reverse boundary; then choose one side + if (l_pac - rmax[0] > rmax[1] - l_pac) rmax[1] = l_pac; + else rmax[0] = l_pac; + } + // retrieve the reference sequence + rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); + if (rlen != rmax[1] - rmax[0]) return; + + srt = xmalloc(c->n * 8); + for (i = 0; i < c->n; ++i) + srt[i] = (uint64_t)c->seeds[i].len<<32 | i; + ks_introsort_64(c->n, srt); + + for (k = c->n - 1; k >= 0; --k) { + mem_alnreg_t *a; + s = &c->seeds[(uint32_t)srt[k]]; + + for (i = 0; i < av->n; ++i) { // test whether extension has been made before + mem_alnreg_t *p = &av->a[i]; + int64_t rd; + int qd, w, max_gap; + if (s->rbeg < p->rb || s->rbeg + s->len > p->re || s->qbeg < p->qb || s->qbeg + s->len > p->qe) continue; // not fully contained + // qd: distance ahead of the seed on query; rd: on reference + qd = s->qbeg - p->qb; rd = s->rbeg - p->rb; + max_gap = cal_max_gap(opt, qd < rd? qd : rd); // the maximal gap allowed in regions ahead of the seed + w = max_gap < opt->w? max_gap : opt->w; // bounded by the band width + if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit + // similar to the previous four lines, but this time we look at the region behind + qd = p->qe - (s->qbeg + s->len); rd = p->re - (s->rbeg + s->len); + max_gap = cal_max_gap(opt, qd < rd? qd : rd); + w = max_gap < opt->w? max_gap : opt->w; + if (qd - rd < w && rd - qd < w) break; + } + if (i < av->n) continue; + + a = kv_pushp(mem_alnreg_t, *av); + memset(a, 0, sizeof(mem_alnreg_t)); + + if (s->qbeg) { // left extension + uint8_t *rs, *qs; + int qle, tle; + qs = xmalloc(s->qbeg); + for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i]; + tmp = s->rbeg - rmax[0]; + rs = xmalloc(tmp); + for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; + a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle); + a->qb = s->qbeg - qle; a->rb = s->rbeg - tle; + free(qs); free(rs); + } else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; + + if (s->qbeg + s->len != l_query) { // right extension + int qle, tle, qe, re; + qe = s->qbeg + s->len; + re = s->rbeg + s->len - rmax[0]; + a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle); + a->qe = qe + qle; a->re = rmax[0] + re + tle; + } else a->qe = l_query, a->re = s->rbeg + s->len; + if (bwa_verbose >= 4) err_printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); + + // compute seedcov + for (i = 0, a->seedcov = 0; i < c->n; ++i) { + const mem_seed_t *t = &c->seeds[i]; + if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained + a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough + } + } + free(srt); free(rseq); +} + +/***************************** + * Basic hit->SAM conversion * + *****************************/ + +void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m) +{ +#define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1) + int score, n_cigar, is_rev = 0, rid, mid, copy_mate = 0, NM = -1; + uint32_t *cigar = 0; + int64_t pos; + bwahit_t ptmp, *p = &ptmp; + + if (!p_) { // in this case, generate an unmapped alignment + memset(&ptmp, 0, sizeof(bwahit_t)); + ptmp.rb = ptmp.re = -1; + } else ptmp = *p_; + p->flag |= m? 1 : 0; // is paired in sequencing + p->flag |= !is_mapped(p)? 4 : 0; // is mapped + p->flag |= m && !is_mapped(m)? 8 : 0; // is mate mapped + if (m && !is_mapped(p) && is_mapped(m)) { + p->rb = m->rb; p->re = m->re; p->qb = 0; p->qe = s->l_seq; + copy_mate = 1; + } + p->flag |= p->rb >= bns->l_pac? 0x10 : 0; // is reverse strand + p->flag |= m && m->rb >= bns->l_pac? 0x20 : 0; // is mate on reverse strand + kputs(s->name, str); kputc('\t', str); + if (is_mapped(p)) { // has a coordinate, no matter whether it is mapped or copied from the mate + int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag + if (p->flag&0x10000) sam_flag |= 0x100; + if (!copy_mate) { + cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM); + p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened) + } else n_cigar = 0, cigar = 0; + pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev); + bns_cnt_ambi(bns, pos, p->re - p->rb, &rid); + kputw(sam_flag, str); kputc('\t', str); + kputs(bns->anns[rid].name, str); kputc('\t', str); kputuw(pos - bns->anns[rid].offset + 1, str); kputc('\t', str); + kputw(p->qual, str); kputc('\t', str); + if (n_cigar) { + int i, clip5, clip3; + clip5 = is_rev? s->l_seq - p->qe : p->qb; + clip3 = is_rev? p->qb : s->l_seq - p->qe; + if (clip5) { kputw(clip5, str); kputc("SH"[(is_hard!=0)], str); } + for (i = 0; i < n_cigar; ++i) { + kputw(cigar[i]>>4, str); kputc("MIDSH"[cigar[i]&0xf], str); + } + if (clip3) { kputw(clip3, str); kputc("SH"[(is_hard!=0)], str); } + } else kputc('*', str); + } else { // no coordinate + kputw(p->flag, str); + kputs("\t*\t0\t0\t*", str); + rid = -1; + } + if (m && is_mapped(m)) { // then print mate pos and isize + pos = bns_depos(bns, m->rb < bns->l_pac? m->rb : m->re - 1, &is_rev); + bns_cnt_ambi(bns, pos, m->re - m->rb, &mid); + kputc('\t', str); + if (mid == rid) kputc('=', str); + else kputs(bns->anns[mid].name, str); + kputc('\t', str); kputuw(pos - bns->anns[mid].offset + 1, str); + kputc('\t', str); + if (mid == rid) { + int64_t p0 = p->rb < bns->l_pac? p->rb : (bns->l_pac<<1) - 1 - p->rb; + int64_t p1 = m->rb < bns->l_pac? m->rb : (bns->l_pac<<1) - 1 - m->rb; + kputw(p0 - p1 + (p0 > p1? 1 : -1), str); + } else kputw(0, str); + kputc('\t', str); + } else kputsn("\t*\t0\t0\t", 7, str); + if (p->flag&0x100) { // for secondary alignments, don't write SEQ and QUAL + kputsn("*\t*", 3, str); + } else if (!(p->flag&0x10)) { // print SEQ and QUAL, the forward strand + int i, qb = 0, qe = s->l_seq; + if (!(p->flag&4) && is_hard) qb = p->qb, qe = p->qe; + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]]; + kputc('\t', str); + if (s->qual) { // printf qual + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qb; i < qe; ++i) str->s[str->l++] = s->qual[i]; + str->s[str->l] = 0; + } else kputc('*', str); + } else { // the reverse strand + int i, qb = 0, qe = s->l_seq; + if (!(p->flag&4) && is_hard) qb = p->qb, qe = p->qe; + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]]; + kputc('\t', str); + if (s->qual) { // printf qual + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qe-1; i >= qb; --i) str->s[str->l++] = s->qual[i]; + str->s[str->l] = 0; + } else kputc('*', str); + } + if (NM >= 0) { kputsn("\tNM:i:", 6, str); kputw(NM, str); } + if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } + if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } + if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } + if (s->comment) { kputc('\t', str); kputs(s->comment, str); } + kputc('\n', str); + free(cigar); +#undef is_mapped +} + +/************************ + * Integrated interface * + ************************/ + +int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) +{ + int mapq, l, sub = a->sub? a->sub : opt->min_seed_len * opt->a; + double identity; + sub = a->csub > sub? a->csub : sub; + if (sub >= a->score) return 0; + l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb; + mapq = a->score? (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0; + identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; + mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; + if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499); + if (mapq > 60) mapq = 60; + if (mapq < 0) mapq = 0; + return mapq; +} + +void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h) +{ + h->rb = a->rb; h->re = a->re; h->qb = a->qb; h->qe = a->qe; + h->score = a->score; + h->sub = a->secondary >= 0? -1 : a->sub > a->csub? a->sub : a->csub; + h->qual = 0; // quality unset + h->flag = a->secondary >= 0? 0x100 : 0; // only the "secondary" bit is set +} + +void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const bwahit_t *m) +{ + int k; + kstring_t str; + str.l = str.m = 0; str.s = 0; + if (a->n > 0) { + int mapq0 = -1; + for (k = 0; k < a->n; ++k) { + bwahit_t h; + mem_alnreg_t *p = &a->a[k]; + if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; + if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue; + mem_alnreg2hit(p, &h); + bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s->seq, &h.qb, &h.qe, &h.rb, &h.re); + h.flag |= extra_flag; + if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) h.flag |= 0x10000; // print the sequence, but flag as secondary (for Picard) + h.qual = p->secondary >= 0? 0 : mem_approx_mapq_se(opt, p); + if (k == 0) mapq0 = h.qual; + else if (h.qual > mapq0) h.qual = mapq0; + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); + } + } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP, m); + s->sam = str.s; +} + +mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) +{ + int i; + mem_chain_v chn; + mem_alnreg_v regs; + + for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so + seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]]; + + chn = mem_chain(opt, bwt, l_seq, (uint8_t*)seq); + chn.n = mem_chain_flt(opt, chn.n, chn.a); + if (bwa_verbose >= 4) mem_print_chain(bns, &chn); + + kv_init(regs); + for (i = 0; i < chn.n; ++i) { + mem_chain_t *p = &chn.a[i]; + mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); + free(chn.a[i].seeds); + } + free(chn.a); + regs.n = mem_sort_and_dedup(regs.n, regs.a); + return regs; +} + +typedef struct { + int start, step, n; + const mem_opt_t *opt; + const bwt_t *bwt; + const bntseq_t *bns; + const uint8_t *pac; + const mem_pestat_t *pes; + bseq1_t *seqs; + mem_alnreg_v *regs; +} worker_t; + +static void *worker1(void *data) +{ + worker_t *w = (worker_t*)data; + int i; + if (!(w->opt->flag&MEM_F_PE)) { + for (i = w->start; i < w->n; i += w->step) + w->regs[i] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq); + } else { // for PE we align the two ends in the same thread in case the 2nd read is of worse quality, in which case some threads may be faster/slower + for (i = w->start; i < w->n>>1; i += w->step) { + w->regs[i<<1|0] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq); + w->regs[i<<1|1] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq); + } + } + return 0; +} + +static void *worker2(void *data) +{ + extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]); + worker_t *w = (worker_t*)data; + int i; + if (!(w->opt->flag&MEM_F_PE)) { + for (i = w->start; i < w->n; i += w->step) { + mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a); + mem_sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); + free(w->regs[i].a); + } + } else { + int n = 0; + for (i = w->start; i < w->n>>1; i += w->step) { // not implemented yet + n += mem_sam_pe(w->opt, w->bns, w->pac, w->pes, i, &w->seqs[i<<1], &w->regs[i<<1]); + free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a); + } + fprintf(stderr, "[M::%s@%d] performed mate-SW for %d reads\n", __func__, w->start, n); + } + return 0; +} + +void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs) +{ + int i; + worker_t *w; + mem_alnreg_v *regs; + mem_pestat_t pes[4]; + + w = xcalloc(opt->n_threads, sizeof(worker_t)); + regs = xmalloc(n * sizeof(mem_alnreg_v)); + for (i = 0; i < opt->n_threads; ++i) { + worker_t *p = &w[i]; + p->start = i; p->step = opt->n_threads; p->n = n; + p->opt = opt; p->bwt = bwt; p->bns = bns; p->pac = pac; + p->seqs = seqs; p->regs = regs; + p->pes = &pes[0]; + } +#ifdef HAVE_PTHREAD + if (opt->n_threads == 1) { + worker1(w); + if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes); + worker2(w); + } else { + pthread_t *tid; + tid = (pthread_t*)xcalloc(opt->n_threads, sizeof(pthread_t)); + for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker1, &w[i]); + for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); + if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes); + for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker2, &w[i]); + for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); + free(tid); + } +#else + worker1(w); + if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes); + worker2(w); +#endif + for (i = 0; i < n; ++i) { + err_fputs(seqs[i].sam, stdout); + free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam); + } + free(regs); free(w); +} diff --git a/bwamem.h b/bwamem.h new file mode 100644 index 0000000..8a7c7b8 --- /dev/null +++ b/bwamem.h @@ -0,0 +1,133 @@ +#ifndef BWAMEM_H_ +#define BWAMEM_H_ + +#include "bwt.h" +#include "bntseq.h" +#include "bwa.h" + +#define MEM_MAPQ_COEF 30.0 +#define MEM_MAPQ_MAX 60 + +struct __smem_i; +typedef struct __smem_i smem_i; + +#define MEM_F_HARDCLIP 0x1 +#define MEM_F_PE 0x2 +#define MEM_F_NOPAIRING 0x4 +#define MEM_F_ALL 0x8 +#define MEM_F_NO_MULTI 0x10 + +typedef struct { + int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r + int w; // band width + int flag; // see MEM_F_* macros + int min_seed_len; // minimum seed length + float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor + int split_width; // split into a seed if its occurence is smaller than this value + int max_occ; // skip a seed if its occurence is larger than this value + int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed + int n_threads; // number of threads + int chunk_size; // process chunk_size-bp sequences in a batch + float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits + float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain + int pen_unpaired; // phred-scaled penalty for unpaired reads + int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value + int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end + int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset +} mem_opt_t; + +typedef struct { + int64_t rb, re; // [rb,re): reference sequence in the alignment + int qb, qe; // [qb,qe): query sequence in the alignment + int score; // best SW score + int sub; // 2nd best SW score + int csub; // SW score of a tandem hit + int sub_n; // approximate number of suboptimal hits + int seedcov; // length of regions coverged by seeds + int secondary; // index of the parent hit shadowing the current hit; <0 if primary +} mem_alnreg_t; + +typedef struct { + int low, high, failed; + double avg, std; +} mem_pestat_t; + +typedef struct { + int64_t rb, re; + int qb, qe, flag, qual; + // optional info + int score, sub; +} bwahit_t; + +typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v; + +#ifdef __cplusplus +extern "C" { +#endif + + smem_i *smem_itr_init(const bwt_t *bwt); + void smem_itr_destroy(smem_i *itr); + void smem_set_query(smem_i *itr, int len, const uint8_t *query); + const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width); + + mem_opt_t *mem_opt_init(void); + void mem_fill_scmat(int a, int b, int8_t mat[25]); + + /** + * Align a batch of sequences and generate the alignments in the SAM format + * + * This routine requires $seqs[i].{l_seq,seq,name} and write $seqs[i].sam. + * Note that $seqs[i].sam may consist of several SAM lines if the + * corresponding sequence has multiple primary hits. + * + * In the paired-end mode (i.e. MEM_F_PE is set in $opt->flag), query + * sequences must be interleaved: $n must be an even number and the 2i-th + * sequence and the (2i+1)-th sequence constitute a read pair. In this + * mode, there should be enough (typically >50) unique pairs for the + * routine to infer the orientation and insert size. + * + * @param opt alignment parameters + * @param bwt FM-index of the reference sequence + * @param bns Information of the reference + * @param pac 2-bit encoded reference + * @param n number of query sequences + * @param seqs query sequences; $seqs[i].seq/sam to be modified after the call + */ + void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs); + + /** + * Find the aligned regions for one query sequence + * + * Note that this routine does not generate CIGAR. CIGAR should be + * generated later by bwa_gen_cigar() defined in bwa.c. + * + * @param opt alignment parameters + * @param bwt FM-index of the reference sequence + * @param bns Information of the reference + * @param pac 2-bit encoded reference + * @param l_seq length of query sequence + * @param seq query sequence; conversion ACGTN/acgtn=>01234 to be applied + * + * @return list of aligned regions. + */ + mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq); + + /** + * Infer the insert size distribution from interleaved alignment regions + * + * This function can be called after mem_align1(), as long as paired-end + * reads are properly interleaved. + * + * @param opt alignment parameters + * @param l_pac length of concatenated reference sequence + * @param n number of query sequences; must be an even number + * @param regs region array of size $n; 2i-th and (2i+1)-th elements constitute a pair + * @param pes inferred insert size distribution (output) + */ + void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bwamem_pair.c b/bwamem_pair.c new file mode 100644 index 0000000..2fc26b6 --- /dev/null +++ b/bwamem_pair.c @@ -0,0 +1,314 @@ +#include +#include +#include +#include +#include "kstring.h" +#include "bwamem.h" +#include "kvec.h" +#include "utils.h" +#include "ksw.h" + +#define MIN_RATIO 0.8 +#define MIN_DIR_CNT 10 +#define MIN_DIR_RATIO 0.05 +#define OUTLIER_BOUND 2.0 +#define MAPPING_BOUND 3.0 +#define MAX_STDDEV 4.0 + +static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist) +{ + int64_t p2; + int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac); + p2 = r1 == r2? b2 : (l_pac<<1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand + *dist = p2 > b1? p2 - b1 : b1 - p2; + return (r1 == r2? 0 : 1) ^ (p2 > b1? 0 : 3); +} + +static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) +{ + int j; + for (j = 1; j < r->n; ++j) { // choose unique alignment + int b_max = r->a[j].qb > r->a[0].qb? r->a[j].qb : r->a[0].qb; + int e_min = r->a[j].qe < r->a[0].qe? r->a[j].qe : r->a[0].qe; + if (e_min > b_max) { // have overlap + int min_l = r->a[j].qe - r->a[j].qb < r->a[0].qe - r->a[0].qb? r->a[j].qe - r->a[j].qb : r->a[0].qe - r->a[0].qb; + if (e_min - b_max >= min_l * opt->mask_level) break; // significant overlap + } + } + return j < r->n? r->a[j].score : opt->min_seed_len * opt->a; +} + +void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]) +{ + int i, d, max; + uint64_v isize[4]; + memset(pes, 0, 4 * sizeof(mem_pestat_t)); + memset(isize, 0, sizeof(kvec_t(int)) * 4); + for (i = 0; i < n>>1; ++i) { + int dir; + int64_t is; + mem_alnreg_v *r[2]; + r[0] = (mem_alnreg_v*)®s[i<<1|0]; + r[1] = (mem_alnreg_v*)®s[i<<1|1]; + if (r[0]->n == 0 || r[1]->n == 0) continue; + if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue; + if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue; + dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is); + if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is); + } + if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); + for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two. + mem_pestat_t *r = &pes[d]; + uint64_v *q = &isize[d]; + int p25, p50, p75, x; + if (q->n < MIN_DIR_CNT) { + fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]); + r->failed = 1; + continue; + } else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]); + ks_introsort_64(q->n, q->a); + p25 = q->a[(int)(.25 * q->n + .499)]; + p50 = q->a[(int)(.50 * q->n + .499)]; + p75 = q->a[(int)(.75 * q->n + .499)]; + r->low = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); + if (r->low < 1) r->low = 1; + r->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); + fprintf(stderr, "[M::%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75); + fprintf(stderr, "[M::%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r->low, r->high); + for (i = x = 0, r->avg = 0; i < q->n; ++i) + if (q->a[i] >= r->low && q->a[i] <= r->high) + r->avg += q->a[i], ++x; + r->avg /= x; + for (i = 0, r->std = 0; i < q->n; ++i) + if (q->a[i] >= r->low && q->a[i] <= r->high) + r->std += (q->a[i] - r->avg) * (q->a[i] - r->avg); + r->std = sqrt(r->std / x); + fprintf(stderr, "[M::%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r->avg, r->std); + r->low = (int)(p25 - MAPPING_BOUND * (p75 - p25) + .499); + r->high = (int)(p75 + MAPPING_BOUND * (p75 - p25) + .499); + if (r->low > r->avg - MAX_STDDEV * r->std) r->low = (int)(r->avg - MAX_STDDEV * r->std + .499); + if (r->high < r->avg - MAX_STDDEV * r->std) r->high = (int)(r->avg + MAX_STDDEV * r->std + .499); + if (r->low < 1) r->low = 1; + fprintf(stderr, "[M::%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r->low, r->high); + free(q->a); + } + for (d = 0, max = 0; d < 4; ++d) + max = max > isize[d].n? max : isize[d].n; + for (d = 0; d < 4; ++d) + if (pes[d].failed == 0 && isize[d].n < max * MIN_DIR_RATIO) { + pes[d].failed = 1; + fprintf(stderr, "[M::%s] skip orientation %c%c\n", __func__, "FR"[d>>1&1], "FR"[d&1]); + } +} + +int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) +{ + int i, r, skip[4], n = 0; + for (r = 0; r < 4; ++r) + skip[r] = pes[r].failed? 1 : 0; + for (i = 0; i < ma->n; ++i) { // check which orinentation has been found + int64_t dist; + r = mem_infer_dir(l_pac, a->rb, ma->a[i].rb, &dist); + if (dist >= pes[r].low && dist <= pes[r].high) + skip[r] = 1; + } + if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return 0; // consistent pair exist; no need to perform SW + for (r = 0; r < 4; ++r) { + int is_rev, is_larger; + uint8_t *seq, *rev = 0, *ref; + int64_t rb, re, len; + if (skip[r]) continue; + is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate + is_larger = !(r>>1); // whether the mate has larger coordinate + if (is_rev) { + rev = xmalloc(l_ms); // this is the reverse complement of $ms + for (i = 0; i < l_ms; ++i) rev[l_ms - 1 - i] = ms[i] < 4? 3 - ms[i] : 4; + seq = rev; + } else seq = (uint8_t*)ms; + if (!is_rev) { + rb = is_larger? a->rb + pes[r].low : a->rb - pes[r].high; + re = (is_larger? a->rb + pes[r].high: a->rb - pes[r].low) + l_ms; // if on the same strand, end position should be larger to make room for the seq length + } else { + rb = (is_larger? a->rb + pes[r].low : a->rb - pes[r].high) - l_ms; // similarly on opposite strands + re = is_larger? a->rb + pes[r].high: a->rb - pes[r].low; + } + if (rb < 0) rb = 0; + if (re > l_pac<<1) re = l_pac<<1; + ref = bns_get_seq(l_pac, pac, rb, re, &len); + if (len == re - rb) { // no funny things happening + kswr_t aln; + mem_alnreg_t b; + int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | opt->min_seed_len; + aln = ksw_align(l_ms, seq, len, ref, 5, opt->mat, opt->q, opt->r, xtra, 0); + memset(&b, 0, sizeof(mem_alnreg_t)); + if (aln.score >= opt->min_seed_len) { + b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb; + b.qe = is_rev? l_ms - aln.qb : aln.qe + 1; + b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb; + b.re = is_rev? (l_pac<<1) - (rb + aln.tb) : rb + aln.te + 1; + b.score = aln.score; + b.csub = aln.score2; + b.secondary = -1; + b.seedcov = (b.re - b.rb < b.qe - b.qb? b.re - b.rb : b.qe - b.qb) >> 1; +// printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re); + kv_push(mem_alnreg_t, *ma, b); // make room for a new element + // move b s.t. ma is sorted + for (i = 0; i < ma->n - 1; ++i) // find the insertion point + if (ma->a[i].score < b.score) break; + tmp = i; + for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1]; + ma->a[i] = b; + } + ++n; + } + if (rev) free(rev); + free(ref); + } + return n; +} + +int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2]) +{ + extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); + pair64_v v, u; + int r, i, k, y[4], ret; // y[] keeps the last hit + kv_init(v); kv_init(u); + for (r = 0; r < 2; ++r) { // loop through read number + for (i = 0; i < a[r].n; ++i) { + pair64_t key; + mem_alnreg_t *e = &a[r].a[i]; + key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position + key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= l_pac)<<1 | r; + kv_push(pair64_t, v, key); + } + } + ks_introsort_128(v.n, v.a); + y[0] = y[1] = y[2] = y[3] = -1; + //for (i = 0; i < v.n; ++i) printf("[%d]\t%d\t%c%ld\n", i, (int)(v.a[i].y&1)+1, "+-"[v.a[i].y>>1&1], (long)v.a[i].x); + for (i = 0; i < v.n; ++i) { + for (r = 0; r < 2; ++r) { // loop through direction + int dir = r<<1 | (v.a[i].y>>1&1), which; + if (pes[dir].failed) continue; // invalid orientation + which = r<<1 | ((v.a[i].y&1)^1); + if (y[which] < 0) continue; // no previous hits + for (k = y[which]; k >= 0; --k) { // TODO: this is a O(n^2) solution in the worst case; remember to check if this loop takes a lot of time (I doubt) + int64_t dist; + int q; + double ns; + pair64_t *p; + if ((v.a[k].y&3) != which) continue; + dist = (int64_t)v.a[i].x - v.a[k].x; + //printf("%d: %lld\n", k, dist); + if (dist > pes[dir].high) break; + if (dist < pes[dir].low) continue; + ns = (dist - pes[dir].avg) / pes[dir].std; + q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) + .499); // .721 = 1/log(4) + if (q < 0) q = 0; + p = kv_pushp(pair64_t, u); + p->y = (uint64_t)k<<32 | i; + p->x = (uint64_t)q<<32 | (hash_64(p->y ^ id<<8) & 0xffffffffU); + //printf("[%lld,%lld]\t%d\tdist=%ld\n", v.a[k].x, v.a[i].x, q, (long)dist); + } + } + y[v.a[i].y&3] = i; + } + if (u.n) { // found at least one proper pair + int tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r; + ks_introsort_128(u.n, u.a); + i = u.a[u.n-1].y >> 32; k = u.a[u.n-1].y << 32 >> 32; + z[v.a[i].y&1] = v.a[i].y<<32>>34; // index of the best pair + z[v.a[k].y&1] = v.a[k].y<<32>>34; + ret = u.a[u.n-1].x >> 32; + *sub = u.n > 1? u.a[u.n-2].x>>32 : 0; + for (i = (long)u.n - 2, *n_sub = 0; i >= 0; --i) + if (*sub - (int)(u.a[i].x>>32) <= tmp) ++*n_sub; + } else ret = 0, *sub = 0, *n_sub = 0; + free(u.a); free(v.a); + return ret; +} + +int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]) +{ + extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a); + extern void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const bwahit_t *m); + extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a); + extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); + extern void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p, int is_hard, const bwahit_t *m); + + int n = 0, i, j, z[2], o, subo, n_sub; + kstring_t str; + mem_alnreg_v b[2]; + bwahit_t h[2]; + + str.l = str.m = 0; str.s = 0; + // perform SW for the best alignment + kv_init(b[0]); kv_init(b[1]); + for (i = 0; i < 2; ++i) + for (j = 0; j < a[i].n; ++j) + if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired) + kv_push(mem_alnreg_t, b[i], a[i].a[j]); + for (i = 0; i < 2; ++i) + for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) + n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); + free(b[0].a); free(b[1].a); + mem_mark_primary_se(opt, a[0].n, a[0].a); + mem_mark_primary_se(opt, a[1].n, a[1].a); + if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; + // pairing single-end hits + if (a[0].n && a[1].n && (o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, &n_sub, z)) > 0) { + int is_multi[2], q_pe, extra_flag = 1, score_un, q_se[2]; + // check if an end has multiple hits even after mate-SW + for (i = 0; i < 2; ++i) { + for (j = 1; j < a[i].n; ++j) + if (a[i].a[j].secondary < 0) break; + is_multi[i] = j < a[i].n? 1 : 0; + } + if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score + // compute mapQ for the best SE hit + score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired; + //q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0; + subo = subo > score_un? subo : score_un; + q_pe = (o - subo) * 6; + if (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499); + if (q_pe < 0) q_pe = 0; + if (q_pe > 60) q_pe = 60; + // the following assumes no split hits + if (o > score_un) { // paired alignment is preferred + mem_alnreg_t *c[2]; + c[0] = &a[0].a[z[0]]; c[1] = &a[1].a[z[1]]; + for (i = 0; i < 2; ++i) { + if (c[i]->secondary >= 0) + c[i]->sub = a[i].a[c[i]->secondary].score, c[i]->secondary = -2; + q_se[i] = mem_approx_mapq_se(opt, c[i]); + } + q_se[0] = q_se[0] > q_pe? q_se[0] : q_pe < q_se[0] + 40? q_pe : q_se[0] + 40; + q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe < q_se[1] + 40? q_pe : q_se[1] + 40; + extra_flag |= 2; + // cap at the tandem repeat score + q_se[0] = q_se[0] < (c[0]->score - c[0]->csub) * 6? q_se[0] : (c[0]->score - c[0]->csub) * 6; + q_se[1] = q_se[1] < (c[1]->score - c[1]->csub) * 6? q_se[1] : (c[1]->score - c[1]->csub) * 6; + } else { // the unpaired alignment is preferred + z[0] = z[1] = 0; + q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]); + q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]); + } + mem_alnreg2hit(&a[0].a[z[0]], &h[0]); h[0].qual = q_se[0]; h[0].flag |= 0x40 | extra_flag; + bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[0].seq, &h[0].qb, &h[0].qe, &h[0].rb, &h[0].re); + mem_alnreg2hit(&a[1].a[z[1]], &h[1]); h[1].qual = q_se[1]; h[1].flag |= 0x80 | extra_flag; + bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[1].seq, &h[1].qb, &h[1].qe, &h[1].rb, &h[1].re); + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->flag&MEM_F_HARDCLIP, &h[1]); s[0].sam = xstrdup(str.s); str.l = 0; + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->flag&MEM_F_HARDCLIP, &h[0]); s[1].sam = str.s; + } else goto no_pairing; + return n; + +no_pairing: + for (i = 0; i < 2; ++i) { + if (a[i].n) { + mem_alnreg2hit(&a[i].a[0], &h[i]); + bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[i].seq, &h[i].qb, &h[i].qe, &h[i].rb, &h[i].re); + } else h[i].rb = h[i].re = -1; + } + mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41, &h[1]); + mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81, &h[0]); + return n; +} diff --git a/bwape.c b/bwape.c index f16d684..f0ecb7a 100644 --- a/bwape.c +++ b/bwape.c @@ -10,6 +10,7 @@ #include "utils.h" #include "stdaln.h" #include "bwase.h" +#include "bwa.h" typedef struct { int n; @@ -21,24 +22,15 @@ typedef struct { bwtint_t low, high, high_bayesian; } isize_info_t; -typedef struct { - uint64_t x, y; -} b128_t; - -#define b128_lt(a, b) ((a).x < (b).x) #define b128_eq(a, b) ((a).x == (b).x && (a).y == (b).y) #define b128_hash(a) ((uint32_t)(a).x) #include "khash.h" -KHASH_INIT(b128, b128_t, poslist_t, 1, b128_hash, b128_eq) - -#include "ksort.h" -KSORT_INIT(b128, b128_t, b128_lt) -KSORT_INIT_GENERIC(uint64_t) +KHASH_INIT(b128, pair64_t, poslist_t, 1, b128_hash, b128_eq) typedef struct { - kvec_t(b128_t) arr; - kvec_t(b128_t) pos[2]; + pair64_v arr; + pair64_v pos[2]; kvec_t(bwt_aln1_t) aln[2]; } pe_data_t; @@ -69,19 +61,6 @@ pe_opt_t *bwa_init_pe_opt() po->ap_prior = 1e-5; return po; } - -static inline uint64_t hash_64(uint64_t key) -{ - key += ~(key << 32); - key ^= (key >> 22); - key += ~(key << 13); - key ^= (key >> 8); - key += (key << 3); - key ^= (key >> 15); - key += ~(key << 27); - key ^= (key >> 31); - return key; -} /* static double ierfc(double x) // inverse erfc(); iphi(x) = M_SQRT2 *ierfc(2 * x); { @@ -120,7 +99,7 @@ static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double free(isizes); return -1; } - ks_introsort(uint64_t, tot, isizes); + ks_introsort_64(tot, isizes); p25 = isizes[(int)(tot*0.25 + 0.5)]; p50 = isizes[(int)(tot*0.50 + 0.5)]; p75 = isizes[(int)(tot*0.75 + 0.5)]; @@ -170,7 +149,7 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, { int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len; uint64_t o_score, subo_score; - b128_t last_pos[2][2], o_pos[2]; + pair64_t last_pos[2][2], o_pos[2]; max_len = p[0]->full_len; if (max_len < p[1]->full_len) max_len = p[1]->full_len; if (low_bound < max_len) low_bound = max_len; @@ -206,11 +185,11 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, o_score = subo_score = (uint64_t)-1; o_n = subo_n = 0; - ks_introsort(b128, d->arr.n, d->arr.a); + ks_introsort_128(d->arr.n, d->arr.a); for (j = 0; j < 2; ++j) last_pos[j][0].x = last_pos[j][0].y = last_pos[j][1].x = last_pos[j][1].y = (uint64_t)-1; if (opt->type == BWA_PET_STD) { for (i = 0; i < d->arr.n; ++i) { - b128_t x = d->arr.a[i]; + pair64_t x = d->arr.a[i]; int strand = x.y>>1&1; if (strand == 1) { // reverse strand, then check int y = 1 - (x.y&1); @@ -221,19 +200,6 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, last_pos[x.y&1][1] = x; } } - } else if (opt->type == BWA_PET_SOLID) { - for (i = 0; i < d->arr.n; ++i) { - b128_t x = d->arr.a[i]; - int strand = x.y>>1&1; - if ((strand^x.y)&1) { // push - int y = 1 - (x.y&1); - __pairing_aux(last_pos[y][1], x); - __pairing_aux(last_pos[y][0], x); - } else { // check - last_pos[x.y&1][0] = last_pos[x.y&1][1]; - last_pos[x.y&1][1] = x; - } - } } else { fprintf(stderr, "[paring] not implemented yet!\n"); exit(1); @@ -345,7 +311,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT) && (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT)) { // only when both ends mapped - b128_t x; + pair64_t x; int j, k; long long n_occ[2]; for (j = 0; j < 2; ++j) { @@ -360,7 +326,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw bwt_aln1_t *r = d->aln[j].a + k; bwtint_t l; if (0 && r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table - b128_t key; + pair64_t key; int ret; key.x = r->k; key.y = r->l; khint_t iter = kh_put(b128, g_hash, key, &ret); @@ -377,14 +343,14 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw for (l = 0; l < kh_val(g_hash, iter).n; ++l) { x.x = kh_val(g_hash, iter).a[l]>>1; x.y = k<<2 | (kh_val(g_hash, iter).a[l]&1)<<1 | j; - kv_push(b128_t, d->arr, x); + kv_push(pair64_t, d->arr, x); } } else { // then calculate on the fly for (l = r->k; l <= r->l; ++l) { int strand; x.x = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand); x.y = k<<2 | strand<<1 | j; - kv_push(b128_t, d->arr, x); + kv_push(pair64_t, d->arr, x); } } } @@ -576,11 +542,11 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, ++n_tot[is_singleton]; cigar[0] = cigar[1] = 0; n_cigar[0] = n_cigar[1] = 0; - if (popt->type != BWA_PET_STD && popt->type != BWA_PET_SOLID) continue; // other types of pairing is not considered + if (popt->type != BWA_PET_STD) continue; // other types of pairing is not considered for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified ubyte_t *seq; if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip - if (popt->type == BWA_PET_STD) { + { // note that popt->type == BWA_PET_STD always true; in older versions, there was a branch for color-space FF/RR reads if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate __set_rght_coor(beg[k], end[k], p[1-k], p[k]); seq = p[k]->rseq; @@ -589,17 +555,6 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, seq = p[k]->seq; seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly } - } else { // BWA_PET_SOLID - if (p[1-k]->strand == 0) { // R3-F3 pairing - if (k == 0) __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3 - else __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3 - seq = p[k]->rseq; - seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed - } else { // F3-R3 pairing - if (k == 0) __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3 - else __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3 - seq = p[k]->seq; - } } // perform SW alignment cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]); @@ -656,14 +611,14 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, return pacseq; } -void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt) +void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line) { extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); int i, j, n_seqs, tot_seqs = 0; bwa_seq_t *seqs[2]; bwa_seqio_t *ks[2]; clock_t t; - bntseq_t *bns, *ntbns = 0; + bntseq_t *bns; FILE *fp_sa[2]; gap_opt_t opt, opt0; khint_t iter; @@ -688,10 +643,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f opt0 = opt; err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten! ks[1] = bwa_open_reads(opt.mode, fn_fa[1]); - if (!(opt.mode & BWA_MODE_COMPREAD)) { - popt->type = BWA_PET_SOLID; - ntbns = bwa_open_nt(prefix); - } else { // for Illumina alignment only + { // for Illumina alignment only if (popt->is_preload) { strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); @@ -702,7 +654,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f } // core loop - bwa_print_sam_SQ(bns); + bwa_print_sam_hdr(bns, rg_line); bwa_print_sam_PG(); while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) { int cnt_chg; @@ -724,7 +676,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... "); for (j = 0; j < 2; ++j) - bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq, ntbns); + bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); if (pac == 0) free(pacseq); @@ -749,7 +701,6 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f // destroy bns_destroy(bns); - if (ntbns) bns_destroy(ntbns); for (i = 0; i < 2; ++i) { bwa_seq_close(ks[i]); err_fclose(fp_sa[i]); @@ -764,21 +715,15 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f int bwa_sai2sam_pe(int argc, char *argv[]) { - extern char *bwa_rg_line, *bwa_rg_id; - extern int bwa_set_rg(const char *s); - extern char *bwa_infer_prefix(const char *hint); int c; pe_opt_t *popt; - char *prefix; + char *prefix, *rg_line = 0; popt = bwa_init_pe_opt(); while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) { switch (c) { case 'r': - if (bwa_set_rg(optarg) < 0) { - fprintf(stderr, "[%s] malformated @RG line\n", __func__); - return 1; - } + if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; break; case 'a': popt->max_isize = atoi(optarg); break; case 'o': popt->max_occ = atoi(optarg); break; @@ -812,13 +757,11 @@ int bwa_sai2sam_pe(int argc, char *argv[]) fprintf(stderr, "\n"); return 1; } - if ((prefix = bwa_infer_prefix(argv[optind])) == 0) { + if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { fprintf(stderr, "[%s] fail to locate the index\n", __func__); - free(bwa_rg_line); free(bwa_rg_id); return 0; } - bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt); - free(bwa_rg_line); free(bwa_rg_id); free(prefix); - free(popt); + bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt, rg_line); + free(prefix); free(popt); return 0; } diff --git a/bwase.c b/bwase.c index 84430fe..e5204e9 100644 --- a/bwase.c +++ b/bwase.c @@ -10,9 +10,9 @@ #include "bntseq.h" #include "utils.h" #include "kstring.h" +#include "bwa.h" int g_log_n[256]; -char *bwa_rg_line, *bwa_rg_id; void bwa_print_sam_PG(); @@ -71,8 +71,8 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma } rest -= q->l - q->k + 1; } else { // Random sampling (http://code.activestate.com/recipes/272884/). In fact, we never come here. - int j, i, k; - for (j = rest, i = q->l - q->k + 1, k = 0; j > 0; --j) { + int j, i; + for (j = rest, i = q->l - q->k + 1; j > 0; --j) { double p = 1.0, x = drand48(); while (x < p) p -= p * j / (i--); s->multi[z].pos = q->l - i; @@ -296,18 +296,12 @@ void bwa_correct_trimmed(bwa_seq_t *s) s->len = s->full_len; } -void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns) +void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq) { - ubyte_t *pacseq, *ntpac = 0; + ubyte_t *pacseq; int i, j; kstring_t *str; - if (ntbns) { // in color space - ntpac = (ubyte_t*)xcalloc(ntbns->l_pac/4+1, 1); - err_rewind(ntbns->fp_pac); - err_fread_noeof(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac); - } - if (!_pacseq) { pacseq = (ubyte_t*)xcalloc(bns->l_pac/4+1, 1); err_rewind(bns->fp_pac); @@ -328,28 +322,6 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1); } -#if 0 - if (ntbns) { // in color space - for (i = 0; i < n_seqs; ++i) { - bwa_seq_t *s = seqs + i; - bwa_cs2nt_core(s, bns->l_pac, ntpac); - for (j = 0; j < s->n_multi; ++j) { - bwt_multi1_t *q = s->multi + j; - int n_cigar; - if (q->gap == 0) continue; - free(q->cigar); - q->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, q->strand? s->rseq : s->seq, &q->pos, - (q->strand? 1 : -1) * q->gap, &n_cigar, 0); - q->n_cigar = n_cigar; - } - if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again - free(s->cigar); - s->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos, - (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0); - } - } - } -#endif // generate MD tag str = (kstring_t*)xcalloc(1, sizeof(kstring_t)); for (i = 0; i != n_seqs; ++i) { @@ -357,18 +329,16 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t if (s->type != BWA_TYPE_NO_MATCH) { int nm; s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq, - bns->l_pac, ntbns? ntpac : pacseq, str, &nm); + bns->l_pac, pacseq, str, &nm); s->nm = nm; } } free(str->s); free(str); // correct for trimmed reads - if (!ntbns) // trimming is only enabled for Illumina reads - for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i); + for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i); if (!_pacseq) free(pacseq); - free(ntpac); } int64_t pos_end(const bwa_seq_t *p) @@ -462,11 +432,11 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in // print mate coordinate if (mate && mate->type != BWA_TYPE_NO_MATCH) { - int m_seqid, m_is_N; + int m_seqid; long long isize; am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality // redundant calculation here, but should not matter too much - m_is_N = bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid); + bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid); err_printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name); isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0; if (p->type == BWA_TYPE_NO_MATCH) isize = 0; @@ -482,7 +452,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in err_printf("%s", p->qual); } else err_printf("*"); - if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id); + if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id); if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc); if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len); if (p->type != BWA_TYPE_NO_MATCH) { @@ -532,74 +502,20 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality err_printf("%s", p->qual); } else err_printf("*"); - if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id); + if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id); if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc); if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len); err_putchar('\n'); } } -bntseq_t *bwa_open_nt(const char *prefix) -{ - bntseq_t *ntbns; - char *str; - str = (char*)xcalloc(strlen(prefix) + 10, 1); - strcat(strcpy(str, prefix), ".nt"); - ntbns = bns_restore(str); - free(str); - return ntbns; -} - -void bwa_print_sam_SQ(const bntseq_t *bns) -{ - int i; - for (i = 0; i < bns->n_seqs; ++i) - err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len); - if (bwa_rg_line) err_printf("%s\n", bwa_rg_line); -} - void bwase_initialize() { int i; for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); } -char *bwa_escape(char *s) -{ - char *p, *q; - for (p = q = s; *p; ++p) { - if (*p == '\\') { - ++p; - if (*p == 't') *q++ = '\t'; - else if (*p == 'n') *q++ = '\n'; - else if (*p == 'r') *q++ = '\r'; - else if (*p == '\\') *q++ = '\\'; - } else *q++ = *p; - } - *q = '\0'; - return s; -} - -int bwa_set_rg(const char *s) -{ - char *p, *q, *r; - if (strstr(s, "@RG") != s) return -1; - if (bwa_rg_line) free(bwa_rg_line); - if (bwa_rg_id) free(bwa_rg_id); - bwa_rg_line = xstrdup(s); - bwa_rg_id = 0; - bwa_escape(bwa_rg_line); - p = strstr(bwa_rg_line, "\tID:"); - if (p == 0) return -1; - p += 4; - for (q = p; *q && *q != '\t' && *q != '\n'; ++q); - bwa_rg_id = xcalloc(q - p + 1, 1); - for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q) - *r++ = *q; - return 0; -} - -void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ) +void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ, const char *rg_line) { extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); int i, n_seqs, tot_seqs = 0, m_aln; @@ -607,7 +523,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f bwa_seq_t *seqs; bwa_seqio_t *ks; clock_t t; - bntseq_t *bns, *ntbns = 0; + bntseq_t *bns; FILE *fp_sa; gap_opt_t opt; @@ -619,9 +535,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f m_aln = 0; err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa); - if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac - ntbns = bwa_open_nt(prefix); - bwa_print_sam_SQ(bns); + bwa_print_sam_hdr(bns, rg_line); //bwa_print_sam_PG(); // set ks ks = bwa_open_reads(opt.mode, fn_fa); @@ -648,7 +562,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); fprintf(stderr, "[bwa_aln_core] refine gapped alignments... "); - bwa_refine_gapped(bns, n_seqs, seqs, 0, ntbns); + bwa_refine_gapped(bns, n_seqs, seqs, 0); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); fprintf(stderr, "[bwa_aln_core] print alignments... "); @@ -662,7 +576,6 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f // destroy bwa_seq_close(ks); - if (ntbns) bns_destroy(ntbns); bns_destroy(bns); err_fclose(fp_sa); free(aln); @@ -670,17 +583,13 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f int bwa_sai2sam_se(int argc, char *argv[]) { - extern char *bwa_infer_prefix(const char *hint); int c, n_occ = 3; - char *prefix; + char *prefix, *rg_line = 0; while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) { switch (c) { case 'h': break; case 'r': - if (bwa_set_rg(optarg) < 0) { - fprintf(stderr, "[%s] malformated @RG line\n", __func__); - return 1; - } + if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; break; case 'n': n_occ = atoi(optarg); break; case 'f': xreopen(optarg, "w", stdout); break; @@ -692,12 +601,10 @@ int bwa_sai2sam_se(int argc, char *argv[]) fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] \n"); return 1; } - if ((prefix = bwa_infer_prefix(argv[optind])) == 0) { + if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { fprintf(stderr, "[%s] fail to locate the index\n", __func__); - free(bwa_rg_line); free(bwa_rg_id); return 0; } - bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ); - free(bwa_rg_line); free(bwa_rg_id); + bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line); return 0; } diff --git a/bwase.h b/bwase.h index f8e9b0a..26a9f68 100644 --- a/bwase.h +++ b/bwase.h @@ -14,7 +14,7 @@ extern "C" { // Calculate the approximate position of the sequence from the specified bwt with loaded suffix array. void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t* bwt, bwa_seq_t* seq, const int max_mm, const float fnr); // Refine the approximate position of the sequence to an actual placement for the sequence. - void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns); + void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq); // Backfill certain alignment properties mainly centering around number of matches. void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); // Calculate the end position of a read given a certain sequence. diff --git a/bwaseqio.c b/bwaseqio.c index 8d69b37..57ed654 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -5,7 +5,7 @@ #include "bamlite.h" #include "kseq.h" -KSEQ_INIT(gzFile, err_gzread) +KSEQ_DECLARE(gzFile) extern unsigned char nst_nt4_table[256]; static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; diff --git a/bwt.c b/bwt.c index eb85bb0..412dce5 100644 --- a/bwt.c +++ b/bwt.c @@ -45,6 +45,14 @@ void bwt_gen_cnt_table(bwt_t *bwt) } } +static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inverse CSA +{ + bwtint_t x = k - (k > bwt->primary); + x = bwt_B0(bwt, x); + x = bwt->L2[x] + bwt_occ(bwt, k, x); + return k == bwt->primary? 0 : x; +} + // bwt->bwt and bwt->occ must be precalculated void bwt_cal_sa(bwt_t *bwt, int intv) { @@ -93,21 +101,20 @@ static inline int __occ_aux(uint64_t y, int c) bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c) { - bwtint_t n, l, j; - uint32_t *p; + bwtint_t n; + uint32_t *p, *end; if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c]; if (k == (bwtint_t)(-1)) return 0; - if (k >= bwt->primary) --k; // because $ is not in bwt + k -= (k >= bwt->primary); // because $ is not in bwt // retrieve Occ at k/OCC_INTERVAL n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c]; p += sizeof(bwtint_t); // jump to the start of the first BWT cell // calculate Occ up to the last k/32 - j = k >> 5 << 5; - for (l = k/OCC_INTERVAL*OCC_INTERVAL; l < j; l += 32, p += 2) - n += __occ_aux((uint64_t)p[0]<<32 | p[1], c); + end = p + (((k>>5) - ((k&~OCC_INTV_MASK)>>5))<<1); + for (; p < end; p += 2) n += __occ_aux((uint64_t)p[0]<<32 | p[1], c); // calculate Occ n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c); @@ -156,20 +163,20 @@ void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) { - bwtint_t l, j, x; - uint32_t *p; + bwtint_t x; + uint32_t *p, tmp, *end; if (k == (bwtint_t)(-1)) { memset(cnt, 0, 4 * sizeof(bwtint_t)); return; } - if (k >= bwt->primary) --k; // because $ is not in bwt + k -= (k >= bwt->primary); // because $ is not in bwt p = bwt_occ_intv(bwt, k); memcpy(cnt, p, 4 * sizeof(bwtint_t)); - p += sizeof(bwtint_t); - j = k >> 4 << 4; - for (l = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; l < j; l += 16, ++p) - x += __occ_aux4(bwt, *p); - x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15); + p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t)) + end = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4)); // this is the end point of the following loop + for (x = 0; p < end; ++p) x += __occ_aux4(bwt, *p); + tmp = *p & ~((1U<<((~k&15)<<1)) - 1); + x += __occ_aux4(bwt, tmp) - (~k&15); cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24; } @@ -177,29 +184,30 @@ void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]) { bwtint_t _k, _l; - _k = (k >= bwt->primary)? k-1 : k; - _l = (l >= bwt->primary)? l-1 : l; - if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { + _k = k - (k >= bwt->primary); + _l = l - (l >= bwt->primary); + if (_l>>OCC_INTV_SHIFT != _k>>OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { bwt_occ4(bwt, k, cntk); bwt_occ4(bwt, l, cntl); } else { - bwtint_t i, j, x, y; - uint32_t *p; - if (k >= bwt->primary) --k; // because $ is not in bwt - if (l >= bwt->primary) --l; + bwtint_t x, y; + uint32_t *p, tmp, *endk, *endl; + k -= (k >= bwt->primary); // because $ is not in bwt + l -= (l >= bwt->primary); p = bwt_occ_intv(bwt, k); memcpy(cntk, p, 4 * sizeof(bwtint_t)); - p += sizeof(bwtint_t); + p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t)) // prepare cntk[] - j = k >> 4 << 4; - for (i = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; i < j; i += 16, ++p) - x += __occ_aux4(bwt, *p); + endk = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4)); + endl = p + ((l>>4) - ((l&~OCC_INTV_MASK)>>4)); + for (x = 0; p < endk; ++p) x += __occ_aux4(bwt, *p); y = x; - x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15); + tmp = *p & ~((1U<<((~k&15)<<1)) - 1); + x += __occ_aux4(bwt, tmp) - (~k&15); // calculate cntl[] and finalize cntk[] - j = l >> 4 << 4; - for (; i < j; i += 16, ++p) y += __occ_aux4(bwt, *p); - y += __occ_aux4(bwt, *p & ~((1U<<((~l&15)<<1)) - 1)) - (~l&15); + for (; p < endl; ++p) y += __occ_aux4(bwt, *p); + tmp = *p & ~((1U<<((~l&15)<<1)) - 1); + y += __occ_aux4(bwt, tmp) - (~l&15); memcpy(cntl, cntk, 4 * sizeof(bwtint_t)); cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24; cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24; @@ -273,7 +281,7 @@ static void bwt_reverse_intvs(bwtintv_v *p) } } -int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2]) +int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) { int i, j, c, ret; bwtintv_t ik, ok[4]; @@ -281,45 +289,45 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem mem->n = 0; if (q[x] > 3) return x + 1; + if (min_intv < 1) min_intv = 1; // the interval size should be at least 1 kv_init(a[0]); kv_init(a[1]); - prev = tmpvec[0]? tmpvec[0] : &a[0]; - curr = tmpvec[1]? tmpvec[1] : &a[1]; - bwt_set_intv(bwt, q[x], ik); + prev = tmpvec && tmpvec[0]? tmpvec[0] : &a[0]; // use the temporary vector if provided + curr = tmpvec && tmpvec[1]? tmpvec[1] : &a[1]; + bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base ik.info = x + 1; for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search - if (q[i] < 4) { - c = 3 - q[i]; + if (q[i] < 4) { // an A/C/G/T base + c = 3 - q[i]; // complement of q[i] bwt_extend(bwt, &ik, ok, 0); - if (ok[c].x[2] != ik.x[2]) // change of the interval size + if (ok[c].x[2] != ik.x[2]) { // change of the interval size kv_push(bwtintv_t, *curr, ik); - if (ok[c].x[2] == 0) break; // cannot be extended + if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further + } ik = ok[c]; ik.info = i + 1; } else { // an ambiguous base kv_push(bwtintv_t, *curr, ik); - break; // cannot be extended; in this case, ia[0].info; // this will be the returned value swap = curr; curr = prev; prev = swap; for (i = x - 1; i >= -1; --i) { // backward search for MEMs - if (q[i] > 3) break; - c = i < 0? 0 : q[i]; + c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base for (j = 0, curr->n = 0; j < prev->n; ++j) { bwtintv_t *p = &prev->a[j]; bwt_extend(bwt, p, ok, 1); - if (ok[c].x[2] == 0 || i == -1) { // keep the hit if reaching the beginning or not extended further - if (curr->n == 0) { // curr->n to make sure there is no longer matches + if (c < 0 || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough + if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches ik = *p; ik.info |= (uint64_t)(i + 1)<<32; kv_push(bwtintv_t, *mem, ik); } } // otherwise the match is contained in another longer match - } - if (ok[c].x[2] && (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2])) { + } else if (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2]) { ok[c].info = p->info; kv_push(bwtintv_t, *curr, ok[c]); } @@ -329,7 +337,85 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem } bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate - if (tmpvec[0] == 0) free(a[0].a); - if (tmpvec[1] == 0) free(a[1].a); + if (tmpvec == 0 || tmpvec[0] == 0) free(a[0].a); + if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a); return ret; } + +/************************* + * Read/write BWT and SA * + *************************/ + +void bwt_dump_bwt(const char *fn, const bwt_t *bwt) +{ + FILE *fp; + fp = xopen(fn, "wb"); + err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); + err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); + err_fwrite(bwt->bwt, 4, bwt->bwt_size, fp); + err_fflush(fp); + err_fclose(fp); +} + +void bwt_dump_sa(const char *fn, const bwt_t *bwt) +{ + FILE *fp; + fp = xopen(fn, "wb"); + err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); + err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); + err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); + err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp); + err_fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); + err_fflush(fp); + err_fclose(fp); +} + +void bwt_restore_sa(const char *fn, bwt_t *bwt) +{ + char skipped[256]; + FILE *fp; + bwtint_t primary; + + fp = xopen(fn, "rb"); + err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp); + xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same."); + err_fread_noeof(skipped, sizeof(bwtint_t), 4, fp); // skip + err_fread_noeof(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); + err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp); + xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same."); + + bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv; + bwt->sa = (bwtint_t*)xcalloc(bwt->n_sa, sizeof(bwtint_t)); + bwt->sa[0] = -1; + + err_fread_noeof(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); + err_fclose(fp); +} + +bwt_t *bwt_restore_bwt(const char *fn) +{ + bwt_t *bwt; + FILE *fp; + + bwt = (bwt_t*)xcalloc(1, sizeof(bwt_t)); + fp = xopen(fn, "rb"); + err_fseek(fp, 0, SEEK_END); + bwt->bwt_size = (err_ftell(fp) - sizeof(bwtint_t) * 5) >> 2; + bwt->bwt = (uint32_t*)xcalloc(bwt->bwt_size, 4); + err_fseek(fp, 0, SEEK_SET); + err_fread_noeof(&bwt->primary, sizeof(bwtint_t), 1, fp); + err_fread_noeof(bwt->L2+1, sizeof(bwtint_t), 4, fp); + err_fread_noeof(bwt->bwt, 4, bwt->bwt_size, fp); + bwt->seq_len = bwt->L2[4]; + err_fclose(fp); + bwt_gen_cnt_table(bwt); + + return bwt; +} + +void bwt_destroy(bwt_t *bwt) +{ + if (bwt == 0) return; + free(bwt->sa); free(bwt->bwt); + free(bwt); +} diff --git a/bwt.h b/bwt.h index 5823f82..e7b0f97 100644 --- a/bwt.h +++ b/bwt.h @@ -30,8 +30,10 @@ #include -// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line -#define OCC_INTERVAL 0x80 +// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line because some part of the code assume OCC_INTERVAL=0x80 +#define OCC_INTV_SHIFT 7 +#define OCC_INTERVAL (1LL<>((~(k)&0xf)<<1)&3) -// inverse Psi function -#define bwt_invPsi(bwt, k) \ - (((k) == (bwt)->primary)? 0 : \ - ((k) < (bwt)->primary)? \ - (bwt)->L2[bwt_B0(bwt, k)] + bwt_occ(bwt, k, bwt_B0(bwt, k)) \ - : (bwt)->L2[bwt_B0(bwt, (k)-1)] + bwt_occ(bwt, k, bwt_B0(bwt, (k)-1))) - #define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)]+1, (ik).x[2] = (bwt)->L2[(int)(c)+1]-(bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3-(c)]+1, (ik).info = 0) #ifdef __cplusplus @@ -121,7 +116,9 @@ extern "C" { * Given a query _q_, collect potential SMEMs covering position _x_ and store them in _mem_. * Return the end of the longest exact match starting from _x_. */ - int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2]); + int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); + + // SMEM iterator interface #ifdef __cplusplus } diff --git a/bwt_gen.c b/bwt_gen.c index 48bd662..e9a5c93 100644 --- a/bwt_gen.c +++ b/bwt_gen.c @@ -1449,7 +1449,7 @@ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, bgint_t initialMaxB } err_fseek(packedFile, -1, SEEK_END); - packedFileLen = ftell(packedFile); + packedFileLen = err_ftell(packedFile); err_fread_noeof(&lastByteLength, sizeof(unsigned char), 1, packedFile); totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength); diff --git a/bwtaln.c b/bwtaln.c index 109f964..2b6a643 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -11,6 +11,7 @@ #include "bwtaln.h" #include "bwtgap.h" #include "utils.h" +#include "bwa.h" #ifdef HAVE_PTHREAD #include @@ -219,32 +220,6 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) bwa_seq_close(ks); } -char *bwa_infer_prefix(const char *hint) -{ - char *prefix; - int l_hint; - FILE *fp; - l_hint = strlen(hint); - prefix = xmalloc(l_hint + 3 + 4 + 1); - strcpy(prefix, hint); - strcpy(prefix + l_hint, ".64.bwt"); - if ((fp = fopen(prefix, "rb")) != 0) { - fclose(fp); - prefix[l_hint + 3] = 0; - return prefix; - } else { - strcpy(prefix + l_hint, ".bwt"); - if ((fp = fopen(prefix, "rb")) == 0) { - free(prefix); - return 0; - } else { - fclose(fp); - prefix[l_hint] = 0; - return prefix; - } - } -} - int bwa_aln(int argc, char *argv[]) { int c, opte = -1; @@ -252,7 +227,7 @@ int bwa_aln(int argc, char *argv[]) char *prefix; opt = gap_init_opt(); - while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) { + while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:LR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) { switch (c) { case 'n': if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; @@ -272,7 +247,6 @@ int bwa_aln(int argc, char *argv[]) case 'L': opt->mode |= BWA_MODE_LOGGAP; break; case 'R': opt->max_top2 = atoi(optarg); break; case 'q': opt->trim_qual = atoi(optarg); break; - case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break; case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break; case 'f': xreopen(optarg, "wb", stdout); break; case 'b': opt->mode |= BWA_MODE_BAM; break; @@ -310,7 +284,6 @@ int bwa_aln(int argc, char *argv[]) fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual); fprintf(stderr, " -f FILE file to write output to instead of stdout\n"); fprintf(stderr, " -B INT length of barcode\n"); -// fprintf(stderr, " -c input sequences are in the color space\n"); fprintf(stderr, " -L log-scaled gap penalty for long deletions\n"); fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n"); fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n"); @@ -330,7 +303,7 @@ int bwa_aln(int argc, char *argv[]) k = l; } } - if ((prefix = bwa_infer_prefix(argv[optind])) == 0) { + if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { fprintf(stderr, "[%s] fail to locate the index\n", __func__); free(opt); return 0; diff --git a/bwtaln.h b/bwtaln.h index 39eaf4b..412cc04 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -107,7 +107,6 @@ typedef struct { } gap_opt_t; #define BWA_PET_STD 1 -#define BWA_PET_SOLID 2 typedef struct { int max_isize, force_isize; diff --git a/bwtindex.c b/bwtindex.c index 6d0604e..0d2a832 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -36,17 +36,160 @@ #include "main.h" #include "utils.h" -bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is); -void bwa_pac_rev_core(const char *fn, const char *fn_rev); +#ifdef _DIVBWT +#include "divsufsort.h" +#endif -int bwa_index(int argc, char *argv[]) +int is_bwt(ubyte_t *T, int n); + +int64_t bwa_seq_len(const char *fn_pac) { + FILE *fp; + int64_t pac_len; + ubyte_t c; + fp = xopen(fn_pac, "rb"); + err_fseek(fp, -1, SEEK_END); + pac_len = err_ftell(fp); + err_fread_noeof(&c, 1, 1, fp); + err_fclose(fp); + return (pac_len - 1) * 4 + (int)c; +} + +bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is) +{ + bwt_t *bwt; + ubyte_t *buf, *buf2; + int i, pac_size; + FILE *fp; + + // initialization + bwt = (bwt_t*)xcalloc(1, sizeof(bwt_t)); + bwt->seq_len = bwa_seq_len(fn_pac); + bwt->bwt_size = (bwt->seq_len + 15) >> 4; + fp = xopen(fn_pac, "rb"); + + // prepare sequence + pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1); + buf2 = (ubyte_t*)xcalloc(pac_size, 1); + err_fread_noeof(buf2, 1, pac_size, fp); + err_fclose(fp); + memset(bwt->L2, 0, 5 * 4); + buf = (ubyte_t*)xcalloc(bwt->seq_len + 1, 1); + for (i = 0; i < bwt->seq_len; ++i) { + buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3; + ++bwt->L2[1+buf[i]]; + } + for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1]; + free(buf2); + + // Burrows-Wheeler Transform + if (use_is) { + bwt->primary = is_bwt(buf, bwt->seq_len); + } else { +#ifdef _DIVBWT + bwt->primary = divbwt(buf, buf, 0, bwt->seq_len); +#else + err_fatal_simple("libdivsufsort is not compiled in."); +#endif + } + bwt->bwt = (u_int32_t*)xcalloc(bwt->bwt_size, 4); + for (i = 0; i < bwt->seq_len; ++i) + bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1); + free(buf); + return bwt; +} + +int bwa_pac2bwt(int argc, char *argv[]) // the "pac2bwt" command; IMPORTANT: bwt generated at this step CANNOT be used with BWA. bwtupdate is required! +{ + bwt_t *bwt; + int c, use_is = 1; + while ((c = getopt(argc, argv, "d")) >= 0) { + switch (c) { + case 'd': use_is = 0; break; + default: return 1; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: bwa pac2bwt [-d] \n"); + return 1; + } + bwt = bwt_pac2bwt(argv[optind], use_is); + bwt_dump_bwt(argv[optind+1], bwt); + bwt_destroy(bwt); + return 0; +} + +#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3) + +void bwt_bwtupdate_core(bwt_t *bwt) +{ + bwtint_t i, k, c[4], n_occ; + uint32_t *buf; + + n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; + bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size + buf = (uint32_t*)xcalloc(bwt->bwt_size, 4); // will be the new bwt + c[0] = c[1] = c[2] = c[3] = 0; + for (i = k = 0; i < bwt->seq_len; ++i) { + if (i % OCC_INTERVAL == 0) { + memcpy(buf + k, c, sizeof(bwtint_t) * 4); + k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4) + } + if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2 + ++c[bwt_B00(bwt, i)]; + } + // the last element + memcpy(buf + k, c, sizeof(bwtint_t) * 4); + xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size"); + // update bwt + free(bwt->bwt); bwt->bwt = buf; +} + +int bwa_bwtupdate(int argc, char *argv[]) // the "bwtupdate" command +{ + bwt_t *bwt; + if (argc < 2) { + fprintf(stderr, "Usage: bwa bwtupdate \n"); + return 1; + } + bwt = bwt_restore_bwt(argv[1]); + bwt_bwtupdate_core(bwt); + bwt_dump_bwt(argv[1], bwt); + bwt_destroy(bwt); + return 0; +} + +int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command +{ + bwt_t *bwt; + int c, sa_intv = 32; + while ((c = getopt(argc, argv, "i:")) >= 0) { + switch (c) { + case 'i': sa_intv = atoi(optarg); break; + default: return 1; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: bwa bwt2sa [-i %d] \n", sa_intv); + return 1; + } + bwt = bwt_restore_bwt(argv[optind]); + bwt_cal_sa(bwt, sa_intv); + bwt_dump_sa(argv[optind+1], bwt); + bwt_destroy(bwt); + return 0; +} + +int bwa_index(int argc, char *argv[]) // the "index" command +{ + extern void bwa_pac_rev_core(const char *fn, const char *fn_rev); + char *prefix = 0, *str, *str2, *str3; - int c, algo_type = 0, is_color = 0, is_64 = 0; + int c, algo_type = 0, is_64 = 0; clock_t t; int64_t l_pac; - while ((c = getopt(argc, argv, "6ca:p:")) >= 0) { + while ((c = getopt(argc, argv, "6a:p:")) >= 0) { switch (c) { case 'a': // if -a is not set, algo_type will be determined later if (strcmp(optarg, "div") == 0) algo_type = 1; @@ -55,7 +198,6 @@ int bwa_index(int argc, char *argv[]) else err_fatal(__func__, "unknown algorithm: '%s'.", optarg); break; case 'p': prefix = xstrdup(optarg); break; - case 'c': is_color = 1; break; case '6': is_64 = 1; break; default: return 1; } @@ -67,7 +209,6 @@ int bwa_index(int argc, char *argv[]) fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n"); fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n"); fprintf(stderr, " -6 index files named as .64.* instead of .* \n"); -// fprintf(stderr, " -c build color-space index\n"); fprintf(stderr, "\n"); fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n"); fprintf(stderr, " `-a div' do not work not for long genomes. Please choose `-a'\n"); @@ -83,29 +224,13 @@ int bwa_index(int argc, char *argv[]) str2 = (char*)xcalloc(strlen(prefix) + 10, 1); str3 = (char*)xcalloc(strlen(prefix) + 10, 1); - if (is_color == 0) { // nucleotide indexing + { // nucleotide indexing gzFile fp = xzopen(argv[optind], "r"); t = clock(); fprintf(stderr, "[bwa_index] Pack FASTA... "); l_pac = bns_fasta2bntseq(fp, prefix, 0); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); err_gzclose(fp); - } else { // color indexing - gzFile fp = xzopen(argv[optind], "r"); - strcat(strcpy(str, prefix), ".nt"); - t = clock(); - fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... "); - l_pac = bns_fasta2bntseq(fp, str, 0); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - err_gzclose(fp); - { - char *tmp_argv[3]; - tmp_argv[0] = argv[0]; tmp_argv[1] = str; tmp_argv[2] = prefix; - t = clock(); - fprintf(stderr, "[bwa_index] Convert nucleotide PAC to color PAC... "); - bwa_pac2cspac(3, tmp_argv); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - } } if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT { diff --git a/bwtio.c b/bwtio.c deleted file mode 100644 index ca5a6c0..0000000 --- a/bwtio.c +++ /dev/null @@ -1,79 +0,0 @@ -#include -#include -#include -#include "bwt.h" -#include "utils.h" - -void bwt_dump_bwt(const char *fn, const bwt_t *bwt) -{ - FILE *fp = NULL; - fp = xopen(fn, "wb"); - err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); - err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); - err_fwrite(bwt->bwt, 4, bwt->bwt_size, fp); - err_fflush(fp); - err_fclose(fp); -} - -void bwt_dump_sa(const char *fn, const bwt_t *bwt) -{ - FILE *fp; - fp = xopen(fn, "wb"); - err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); - err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); - err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); - err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp); - err_fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); - err_fflush(fp); - err_fclose(fp); -} - -void bwt_restore_sa(const char *fn, bwt_t *bwt) -{ - char skipped[256]; - FILE *fp; - bwtint_t primary; - - fp = xopen(fn, "rb"); - err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp); - xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same."); - err_fread_noeof(skipped, sizeof(bwtint_t), 4, fp); // skip - err_fread_noeof(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); - err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp); - xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same."); - - bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv; - bwt->sa = (bwtint_t*)xcalloc(bwt->n_sa, sizeof(bwtint_t)); - bwt->sa[0] = -1; - - err_fread_noeof(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); - err_fclose(fp); -} - -bwt_t *bwt_restore_bwt(const char *fn) -{ - bwt_t *bwt; - FILE *fp; - - bwt = (bwt_t*)xcalloc(1, sizeof(bwt_t)); - fp = xopen(fn, "rb"); - err_fseek(fp, 0, SEEK_END); - bwt->bwt_size = (err_ftell(fp) - sizeof(bwtint_t) * 5) >> 2; - bwt->bwt = (uint32_t*)xcalloc(bwt->bwt_size, 4); - err_fseek(fp, 0, SEEK_SET); - err_fread_noeof(&bwt->primary, sizeof(bwtint_t), 1, fp); - err_fread_noeof(bwt->L2+1, sizeof(bwtint_t), 4, fp); - err_fread_noeof(bwt->bwt, 4, bwt->bwt_size, fp); - bwt->seq_len = bwt->L2[4]; - err_fclose(fp); - bwt_gen_cnt_table(bwt); - - return bwt; -} - -void bwt_destroy(bwt_t *bwt) -{ - if (bwt == 0) return; - free(bwt->sa); free(bwt->bwt); - free(bwt); -} diff --git a/bwtmisc.c b/bwtmisc.c deleted file mode 100644 index ccc82eb..0000000 --- a/bwtmisc.c +++ /dev/null @@ -1,231 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 Genome Research Ltd (GRL). - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Contact: Heng Li */ - -#include -#include -#include -#include -#include "bntseq.h" -#include "utils.h" -#include "main.h" -#include "bwt.h" - -#ifdef _DIVBWT -#include "divsufsort.h" -#endif - -int is_bwt(ubyte_t *T, int n); - -int64_t bwa_seq_len(const char *fn_pac) -{ - FILE *fp; - int64_t pac_len; - ubyte_t c; - fp = xopen(fn_pac, "rb"); - err_fseek(fp, -1, SEEK_END); - pac_len = err_ftell(fp); - err_fread_noeof(&c, 1, 1, fp); - err_fclose(fp); - return (pac_len - 1) * 4 + (int)c; -} - -bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is) -{ - bwt_t *bwt; - ubyte_t *buf, *buf2; - int i, pac_size; - FILE *fp; - - // initialization - bwt = (bwt_t*)xcalloc(1, sizeof(bwt_t)); - bwt->seq_len = bwa_seq_len(fn_pac); - bwt->bwt_size = (bwt->seq_len + 15) >> 4; - fp = xopen(fn_pac, "rb"); - - // prepare sequence - pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1); - buf2 = (ubyte_t*)xcalloc(pac_size, 1); - err_fread_noeof(buf2, 1, pac_size, fp); - err_fclose(fp); - memset(bwt->L2, 0, 5 * 4); - buf = (ubyte_t*)xcalloc(bwt->seq_len + 1, 1); - for (i = 0; i < bwt->seq_len; ++i) { - buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3; - ++bwt->L2[1+buf[i]]; - } - for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1]; - free(buf2); - - // Burrows-Wheeler Transform - if (use_is) { - bwt->primary = is_bwt(buf, bwt->seq_len); - } else { -#ifdef _DIVBWT - bwt->primary = divbwt(buf, buf, 0, bwt->seq_len); -#else - err_fatal_simple("libdivsufsort is not compiled in."); -#endif - } - bwt->bwt = (u_int32_t*)xcalloc(bwt->bwt_size, 4); - for (i = 0; i < bwt->seq_len; ++i) - bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1); - free(buf); - return bwt; -} - -int bwa_pac2bwt(int argc, char *argv[]) -{ - bwt_t *bwt; - int c, use_is = 1; - while ((c = getopt(argc, argv, "d")) >= 0) { - switch (c) { - case 'd': use_is = 0; break; - default: return 1; - } - } - if (optind + 2 > argc) { - fprintf(stderr, "Usage: bwa pac2bwt [-d] \n"); - return 1; - } - bwt = bwt_pac2bwt(argv[optind], use_is); - bwt_dump_bwt(argv[optind+1], bwt); - bwt_destroy(bwt); - return 0; -} - -#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3) - -void bwt_bwtupdate_core(bwt_t *bwt) -{ - bwtint_t i, k, c[4], n_occ; - uint32_t *buf; - - n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; - bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size - buf = (uint32_t*)xcalloc(bwt->bwt_size, 4); // will be the new bwt - c[0] = c[1] = c[2] = c[3] = 0; - for (i = k = 0; i < bwt->seq_len; ++i) { - if (i % OCC_INTERVAL == 0) { - memcpy(buf + k, c, sizeof(bwtint_t) * 4); - k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4) - } - if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2 - ++c[bwt_B00(bwt, i)]; - } - // the last element - memcpy(buf + k, c, sizeof(bwtint_t) * 4); - xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size"); - // update bwt - free(bwt->bwt); bwt->bwt = buf; -} - -int bwa_bwtupdate(int argc, char *argv[]) -{ - bwt_t *bwt; - if (argc < 2) { - fprintf(stderr, "Usage: bwa bwtupdate \n"); - return 1; - } - bwt = bwt_restore_bwt(argv[1]); - bwt_bwtupdate_core(bwt); - bwt_dump_bwt(argv[1], bwt); - bwt_destroy(bwt); - return 0; -} - -const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4}; - -/* this function is not memory efficient, but this will make life easier - Ideally we should also change .amb files as one 'N' in the nucleotide - sequence leads to two ambiguous colors. I may do this later... */ -uint8_t *bwa_pac2cspac_core(const bntseq_t *bns) -{ - uint8_t *pac, *cspac; - bwtint_t i; - int c1, c2; - pac = (uint8_t*)xcalloc(bns->l_pac/4 + 1, 1); - cspac = (uint8_t*)xcalloc(bns->l_pac/4 + 1, 1); - err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac); - err_rewind(bns->fp_pac); - c1 = pac[0]>>6; cspac[0] = c1<<6; - for (i = 1; i < bns->l_pac; ++i) { - c2 = pac[i>>2] >> (~i&3)*2 & 3; - cspac[i>>2] |= nst_color_space_table[(1< \n"); - return 1; - } - bns = bns_restore(argv[1]); - cspac = bwa_pac2cspac_core(bns); - bns_dump(bns, argv[2]); - // now write cspac - str = (char*)xcalloc(strlen(argv[2]) + 5, 1); - strcat(strcpy(str, argv[2]), ".pac"); - fp = xopen(str, "wb"); - err_fwrite(cspac, 1, bns->l_pac/4 + 1, fp); - ct = bns->l_pac % 4; - err_fwrite(&ct, 1, 1, fp); - err_fflush(fp); - err_fclose(fp); - bns_destroy(bns); - free(cspac); - return 0; -} - -int bwa_bwt2sa(int argc, char *argv[]) -{ - bwt_t *bwt; - int c, sa_intv = 32; - while ((c = getopt(argc, argv, "i:")) >= 0) { - switch (c) { - case 'i': sa_intv = atoi(optarg); break; - default: return 1; - } - } - if (optind + 2 > argc) { - fprintf(stderr, "Usage: bwa bwt2sa [-i %d] \n", sa_intv); - return 1; - } - bwt = bwt_restore_bwt(argv[optind]); - bwt_cal_sa(bwt, sa_intv); - bwt_dump_sa(argv[optind+1], bwt); - bwt_destroy(bwt); - return 0; -} diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 2228054..3b2e5eb 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -13,9 +13,10 @@ #include "bwtsw2.h" #include "stdaln.h" #include "kstring.h" +#include "bwa.h" #include "kseq.h" -KSEQ_INIT(gzFile, err_gzread) +KSEQ_DECLARE(gzFile) #include "ksort.h" #define __left_lt(a, b) ((a).end > (b).end) @@ -186,14 +187,14 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8 bsw2aux_t *q = b->aux + i; uint8_t *query; bwtint_t k; - int score, path_len, beg, end; + int path_len, beg, end; if (p->l) continue; beg = (p->flag & 0x10)? lq - p->end : p->beg; end = (p->flag & 0x10)? lq - p->beg : p->end; query = seq[(p->flag & 0x10)? 1 : 0] + beg; for (k = p->k; k < p->k + p->len; ++k) // in principle, no out-of-boundary here target[k - p->k] = pac[k>>2] >> (~k&3)*2 & 0x3; - score = aln_global_core(target, p->len, query, end - beg, &par, path, &path_len); + aln_global_core(target, p->len, query, end - beg, &par, path, &path_len); q->cigar = aln_path2cigar32(path, path_len, &q->n_cigar); #if 0 if (name && score != p->G) { // debugging only @@ -747,7 +748,7 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t * // print and reset for (i = 0; i < _seq->n; ++i) { bsw2seq1_t *p = _seq->seq + i; - if (p->sam) printf("%s", p->sam); + if (p->sam) err_printf("%s", p->sam); free(p->name); free(p->seq); free(p->qual); free(p->sam); p->tid = -1; p->l = 0; p->name = p->seq = p->qual = p->sam = 0; @@ -756,28 +757,18 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t * _seq->n = 0; } -static void kseq_to_bsw2seq(const kseq_t *ks, bsw2seq1_t *p) -{ - p->tid = -1; - p->l = ks->seq.l; - p->name = xstrdup(ks->name.s); - p->seq = xstrdup(ks->seq.s); - p->qual = ks->qual.l? xstrdup(ks->qual.s) : 0; - p->comment = ks->comment.l? xstrdup(ks->comment.s) : 0; - p->sam = 0; -} - void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2) { gzFile fp, fp2; kseq_t *ks, *ks2; - int l, size = 0, is_pe = 0; + int l, is_pe = 0, i, n; uint8_t *pac; bsw2seq_t *_seq; + bseq1_t *bseq; pac = xcalloc(bns->l_pac/4+1, 1); for (l = 0; l < bns->n_seqs; ++l) - printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len); + err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len); err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac); fp = xzopen(fn, "r"); ks = kseq_init(fp); @@ -787,34 +778,25 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c ks2 = kseq_init(fp2); is_pe = 1; } else fp2 = 0, ks2 = 0, is_pe = 0; - while (kseq_read(ks) >= 0) { - if (ks->name.l > 2 && ks->name.s[ks->name.l-2] == '/') - ks->name.l -= 2, ks->name.s[ks->name.l] = 0; - if (_seq->n == _seq->max) { - _seq->max = _seq->max? _seq->max<<1 : 1024; + while ((bseq = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { + int size = 0; + if (n > _seq->max) { + _seq->max = n; + kroundup32(_seq->max); _seq->seq = xrealloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t)); } - kseq_to_bsw2seq(ks, &_seq->seq[_seq->n++]); - size += ks->seq.l; - if (ks2) { - if (kseq_read(ks2) >= 0) { - if (ks2->name.l > 2 && ks2->name.s[ks2->name.l-2] == '/') - ks2->name.l -= 2, ks2->name.s[ks2->name.l] = 0; - kseq_to_bsw2seq(ks2, &_seq->seq[_seq->n++]); // for PE, _seq->n here must be odd and we do not need to enlarge - size += ks->seq.l; - } else { - fprintf(stderr, "[%s] The second query file has fewer reads. Switched to the single-end mode for the following batches.\n", __func__); - is_pe = 0; - } - } - if (size > opt->chunk_size * opt->n_threads) { - fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size); - process_seqs(_seq, opt, bns, pac, target, is_pe); - size = 0; + _seq->n = n; + for (i = 0; i < n; ++i) { + bseq1_t *b = &bseq[i]; + bsw2seq1_t *p = &_seq->seq[i]; + p->tid = -1; p->l = b->l_seq; + p->name = b->name; p->seq = b->seq; p->qual = b->qual; p->comment = b->comment; p->sam = 0; + size += p->l; } + fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp) ...\n", n, size); + free(bseq); + process_seqs(_seq, opt, bns, pac, target, is_pe); } - fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size); - process_seqs(_seq, opt, bns, pac, target, is_pe); // free free(pac); free(_seq->seq); free(_seq); diff --git a/bwtsw2_main.c b/bwtsw2_main.c index e3f57f8..ab126f2 100644 --- a/bwtsw2_main.c +++ b/bwtsw2_main.c @@ -6,14 +6,12 @@ #include "bwt.h" #include "bwtsw2.h" #include "utils.h" +#include "bwa.h" int bwa_bwtsw2(int argc, char *argv[]) { - extern char *bwa_infer_prefix(const char *hint); bsw2opt_t *opt; - bwt_t *target; - char buf[1024], *prefix; - bntseq_t *bns; + bwaidx_t *idx; int c; opt = bsw2_init_opt(); @@ -81,19 +79,10 @@ int bwa_bwtsw2(int argc, char *argv[]) opt->t *= opt->a; opt->coef *= opt->a; - if ((prefix = bwa_infer_prefix(argv[optind])) == 0) { - fprintf(stderr, "[%s] fail to locate the index\n", __func__); - return 0; - } - strcpy(buf, prefix); target = bwt_restore_bwt(strcat(buf, ".bwt")); - strcpy(buf, prefix); bwt_restore_sa(strcat(buf, ".sa"), target); - bns = bns_restore(prefix); - - bsw2_aln(opt, bns, target, argv[optind+1], optind+2 < argc? argv[optind+2] : 0); - - bns_destroy(bns); - bwt_destroy(target); - free(opt); free(prefix); + if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 0; + bsw2_aln(opt, idx->bns, idx->bwt, argv[optind+1], optind+2 < argc? argv[optind+2] : 0); + bwa_idx_destroy(idx); + free(opt); return 0; } diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index d195a09..84c30e3 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -7,6 +7,7 @@ #include "bntseq.h" #include "bwtsw2.h" #include "kstring.h" +#include "utils.h" #ifndef _NO_SSE2 #include "ksw.h" #else @@ -25,7 +26,6 @@ typedef struct { bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins) { - extern void ks_introsort_uint64_t(size_t n, uint64_t *a); int i, k, x, p25, p50, p75, tmp, max_len = 0; uint64_t *isize; bsw2pestat_t r; @@ -45,7 +45,7 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins) max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg; isize[k++] = l; } - ks_introsort_uint64_t(k, isize); + ks_introsort_64(k, isize); p25 = isize[(int)(.25 * k + .499)]; p50 = isize[(int)(.50 * k + .499)]; p75 = isize[(int)(.75 * k + .499)]; @@ -75,9 +75,9 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins) r.low = tmp > max_len? tmp : max_len; if (r.low < 1) r.low = 1; r.high = (int)(p75 + 3. * (p75 - p25) + .499); - if (r.low > r.avg - MAX_STDDEV * 4.) r.low = (int)(r.avg - MAX_STDDEV * 4. + .499); + if (r.low > r.avg - MAX_STDDEV * r.std) r.low = (int)(r.avg - MAX_STDDEV * r.std + .499); r.low = tmp > max_len? tmp : max_len; - if (r.high < r.avg - MAX_STDDEV * 4.) r.high = (int)(r.avg + MAX_STDDEV * 4. + .499); + if (r.high < r.avg - MAX_STDDEV * r.std) r.high = (int)(r.avg + MAX_STDDEV * r.std + .499); ksprintf(msg, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high); free(isize); return r; @@ -128,35 +128,24 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b seq[i] = nst_nt4_table[(int)mseq[i]]; } #ifndef _NO_SSE2 - { - ksw_query_t *q; - ksw_aux_t aux[2]; - // forward Smith-Waterman - aux[0].T = opt->t; aux[0].gapo = opt->q; aux[0].gape = opt->r; aux[1] = aux[0]; - q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat); - ksw_sse2(q, end - beg, ref, &aux[0]); - free(q); - if (aux[0].score < opt->t) { - free(seq); - return; - } - ++aux[0].qe; ++aux[0].te; - // reverse Smith-Waterman - seq_reverse(aux[0].qe, seq, 0); - seq_reverse(aux[0].te, ref, 0); - q = ksw_qinit(aux[0].qe * g_mat[0] < 250? 1 : 2, aux[0].qe, seq, 5, g_mat); - ksw_sse2(q, aux[0].te, ref, &aux[1]); - free(q); - ++aux[1].qe; ++aux[1].te; - // write output - a->G = aux[0].score; - a->G2 = aux[0].score2 > aux[1].score2? aux[0].score2 : aux[1].score2; + { // FIXME!!! The following block has not been tested since the update of the ksw library + int flag = KSW_XSUBO | KSW_XSTART | (l_mseq * g_mat[0] < 250? KSW_XBYTE : 0) | opt->t; + kswr_t aln; + aln = ksw_align(l_mseq, seq, end - beg, ref, 5, g_mat, opt->q, opt->r, flag, 0); + a->G = aln.score; + a->G2 = aln.score2; + if (a->G < opt->t) a->G = 0; if (a->G2 < opt->t) a->G2 = 0; if (a->G2) a->flag |= BSW2_FLAG_TANDEM; - a->k = beg + (aux[0].te - aux[1].te); - a->len = aux[1].te; - a->beg = aux[0].qe - aux[1].qe; - a->end = aux[0].qe; + a->k = beg + aln.tb; + a->len = aln.te - aln.tb + 1; + a->beg = aln.qb; + a->end = aln.qe + 1; + /* + printf("[Q] "); for (i = 0; i < l_mseq; ++i) putchar("ACGTN"[(int)seq[i]]); putchar('\n'); + printf("[R] "); for (i = 0; i < end - beg; ++i) putchar("ACGTN"[(int)ref[i]]); putchar('\n'); + printf("G=%d,G2=%d,beg=%d,end=%d,k=%lld,len=%d\n", a->G, a->G2, a->beg, a->end, a->k, a->len); + */ } #else { @@ -169,6 +158,7 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b a->G = aln_local_core(ref, end - beg, seq, l_mseq, &ap, path, 0, opt->t, &a->G2); if (a->G < opt->t) a->G = 0; if (a->G2 < opt->t) a->G2 = 0; + if (a->G2) a->flag |= BSW2_FLAG_TANDEM; a->k = beg + path[0].i - 1; a->len = path[1].i - path[0].i + 1; a->beg = path[0].j - 1; diff --git a/cs2nt.c b/cs2nt.c deleted file mode 100644 index 3084f11..0000000 --- a/cs2nt.c +++ /dev/null @@ -1,192 +0,0 @@ -#include -#include -#include -#include "bwtaln.h" -#include "stdaln.h" -#include "utils.h" - -/* - Here is a delicate example. ref_nt=ATTAAC(RBRBG), read_cs=RBBOG. If we - decode as ATTGAC(RBGOG), there are one color change and one nt change; - if we decode as ATTAAC(RBRBG), there are two color changes. - - In DP, if color quality is smaller than COLOR_MM, we will use COLOR_MM - as the penalty; otherwise, we will use color quality as the - penalty. This means we always prefer two consistent color changes over - a nt change, but if a color has high quality, we may prefer one nt - change. - - In the above example, the penalties of the two types of decoding are - q(B)+25 and q(B)+q(O), respectively. If q(O)>25, we prefer the first; - otherwise the second. Note that no matter what we choose, the fourth - base will get a low nt quality. - */ - -#define COLOR_MM 19 -#define NUCL_MM 25 - -static const int nst_ntnt2cs_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4 }; - -/* - {A,C,G,T,N} -> {0,1,2,3,4} - nt_ref[0..size]: nucleotide reference: 0/1/2/3/4 - cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N - nt_read[0..size]: nucleotide read sequence: 0/1/2/3 (returned) - btarray[0..4*size]: backtrack array (working space) - */ -void cs2nt_DP(int size, const uint8_t *nt_ref, const uint8_t *cs_read, uint8_t *nt_read, uint8_t *btarray) -{ - int h[8], curr, last; - int x, y, xmin, hmin, k; - - // h[0..3] and h[4..7] are the current and last best score array, depending on curr and last - - // recursion: initial value - if (nt_ref[0] >= 4) memset(h, 0, sizeof(int) << 2); - else { - for (x = 0; x != 4; ++x) h[x] = NUCL_MM; - h[nt_ref[0]] = 0; - } - // recursion: main loop - curr = 1; last = 0; - for (k = 1; k <= size; ++k) { - for (x = 0; x != 4; ++x) { - int min = 0x7fffffff, ymin = 0; - for (y = 0; y != 4; ++y) { - int s = h[last<<2|y]; - if ((cs_read[k-1]&0x3f) != 63 && cs_read[k-1]>>6 != nst_ntnt2cs_table[1<= 0; --k) - nt_read[k] = btarray[(k+1)<<2 | nt_read[k+1]]; -} -/* - nt_read[0..size]: nucleotide read sequence: 0/1/2/3 - cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N - tarray[0..size*2-1]: temporary array - */ -uint8_t *cs2nt_nt_qual(int size, const uint8_t *nt_read, const uint8_t *cs_read, uint8_t *tarray) -{ - int k, c1, c2; - uint8_t *t2array = tarray + size; - // get the color sequence of nt_read - c1 = nt_read[0]; - for (k = 1; k <= size; ++k) { - c2 = nt_read[k]; // in principle, there is no 'N' in nt_read[]; just in case - tarray[k-1] = (c1 >= 4 || c2 >= 4)? 4 : nst_ntnt2cs_table[1<>6 && tarray[k] == cs_read[k]>>6) { - q = (int)(cs_read[k-1]&0x3f) + (int)(cs_read[k]&0x3f) + 10; - } else if (tarray[k-1] == cs_read[k-1]>>6) { - q = (int)(cs_read[k-1]&0x3f) - (int)(cs_read[k]&0x3f); - } else if (tarray[k] == cs_read[k]>>6) { - q = (int)(cs_read[k]&0x3f) - (int)(cs_read[k-1]&0x3f); - } // else, q = 0 - if (q < 0) q = 0; - if (q > 60) q = 60; - t2array[k] = nt_read[k]<<6 | q; - if ((cs_read[k-1]&0x3f) == 63 || (cs_read[k]&0x3f) == 63) t2array[k] = 0; - } - return t2array + 1; // of size-2 -} - -// this function will be called when p->seq has been reversed by refine_gapped() -void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac) -{ - uint8_t *ta, *nt_read, *btarray, *tarray, *nt_ref, *cs_read, *new_nt_read; - int i, len; - uint8_t *seq; - - // set temporary arrays - if (p->type == BWA_TYPE_NO_MATCH) return; - len = p->len + p->n_gapo + p->n_gape + 100; // leave enough space - ta = (uint8_t*)xmalloc(len * 7); - nt_ref = ta; - cs_read = nt_ref + len; - nt_read = cs_read + len; - btarray = nt_read + len; - tarray = nt_read + len; - -#define __gen_csbase(_cs, _i, _seq) do { \ - int q = p->qual[p->strand? p->len - 1 - (_i) : (_i)] - 33; \ - if (q > 60) q = 60; \ - if (_seq[_i] > 3) q = 63; \ - (_cs) = _seq[_i]<<6 | q; \ - } while (0) - - // generate len, nt_ref[] and cs_read - seq = p->strand? p->rseq : p->seq; - nt_ref[0] = p->pos? bns_pac(pac, p->pos-1) : 4; - if (p->cigar == 0) { // no gap or clipping - len = p->len; - for (i = 0; i < p->len; ++i) { - __gen_csbase(cs_read[i], i, seq); - nt_ref[i+1] = bns_pac(pac, p->pos + i); - } - } else { - int k, z; - bwtint_t x, y; - x = p->pos; y = 0; - for (k = z = 0; k < p->n_cigar; ++k) { - int l = __cigar_len(p->cigar[k]); - if (__cigar_op(p->cigar[k]) == FROM_M) { - for (i = 0; i < l; ++i, ++x, ++y) { - __gen_csbase(cs_read[z], y, seq); - nt_ref[z+1] = bns_pac(pac, x); - ++z; - } - } else if (__cigar_op(p->cigar[k]) == FROM_I) { - for (i = 0; i < l; ++i, ++y) { - __gen_csbase(cs_read[z], y, seq); - nt_ref[z+1] = 4; - ++z; - } - } else if (__cigar_op(p->cigar[k]) == FROM_S) y += l; - else x += l; - } - len = z; - } - - cs2nt_DP(len, nt_ref, cs_read, nt_read, btarray); - new_nt_read = cs2nt_nt_qual(len, nt_read, cs_read, tarray); - - // update p - p->len = p->full_len = len - 1; - for (i = 0; i < p->len; ++i) { - if ((new_nt_read[i]&0x3f) == 63) { - p->qual[i] = 33; seq[i] = 4; - } else { - p->qual[i] = (new_nt_read[i]&0x3f) + 33; - seq[i] = new_nt_read[i]>>6; - } - } - p->qual[p->len] = seq[p->len] = 0; - if (p->strand) { - memcpy(p->seq, seq, p->len); - seq_reverse(p->len, p->seq, 1); - seq_reverse(p->len, p->qual, 0); - } else { - memcpy(p->rseq, seq, p->len); - seq_reverse(p->len, p->rseq, 1); - } - free(ta); -} diff --git a/fastmap.c b/fastmap.c index 5cb83fc..ec517e2 100644 --- a/fastmap.c +++ b/fastmap.c @@ -2,115 +2,174 @@ #include #include #include -#include "bntseq.h" -#include "bwt.h" +#include "bwa.h" +#include "bwamem.h" #include "kvec.h" +#include "utils.h" #include "kseq.h" #include "utils.h" -KSEQ_INIT(gzFile, err_gzread) +KSEQ_DECLARE(gzFile) extern unsigned char nst_nt4_table[256]; -typedef struct { - const bwt_t *bwt; - const uint8_t *query; - int start, len; - bwtintv_v *tmpvec[2], *matches; -} smem_i; +void *kopen(const char *fn, int *_fd); +int kclose(void *a); -smem_i *smem_iter_init(const bwt_t *bwt) +int main_mem(int argc, char *argv[]) { - smem_i *iter; - iter = xcalloc(1, sizeof(smem_i)); - iter->bwt = bwt; - iter->tmpvec[0] = xcalloc(1, sizeof(bwtintv_v)); - iter->tmpvec[1] = xcalloc(1, sizeof(bwtintv_v)); - iter->matches = xcalloc(1, sizeof(bwtintv_v)); - return iter; -} + mem_opt_t *opt; + int fd, fd2, i, c, n, copy_comment = 0; + gzFile fp, fp2 = 0; + kseq_t *ks, *ks2 = 0; + bseq1_t *seqs; + bwaidx_t *idx; + char *rg_line = 0; + void *ko = 0, *ko2 = 0; -void smem_iter_destroy(smem_i *iter) -{ - free(iter->tmpvec[0]->a); - free(iter->tmpvec[1]->a); - free(iter->matches->a); - free(iter); -} + opt = mem_opt_init(); + while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:w:")) >= 0) { + if (c == 'k') opt->min_seed_len = atoi(optarg); + else if (c == 'w') opt->w = atoi(optarg); + else if (c == 'A') opt->a = atoi(optarg); + else if (c == 'B') opt->b = atoi(optarg); + else if (c == 'O') opt->q = atoi(optarg); + else if (c == 'E') opt->r = atoi(optarg); + else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; + else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; + else if (c == 'H') opt->flag |= MEM_F_HARDCLIP; + else if (c == 'a') opt->flag |= MEM_F_ALL; + else if (c == 'p') opt->flag |= MEM_F_PE; + else if (c == 'M') opt->flag |= MEM_F_NO_MULTI; + else if (c == 'c') opt->max_occ = atoi(optarg); + else if (c == 'v') bwa_verbose = atoi(optarg); + else if (c == 'r') opt->split_factor = atof(optarg); + else if (c == 'C') copy_comment = 1; + else if (c == 'R') { + if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak + } else if (c == 's') opt->split_width = atoi(optarg); + } + if (opt->n_threads < 1) opt->n_threads = 1; + if (optind + 1 >= argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); + fprintf(stderr, "Algorithm options:\n\n"); + fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); + fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); + fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); + fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); + fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); + fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); + fprintf(stderr, " -P skip pairing; perform mate SW only\n"); + fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a); + fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); + fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q); + fprintf(stderr, " -E INT gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r); + fprintf(stderr, "\nInput/output options:\n\n"); + fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); + fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose); + fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); + fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); + fprintf(stderr, " -H hard clipping\n"); + fprintf(stderr, " -M mark shorter split hits as secondary (for Picard/GATK compatibility)\n"); + fprintf(stderr, "\n"); + free(opt); + return 1; + } -void smem_set_query(smem_i *iter, int len, const uint8_t *query) -{ - iter->query = query; - iter->start = 0; - iter->len = len; -} + mem_fill_scmat(opt->a, opt->b, opt->mat); + if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak + bwa_print_sam_hdr(idx->bns, rg_line); -int smem_next(smem_i *iter) -{ - iter->tmpvec[0]->n = iter->tmpvec[1]->n = iter->matches->n = 0; - if (iter->start >= iter->len || iter->start < 0) return -1; - while (iter->start < iter->len && iter->query[iter->start] > 3) ++iter->start; // skip ambiguous bases - if (iter->start == iter->len) return -1; - iter->start = bwt_smem1(iter->bwt, iter->len, iter->query, iter->start, iter->matches, iter->tmpvec); - return iter->start; + ko = kopen(argv[optind + 1], &fd); + fp = gzdopen(fd, "r"); + ks = kseq_init(fp); + if (optind + 2 < argc) { + if (opt->flag&MEM_F_PE) { + if (bwa_verbose >= 2) + fprintf(stderr, "[W::%s] when '-p' is in use, the second query file will be ignored.\n", __func__); + } else { + ko2 = kopen(argv[optind + 2], &fd2); + fp2 = gzdopen(fd2, "r"); + ks2 = kseq_init(fp2); + opt->flag |= MEM_F_PE; + } + } + while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { + int64_t size = 0; + if (!copy_comment) + for (i = 0; i < n; ++i) { + free(seqs[i].comment); seqs[i].comment = 0; + } + for (i = 0; i < n; ++i) size += seqs[i].l_seq; + if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size); + mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n, seqs); + free(seqs); + } + + free(opt); + bwa_idx_destroy(idx); + kseq_destroy(ks); + err_gzclose(fp); kclose(ko); + if (ks2) { + kseq_destroy(ks2); + err_gzclose(fp2); kclose(ko2); + } + return 0; } int main_fastmap(int argc, char *argv[]) { - int c, i, min_iwidth = 20, min_len = 17, print_seq = 0; + int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, split_width = 0; kseq_t *seq; bwtint_t k; gzFile fp; - bwt_t *bwt; - bntseq_t *bns; - smem_i *iter; + smem_i *itr; + const bwtintv_v *a; + bwaidx_t *idx; - while ((c = getopt(argc, argv, "w:l:s")) >= 0) { + while ((c = getopt(argc, argv, "w:l:ps:")) >= 0) { switch (c) { - case 's': print_seq = 1; break; + case 's': split_width = atoi(optarg); break; + case 'p': print_seq = 1; break; case 'w': min_iwidth = atoi(optarg); break; case 'l': min_len = atoi(optarg); break; } } if (optind + 1 >= argc) { - fprintf(stderr, "Usage: bwa fastmap [-s] [-l minLen=%d] [-w maxSaSize=%d] \n", min_len, min_iwidth); + fprintf(stderr, "Usage: bwa fastmap [-p] [-s splitWidth=%d] [-l minLen=%d] [-w maxSaSize=%d] \n", split_width, min_len, min_iwidth); return 1; } fp = xzopen(argv[optind + 1], "r"); seq = kseq_init(fp); - { // load the packed sequences, BWT and SA - char *tmp = xcalloc(strlen(argv[optind]) + 5, 1); - strcat(strcpy(tmp, argv[optind]), ".bwt"); - bwt = bwt_restore_bwt(tmp); - strcat(strcpy(tmp, argv[optind]), ".sa"); - bwt_restore_sa(tmp, bwt); - free(tmp); - bns = bns_restore(argv[optind]); - } - iter = smem_iter_init(bwt); + idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS); + itr = smem_itr_init(idx->bwt); while (kseq_read(seq) >= 0) { - printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l); + err_printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l); if (print_seq) { err_putchar('\t'); err_puts(seq->seq.s); } else err_putchar('\n'); for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; - smem_set_query(iter, seq->seq.l, (uint8_t*)seq->seq.s); - while (smem_next(iter) > 0) { - for (i = 0; i < iter->matches->n; ++i) { - bwtintv_t *p = &iter->matches->a[i]; + smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s); + while ((a = smem_next(itr, min_len<<1, split_width)) != 0) { + for (i = 0; i < a->n; ++i) { + bwtintv_t *p = &a->a[i]; if ((uint32_t)p->info - (p->info>>32) < min_len) continue; - printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); + err_printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); if (p->x[2] <= min_iwidth) { for (k = 0; k < p->x[2]; ++k) { bwtint_t pos; int len, is_rev, ref_id; len = (uint32_t)p->info - (p->info>>32); - pos = bns_depos(bns, bwt_sa(bwt, p->x[0] + k), &is_rev); + pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &is_rev); if (is_rev) pos -= len - 1; - bns_cnt_ambi(bns, pos, len, &ref_id); - printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); + bns_cnt_ambi(idx->bns, pos, len, &ref_id); + err_printf("\t%s:%c%ld", idx->bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - idx->bns->anns[ref_id].offset) + 1); } } else err_puts("\t*"); err_putchar('\n'); @@ -119,9 +178,8 @@ int main_fastmap(int argc, char *argv[]) err_puts("//"); } - smem_iter_destroy(iter); - bns_destroy(bns); - bwt_destroy(bwt); + smem_itr_destroy(itr); + bwa_idx_destroy(idx); kseq_destroy(seq); err_gzclose(fp); return 0; diff --git a/kbtree.h b/kbtree.h new file mode 100644 index 0000000..bab4f0a --- /dev/null +++ b/kbtree.h @@ -0,0 +1,385 @@ +/*- + * Copyright 1997-1999, 2001, John-Mark Gurney. + * 2008-2009, Attractive Chaos + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef __AC_KBTREE_H +#define __AC_KBTREE_H + +#include +#include +#include +#include "utils.h" + +typedef struct { + int32_t is_internal:1, n:31; +} kbnode_t; + +#define __KB_KEY(type, x) ((type*)((char*)x + 4)) +#define __KB_PTR(btr, x) ((kbnode_t**)((char*)x + btr->off_ptr)) + +#define __KB_TREE_T(name) \ + typedef struct { \ + kbnode_t *root; \ + int off_key, off_ptr, ilen, elen; \ + int n, t; \ + int n_keys, n_nodes; \ + } kbtree_##name##_t; + +#define __KB_INIT(name, key_t) \ + kbtree_##name##_t *kb_init_##name(int size) \ + { \ + kbtree_##name##_t *b; \ + b = (kbtree_##name##_t*)xcalloc(1, sizeof(kbtree_##name##_t)); \ + b->t = ((size - 4 - sizeof(void*)) / (sizeof(void*) + sizeof(key_t)) + 1) >> 1; \ + if (b->t < 2) { \ + free(b); return 0; \ + } \ + b->n = 2 * b->t - 1; \ + b->off_ptr = 4 + b->n * sizeof(key_t); \ + b->ilen = (4 + sizeof(void*) + b->n * (sizeof(void*) + sizeof(key_t)) + 3) >> 2 << 2; \ + b->elen = (b->off_ptr + 3) >> 2 << 2; \ + b->root = (kbnode_t*)xcalloc(1, b->ilen); \ + ++b->n_nodes; \ + return b; \ + } + +#define __kb_destroy(b) do { \ + int i, max = 8; \ + kbnode_t *x, **top, **stack = 0; \ + if (b) { \ + top = stack = (kbnode_t**)xcalloc(max, sizeof(kbnode_t*)); \ + *top++ = (b)->root; \ + while (top != stack) { \ + x = *--top; \ + if (x->is_internal == 0) { free(x); continue; } \ + for (i = 0; i <= x->n; ++i) \ + if (__KB_PTR(b, x)[i]) { \ + if (top - stack == max) { \ + max <<= 1; \ + stack = (kbnode_t**)xrealloc(stack, max * sizeof(kbnode_t*)); \ + top = stack + (max>>1); \ + } \ + *top++ = __KB_PTR(b, x)[i]; \ + } \ + free(x); \ + } \ + } \ + free(b); free(stack); \ + } while (0) + +#define __kb_get_first(key_t, b, ret) do { \ + kbnode_t *__x = (b)->root; \ + while (__KB_PTR(b, __x)[0] != 0) \ + __x = __KB_PTR(b, __x)[0]; \ + (ret) = __KB_KEY(key_t, __x)[0]; \ + } while (0) + +#define __KB_GET_AUX0(name, key_t, __cmp) \ + static inline int __kb_get_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \ + { \ + int tr, *rr, begin, end, n = x->n >> 1; \ + if (x->n == 0) return -1; \ + if (__cmp(*k, __KB_KEY(key_t, x)[n]) < 0) { \ + begin = 0; end = n; \ + } else { begin = n; end = x->n - 1; } \ + rr = r? r : &tr; \ + n = end; \ + while (n >= begin && (*rr = __cmp(*k, __KB_KEY(key_t, x)[n])) < 0) --n; \ + return n; \ + } + +#define __KB_GET_AUX1(name, key_t, __cmp) \ + static inline int __kb_getp_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \ + { \ + int tr, *rr, begin = 0, end = x->n; \ + if (x->n == 0) return -1; \ + rr = r? r : &tr; \ + while (begin < end) { \ + int mid = (begin + end) >> 1; \ + if (__cmp(__KB_KEY(key_t, x)[mid], *k) < 0) begin = mid + 1; \ + else end = mid; \ + } \ + if (begin == x->n) { *rr = 1; return x->n - 1; } \ + if ((*rr = __cmp(*k, __KB_KEY(key_t, x)[begin])) < 0) --begin; \ + return begin; \ + } + +#define __KB_GET(name, key_t) \ + static key_t *kb_getp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \ + { \ + int i, r = 0; \ + kbnode_t *x = b->root; \ + while (x) { \ + i = __kb_getp_aux_##name(x, k, &r); \ + if (i >= 0 && r == 0) return &__KB_KEY(key_t, x)[i]; \ + if (x->is_internal == 0) return 0; \ + x = __KB_PTR(b, x)[i + 1]; \ + } \ + return 0; \ + } \ + static inline key_t *kb_get_##name(kbtree_##name##_t *b, const key_t k) \ + { \ + return kb_getp_##name(b, &k); \ + } + +#define __KB_INTERVAL(name, key_t) \ + static void kb_intervalp_##name(kbtree_##name##_t *b, const key_t * __restrict k, key_t **lower, key_t **upper) \ + { \ + int i, r = 0; \ + kbnode_t *x = b->root; \ + *lower = *upper = 0; \ + while (x) { \ + i = __kb_getp_aux_##name(x, k, &r); \ + if (i >= 0 && r == 0) { \ + *lower = *upper = &__KB_KEY(key_t, x)[i]; \ + return; \ + } \ + if (i >= 0) *lower = &__KB_KEY(key_t, x)[i]; \ + if (i < x->n - 1) *upper = &__KB_KEY(key_t, x)[i + 1]; \ + if (x->is_internal == 0) return; \ + x = __KB_PTR(b, x)[i + 1]; \ + } \ + } \ + static inline void kb_interval_##name(kbtree_##name##_t *b, const key_t k, key_t **lower, key_t **upper) \ + { \ + kb_intervalp_##name(b, &k, lower, upper); \ + } + +#define __KB_PUT(name, key_t, __cmp) \ + /* x must be an internal node */ \ + static void __kb_split_##name(kbtree_##name##_t *b, kbnode_t *x, int i, kbnode_t *y) \ + { \ + kbnode_t *z; \ + z = (kbnode_t*)xcalloc(1, y->is_internal? b->ilen : b->elen); \ + ++b->n_nodes; \ + z->is_internal = y->is_internal; \ + z->n = b->t - 1; \ + memcpy(__KB_KEY(key_t, z), __KB_KEY(key_t, y) + b->t, sizeof(key_t) * (b->t - 1)); \ + if (y->is_internal) memcpy(__KB_PTR(b, z), __KB_PTR(b, y) + b->t, sizeof(void*) * b->t); \ + y->n = b->t - 1; \ + memmove(__KB_PTR(b, x) + i + 2, __KB_PTR(b, x) + i + 1, sizeof(void*) * (x->n - i)); \ + __KB_PTR(b, x)[i + 1] = z; \ + memmove(__KB_KEY(key_t, x) + i + 1, __KB_KEY(key_t, x) + i, sizeof(key_t) * (x->n - i)); \ + __KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[b->t - 1]; \ + ++x->n; \ + } \ + static void __kb_putp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k) \ + { \ + int i = x->n - 1; \ + if (x->is_internal == 0) { \ + i = __kb_getp_aux_##name(x, k, 0); \ + if (i != x->n - 1) \ + memmove(__KB_KEY(key_t, x) + i + 2, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ + __KB_KEY(key_t, x)[i + 1] = *k; \ + ++x->n; \ + } else { \ + i = __kb_getp_aux_##name(x, k, 0) + 1; \ + if (__KB_PTR(b, x)[i]->n == 2 * b->t - 1) { \ + __kb_split_##name(b, x, i, __KB_PTR(b, x)[i]); \ + if (__cmp(*k, __KB_KEY(key_t, x)[i]) > 0) ++i; \ + } \ + __kb_putp_aux_##name(b, __KB_PTR(b, x)[i], k); \ + } \ + } \ + static void kb_putp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \ + { \ + kbnode_t *r, *s; \ + ++b->n_keys; \ + r = b->root; \ + if (r->n == 2 * b->t - 1) { \ + ++b->n_nodes; \ + s = (kbnode_t*)xcalloc(1, b->ilen); \ + b->root = s; s->is_internal = 1; s->n = 0; \ + __KB_PTR(b, s)[0] = r; \ + __kb_split_##name(b, s, 0, r); \ + r = s; \ + } \ + __kb_putp_aux_##name(b, r, k); \ + } \ + static inline void kb_put_##name(kbtree_##name##_t *b, const key_t k) \ + { \ + kb_putp_##name(b, &k); \ + } + + +#define __KB_DEL(name, key_t) \ + static key_t __kb_delp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k, int s) \ + { \ + int yn, zn, i, r = 0; \ + kbnode_t *xp, *y, *z; \ + key_t kp; \ + if (x == 0) return *k; \ + if (s) { /* s can only be 0, 1 or 2 */ \ + r = x->is_internal == 0? 0 : s == 1? 1 : -1; \ + i = s == 1? x->n - 1 : -1; \ + } else i = __kb_getp_aux_##name(x, k, &r); \ + if (x->is_internal == 0) { \ + if (s == 2) ++i; \ + kp = __KB_KEY(key_t, x)[i]; \ + memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ + --x->n; \ + return kp; \ + } \ + if (r == 0) { \ + if ((yn = __KB_PTR(b, x)[i]->n) >= b->t) { \ + xp = __KB_PTR(b, x)[i]; \ + kp = __KB_KEY(key_t, x)[i]; \ + __KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 1); \ + return kp; \ + } else if ((zn = __KB_PTR(b, x)[i + 1]->n) >= b->t) { \ + xp = __KB_PTR(b, x)[i + 1]; \ + kp = __KB_KEY(key_t, x)[i]; \ + __KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 2); \ + return kp; \ + } else if (yn == b->t - 1 && zn == b->t - 1) { \ + y = __KB_PTR(b, x)[i]; z = __KB_PTR(b, x)[i + 1]; \ + __KB_KEY(key_t, y)[y->n++] = *k; \ + memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, z), z->n * sizeof(key_t)); \ + if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, z), (z->n + 1) * sizeof(void*)); \ + y->n += z->n; \ + memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ + memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \ + --x->n; \ + free(z); \ + return __kb_delp_aux_##name(b, y, k, s); \ + } \ + } \ + ++i; \ + if ((xp = __KB_PTR(b, x)[i])->n == b->t - 1) { \ + if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n >= b->t) { \ + memmove(__KB_KEY(key_t, xp) + 1, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \ + if (xp->is_internal) memmove(__KB_PTR(b, xp) + 1, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \ + __KB_KEY(key_t, xp)[0] = __KB_KEY(key_t, x)[i - 1]; \ + __KB_KEY(key_t, x)[i - 1] = __KB_KEY(key_t, y)[y->n - 1]; \ + if (xp->is_internal) __KB_PTR(b, xp)[0] = __KB_PTR(b, y)[y->n]; \ + --y->n; ++xp->n; \ + } else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n >= b->t) { \ + __KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \ + __KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[0]; \ + if (xp->is_internal) __KB_PTR(b, xp)[xp->n] = __KB_PTR(b, y)[0]; \ + --y->n; \ + memmove(__KB_KEY(key_t, y), __KB_KEY(key_t, y) + 1, y->n * sizeof(key_t)); \ + if (y->is_internal) memmove(__KB_PTR(b, y), __KB_PTR(b, y) + 1, (y->n + 1) * sizeof(void*)); \ + } else if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n == b->t - 1) { \ + __KB_KEY(key_t, y)[y->n++] = __KB_KEY(key_t, x)[i - 1]; \ + memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \ + if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \ + y->n += xp->n; \ + memmove(__KB_KEY(key_t, x) + i - 1, __KB_KEY(key_t, x) + i, (x->n - i) * sizeof(key_t)); \ + memmove(__KB_PTR(b, x) + i, __KB_PTR(b, x) + i + 1, (x->n - i) * sizeof(void*)); \ + --x->n; \ + free(xp); \ + xp = y; \ + } else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n == b->t - 1) { \ + __KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \ + memmove(__KB_KEY(key_t, xp) + xp->n, __KB_KEY(key_t, y), y->n * sizeof(key_t)); \ + if (xp->is_internal) memmove(__KB_PTR(b, xp) + xp->n, __KB_PTR(b, y), (y->n + 1) * sizeof(void*)); \ + xp->n += y->n; \ + memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ + memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \ + --x->n; \ + free(y); \ + } \ + } \ + return __kb_delp_aux_##name(b, xp, k, s); \ + } \ + static key_t kb_delp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \ + { \ + kbnode_t *x; \ + key_t ret; \ + ret = __kb_delp_aux_##name(b, b->root, k, 0); \ + --b->n_keys; \ + if (b->root->n == 0 && b->root->is_internal) { \ + --b->n_nodes; \ + x = b->root; \ + b->root = __KB_PTR(b, x)[0]; \ + free(x); \ + } \ + return ret; \ + } \ + static inline key_t kb_del_##name(kbtree_##name##_t *b, const key_t k) \ + { \ + return kb_delp_##name(b, &k); \ + } + +typedef struct { + kbnode_t *x; + int i; +} __kbstack_t; + +#define __kb_traverse(key_t, b, __func) do { \ + int __kmax = 8; \ + __kbstack_t *__kstack, *__kp; \ + __kp = __kstack = (__kbstack_t*)xcalloc(__kmax, sizeof(__kbstack_t)); \ + __kp->x = (b)->root; __kp->i = 0; \ + for (;;) { \ + while (__kp->x && __kp->i <= __kp->x->n) { \ + if (__kp - __kstack == __kmax - 1) { \ + __kmax <<= 1; \ + __kstack = (__kbstack_t*)xrealloc(__kstack, __kmax * sizeof(__kbstack_t)); \ + __kp = __kstack + (__kmax>>1) - 1; \ + } \ + (__kp+1)->i = 0; (__kp+1)->x = __kp->x->is_internal? __KB_PTR(b, __kp->x)[__kp->i] : 0; \ + ++__kp; \ + } \ + --__kp; \ + if (__kp >= __kstack) { \ + if (__kp->x && __kp->i < __kp->x->n) __func(&__KB_KEY(key_t, __kp->x)[__kp->i]); \ + ++__kp->i; \ + } else break; \ + } \ + free(__kstack); \ + } while (0) + +#define KBTREE_INIT(name, key_t, __cmp) \ + __KB_TREE_T(name) \ + __KB_INIT(name, key_t) \ + __KB_GET_AUX1(name, key_t, __cmp) \ + __KB_GET(name, key_t) \ + __KB_INTERVAL(name, key_t) \ + __KB_PUT(name, key_t, __cmp) \ + __KB_DEL(name, key_t) + +#define KB_DEFAULT_SIZE 512 + +#define kbtree_t(name) kbtree_##name##_t +#define kb_init(name, s) kb_init_##name(s) +#define kb_destroy(name, b) __kb_destroy(b) +#define kb_get(name, b, k) kb_get_##name(b, k) +#define kb_put(name, b, k) kb_put_##name(b, k) +#define kb_del(name, b, k) kb_del_##name(b, k) +#define kb_interval(name, b, k, l, u) kb_interval_##name(b, k, l, u) +#define kb_getp(name, b, k) kb_getp_##name(b, k) +#define kb_putp(name, b, k) kb_putp_##name(b, k) +#define kb_delp(name, b, k) kb_delp_##name(b, k) +#define kb_intervalp(name, b, k, l, u) kb_intervalp_##name(b, k, l, u) + +#define kb_size(b) ((b)->n_keys) + +#define kb_generic_cmp(a, b) (((b) < (a)) - ((a) < (b))) +#define kb_str_cmp(a, b) strcmp(a, b) + +#endif diff --git a/khash.h b/khash.h index fae5008..e206d35 100644 --- a/khash.h +++ b/khash.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008, 2009 by attractor + Copyright (c) 2008, 2009, 2011 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -33,7 +33,6 @@ int main() { khiter_t k; khash_t(32) *h = kh_init(32); k = kh_put(32, h, 5, &ret); - if (!ret) kh_del(32, h, k); kh_value(h, k) = 10; k = kh_get(32, h, 10); is_missing = (k == kh_end(h)); @@ -47,6 +46,29 @@ int main() { */ /* + 2011-12-29 (0.2.7): + + * Minor code clean up; no actual effect. + + 2011-09-16 (0.2.6): + + * The capacity is a power of 2. This seems to dramatically improve the + speed for simple keys. Thank Zilong Tan for the suggestion. Reference: + + - http://code.google.com/p/ulib/ + - http://nothings.org/computer/judy/ + + * Allow to optionally use linear probing which usually has better + performance for random input. Double hashing is still the default as it + is more robust to certain non-random input. + + * Added Wang's integer hash function (not used by default). This hash + function is more robust to certain non-random input. + + 2011-02-14 (0.2.5): + + * Allow to declare global functions. + 2009-09-26 (0.2.4): * Improve portability @@ -86,11 +108,9 @@ int main() { @header Generic hash table library. - - @copyright Heng Li */ -#define AC_VERSION_KHASH_H "0.2.4" +#define AC_VERSION_KHASH_H "0.2.6" #include #include @@ -112,24 +132,14 @@ typedef unsigned long long khint64_t; #endif #ifdef _MSC_VER -#define inline __inline +#define kh_inline __inline +#else +#define kh_inline inline #endif typedef khint32_t khint_t; typedef khint_t khiter_t; -#define __ac_HASH_PRIME_SIZE 32 -static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = -{ - 0ul, 3ul, 11ul, 23ul, 53ul, - 97ul, 193ul, 389ul, 769ul, 1543ul, - 3079ul, 6151ul, 12289ul, 24593ul, 49157ul, - 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul, - 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul, - 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul, - 3221225473ul, 4294967291ul -}; - #define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) #define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) #define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) @@ -138,88 +148,128 @@ static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = #define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) #define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) +#ifdef KHASH_LINEAR +#define __ac_inc(k, m) 1 +#else +#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m) +#endif + +#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef kcalloc +#define kcalloc(N,Z) xcalloc(N,Z) +#endif +#ifndef kmalloc +#define kmalloc(Z) xmalloc(Z) +#endif +#ifndef krealloc +#define krealloc(P,Z) xrealloc(P,Z) +#endif +#ifndef kfree +#define kfree(P) free(P) +#endif + static const double __ac_HASH_UPPER = 0.77; -#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - typedef struct { \ - khint_t n_buckets, size, n_occupied, upper_bound; \ - khint32_t *flags; \ - khkey_t *keys; \ - khval_t *vals; \ - } kh_##name##_t; \ - static inline kh_##name##_t *kh_init_##name() { \ - return (kh_##name##_t*)xcalloc(1, sizeof(kh_##name##_t)); \ +#define __KHASH_TYPE(name, khkey_t, khval_t) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; + +#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ + extern kh_##name##_t *kh_init_##name(void); \ + extern void kh_destroy_##name(kh_##name##_t *h); \ + extern void kh_clear_##name(kh_##name##_t *h); \ + extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ + extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ + extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ + extern void kh_del_##name(kh_##name##_t *h, khint_t x); + +#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + SCOPE kh_##name##_t *kh_init_##name(void) { \ + return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ } \ - static inline void kh_destroy_##name(kh_##name##_t *h) \ + SCOPE void kh_destroy_##name(kh_##name##_t *h) \ { \ if (h) { \ - free(h->keys); free(h->flags); \ - free(h->vals); \ - free(h); \ + kfree((void *)h->keys); kfree(h->flags); \ + kfree((void *)h->vals); \ + kfree(h); \ } \ } \ - static inline void kh_clear_##name(kh_##name##_t *h) \ + SCOPE void kh_clear_##name(kh_##name##_t *h) \ { \ if (h && h->flags) { \ - memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \ + memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ h->size = h->n_occupied = 0; \ } \ } \ - static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ { \ if (h->n_buckets) { \ - khint_t inc, k, i, last; \ - k = __hash_func(key); i = k % h->n_buckets; \ - inc = 1 + k % (h->n_buckets - 1); last = i; \ + khint_t inc, k, i, last, mask; \ + mask = h->n_buckets - 1; \ + k = __hash_func(key); i = k & mask; \ + inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ - if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ - else i += inc; \ + i = (i + inc) & mask; \ if (i == last) return h->n_buckets; \ } \ return __ac_iseither(h->flags, i)? h->n_buckets : i; \ } else return 0; \ } \ - static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ - { \ + SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ khint32_t *new_flags = 0; \ khint_t j = 1; \ { \ - khint_t t = __ac_HASH_PRIME_SIZE - 1; \ - while (__ac_prime_list[t] > new_n_buckets) --t; \ - new_n_buckets = __ac_prime_list[t+1]; \ - if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ - else { \ - new_flags = (khint32_t*)xmalloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ - memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ - if (h->n_buckets < new_n_buckets) { \ - h->keys = (khkey_t*)xrealloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) \ - h->vals = (khval_t*)xrealloc(h->vals, new_n_buckets * sizeof(khval_t)); \ - } \ + kroundup32(new_n_buckets); \ + if (new_n_buckets < 4) new_n_buckets = 4; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ + else { /* hash table size to be changed (shrink or expand); rehash */ \ + new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (!new_flags) return -1; \ + memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (h->n_buckets < new_n_buckets) { /* expand */ \ + khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (!new_keys) return -1; \ + h->keys = new_keys; \ + if (kh_is_map) { \ + khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + if (!new_vals) return -1; \ + h->vals = new_vals; \ + } \ + } /* otherwise shrink */ \ } \ } \ - if (j) { \ + if (j) { /* rehashing is needed */ \ for (j = 0; j != h->n_buckets; ++j) { \ if (__ac_iseither(h->flags, j) == 0) { \ khkey_t key = h->keys[j]; \ khval_t val; \ + khint_t new_mask; \ + new_mask = new_n_buckets - 1; \ if (kh_is_map) val = h->vals[j]; \ __ac_set_isdel_true(h->flags, j); \ - while (1) { \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ khint_t inc, k, i; \ k = __hash_func(key); \ - i = k % new_n_buckets; \ - inc = 1 + k % (new_n_buckets - 1); \ - while (!__ac_isempty(new_flags, i)) { \ - if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \ - else i += inc; \ - } \ + i = k & new_mask; \ + inc = __ac_inc(k, new_mask); \ + while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \ __ac_set_isempty_false(new_flags, i); \ - if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ - __ac_set_isdel_true(h->flags, i); \ - } else { \ + __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ h->keys[i] = key; \ if (kh_is_map) h->vals[i] = val; \ break; \ @@ -227,35 +277,39 @@ static const double __ac_HASH_UPPER = 0.77; } \ } \ } \ - if (h->n_buckets > new_n_buckets) { \ - h->keys = (khkey_t*)xrealloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) \ - h->vals = (khval_t*)xrealloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ + h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ } \ - free(h->flags); \ + kfree(h->flags); /* free the working space */ \ h->flags = new_flags; \ h->n_buckets = new_n_buckets; \ h->n_occupied = h->size; \ h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ } \ + return 0; \ } \ - static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ { \ khint_t x; \ - if (h->n_occupied >= h->upper_bound) { \ - if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \ - else kh_resize_##name(h, h->n_buckets + 1); \ - } \ + if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ + if (h->n_buckets > (h->size<<1)) { \ + if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ + *ret = -1; return h->n_buckets; \ + } \ + } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ + *ret = -1; return h->n_buckets; \ + } \ + } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ { \ - khint_t inc, k, i, site, last; \ - x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \ - if (__ac_isempty(h->flags, i)) x = i; \ + khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ + x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ + if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ else { \ - inc = 1 + k % (h->n_buckets - 1); last = i; \ + inc = __ac_inc(k, mask); last = i; \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ if (__ac_isdel(h->flags, i)) site = i; \ - if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ - else i += inc; \ + i = (i + inc) & mask; \ if (i == last) { x = site; break; } \ } \ if (x == h->n_buckets) { \ @@ -264,20 +318,20 @@ static const double __ac_HASH_UPPER = 0.77; } \ } \ } \ - if (__ac_isempty(h->flags, x)) { \ + if (__ac_isempty(h->flags, x)) { /* not present at all */ \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; ++h->n_occupied; \ *ret = 1; \ - } else if (__ac_isdel(h->flags, x)) { \ + } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; \ *ret = 2; \ - } else *ret = 0; \ + } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ return x; \ } \ - static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \ + SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ { \ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ __ac_set_isdel_true(h->flags, x); \ @@ -285,6 +339,17 @@ static const double __ac_HASH_UPPER = 0.77; } \ } +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_PROTOTYPES(name, khkey_t, khval_t) + +#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + /* --- BEGIN OF HASH FUNCTIONS --- */ /*! @function @@ -312,10 +377,10 @@ static const double __ac_HASH_UPPER = 0.77; @param s Pointer to a null terminated string @return The hash value */ -static inline khint_t __ac_X31_hash_string(const char *s) +static kh_inline khint_t __ac_X31_hash_string(const char *s) { - khint_t h = *s; - if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; + khint_t h = (khint_t)*s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; return h; } /*! @function @@ -329,9 +394,21 @@ static inline khint_t __ac_X31_hash_string(const char *s) */ #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) +static kh_inline khint_t __ac_Wang_hash(khint_t key) +{ + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} +#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) + /* --- END OF HASH FUNCTIONS --- */ -/* Other necessary macros... */ +/* Other convenient macros... */ /*! @abstract Type of the hash table. @@ -397,7 +474,6 @@ static inline khint_t __ac_X31_hash_string(const char *s) */ #define kh_del(name, h, k) kh_del_##name(h, k) - /*! @function @abstract Test whether a bucket contains data. @param h Pointer to the hash table [khash_t(name)*] @@ -456,6 +532,34 @@ static inline khint_t __ac_X31_hash_string(const char *s) */ #define kh_n_buckets(h) ((h)->n_buckets) +/*! @function + @abstract Iterate over the entries in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param kvar Variable to which key will be assigned + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (kvar) = kh_key(h,__i); \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/*! @function + @abstract Iterate over the values in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach_value(h, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + /* More conenient interfaces */ /*! @function diff --git a/kopen.c b/kopen.c new file mode 100644 index 0000000..c1c43a8 --- /dev/null +++ b/kopen.c @@ -0,0 +1,372 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef _WIN32 +#include +#include +#include +#endif + +#include "utils.h" + +#ifdef _WIN32 +#define _KO_NO_NET +#endif + +#ifndef _KO_NO_NET +static int socket_wait(int fd, int is_read) +{ + fd_set fds, *fdr = 0, *fdw = 0; + struct timeval tv; + int ret; + tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out + FD_ZERO(&fds); + FD_SET(fd, &fds); + if (is_read) fdr = &fds; + else fdw = &fds; + ret = select(fd+1, fdr, fdw, 0, &tv); + if (ret == -1) perror("select"); + return ret; +} + +static int socket_connect(const char *host, const char *port) +{ +#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0) + + int on = 1, fd; + struct linger lng = { 0, 0 }; + struct addrinfo hints, *res = 0; + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo"); + if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt"); + if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); + freeaddrinfo(res); + return fd; +#undef __err_connect +} + +static int write_bytes(int fd, const char *buf, size_t len) +{ + ssize_t bytes; + do { + bytes = write(fd, buf, len); + if (bytes >= 0) { + len -= bytes; + } else if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) { + return -1; + } + } while (len > 0); + + return 0; +} + +static int http_open(const char *fn) +{ + char *p, *proxy, *q, *http_host, *host, *port, *path, *buf; + int fd, ret, l; + ssize_t bytes = 0, bufsz = 0x10000; + + /* parse URL; adapted from khttp_parse_url() in knetfile.c */ + if (strstr(fn, "http://") != fn) return 0; + // set ->http_host + for (p = (char*)fn + 7; *p && *p != '/'; ++p); + l = p - fn - 7; + http_host = xcalloc(l + 1, 1); + strncpy(http_host, fn + 7, l); + http_host[l] = 0; + for (q = http_host; *q && *q != ':'; ++q); + if (*q == ':') *q++ = 0; + // get http_proxy + proxy = getenv("http_proxy"); + // set host, port and path + if (proxy == 0) { + host = xstrdup(http_host); // when there is no proxy, server name is identical to http_host name. + port = xstrdup(*q? q : "80"); + path = xstrdup(*p? p : "/"); + } else { + host = (strstr(proxy, "http://") == proxy)? xstrdup(proxy + 7) : xstrdup(proxy); + for (q = host; *q && *q != ':'; ++q); + if (*q == ':') *q++ = 0; + port = xstrdup(*q? q : "80"); + path = xstrdup(fn); + } + + /* connect; adapted from khttp_connect() in knetfile.c */ + l = 0; + fd = socket_connect(host, port); + buf = xcalloc(bufsz, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. + l += snprintf(buf + l, bufsz, "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n", + path, http_host); + if (write_bytes(fd, buf, l) != 0) { + close(fd); + fd = -1; + goto out; + } + l = 0; + retry: + while (l < bufsz && (bytes = read(fd, buf + l, 1)) > 0) { // read HTTP header; FIXME: bad efficiency + if (buf[l] == '\n' && l >= 3) + if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break; + ++l; + } + if (bytes < 0 && (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)) goto retry; + + buf[l] = 0; + if (bytes < 0 || l < 14) { // prematured header + close(fd); + fd = -1; + goto out; + } + ret = strtol(buf + 8, &p, 0); // HTTP return code + if (ret != 200) { + close(fd); + fd = -1; + } + out: + free(buf); free(http_host); free(host); free(port); free(path); + return fd; +} + +typedef struct { + int max_response, ctrl_fd; + char *response; +} ftpaux_t; + +static int kftp_get_response(ftpaux_t *aux) +{ + unsigned char c; + int n = 0; + char *p; + if (socket_wait(aux->ctrl_fd, 1) <= 0) return 0; + while (read(aux->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O + if (n >= aux->max_response) { + aux->max_response = aux->max_response? aux->max_response<<1 : 256; + aux->response = xrealloc(aux->response, aux->max_response); + } + aux->response[n++] = c; + if (c == '\n') { + if (n >= 4 && isdigit(aux->response[0]) && isdigit(aux->response[1]) && isdigit(aux->response[2]) + && aux->response[3] != '-') break; + n = 0; + continue; + } + } + if (n < 2) return -1; + aux->response[n-2] = 0; + return strtol(aux->response, &p, 0); +} + +static int kftp_send_cmd(ftpaux_t *aux, const char *cmd, int is_get) +{ + if (socket_wait(aux->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing + if (write_bytes(aux->ctrl_fd, cmd, strlen(cmd)) != 0) return -1; + return is_get? kftp_get_response(aux) : 0; +} + +static int ftp_open(const char *fn) +{ + char *p, *host = 0, *port = 0, *retr = 0; + char host2[80], port2[10]; + int v[6], l, fd = -1, ret, pasv_port, pasv_ip[4]; + ftpaux_t aux; + + /* parse URL */ + if (strstr(fn, "ftp://") != fn) return 0; + for (p = (char*)fn + 6; *p && *p != '/'; ++p); + if (*p != '/') return 0; + l = p - fn - 6; + port = xstrdup("21"); + host = xcalloc(l + 1, 1); + strncpy(host, fn + 6, l); + retr = xcalloc(strlen(p) + 8, 1); + sprintf(retr, "RETR %s\r\n", p); + + /* connect to ctrl */ + memset(&aux, 0, sizeof(ftpaux_t)); + aux.ctrl_fd = socket_connect(host, port); + if (aux.ctrl_fd == -1) goto ftp_open_end; /* fail to connect ctrl */ + + /* connect to the data stream */ + kftp_get_response(&aux); + kftp_send_cmd(&aux, "USER anonymous\r\n", 1); + kftp_send_cmd(&aux, "PASS kopen@\r\n", 1); + kftp_send_cmd(&aux, "TYPE I\r\n", 1); + kftp_send_cmd(&aux, "PASV\r\n", 1); + for (p = aux.response; *p && *p != '('; ++p); + if (*p != '(') goto ftp_open_end; + ++p; + sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); + memcpy(pasv_ip, v, 4 * sizeof(int)); + pasv_port = (v[4]<<8&0xff00) + v[5]; + kftp_send_cmd(&aux, retr, 0); + sprintf(host2, "%d.%d.%d.%d", pasv_ip[0], pasv_ip[1], pasv_ip[2], pasv_ip[3]); + sprintf(port2, "%d", pasv_port); + fd = socket_connect(host2, port2); + if (fd == -1) goto ftp_open_end; + ret = kftp_get_response(&aux); + if (ret != 150) { + close(fd); + fd = -1; + } + close(aux.ctrl_fd); + +ftp_open_end: + free(host); free(port); free(retr); free(aux.response); + return fd; +} +#endif /* !defined(_KO_NO_NET) */ + +static char **cmd2argv(const char *cmd) +{ + int i, beg, end, argc; + char **argv, *str; + end = strlen(cmd); + for (i = end - 1; i >= 0; --i) + if (!isspace(cmd[i])) break; + end = i + 1; + for (beg = 0; beg < end; ++beg) + if (!isspace(cmd[beg])) break; + if (beg == end) return 0; + for (i = beg + 1, argc = 0; i < end; ++i) + if (isspace(cmd[i]) && !isspace(cmd[i-1])) + ++argc; + argv = (char**)xcalloc(argc + 2, sizeof(void*)); + argv[0] = str = (char*)xcalloc(end - beg + 1, 1); + strncpy(argv[0], cmd + beg, end - beg); + for (i = argc = 1; i < end - beg; ++i) + if (isspace(str[i])) str[i] = 0; + else if (str[i] && str[i-1] == 0) argv[argc++] = &str[i]; + return argv; +} + +#define KO_STDIN 1 +#define KO_FILE 2 +#define KO_PIPE 3 +#define KO_HTTP 4 +#define KO_FTP 5 + +typedef struct { + int type, fd; + pid_t pid; +} koaux_t; + +void *kopen(const char *fn, int *_fd) +{ + koaux_t *aux = 0; + *_fd = -1; + if (strstr(fn, "http://") == fn) { + aux = xcalloc(1, sizeof(koaux_t)); + aux->type = KO_HTTP; + aux->fd = http_open(fn); + } else if (strstr(fn, "ftp://") == fn) { + aux = xcalloc(1, sizeof(koaux_t)); + aux->type = KO_FTP; + aux->fd = ftp_open(fn); + } else if (strcmp(fn, "-") == 0) { + aux = xcalloc(1, sizeof(koaux_t)); + aux->type = KO_STDIN; + aux->fd = STDIN_FILENO; + } else { + const char *p, *q; + for (p = fn; *p; ++p) + if (!isspace(*p)) break; + if (*p == '<') { // pipe open + int need_shell, pfd[2]; + pid_t pid; + // a simple check to see if we need to invoke a shell; not always working + for (q = p + 1; *q; ++q) + if (ispunct(*q) && *q != '.' && *q != '_' && *q != '-' && *q != ':') + break; + need_shell = (*q != 0); + if (pipe(pfd) != 0) return 0; + pid = vfork(); + if (pid == -1) { /* vfork() error */ + close(pfd[0]); close(pfd[1]); + return 0; + } + if (pid == 0) { /* the child process */ + char **argv; /* FIXME: I do not know if this will lead to a memory leak */ + close(pfd[0]); + dup2(pfd[1], STDOUT_FILENO); + close(pfd[1]); + if (!need_shell) { + argv = cmd2argv(p + 1); + execvp(argv[0], argv); + free(argv[0]); free(argv); + } else execl("/bin/sh", "sh", "-c", p + 1, NULL); + exit(1); + } else { /* parent process */ + close(pfd[1]); + aux = xcalloc(1, sizeof(koaux_t)); + aux->type = KO_PIPE; + aux->fd = pfd[0]; + aux->pid = pid; + } + } else { +#ifdef _WIN32 + *_fd = open(fn, O_RDONLY | O_BINARY); +#else + *_fd = open(fn, O_RDONLY); +#endif + if (*_fd) { + aux = xcalloc(1, sizeof(koaux_t)); + aux->type = KO_FILE; + aux->fd = *_fd; + } + } + } + *_fd = aux->fd; + return aux; +} + +int kclose(void *a) +{ + koaux_t *aux = (koaux_t*)a; + if (aux->type == KO_PIPE) { + int status; + pid_t pid; + pid = waitpid(aux->pid, &status, WNOHANG); + if (pid != aux->pid) kill(aux->pid, 15); + } + free(aux); + return 0; +} + +#ifdef _KO_MAIN +#define BUF_SIZE 0x10000 +int main(int argc, char *argv[]) +{ + void *x; + int l, fd; + unsigned char buf[BUF_SIZE]; + FILE *fp; + if (argc == 1) { + fprintf(stderr, "Usage: kopen \n"); + return 1; + } + x = kopen(argv[1], &fd); + fp = fdopen(fd, "r"); + if (fp == 0) { + fprintf(stderr, "ERROR: fail to open the input\n"); + return 1; + } + do { + if ((l = fread(buf, 1, BUF_SIZE, fp)) != 0) + fwrite(buf, 1, l, stdout); + } while (l == BUF_SIZE); + fclose(fp); + kclose(x); + return 0; +} +#endif diff --git a/kseq.h b/kseq.h index 0fb1847..55405a8 100644 --- a/kseq.h +++ b/kseq.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008, by Heng Li + Copyright (c) 2008, 2009, 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -23,6 +23,8 @@ SOFTWARE. */ +/* Last Modified: 05MAR2012 */ + #ifndef AC_KSEQ_H #define AC_KSEQ_H @@ -31,9 +33,14 @@ #include #include "utils.h" +#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r +#define KS_SEP_TAB 1 // isspace() && !' ' +#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) +#define KS_SEP_MAX 2 + #define __KS_TYPE(type_t) \ typedef struct __kstream_t { \ - char *buf; \ + unsigned char *buf; \ int begin, end, is_eof; \ type_t f; \ } kstream_t; @@ -46,7 +53,7 @@ { \ kstream_t *ks = (kstream_t*)xcalloc(1, sizeof(kstream_t)); \ ks->f = f; \ - ks->buf = (char*)xmalloc(__bufsize); \ + ks->buf = (unsigned char*)xmalloc(__bufsize); \ return ks; \ } \ static inline void ks_destroy(kstream_t *ks) \ @@ -83,10 +90,10 @@ typedef struct __kstring_t { #endif #define __KS_GETUNTIL(__read, __bufsize) \ - static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ { \ if (dret) *dret = 0; \ - str->l = 0; \ + str->l = append? str->l : 0; \ if (ks->begin >= ks->end && ks->is_eof) return -1; \ for (;;) { \ int i; \ @@ -98,14 +105,20 @@ typedef struct __kstring_t { if (ks->end == 0) break; \ } else break; \ } \ - if (delimiter) { \ + if (delimiter == KS_SEP_LINE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == '\n') break; \ + } else if (delimiter > KS_SEP_MAX) { \ for (i = ks->begin; i < ks->end; ++i) \ if (ks->buf[i] == delimiter) break; \ - } else { \ + } else if (delimiter == KS_SEP_SPACE) { \ for (i = ks->begin; i < ks->end; ++i) \ if (isspace(ks->buf[i])) break; \ - } \ - if (str->m - str->l < i - ks->begin + 1) { \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ str->m = str->l + (i - ks->begin) + 1; \ kroundup32(str->m); \ str->s = (char*)xrealloc(str->s, str->m); \ @@ -118,9 +131,15 @@ typedef struct __kstring_t { break; \ } \ } \ + if (str->s == 0) { \ + str->m = 1; \ + str->s = (char*)xcalloc(1, 1); \ + } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ str->s[str->l] = '\0'; \ return str->l; \ - } + } \ + static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { return ks_getuntil2(ks, delimiter, str, dret, 0); } #define KSTREAM_INIT(type_t, __read, __bufsize) \ __KS_TYPE(type_t) \ @@ -128,19 +147,16 @@ typedef struct __kstring_t { __KS_GETC(__read, __bufsize) \ __KS_GETUNTIL(__read, __bufsize) -#define __KSEQ_BASIC(type_t) \ - static inline kseq_t *kseq_init(type_t fd) \ +#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) + +#define __KSEQ_BASIC(SCOPE, type_t) \ + SCOPE kseq_t *kseq_init(type_t fd) \ { \ kseq_t *s = (kseq_t*)xcalloc(1, sizeof(kseq_t)); \ s->f = ks_init(fd); \ return s; \ } \ - static inline void kseq_rewind(kseq_t *ks) \ - { \ - ks->last_char = 0; \ - ks->f->is_eof = ks->f->begin = ks->f->end = 0; \ - } \ - static inline void kseq_destroy(kseq_t *ks) \ + SCOPE void kseq_destroy(kseq_t *ks) \ { \ if (!ks) return; \ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ @@ -153,44 +169,46 @@ typedef struct __kstring_t { -1 end-of-file -2 truncated quality string */ -#define __KSEQ_READ \ - static int kseq_read(kseq_t *seq) \ - { \ - int c; \ - kstream_t *ks = seq->f; \ +#define __KSEQ_READ(SCOPE) \ + SCOPE int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ if (seq->last_char == 0) { /* then jump to the next header line */ \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ - if (c == -1) return -1; /* end of file */ \ - seq->last_char = c; \ - } /* the first header char has been read */ \ - seq->comment.l = seq->seq.l = seq->qual.l = 0; \ - if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ - if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* else: the first header char has been read in the previous call */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ + if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ + if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ + seq->seq.m = 256; \ + seq->seq.s = (char*)xmalloc(seq->seq.m); \ + } \ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ - if (isgraph(c)) { /* printable non-space character */ \ - if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ - seq->seq.m = seq->seq.l + 2; \ - kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ - seq->seq.s = (char*)xrealloc(seq->seq.s, seq->seq.m); \ - } \ - seq->seq.s[seq->seq.l++] = (char)c; \ - } \ - } \ + if (c == '\n') continue; /* skip empty lines */ \ + seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ + ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ + } \ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ - seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ - if (c != '+') return seq->seq.l; /* FASTA */ \ - if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ - seq->qual.m = seq->seq.m; \ - seq->qual.s = (char*)xrealloc(seq->qual.s, seq->qual.m); \ - } \ + if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ + seq->seq.s = (char*)xrealloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)xrealloc(seq->qual.s, seq->qual.m); \ + } \ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ - if (c == -1) return -2; /* we should not stop here */ \ - while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ - if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ - seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ + if (c == -1) return -2; /* error: no quality string */ \ + while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ seq->last_char = 0; /* we have not come to the next header line */ \ - if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ - return seq->seq.l; \ + if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ + return seq->seq.l; \ } #define __KSEQ_TYPE(type_t) \ @@ -200,10 +218,19 @@ typedef struct __kstring_t { kstream_t *f; \ } kseq_t; -#define KSEQ_INIT(type_t, __read) \ - KSTREAM_INIT(type_t, __read, 4096) \ +#define KSEQ_INIT2(SCOPE, type_t, __read) \ + KSTREAM_INIT(type_t, __read, 16384) \ __KSEQ_TYPE(type_t) \ - __KSEQ_BASIC(type_t) \ - __KSEQ_READ + __KSEQ_BASIC(SCOPE, type_t) \ + __KSEQ_READ(SCOPE) + +#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) + +#define KSEQ_DECLARE(type_t) \ + __KS_TYPE(type_t) \ + __KSEQ_TYPE(type_t) \ + extern kseq_t *kseq_init(type_t fd); \ + void kseq_destroy(kseq_t *ks); \ + int kseq_read(kseq_t *seq); #endif diff --git a/ksort.h b/ksort.h index 68f9407..9f334e2 100644 --- a/ksort.h +++ b/ksort.h @@ -140,7 +140,7 @@ typedef struct { tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ } \ } \ - inline void __ks_insertsort_##name(type_t *s, type_t *t) \ + static inline void __ks_insertsort_##name(type_t *s, type_t *t) \ { \ type_t *i, *j, swap_tmp; \ for (i = s + 1; i < t; ++i) \ diff --git a/kstring.c b/kstring.c index 2b30038..ce9dbd3 100644 --- a/kstring.c +++ b/kstring.c @@ -27,7 +27,7 @@ int ksprintf(kstring_t *s, const char *fmt, ...) int main() { kstring_t *s; - s = (kstring_t*)calloc(1, sizeof(kstring_t)); + s = (kstring_t*)xcalloc(1, sizeof(kstring_t)); ksprintf(s, "abcdefg: %d", 100); printf("%s\n", s->s); free(s); diff --git a/kstring.h b/kstring.h index 88cf93a..136a0fc 100644 --- a/kstring.h +++ b/kstring.h @@ -17,19 +17,33 @@ typedef struct __kstring_t { } kstring_t; #endif -static inline int kputs(const char *p, kstring_t *s) +static inline void ks_resize(kstring_t *s, size_t size) +{ + if (s->m < size) { + s->m = size; + kroundup32(s->m); + s->s = (char*)xrealloc(s->s, s->m); + } +} + +static inline int kputsn(const char *p, int l, kstring_t *s) { - int l = strlen(p); if (s->l + l + 1 >= s->m) { s->m = s->l + l + 2; kroundup32(s->m); s->s = (char*)xrealloc(s->s, s->m); } - strcpy(s->s + s->l, p); + memcpy(s->s + s->l, p, l); s->l += l; + s->s[s->l] = 0; return l; } +static inline int kputs(const char *p, kstring_t *s) +{ + return kputsn(p, strlen(p), s); +} + static inline int kputc(int c, kstring_t *s) { if (s->l + 1 >= s->m) { @@ -42,6 +56,40 @@ static inline int kputc(int c, kstring_t *s) return c; } +static inline int kputw(int c, kstring_t *s) +{ + char buf[16]; + int l, x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (c < 0) buf[l++] = '-'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)xrealloc(s->s, s->m); + } + for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; + s->s[s->l] = 0; + return 0; +} + +static inline int kputuw(unsigned c, kstring_t *s) +{ + char buf[16]; + int l, i; + unsigned x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)xrealloc(s->s, s->m); + } + for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; + s->s[s->l] = 0; + return 0; +} + int ksprintf(kstring_t *s, const char *fmt, ...); #endif diff --git a/ksw.c b/ksw.c index 270ecfb..32b8a3a 100644 --- a/ksw.c +++ b/ksw.c @@ -23,7 +23,6 @@ SOFTWARE. */ -#ifndef _NO_SSE2 #include #include #include @@ -38,22 +37,35 @@ #define UNLIKELY(x) (x) #endif -struct _ksw_query_t { +const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 }; + +struct _kswq_t { int qlen, slen; uint8_t shift, mdiff, max, size; __m128i *qp, *H0, *H1, *E, *Hmax; }; -ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat) +/** + * Initialize the query data structure + * + * @param size Number of bytes used to store a score; valid valures are 1 or 2 + * @param qlen Length of the query sequence + * @param query Query sequence + * @param m Size of the alphabet + * @param mat Scoring matrix in a one-dimension array + * + * @return Query data structure + */ +kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat) { - ksw_query_t *q; + kswq_t *q; int slen, a, tmp, p; size = size > 1? 2 : 1; p = 8 * (3 - size); // # values per __m128i slen = (qlen + p - 1) / p; // segmented length - q = xmalloc(sizeof(ksw_query_t) + 256 + 16 * slen * (m + 4)); // a single block of memory - q->qp = (__m128i*)(((size_t)q + sizeof(ksw_query_t) + 15) >> 4 << 4); // align memory + q = (kswq_t*)xmalloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory + q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory q->H0 = q->qp + slen * m; q->H1 = q->H0 + slen; q->E = q->H1 + slen; @@ -92,11 +104,12 @@ ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const in return q; } -int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e) +kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e) { - int slen, i, m_b, n_b, te = -1, gmax = 0; + int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; uint64_t *b; __m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax; + kswr_t r; #define __max_16(ret, xx) do { \ (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \ @@ -107,10 +120,13 @@ int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) / } while (0) // initialization + r = g_defr; + minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; + endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; m_b = n_b = 0; b = 0; zero = _mm_set1_epi32(0); - gapoe = _mm_set1_epi8(a->gapo + a->gape); - gape = _mm_set1_epi8(a->gape); + gapoe = _mm_set1_epi8(_gapo + _gape); + gape = _mm_set1_epi8(_gape); shift = _mm_set1_epi8(q->shift); H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; slen = q->slen; @@ -166,11 +182,11 @@ int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) / end_loop16: //int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n"); __max_16(imax, max); // imax is the maximum number in max - if (imax >= a->T) { // write the b array; this condition adds branching unfornately + if (imax >= minsc) { // write the b array; this condition adds branching unfornately if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append if (n_b == m_b) { m_b = m_b? m_b<<1 : 8; - b = xrealloc(b, 8 * m_b); + b = (uint64_t*)xrealloc(b, 8 * m_b); } b[n_b++] = (uint64_t)imax<<32 | i; } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last @@ -179,34 +195,38 @@ end_loop16: gmax = imax; te = i; // te is the end position on the target for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); - if (gmax + q->shift >= 255) break; + if (gmax + q->shift >= 255 || gmax >= endsc) break; } S = H1; H1 = H0; H0 = S; // swap H0 and H1 } - a->score = gmax; a->te = te; - { // get a->qe, the end of query match; find the 2nd best score + r.score = gmax + q->shift < 255? gmax : 255; + r.te = te; + if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score int max = -1, low, high, qlen = slen * 16; uint8_t *t = (uint8_t*)Hmax; - for (i = 0, a->qe = -1; i < qlen; ++i, ++t) - if ((int)*t > max) max = *t, a->qe = i / 16 + i % 16 * slen; + for (i = 0; i < qlen; ++i, ++t) + if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen; //printf("%d,%d\n", max, gmax); - i = (a->score + q->max - 1) / q->max; - low = te - i; high = te + i; - for (i = 0, a->score2 = 0; i < n_b; ++i) { - int e = (int32_t)b[i]; - if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2) - a->score2 = b[i]>>32, a->te2 = e; + if (b) { + i = (r.score + q->max - 1) / q->max; + low = te - i; high = te + i; + for (i = 0; i < n_b; ++i) { + int e = (int32_t)b[i]; + if ((e < low || e > high) && (int)(b[i]>>32) > r.score2) + r.score2 = b[i]>>32, r.te2 = e; + } } } free(b); - return a->score + q->shift >= 255? 255 : a->score; + return r; } -int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e) +kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e) { - int slen, i, m_b, n_b, te = -1, gmax = 0; + int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; uint64_t *b; __m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax; + kswr_t r; #define __max_8(ret, xx) do { \ (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \ @@ -216,10 +236,13 @@ int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // } while (0) // initialization + r = g_defr; + minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; + endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; m_b = n_b = 0; b = 0; zero = _mm_set1_epi32(0); - gapoe = _mm_set1_epi16(a->gapo + a->gape); - gape = _mm_set1_epi16(a->gape); + gapoe = _mm_set1_epi16(_gapo + _gape); + gape = _mm_set1_epi16(_gape); H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; slen = q->slen; for (i = 0; i < slen; ++i) { @@ -261,11 +284,11 @@ int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // } end_loop8: __max_8(imax, max); - if (imax >= a->T) { + if (imax >= minsc) { if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { if (n_b == m_b) { m_b = m_b? m_b<<1 : 8; - b = xrealloc(b, 8 * m_b); + b = (uint64_t*)xrealloc(b, 8 * m_b); } b[n_b++] = (uint64_t)imax<<32 | i; } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last @@ -274,31 +297,238 @@ end_loop8: gmax = imax; te = i; for (j = 0; LIKELY(j < slen); ++j) _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); + if (gmax >= endsc) break; } S = H1; H1 = H0; H0 = S; } - a->score = gmax; a->te = te; + r.score = gmax; r.te = te; { int max = -1, low, high, qlen = slen * 8; uint16_t *t = (uint16_t*)Hmax; - for (i = 0, a->qe = -1; i < qlen; ++i, ++t) - if ((int)*t > max) max = *t, a->qe = i / 8 + i % 8 * slen; - i = (a->score + q->max - 1) / q->max; - low = te - i; high = te + i; - for (i = 0, a->score2 = 0; i < n_b; ++i) { - int e = (int32_t)b[i]; - if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2) - a->score2 = b[i]>>32, a->te2 = e; + for (i = 0, r.qe = -1; i < qlen; ++i, ++t) + if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen; + if (b) { + i = (r.score + q->max - 1) / q->max; + low = te - i; high = te + i; + for (i = 0; i < n_b; ++i) { + int e = (int32_t)b[i]; + if ((e < low || e > high) && (int)(b[i]>>32) > r.score2) + r.score2 = b[i]>>32, r.te2 = e; + } } } free(b); - return a->score; + return r; } -int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) +static void revseq(int l, uint8_t *s) { - if (q->size == 1) return ksw_sse2_16(q, tlen, target, a); - else return ksw_sse2_8(q, tlen, target, a); + int i, t; + for (i = 0; i < l>>1; ++i) + t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t; +} + +kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry) +{ + int size; + kswq_t *q; + kswr_t r, rr; + kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int); + + q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat); + if (qry && *qry == 0) *qry = q; + func = q->size == 2? ksw_i16 : ksw_u8; + size = q->size; + r = func(q, tlen, target, gapo, gape, xtra); + if (qry == 0) free(q); + if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r; + revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end + q = ksw_qinit(size, r.qe + 1, query, m, mat); + rr = func(q, tlen, target, gapo, gape, KSW_XSTOP | r.score); + revseq(r.qe + 1, query); revseq(r.te + 1, target); + free(q); + if (r.score == rr.score) + r.tb = r.te - rr.te, r.qb = r.qe - rr.qe; + return r; +} + +/******************** + *** SW extension *** + ********************/ + +typedef struct { + int32_t h, e; +} eh_t; + +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle) +{ + eh_t *eh; // score array + int8_t *qp; // query profile + int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap; + if (h0 < 0) h0 = 0; + // allocate memory + qp = xmalloc(qlen * m); + eh = xcalloc(qlen + 1, 8); + // generate the query profile + for (k = i = 0; k < m; ++k) { + const int8_t *p = &mat[k * m]; + for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]]; + } + // fill the first row + eh[0].h = h0; eh[1].h = h0 > gapoe? h0 - gapoe : 0; + for (j = 2; j <= qlen && eh[j-1].h > gape; ++j) + eh[j].h = eh[j-1].h - gape; + // adjust $w if it is too large + k = m * m; + for (i = 0, max = 0; i < k; ++i) // get the max score + max = max > mat[i]? max : mat[i]; + max_gap = (int)((double)(qlen * max - gapo) / gape + 1.); + max_gap = max_gap > 1? max_gap : 1; + w = w < max_gap? w : max_gap; + // DP loop + max = h0, max_i = max_j = -1; + beg = 0, end = qlen; + for (i = 0; LIKELY(i < tlen); ++i) { + int f = 0, h1, m = 0, mj = -1; + int8_t *q = &qp[target[i] * qlen]; + // compute the first column + h1 = h0 - (gapo + gape * (i + 1)); + if (h1 < 0) h1 = 0; + // apply the band and the constraint (if provided) + if (beg < i - w) beg = i - w; + if (end > i + w + 1) end = i + w + 1; + if (end > qlen) end = qlen; + for (j = beg; LIKELY(j < end); ++j) { + // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) + // Similar to SSE2-SW, cells are computed in the following order: + // H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} + // E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape + // F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape + eh_t *p = &eh[j]; + int h = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) + p->h = h1; // set H(i,j-1) for the next row + h += q[j]; + h = h > e? h : e; + h = h > f? h : f; + h1 = h; // save H(i,j) to h1 for the next column + mj = m > h? mj : j; + m = m > h? m : h; // m is stored at eh[mj+1] + h -= gapoe; + h = h > 0? h : 0; + e -= gape; + e = e > h? e : h; // computed E(i+1,j) + p->e = e; // save E(i+1,j) for the next row + f -= gape; + f = f > h? f : h; // computed F(i,j+1) + } + eh[end].h = h1; eh[end].e = 0; + if (m == 0) break; + if (m > max) max = m, max_i = i, max_j = mj; + // update beg and end for the next round + for (j = mj; j >= beg && eh[j].h; --j); + beg = j + 1; + for (j = mj + 2; j <= end && eh[j].h; ++j); + end = j; + //beg = 0; end = qlen; // uncomment this line for debugging + } + free(eh); free(qp); + if (_qle) *_qle = max_j + 1; + if (_tle) *_tle = max_i + 1; + return max; +} + +/******************** + * Global alignment * + ********************/ + +#define MINUS_INF -0x40000000 + +static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, int op, int len) +{ + if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) { + if (*n_cigar == *m_cigar) { + *m_cigar = *m_cigar? (*m_cigar)<<1 : 4; + cigar = xrealloc(cigar, (*m_cigar) << 2); + } + cigar[(*n_cigar)++] = len<<4 | op; + } else cigar[(*n_cigar)-1] += len<<4; + return cigar; +} + +int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_) +{ + eh_t *eh; + int8_t *qp; // query profile + int i, j, k, gapoe = gapo + gape, score, n_col; + uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex + if (n_cigar_) *n_cigar_ = 0; + // allocate memory + n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix + z = xmalloc(n_col * tlen); + qp = xmalloc(qlen * m); + eh = xcalloc(qlen + 1, 8); + // generate the query profile + for (k = i = 0; k < m; ++k) { + const int8_t *p = &mat[k * m]; + for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]]; + } + // fill the first row + eh[0].h = 0; eh[0].e = MINUS_INF; + for (j = 1; j <= qlen && j <= w; ++j) + eh[j].h = -(gapo + gape * j), eh[j].e = MINUS_INF; + for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band + // DP loop + for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop + int32_t f = MINUS_INF, h1, beg, end; + int8_t *q = &qp[target[i] * qlen]; + uint8_t *zi = &z[i * n_col]; + beg = i > w? i - w : 0; + end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence + h1 = beg == 0? -(gapo + gape * (i + 1)) : MINUS_INF; + for (j = beg; LIKELY(j < end); ++j) { + // This loop is organized in a similar way to ksw_extend() and ksw_sse2(), except: + // 1) not checking h>0; 2) recording direction for backtracking + eh_t *p = &eh[j]; + int32_t h = p->h, e = p->e; + uint8_t d; // direction + p->h = h1; + h += q[j]; + d = h >= e? 0 : 1; + h = h >= e? h : e; + d = h >= f? d : 2; + h = h >= f? h : f; + h1 = h; + h -= gapoe; + e -= gape; + d |= e > h? 1<<2 : 0; + e = e > h? e : h; + p->e = e; + f -= gape; + d |= f > h? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two + f = f > h? f : h; + zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell + } + eh[end].h = h1; eh[end].e = MINUS_INF; + } + score = eh[qlen].h; + if (n_cigar_ && cigar_) { // backtrack + int n_cigar = 0, m_cigar = 0, which = 0; + uint32_t *cigar = 0, tmp; + i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell + while (i >= 0 && k >= 0) { + which = z[i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3; + if (which == 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k; + else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i; + else cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k; + } + if (i >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1); + if (k >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1); + for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR + tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp; + *n_cigar_ = n_cigar, *cigar_ = cigar; + } + free(eh); free(qp); free(z); + return score; } /******************************************* @@ -334,30 +564,33 @@ unsigned char seq_nt4_table[256] = { int main(int argc, char *argv[]) { - int c, sa = 1, sb = 3, i, j, k, forward_only = 0, size = 2; + int c, sa = 1, sb = 3, i, j, k, forward_only = 0, max_rseq = 0; int8_t mat[25]; - ksw_aux_t a; + int gapo = 5, gape = 2, minsc = 0, xtra = KSW_XSTART; + uint8_t *rseq = 0; gzFile fpt, fpq; kseq_t *kst, *ksq; + // parse command line - a.gapo = 5; a.gape = 2; a.T = 10; - while ((c = getopt(argc, argv, "a:b:q:r:ft:s:")) >= 0) { + while ((c = getopt(argc, argv, "a:b:q:r:ft:1")) >= 0) { switch (c) { case 'a': sa = atoi(optarg); break; case 'b': sb = atoi(optarg); break; - case 'q': a.gapo = atoi(optarg); break; - case 'r': a.gape = atoi(optarg); break; - case 't': a.T = atoi(optarg); break; + case 'q': gapo = atoi(optarg); break; + case 'r': gape = atoi(optarg); break; + case 't': minsc = atoi(optarg); break; case 'f': forward_only = 1; break; - case 's': size = atoi(optarg); break; + case '1': xtra |= KSW_XBYTE; break; } } if (optind + 2 > argc) { - fprintf(stderr, "Usage: ksw [-s%d] [-a%d] [-b%d] [-q%d] [-r%d] \n", size, sa, sb, a.gapo, a.gape); + fprintf(stderr, "Usage: ksw [-1] [-f] [-a%d] [-b%d] [-q%d] [-r%d] [-t%d] \n", sa, sb, gapo, gape, minsc); return 1; } + if (minsc > 0xffff) minsc = 0xffff; + xtra |= KSW_XSUBO | minsc; // initialize scoring matrix - for (i = k = 0; i < 5; ++i) { + for (i = k = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) mat[k++] = i == j? sa : -sb; mat[k++] = 0; // ambiguous base @@ -368,35 +601,34 @@ int main(int argc, char *argv[]) fpq = xzopen(argv[optind+1], "r"); ksq = kseq_init(fpq); // all-pair alignment while (kseq_read(ksq) > 0) { - ksw_query_t *q[2]; - for (i = 0; i < ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]]; - q[0] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat); + kswq_t *q[2] = {0, 0}; + kswr_t r; + for (i = 0; i < (int)ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]]; if (!forward_only) { // reverse - for (i = 0; i < ksq->seq.l/2; ++i) { - int t = ksq->seq.s[i]; - ksq->seq.s[i] = ksq->seq.s[ksq->seq.l-1-i]; - ksq->seq.s[ksq->seq.l-1-i] = t; + if ((int)ksq->seq.m > max_rseq) { + max_rseq = ksq->seq.m; + rseq = (uint8_t*)xrealloc(rseq, max_rseq); } - for (i = 0; i < ksq->seq.l; ++i) - ksq->seq.s[i] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i]; - q[1] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat); - } else q[1] = 0; + for (i = 0, j = ksq->seq.l - 1; i < (int)ksq->seq.l; ++i, --j) + rseq[j] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i]; + } gzrewind(fpt); kseq_rewind(kst); while (kseq_read(kst) > 0) { - int s; - for (i = 0; i < kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]]; - s = ksw_sse2(q[0], kst->seq.l, (uint8_t*)kst->seq.s, &a); - printf("%s\t%s\t+\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1); - if (q[1]) { - s = ksw_sse2(q[1], kst->seq.l, (uint8_t*)kst->seq.s, &a); - printf("%s\t%s\t-\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1); + for (i = 0; i < (int)kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]]; + r = ksw_align(ksq->seq.l, (uint8_t*)ksq->seq.s, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[0]); + if (r.score >= minsc) + err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, r.qb, r.qe+1, r.score, r.score2, r.te2); + if (rseq) { + r = ksw_align(ksq->seq.l, rseq, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[1]); + if (r.score >= minsc) + err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, (int)ksq->seq.l - r.qb, (int)ksq->seq.l - 1 - r.qe, r.score, r.score2, r.te2); } } free(q[0]); free(q[1]); } + free(rseq); kseq_destroy(kst); err_gzclose(fpt); kseq_destroy(ksq); err_gzclose(fpq); return 0; } -#endif // _KSW_MAIN -#endif // _NO_SSE2 +#endif diff --git a/ksw.h b/ksw.h index d93d6a9..5162dc0 100644 --- a/ksw.h +++ b/ksw.h @@ -1,51 +1,69 @@ #ifndef __AC_KSW_H #define __AC_KSW_H -struct _ksw_query_t; -typedef struct _ksw_query_t ksw_query_t; +#include + +#define KSW_XBYTE 0x10000 +#define KSW_XSTOP 0x20000 +#define KSW_XSUBO 0x40000 +#define KSW_XSTART 0x80000 + +struct _kswq_t; +typedef struct _kswq_t kswq_t; typedef struct { - // input - unsigned gapo, gape; // the first gap costs gapo+gape - unsigned T; // threshold - // output - int score, te, qe, score2, te2; -} ksw_aux_t; + int score; // best score + int te, qe; // target end and query end + int score2, te2; // second best score and ending position on the target + int tb, qb; // target start and query start +} kswr_t; #ifdef __cplusplus extern "C" { #endif /** - * Initialize the query data structure + * Aligning two sequences * - * @param size Number of bytes used to store a score; valid valures are 1 or 2 - * @param qlen Length of the query sequence - * @param query Query sequence - * @param m Size of the alphabet - * @param mat Scoring matrix in a one-dimension array + * @param qlen length of the query sequence (typically + Copyright (c) 2008, by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -76,15 +76,15 @@ int main() { (v).a[(v).n++] = (x); \ } while (0) -#define kv_pushp(type, v) (((v).n == (v).m)? \ +#define kv_pushp(type, v) ((((v).n == (v).m)? \ ((v).m = ((v).m? (v).m<<1 : 2), \ (v).a = (type*)xrealloc((v).a, sizeof(type) * (v).m), 0) \ - : 0), ((v).a + ((v).n++)) + : 0), &(v).a[(v).n++]) -#define kv_a(type, v, i) ((v).m <= (size_t)(i)? \ +#define kv_a(type, v, i) (((v).m <= (size_t)(i)? \ ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ (v).a = (type*)xrealloc((v).a, sizeof(type) * (v).m), 0) \ - : (v).n <= (size_t)(i)? (v).n = (i) \ - : 0), (v).a[(i)] + : (v).n <= (size_t)(i)? (v).n = (i) + 1 \ + : 0), (v).a[(i)]) #endif diff --git a/main.c b/main.c index 73cbcd9..764c0d2 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r132" +#define PACKAGE_VERSION "0.6.2-r301-beta" #endif static int usage() @@ -20,21 +20,20 @@ static int usage() fprintf(stderr, " sampe generate alignment (paired ended)\n"); fprintf(stderr, " bwasw BWA-SW for long queries\n"); fprintf(stderr, " fastmap identify super-maximal exact matches\n"); + fprintf(stderr, " mem BWA-MEM algorithm\n"); fprintf(stderr, "\n"); fprintf(stderr, " fa2pac convert FASTA to PAC format\n"); fprintf(stderr, " pac2bwt generate BWT from PAC\n"); fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n"); fprintf(stderr, " bwtupdate update .bwt to the new format\n"); fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n"); - fprintf(stderr, " pac2cspac convert PAC to color-space PAC\n"); - fprintf(stderr, " stdsw standard SW/NW alignment\n"); fprintf(stderr, "\n"); return 1; } void bwa_print_sam_PG() { - printf("@PG\tID:bwa\tPN:bwa\tVN:%s\n", PACKAGE_VERSION); + err_printf("@PG\tID:bwa\tPN:bwa\tVN:%s\n", PACKAGE_VERSION); } int main(int argc, char *argv[]) @@ -50,15 +49,13 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1); else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1); else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1); - else if (strcmp(argv[1], "sw") == 0) ret = bwa_stdsw(argc-1, argv+1); else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1); else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1); - else if (strcmp(argv[1], "pac2cspac") == 0) ret = bwa_pac2cspac(argc-1, argv+1); - else if (strcmp(argv[1], "stdsw") == 0) ret = bwa_stdsw(argc-1, argv+1); else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1); + else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1); else { fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; diff --git a/main.h b/main.h index 026a80b..3e70362 100644 --- a/main.h +++ b/main.h @@ -6,7 +6,6 @@ extern "C" { #endif int bwa_fa2pac(int argc, char *argv[]); - int bwa_pac2cspac(int argc, char *argv[]); int bwa_pac2bwt(int argc, char *argv[]); int bwa_bwtupdate(int argc, char *argv[]); int bwa_bwt2sa(int argc, char *argv[]); @@ -17,11 +16,10 @@ extern "C" { int bwa_sai2sam_se(int argc, char *argv[]); int bwa_sai2sam_pe(int argc, char *argv[]); - int bwa_stdsw(int argc, char *argv[]); - int bwa_bwtsw2(int argc, char *argv[]); int main_fastmap(int argc, char *argv[]); + int main_mem(int argc, char *argv[]); #ifdef __cplusplus } diff --git a/simple_dp.c b/simple_dp.c deleted file mode 100644 index 4c6a156..0000000 --- a/simple_dp.c +++ /dev/null @@ -1,162 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "stdaln.h" -#include "utils.h" - -#include "kseq.h" -KSEQ_INIT(gzFile, err_gzread) - -typedef struct { - int l; - unsigned char *s; - char *n; -} seq1_t; - -typedef struct { - int n_seqs, m_seqs; - seq1_t *seqs; -} seqs_t; - -unsigned char aln_rev_table[256] = { - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','T','V','G', 'H','N','N','C', 'D','N','N','M', 'N','K','N','N', - 'N','N','Y','S', 'A','N','B','W', 'X','R','N','N', 'N','N','N','N', - 'N','t','v','g', 'h','N','N','c', 'd','N','N','m', 'N','k','N','N', - 'N','N','y','s', 'a','N','b','w', 'x','r','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N' -}; - -static int g_is_global = 0, g_thres = 1, g_strand = 0, g_aa = 0; -static AlnParam g_aln_param; - -static void revseq(int len, uint8_t *seq) -{ - int i; - for (i = 0; i < len>>1; ++i) { - uint8_t tmp = aln_rev_table[seq[len-1-i]]; - seq[len-1-i] = aln_rev_table[seq[i]]; - seq[i] = tmp; - } - if (len&1) seq[i] = aln_rev_table[seq[i]]; -} - -static seqs_t *load_seqs(const char *fn) -{ - seqs_t *s; - seq1_t *p; - gzFile fp; - int l; - kseq_t *seq; - - fp = xzopen(fn, "r"); - seq = kseq_init(fp); - s = (seqs_t*)xcalloc(1, sizeof(seqs_t)); - s->m_seqs = 256; - s->seqs = (seq1_t*)xcalloc(s->m_seqs, sizeof(seq1_t)); - while ((l = kseq_read(seq)) >= 0) { - if (s->n_seqs == s->m_seqs) { - s->m_seqs <<= 1; - s->seqs = (seq1_t*)xrealloc(s->seqs, s->m_seqs * sizeof(seq1_t)); - } - p = s->seqs + (s->n_seqs++); - p->l = seq->seq.l; - p->s = (unsigned char*)xmalloc(p->l + 1); - memcpy(p->s, seq->seq.s, p->l); - p->s[p->l] = 0; - p->n = xstrdup((const char*)seq->name.s); - } - kseq_destroy(seq); - err_gzclose(fp); - fprintf(stderr, "[load_seqs] %d sequences are loaded.\n", s->n_seqs); - return s; -} - -static void aln_1seq(const seqs_t *ss, const char *name, int l, const char *s, char strand) -{ - int i; - for (i = 0; i < ss->n_seqs; ++i) { - AlnAln *aa; - seq1_t *p = ss->seqs + i; - g_aln_param.band_width = l + p->l; - aa = aln_stdaln_aux(s, (const char*)p->s, &g_aln_param, g_is_global, g_thres, l, p->l); - if (aa->score >= g_thres || g_is_global) { - printf(">%s\t%d\t%d\t%s\t%c\t%d\t%d\t%d\t%d\t", p->n, aa->start1? aa->start1 : 1, aa->end1, name, strand, - aa->start2? aa->start2 : 1, aa->end2, aa->score, aa->subo); - // NB: I put the short sequence as the first sequence in SW, an insertion to - // the reference becomes a deletion from the short sequence. Therefore, I use - // "MDI" here rather than "MID", and print ->out2 first rather than ->out1. - for (i = 0; i != aa->n_cigar; ++i) - printf("%d%c", aa->cigar32[i]>>4, "MDI"[aa->cigar32[i]&0xf]); - printf("\n%s\n%s\n%s\n", aa->out2, aa->outm, aa->out1); - } - aln_free_AlnAln(aa); - } -} - -static void aln_seqs(const seqs_t *ss, const char *fn) -{ - gzFile fp; - kseq_t *seq; - int l; - - fp = xzopen(fn, "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - if (g_strand&1) aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '+'); - if (g_strand&2) { - revseq(l, (uint8_t*)seq->seq.s); - aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '-'); - } - } - kseq_destroy(seq); - err_gzclose(fp); -} - -int bwa_stdsw(int argc, char *argv[]) -{ - int c; - seqs_t *ss; - - while ((c = getopt(argc, argv, "gT:frp")) >= 0) { - switch (c) { - case 'g': g_is_global = 1; break; - case 'T': g_thres = atoi(optarg); break; - case 'f': g_strand |= 1; break; - case 'r': g_strand |= 2; break; - case 'p': g_aa = 1; break; - } - } - if (g_strand == 0) g_strand = 3; - if (g_aa) g_strand = 1; - if (optind + 1 >= argc) { - fprintf(stderr, "\nUsage: bwa stdsw [options] \n\n"); - fprintf(stderr, "Options: -T INT minimum score [%d]\n", g_thres); - fprintf(stderr, " -p protein alignment (suppressing -r)\n"); - fprintf(stderr, " -f forward strand only\n"); - fprintf(stderr, " -r reverse strand only\n"); - fprintf(stderr, " -g global alignment\n\n"); - fprintf(stderr, "Note: This program is specifically designed for alignment between multiple short\n"); - fprintf(stderr, " sequences and ONE long sequence. It outputs the suboptimal score on the long\n"); - fprintf(stderr, " sequence.\n\n"); - return 1; - } - g_aln_param = g_aa? aln_param_aa2aa : aln_param_blast; - g_aln_param.gap_end = 0; - ss = load_seqs(argv[optind]); - aln_seqs(ss, argv[optind+1]); - return 0; -} diff --git a/solid2fastq.pl b/solid2fastq.pl deleted file mode 100755 index c60ad81..0000000 --- a/solid2fastq.pl +++ /dev/null @@ -1,111 +0,0 @@ -#!/usr/bin/perl -w - -# Author: lh3 -# Note: Ideally, this script should be written in C. It is a bit slow at present. -# Also note that this script is different from the one contained in MAQ. - -use strict; -use warnings; -use Getopt::Std; - -my %opts; -my $version = '0.1.4'; -my $usage = qq{ -Usage: solid2fastq.pl - -Note: is the string showed in the `# Title:' line of a - ".csfasta" read file. Then F3.csfasta is read sequence - file and F3_QV.qual is the quality file. If - R3.csfasta is present, this script assumes reads are - paired; otherwise reads will be regarded as single-end. - - The read name will be :panel_x_y/[12] with `1' for R3 - tag and `2' for F3. Usually you may want to use short - to save diskspace. Long also causes troubles to maq. - -}; - -getopts('', \%opts); -die($usage) if (@ARGV != 2); -my ($title, $pre) = @ARGV; -my (@fhr, @fhw); -my @fn_suff = ('F3.csfasta', 'F3_QV.qual', 'R3.csfasta', 'R3_QV.qual'); -my $is_paired = (-f "$title$fn_suff[2]" || -f "$title$fn_suff[2].gz")? 1 : 0; -if ($is_paired) { # paired end - for (0 .. 3) { - my $fn = "$title$fn_suff[$_]"; - $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); - open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); - } - open($fhw[0], "|gzip >$pre.read2.fastq.gz") || die; # this is NOT a typo - open($fhw[1], "|gzip >$pre.read1.fastq.gz") || die; - open($fhw[2], "|gzip >$pre.single.fastq.gz") || die; - my (@df, @dr); - @df = &read1(1); @dr = &read1(2); - while (@df && @dr) { - if ($df[0] eq $dr[0]) { # mate pair - print {$fhw[0]} $df[1]; print {$fhw[1]} $dr[1]; - @df = &read1(1); @dr = &read1(2); - } else { - if ($df[0] le $dr[0]) { - print {$fhw[2]} $df[1]; - @df = &read1(1); - } else { - print {$fhw[2]} $dr[1]; - @dr = &read1(2); - } - } - } - if (@df) { - print {$fhw[2]} $df[1]; - while (@df = &read1(1, $fhr[0], $fhr[1])) { - print {$fhw[2]} $df[1]; - } - } - if (@dr) { - print {$fhw[2]} $dr[1]; - while (@dr = &read1(2, $fhr[2], $fhr[3])) { - print {$fhw[2]} $dr[1]; - } - } - close($fhr[$_]) for (0 .. $#fhr); - close($fhw[$_]) for (0 .. $#fhw); -} else { # single end - for (0 .. 1) { - my $fn = "$title$fn_suff[$_]"; - $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); - open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); - } - open($fhw[2], "|gzip >$pre.single.fastq.gz") || die; - my @df; - while (@df = &read1(1, $fhr[0], $fhr[1])) { - print {$fhw[2]} $df[1]; - } - close($fhr[$_]) for (0 .. $#fhr); - close($fhw[2]); -} - -sub read1 { - my $i = shift(@_); - my $j = ($i-1)<<1; - my ($key, $seq); - my ($fhs, $fhq) = ($fhr[$j], $fhr[$j|1]); - while (<$fhs>) { - my $t = <$fhq>; - if (/^>(\d+)_(\d+)_(\d+)_[FR]3/) { - $key = sprintf("%.4d_%.4d_%.4d", $1, $2, $3); # this line could be improved on 64-bit machines - die(qq/** unmatched read name: '$_' != '$_'\n/) unless ($_ eq $t); - my $name = "$pre:$1_$2_$3/$i"; - $_ = substr(<$fhs>, 2); - tr/0123./ACGTN/; - my $s = $_; - $_ = <$fhq>; - s/-1\b/0/eg; - s/^(\d+)\s*//; - s/(\d+)\s*/chr($1+33)/eg; - $seq = qq/\@$name\n$s+\n$_\n/; - last; - } - } - return defined($seq)? ($key, $seq) : (); -} diff --git a/stdaln.c b/stdaln.c index 1a8a3d1..336a4e4 100644 --- a/stdaln.c +++ b/stdaln.c @@ -543,13 +543,12 @@ int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, int start, end, max_score; int thres, *suba, *ss; - int gap_open, gap_ext, b; + int gap_open, gap_ext; int *score_matrix, N_MATRIX_ROW; /* initialize some align-related parameters. just for compatibility */ gap_open = ap->gap_open; gap_ext = ap->gap_ext; - b = ap->band_width; score_matrix = ap->matrix; N_MATRIX_ROW = ap->row; thres = _thres > 0? _thres : -_thres; @@ -863,7 +862,7 @@ uint16_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar) int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, path_t *path, int *path_len, int G0, uint8_t *_mem) { - int q, r, qr, tmp_len; + int q, r, qr; int32_t **s_array, *score_array; int is_overflow, of_base; uint32_t *eh; @@ -890,7 +889,6 @@ int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2 s_array[i] = (int32_t*)_p, _p += 4 * len1; /* initialization */ aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array); - tmp_len = len1 + 1; start = 1; end = 2; end_i = end_j = 0; score = 0; diff --git a/utils.c b/utils.c index 93c17c9..ad2f734 100644 --- a/utils.c +++ b/utils.c @@ -41,6 +41,18 @@ #include #include "utils.h" +#include "ksort.h" +#define pair64_lt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y < (b).y)) +KSORT_INIT(128, pair64_t, pair64_lt) +KSORT_INIT(64, uint64_t, ks_lt_generic) + +#include "kseq.h" +KSEQ_INIT2(, gzFile, err_gzread) + +/******************** + * System utilities * + ********************/ + FILE *err_xopen_core(const char *func, const char *fn, const char *mode) { FILE *fp = 0; @@ -51,6 +63,7 @@ FILE *err_xopen_core(const char *func, const char *fn, const char *mode) } return fp; } + FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp) { if (freopen(fn, mode, fp) == 0) { @@ -58,6 +71,7 @@ FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE } return fp; } + gzFile err_xzopen_core(const char *func, const char *fn, const char *mode) { gzFile fp; @@ -109,12 +123,10 @@ void _err_fatal_simple_core(const char *func, const char *msg) size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) { - size_t ret = fwrite(ptr, size, nmemb, stream); - if (ret != nmemb) - { - _err_fatal_simple("fwrite", strerror(errno)); - } - return ret; + size_t ret = fwrite(ptr, size, nmemb, stream); + if (ret != nmemb) + _err_fatal_simple("fwrite", strerror(errno)); + return ret; } size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream) @@ -163,36 +175,26 @@ long err_ftell(FILE *stream) int err_printf(const char *format, ...) { - va_list arg; - int done; - - va_start(arg, format); - done = vfprintf(stdout, format, arg); - int saveErrno = errno; - va_end(arg); - - if (done < 0) - { - _err_fatal_simple("vfprintf(stdout)", strerror(saveErrno)); - } - return done; + va_list arg; + int done; + va_start(arg, format); + done = vfprintf(stdout, format, arg); + int saveErrno = errno; + va_end(arg); + if (done < 0) _err_fatal_simple("vfprintf(stdout)", strerror(saveErrno)); + return done; } int err_fprintf(FILE *stream, const char *format, ...) { - va_list arg; - int done; - - va_start(arg, format); - done = vfprintf(stream, format, arg); - int saveErrno = errno; - va_end(arg); - - if (done < 0) - { - _err_fatal_simple("vfprintf", strerror(saveErrno)); - } - return done; + va_list arg; + int done; + va_start(arg, format); + done = vfprintf(stream, format, arg); + int saveErrno = errno; + va_end(arg); + if (done < 0) _err_fatal_simple("vfprintf", strerror(saveErrno)); + return done; } int err_fputc(int c, FILE *stream) @@ -220,10 +222,8 @@ int err_fputs(const char *s, FILE *stream) int err_fflush(FILE *stream) { int ret = fflush(stream); - if (ret != 0) - { - _err_fatal_simple("fflush", strerror(errno)); - } + if (ret != 0) _err_fatal_simple("fflush", strerror(errno)); + #ifdef FSYNC_ON_FLUSH /* Calling fflush() ensures that all the data has made it to the kernel buffers, but this may not be sufficient for remote filesystems @@ -234,15 +234,12 @@ int err_fflush(FILE *stream) { struct stat sbuf; if (0 != fstat(fileno(stream), &sbuf)) - { _err_fatal_simple("fstat", strerror(errno)); - } + if (S_ISREG(sbuf.st_mode)) { if (0 != fsync(fileno(stream))) - { _err_fatal_simple("fsync", strerror(errno)); - } } } #endif @@ -251,12 +248,9 @@ int err_fflush(FILE *stream) int err_fclose(FILE *stream) { - int ret = fclose(stream); - if (ret != 0) - { - _err_fatal_simple("fclose", strerror(errno)); - } - return ret; + int ret = fclose(stream); + if (ret != 0) _err_fatal_simple("fclose", strerror(errno)); + return ret; } int err_gzclose(gzFile file) @@ -311,6 +305,10 @@ char *err_strdup(const char *s, const char *file, unsigned int line, const char return p; } +/********* + * Timer * + *********/ + double cputime() { struct rusage r; diff --git a/utils.h b/utils.h index b904701..8567d3f 100644 --- a/utils.h +++ b/utils.h @@ -28,6 +28,7 @@ #ifndef LH3_UTILS_H #define LH3_UTILS_H +#include #include #include @@ -38,10 +39,9 @@ #define ATTRIBUTE(list) #endif - - #define err_fatal_simple(msg) _err_fatal_simple(__func__, msg) #define err_fatal_simple_core(msg) _err_fatal_simple_core(__func__, msg) + #define xopen(fn, mode) err_xopen_core(__func__, fn, mode) #define xreopen(fn, mode, fp) err_xreopen_core(__func__, fn, mode, fp) #define xzopen(fn, mode) err_xzopen_core(__func__, fn, mode) @@ -54,6 +54,13 @@ #define xstrdup(s) err_strdup( (s), __FILE__, __LINE__, __func__) +typedef struct { + uint64_t x, y; +} pair64_t; + +typedef struct { size_t n, m; uint64_t *a; } uint64_v; +typedef struct { size_t n, m; pair64_t *a; } pair64_v; + #ifdef __cplusplus extern "C" { #endif @@ -92,8 +99,24 @@ extern "C" { double cputime(); double realtime(); + void ks_introsort_64 (size_t n, uint64_t *a); + void ks_introsort_128(size_t n, pair64_t *a); + #ifdef __cplusplus } #endif +static inline uint64_t hash_64(uint64_t key) +{ + key += ~(key << 32); + key ^= (key >> 22); + key += ~(key << 13); + key ^= (key >> 8); + key += (key << 3); + key ^= (key >> 15); + key += ~(key << 27); + key ^= (key >> 31); + return key; +} + #endif