From 15e80aec3b9a3a379ff30ee5f1b78cdf9c7b9cca Mon Sep 17 00:00:00 2001 From: hanna Date: Wed, 4 Nov 2009 21:46:05 +0000 Subject: [PATCH] Temporarily adding changed bwa files to Sting repository until I can pull together the official bwa patch. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1975 348d0f76-0448-11de-a6fe-93d51630548a --- c/bwa/bntseq.c | 302 +++++++++++++++++++++++++ c/bwa/bntseq.h | 80 +++++++ c/bwa/bwase.c | 597 +++++++++++++++++++++++++++++++++++++++++++++++++ c/bwa/bwase.h | 52 +++++ c/bwa/bwtaln.c | 295 ++++++++++++++++++++++++ c/bwa/bwtaln.h | 113 ++++++++++ 6 files changed, 1439 insertions(+) create mode 100755 c/bwa/bntseq.c create mode 100755 c/bwa/bntseq.h create mode 100755 c/bwa/bwase.c create mode 100755 c/bwa/bwase.h create mode 100755 c/bwa/bwtaln.c create mode 100755 c/bwa/bwtaln.h diff --git a/c/bwa/bntseq.c b/c/bwa/bntseq.c new file mode 100755 index 000000000..9db4a5242 --- /dev/null +++ b/c/bwa/bntseq.c @@ -0,0 +1,302 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#include +#include +#include +#include +#include "bntseq.h" +#include "main.h" +#include "utils.h" + +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +unsigned char nst_nt4_table[256] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; + +void bns_dump(const bntseq_t *bns, const char *prefix) +{ + char str[1024]; + FILE *fp; + int i; + { // dump .ann + strcpy(str, prefix); strcat(str, ".ann"); + fp = xopen(str, "w"); + fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed); + for (i = 0; i != bns->n_seqs; ++i) { + bntann1_t *p = bns->anns + i; + fprintf(fp, "%d %s", p->gi, p->name); + if (p->anno[0]) fprintf(fp, " %s\n", p->anno); + else fprintf(fp, "\n"); + fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs); + } + fclose(fp); + } + { // dump .amb + strcpy(str, prefix); strcat(str, ".amb"); + fp = xopen(str, "w"); + fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes); + for (i = 0; i != bns->n_holes; ++i) { + bntamb1_t *p = bns->ambs + i; + fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb); + } + fclose(fp); + } +} + +bntseq_t *bns_restore(const char *prefix) +{ + char ann_filename[1024], amb_filename[1024], pac_filename[1024]; + strcpy(ann_filename,prefix); + strcat(ann_filename,".ann"); + strcpy(amb_filename,prefix); + strcat(amb_filename,".amb"); + strcpy(pac_filename,prefix); + strcat(pac_filename,".pac"); + + return bns_restore_core(ann_filename, amb_filename, pac_filename); +} + +bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename) +{ + char buffer[1024]; + FILE *fp; + bntseq_t *bns; + int i; + bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); + { // read .ann + fp = xopen(ann_filename, "r"); + fscanf(fp, "%lld%d%u", &bns->l_pac, &bns->n_seqs, &bns->seed); + bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t)); + for (i = 0; i < bns->n_seqs; ++i) { + bntann1_t *p = bns->anns + i; + char *q = buffer; + int c; + // read gi and sequence name + fscanf(fp, "%u%s", &p->gi, buffer); + p->name = strdup(buffer); + // read fasta comments + while ((c = fgetc(fp)) != '\n' && c != EOF) *q++ = c; + *q = 0; + if (q - buffer > 1) p->anno = strdup(buffer + 1); // skip leading space + else p->anno = strdup(""); + // read the rest + fscanf(fp, "%lld%d%d", &p->offset, &p->len, &p->n_ambs); + } + fclose(fp); + } + { // read .amb + int64_t l_pac; + int32_t n_seqs; + fp = xopen(amb_filename, "r"); + fscanf(fp, "%lld%d%d", &l_pac, &n_seqs, &bns->n_holes); + xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files."); + bns->ambs = (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t)); + for (i = 0; i < bns->n_holes; ++i) { + bntamb1_t *p = bns->ambs + i; + fscanf(fp, "%lld%d%s", &p->offset, &p->len, buffer); + p->amb = buffer[0]; + } + fclose(fp); + } + { // open .pac + bns->fp_pac = xopen(pac_filename, "rb"); + } + return bns; +} + +void bns_destroy(bntseq_t *bns) +{ + if (bns == 0) return; + else { + int i; + if (bns->fp_pac) fclose(bns->fp_pac); + free(bns->ambs); + for (i = 0; i < bns->n_seqs; ++i) { + free(bns->anns[i].name); + free(bns->anns[i].anno); + } + free(bns->anns); + free(bns); + } +} + +void bns_fasta2bntseq(gzFile fp_fa, const char *prefix) +{ + kseq_t *seq; + char name[1024]; + bntseq_t *bns; + bntamb1_t *q; + int l_buf; + unsigned char buf[0x10000]; + int32_t m_seqs, m_holes, l, i; + FILE *fp; + + // initialization + seq = kseq_init(fp_fa); + bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); + bns->seed = 11; // fixed seed for random generator + srand48(bns->seed); + m_seqs = m_holes = 8; + bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); + bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); + q = bns->ambs; + l_buf = 0; + strcpy(name, prefix); strcat(name, ".pac"); + fp = xopen(name, "wb"); + memset(buf, 0, 0x10000); + // read sequences + while ((l = kseq_read(seq)) >= 0) { + bntann1_t *p; + int lasts; + if (bns->n_seqs == m_seqs) { + m_seqs <<= 1; + bns->anns = (bntann1_t*)realloc(bns->anns, m_seqs * sizeof(bntann1_t)); + } + p = bns->anns + bns->n_seqs; + p->name = strdup((char*)seq->name.s); + p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)"); + p->gi = 0; p->len = l; + p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; + p->n_ambs = 0; + for (i = 0, lasts = 0; i < l; ++i) { + int c = nst_nt4_table[(int)seq->seq.s[i]]; + if (c >= 4) { // N + if (lasts == seq->seq.s[i]) { // contiguous N + ++q->len; + } else { + if (bns->n_holes == m_holes) { + m_holes <<= 1; + bns->ambs = (bntamb1_t*)realloc(bns->ambs, m_holes * sizeof(bntamb1_t)); + } + q = bns->ambs + bns->n_holes; + q->len = 1; + q->offset = p->offset + i; + q->amb = seq->seq.s[i]; + ++p->n_ambs; + ++bns->n_holes; + } + } + lasts = seq->seq.s[i]; + { // fill buffer + if (c >= 4) c = lrand48()&0x3; + if (l_buf == 0x40000) { + fwrite(buf, 1, 0x10000, fp); + memset(buf, 0, 0x10000); + l_buf = 0; + } + buf[l_buf>>2] |= c << ((3 - (l_buf&3)) << 1); + ++l_buf; + } + } + ++bns->n_seqs; + bns->l_pac += seq->seq.l; + } + xassert(bns->l_pac, "zero length sequence."); + { // finalize .pac file + ubyte_t ct; + fwrite(buf, 1, (l_buf>>2) + ((l_buf&3) == 0? 0 : 1), fp); + // the following codes make the pac file size always (l_pac/4+1+1) + if (bns->l_pac % 4 == 0) { + ct = 0; + fwrite(&ct, 1, 1, fp); + } + ct = bns->l_pac % 4; + fwrite(&ct, 1, 1, fp); + // close .pac file + fclose(fp); + } + bns_dump(bns, prefix); + bns_destroy(bns); + kseq_destroy(seq); +} + +int bwa_fa2pac(int argc, char *argv[]) +{ + gzFile fp; + if (argc < 2) { + fprintf(stderr, "Usage: bwa fa2pac []\n"); + return 1; + } + fp = xzopen(argv[1], "r"); + bns_fasta2bntseq(fp, (argc < 3)? argv[1] : argv[2]); + gzclose(fp); + return 0; +} + +int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq) +{ + int left, mid, right, nn; + if (pac_coor >= bns->l_pac) + err_fatal("bns_coor_pac2real", "bug! Coordinate is longer than sequence (%lld>=%lld).", pac_coor, bns->l_pac); + // binary search for the sequence ID. Note that this is a bit different from the following one... + left = 0; mid = 0; right = bns->n_seqs; + while (left < right) { + mid = (left + right) >> 1; + if (pac_coor >= bns->anns[mid].offset) { + if (mid == bns->n_seqs - 1) break; + if (pac_coor < bns->anns[mid+1].offset) break; + left = mid + 1; + } else right = mid; + } + *real_seq = mid; + // binary search for holes + left = 0; right = bns->n_holes; nn = 0; + while (left < right) { + int64_t mid = (left + right) >> 1; + if (pac_coor >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1; + else if (pac_coor + len <= bns->ambs[mid].offset) right = mid; + else { // overlap + if (pac_coor >= bns->ambs[mid].offset) { + nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len? + bns->ambs[mid].offset + bns->ambs[mid].len - pac_coor : len; + } else { + nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len? + bns->ambs[mid].len : len - (bns->ambs[mid].offset - pac_coor); + } + break; + } + } + return nn; +} diff --git a/c/bwa/bntseq.h b/c/bwa/bntseq.h new file mode 100755 index 000000000..21b831ee4 --- /dev/null +++ b/c/bwa/bntseq.h @@ -0,0 +1,80 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#ifndef BWT_BNTSEQ_H +#define BWT_BNTSEQ_H + +#include +#include + +#ifndef BWA_UBYTE +#define BWA_UBYTE +typedef uint8_t ubyte_t; +#endif + +typedef struct { + int64_t offset; + int32_t len; + int32_t n_ambs; + uint32_t gi; + char *name, *anno; +} bntann1_t; + +typedef struct { + int64_t offset; + int32_t len; + char amb; +} bntamb1_t; + +typedef struct { + int64_t l_pac; + int32_t n_seqs; + uint32_t seed; + bntann1_t *anns; // n_seqs elements + int32_t n_holes; + bntamb1_t *ambs; // n_holes elements + FILE *fp_pac; +} bntseq_t; + +extern unsigned char nst_nt4_table[256]; + +#ifdef __cplusplus +extern "C" { +#endif + + void bns_dump(const bntseq_t *bns, const char *prefix); + bntseq_t *bns_restore(const char *prefix); + bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename); + void bns_destroy(bntseq_t *bns); + void bns_fasta2bntseq(gzFile fp_fa, const char *prefix); + int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/c/bwa/bwase.c b/c/bwa/bwase.c new file mode 100755 index 000000000..014e50bc4 --- /dev/null +++ b/c/bwa/bwase.c @@ -0,0 +1,597 @@ +#include +#include +#include +#include +#include +#include +#include "bwase.h" +#include "stdaln.h" +#include "bntseq.h" +#include "utils.h" +#include "kstring.h" + +static int g_log_n[256]; + +void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s) +{ + int i, cnt, best; + if (n_aln == 0) { + s->type = BWA_TYPE_NO_MATCH; + s->c1 = s->c2 = 0; + return; + } + best = aln[0].score; + for (i = cnt = 0; i < n_aln; ++i) { + const bwt_aln1_t *p = aln + i; + if (p->score > best) break; + if (drand48() * (p->l - p->k + 1) > (double)cnt) { + s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape; s->strand = p->a; + s->score = p->score; + s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48()); + } + cnt += p->l - p->k + 1; + } + s->c1 = cnt; + for (; i < n_aln; ++i) cnt += aln[i].l - aln[i].k + 1; + s->c2 = cnt - s->c1; + s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE; +} + +int bwa_approx_mapQ(const bwa_seq_t *p, int mm) +{ + int n; + if (p->c1 == 0) return 23; + if (p->c1 > 1) return 0; + if (p->n_mm == mm) return 25; + if (p->c2 == 0) return 37; + n = (p->c2 >= 255)? 255 : p->c2; + return (23 < g_log_n[n])? 0 : 23 - g_log_n[n]; +} + +void bwa_cal_pac_pos(const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr) +{ + int i; + char str[1024]; + bwt_t *bwt; + // load forward SA + strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); + strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *p = seqs + i; + if(p->strand) // reverse strand only + bwa_cal_pac_pos_core(bwt, NULL, p, max_mm, fnr); + } + bwt_destroy(bwt); + // load reverse BWT and SA + strcpy(str, prefix); strcat(str, ".rbwt"); bwt = bwt_restore_bwt(str); + strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt); + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *p = seqs + i; + if(!p->strand) // forward strand only + bwa_cal_pac_pos_core(NULL, bwt, p, max_mm, fnr); + } + bwt_destroy(bwt); +} + +/** + * Derive the actual position in the read from the given suffix array coordinates. + * Note that the position will be approximate based on whether indels appear in the + * read and whether calculations are performed from the start or end of the read. + */ +void bwa_cal_pac_pos_core(const bwt_t* forward_bwt, const bwt_t* reverse_bwt, bwa_seq_t* seq, const int max_mm, const float fnr) { + if(seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT) + return; + + int max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm; + if(seq->strand) { // reverse strand only + seq->pos = bwt_sa(forward_bwt, seq->sa); + seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff); + } + else { // forward strand only + /* NB: For gapped alignment, p->pos may not be correct, + * which will be fixed in refine_gapped_core(). This + * line also determines the way "x" is calculated in + * refine_gapped_core() when (ext < 0 && is_end == 0). */ + seq->pos = reverse_bwt->seq_len - (bwt_sa(reverse_bwt, seq->sa) + seq->len); + seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff); + } +} + +/* is_end_correct == 1 if (*pos+len) gives the correct coordinate on + * forward strand. This happens when p->pos is calculated by + * bwa_cal_pac_pos(). is_end_correct==0 if (*pos) gives the correct + * coordinate. This happens only for color-converted alignment. */ +static uint16_t *refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, bwtint_t *_pos, + int ext, int *n_cigar, int is_end_correct) +{ + uint16_t *cigar = 0; + ubyte_t *ref_seq; + int l = 0, path_len, ref_len; + AlnParam ap = aln_param_bwa; + path_t *path; + int64_t k, __pos = *_pos > l_pac? (int64_t)((int32_t)*_pos) : *_pos; + + ref_len = len + abs(ext); + if (ext > 0) { + ref_seq = (ubyte_t*)calloc(ref_len, 1); + for (k = __pos; k < __pos + ref_len && k < l_pac; ++k) + ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; + } else { + int64_t x = __pos + (is_end_correct? len : ref_len); + ref_seq = (ubyte_t*)calloc(ref_len, 1); + for (l = 0, k = x - ref_len > 0? x - ref_len : 0; k < x && k < l_pac; ++k) + ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; + } + path = (path_t*)calloc(l+len, sizeof(path_t)); + + aln_global_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len); + cigar = aln_path2cigar(path, path_len, n_cigar); + + if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped on the forward strand + for (l = k = 0; k < *n_cigar; ++k) { + if (cigar[k]>>14 == FROM_D) l -= cigar[k]&0x3fff; + else if (cigar[k]>>14 == FROM_I) l += cigar[k]&0x3fff; + } + __pos += l; + } + + if (cigar[0]>>14 == FROM_D) { // deletion at the 5'-end + __pos += cigar[0]&0x3fff; + for (k = 0; k < *n_cigar - 1; ++k) cigar[k] = cigar[k+1]; + --(*n_cigar); + } + if (cigar[*n_cigar-1]>>14 == FROM_D) --(*n_cigar); // deletion at the 3'-end + + // change "I" at either end of the read to S. just in case. This should rarely happen... + if (cigar[*n_cigar-1]>>14 == FROM_I) cigar[*n_cigar-1] = 3<<14 | (cigar[*n_cigar-1]&0x3fff); + if (cigar[0]>>14 == FROM_I) cigar[0] = 3<<14 | (cigar[0]&0x3fff); + + *_pos = (bwtint_t)__pos; + free(ref_seq); free(path); + return cigar; +} + +char *bwa_cal_md1(int n_cigar, uint16_t *cigar, int len, bwtint_t pos, ubyte_t *seq, + bwtint_t l_pac, ubyte_t *pacseq, kstring_t *str, int *_nm) +{ + bwtint_t x, y; + int z, u, c, nm = 0; + str->l = 0; // reset + x = pos; y = 0; + if (cigar) { + int k, l; + for (k = u = 0; k < n_cigar; ++k) { + l = cigar[k]&0x3fff; + if (cigar[k]>>14 == FROM_M) { + for (z = 0; z < l && x+z < l_pac; ++z) { + c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3; + if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) { + ksprintf(str, "%d", u); + kputc("ACGTN"[c], str); + ++nm; + u = 0; + } else ++u; + } + x += l; y += l; + } else if (cigar[k]>>14 == FROM_I || cigar[k]>>14 == 3) { + y += l; nm += l; + } else if (cigar[k]>>14 == FROM_D) { + ksprintf(str, "%d", u); + kputc('^', str); + for (z = 0; z < l && x+z < l_pac; ++z) + kputc("ACGT"[pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3], str); + u = 0; + x += l; nm += l; + } + } + } else { // no gaps + for (z = u = 0; z < (bwtint_t)len; ++z) { + c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3; + if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) { + ksprintf(str, "%d", u); + kputc("ACGTN"[c], str); + ++nm; + u = 0; + } else ++u; + } + } + ksprintf(str, "%d", u); + *_nm = nm; + return strdup(str->s); +} + +void bwa_correct_trimmed(bwa_seq_t *s) +{ + if (s->len == s->full_len) return; + if (s->strand == 0) { // forward + if (s->cigar && s->cigar[s->n_cigar-1]>>14 == 3) { // the last is S + s->cigar[s->n_cigar-1] += s->full_len - s->len; + } else { + if (s->cigar == 0) { + s->n_cigar = 2; + s->cigar = calloc(s->n_cigar, 2); + s->cigar[0] = 0<<14 | s->len; + } else { + ++s->n_cigar; + s->cigar = realloc(s->cigar, s->n_cigar * 2); + } + s->cigar[s->n_cigar-1] = 3<<14 | (s->full_len - s->len); + } + } else { // reverse + if (s->cigar && s->cigar[0]>>14 == 3) { // the first is S + s->cigar[0] += s->full_len - s->len; + } else { + if (s->cigar == 0) { + s->n_cigar = 2; + s->cigar = calloc(s->n_cigar, 2); + s->cigar[1] = 0<<14 | s->len; + } else { + ++s->n_cigar; + s->cigar = realloc(s->cigar, s->n_cigar * 2); + memmove(s->cigar + 1, s->cigar, (s->n_cigar-1) * 2); + } + s->cigar[0] = 3<<14 | (s->full_len - s->len); + } + } + s->len = s->full_len; +} + +void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns) +{ + ubyte_t *pacseq, *ntpac = 0; + int i; + kstring_t *str; + + if (ntbns) { // in color space + ntpac = (ubyte_t*)calloc(ntbns->l_pac/4+1, 1); + rewind(ntbns->fp_pac); + fread(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac); + } + + if (!_pacseq) { + pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); + rewind(bns->fp_pac); + fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac); + } else pacseq = _pacseq; + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *s = seqs + i; + seq_reverse(s->len, s->seq, 0); // IMPORTANT: s->seq is reversed here!!! + if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue; + s->cigar = refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, + (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1); + } + + if (ntbns) { // in color space + for (i = 0; i < n_seqs; ++i) { + bwa_seq_t *s = seqs + i; + bwa_cs2nt_core(s, bns->l_pac, ntpac); + if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again + free(s->cigar); + s->cigar = refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos, + (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0); + } + } + } + + // generate MD tag + str = (kstring_t*)calloc(1, sizeof(kstring_t)); + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *s = seqs + i; + if (s->type != BWA_TYPE_NO_MATCH) { + int nm; + s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq, + bns->l_pac, ntbns? ntpac : pacseq, str, &nm); + s->nm = nm; + } + } + free(str->s); free(str); + + // correct for trimmed reads + for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i); + + if (!_pacseq) free(pacseq); + free(ntpac); +} + +int64_t pos_end(const bwa_seq_t *p) +{ + if (p->cigar) { + int j; + int64_t x = p->pos; + for (j = 0; j != p->n_cigar; ++j) { + int op = p->cigar[j]>>14; + if (op == 0 || op == 2) x += p->cigar[j]&0x3fff; + } + return x; + } else return p->pos + p->len; +} + +static int64_t pos_5(const bwa_seq_t *p) +{ + if (p->type != BWA_TYPE_NO_MATCH) + return p->strand? pos_end(p) : p->pos; + return -1; +} + +void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2) +{ + int j; + if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) { + int seqid, nn, am = 0, flag = p->extra_flag; + char XT; + + if (p->type == BWA_TYPE_NO_MATCH) { + p->pos = mate->pos; + p->strand = mate->strand; + flag |= SAM_FSU; + j = 1; + } else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment + + // get seqid + nn = bns_coor_pac2real(bns, p->pos, j, &seqid); + + // update flag and print it + if (p->strand) flag |= SAM_FSR; + if (mate) { + if (mate->type != BWA_TYPE_NO_MATCH) { + if (mate->strand) flag |= SAM_FMR; + } else flag |= SAM_FMU; + } + printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name); + printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ); + + // print CIGAR + if (p->cigar) { + for (j = 0; j != p->n_cigar; ++j) + printf("%d%c", p->cigar[j]&0x3fff, "MIDS"[p->cigar[j]>>14]); + } else if (p->type == BWA_TYPE_NO_MATCH) printf("*"); + else printf("%dM", p->len); + + // print mate coordinate + if (mate && mate->type != BWA_TYPE_NO_MATCH) { + int m_seqid, m_is_N; + long long isize; + am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality + // redundant calculation here, but should not matter too much + m_is_N = bns_coor_pac2real(bns, mate->pos, mate->len, &m_seqid); + printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name); + isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0; + if (p->type == BWA_TYPE_NO_MATCH) isize = 0; + printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize); + } else if (mate) printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1)); + else printf("\t*\t0\t0\t"); + + // print sequence and quality + if (p->strand == 0) + for (j = 0; j != p->full_len; ++j) putchar("ACGTN"[(int)p->seq[j]]); + else for (j = 0; j != p->full_len; ++j) putchar("TGCAN"[p->seq[p->full_len - 1 - j]]); + putchar('\t'); + if (p->qual) { + if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality + printf("%s", p->qual); + } else printf("*"); + + if (p->type != BWA_TYPE_NO_MATCH) { + // calculate XT tag + XT = "NURM"[p->type]; + if (nn > 10) XT = 'N'; + // print tags + printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm); + if (nn) printf("\tXN:i:%d", nn); + if (mate) printf("\tSM:i:%d\tAM:i:%d", p->seQ, am); + if (p->type != BWA_TYPE_MATESW) { // X0 and X1 are not available for this type of alignment + printf("\tX0:i:%d", p->c1); + if (p->c1 <= max_top2) printf("\tX1:i:%d", p->c2); + } + printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo, p->n_gapo+p->n_gape); + if (p->md) printf("\tMD:Z:%s", p->md); + } + putchar('\n'); + } else { // this read has no match + ubyte_t *s = p->strand? p->rseq : p->seq; + int flag = p->extra_flag | SAM_FSU; + if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU; + printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag); + for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]); + putchar('\t'); + if (p->qual) { + if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality + printf("%s", p->qual); + } else printf("*"); + putchar('\n'); + } +} + +bntseq_t *bwa_open_nt(const char *prefix) +{ + bntseq_t *ntbns; + char *str; + str = (char*)calloc(strlen(prefix) + 10, 1); + strcat(strcpy(str, prefix), ".nt"); + ntbns = bns_restore(str); + free(str); + return ntbns; +} + +void bwa_print_sam_SQ(const bntseq_t *bns) +{ + int i; + for (i = 0; i < bns->n_seqs; ++i) + printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len); +} + +void bwase_initialize() +{ + int i; + for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); +} + +void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa) +{ + int i, n_seqs, tot_seqs = 0, m_aln; + bwt_aln1_t *aln = 0; + bwa_seq_t *seqs; + bwa_seqio_t *ks; + clock_t t; + bntseq_t *bns, *ntbns = 0; + FILE *fp_sa; + gap_opt_t opt; + + // initialization + bwase_initialize(); + for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); + bns = bns_restore(prefix); + srand48(bns->seed); + ks = bwa_seq_open(fn_fa); + fp_sa = xopen(fn_sa, "r"); + + // core loop + m_aln = 0; + fread(&opt, sizeof(gap_opt_t), 1, fp_sa); + if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac + ntbns = bwa_open_nt(prefix); + bwa_print_sam_SQ(bns); + while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode & BWA_MODE_COMPREAD, opt.trim_qual)) != 0) { + tot_seqs += n_seqs; + t = clock(); + + // read alignment + for (i = 0; i < n_seqs; ++i) { + bwa_seq_t *p = seqs + i; + int n_aln; + fread(&n_aln, 4, 1, fp_sa); + if (n_aln > m_aln) { + m_aln = n_aln; + aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln); + } + fread(aln, sizeof(bwt_aln1_t), n_aln, fp_sa); + bwa_aln2seq(n_aln, aln, p); + } + + fprintf(stderr, "[bwa_aln_core] convert to sequence coordinate... "); + bwa_cal_pac_pos(prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + fprintf(stderr, "[bwa_aln_core] refine gapped alignments... "); + bwa_refine_gapped(bns, n_seqs, seqs, 0, ntbns); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + fprintf(stderr, "[bwa_aln_core] print alignments... "); + for (i = 0; i < n_seqs; ++i) + bwa_print_sam1(bns, seqs + i, 0, opt.mode, opt.max_top2); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + bwa_free_read_seq(n_seqs, seqs); + fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs); + } + + // destroy + bwa_seq_close(ks); + if (ntbns) bns_destroy(ntbns); + bns_destroy(bns); + fclose(fp_sa); + free(aln); +} + +static void print_aln_simple(bwt_t *const bwt[2], const bntseq_t *bns, const bwt_aln1_t *q, int len, bwtint_t k) +{ + bwtint_t pos; + int seqid, is_N; + pos = q->a? bwt_sa(bwt[0], k) : bwt[1]->seq_len - (bwt_sa(bwt[1], k) + len); + if (pos > bwt[1]->seq_len) pos = 0; // negative + is_N = bns_coor_pac2real(bns, pos, len, &seqid); + printf("%s\t%c%d\t%d\n", bns->anns[seqid].name, "+-"[q->a], (int)(pos - bns->anns[seqid].offset + 1), + q->n_mm + q->n_gapo + q->n_gape); +} + +void bwa_print_all_hits(const char *prefix, const char *fn_sa, const char *fn_fa, int max_occ) +{ + int i, n_seqs, tot_seqs = 0, m_aln; + bwt_aln1_t *aln = 0; + bwa_seq_t *seqs; + bwa_seqio_t *ks; + bntseq_t *bns; + FILE *fp_sa; + gap_opt_t opt; + bwt_t *bwt[2]; + + bns = bns_restore(prefix); + srand48(bns->seed); + ks = bwa_seq_open(fn_fa); + fp_sa = xopen(fn_sa, "r"); + { // load BWT + char *str = (char*)calloc(strlen(prefix) + 10, 1); + strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str); + strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]); + strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str); + strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]); + free(str); + } + + m_aln = 0; + fread(&opt, sizeof(gap_opt_t), 1, fp_sa); + while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode & BWA_MODE_COMPREAD, opt.trim_qual)) != 0) { + tot_seqs += n_seqs; + for (i = 0; i < n_seqs; ++i) { + bwa_seq_t *p = seqs + i; + int n_aln, n_occ, k, rest, len; + len = p->len; + fread(&n_aln, 4, 1, fp_sa); + if (n_aln > m_aln) { + m_aln = n_aln; + aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln); + } + fread(aln, sizeof(bwt_aln1_t), n_aln, fp_sa); + for (k = n_occ = 0; k < n_aln; ++k) { + const bwt_aln1_t *q = aln + k; + n_occ += q->l - q->k + 1; + } + rest = n_occ > max_occ? max_occ : n_occ; + printf(">%s %d %d\n", p->name, rest, n_occ); + for (k = 0; k < n_aln; ++k) { + const bwt_aln1_t *q = aln + k; + if (q->l - q->k + 1 <= rest) { + bwtint_t l; + for (l = q->k; l <= q->l; ++l) + print_aln_simple(bwt, bns, q, len, l); + rest -= q->l - q->k + 1; + } else { // See also: http://code.activestate.com/recipes/272884/ + int j, i, k; + for (j = rest, i = q->l - q->k + 1, k = 0; j > 0; --j) { + double p = 1.0, x = drand48(); + while (x < p) p -= p * j / (i--); + print_aln_simple(bwt, bns, q, len, q->l - i); + } + rest = 0; + break; + } + } + } + bwa_free_read_seq(n_seqs, seqs); + } + bwt_destroy(bwt[0]); bwt_destroy(bwt[1]); + bwa_seq_close(ks); + bns_destroy(bns); + fclose(fp_sa); + free(aln); +} + +int bwa_sai2sam_se(int argc, char *argv[]) +{ + int c, n_occ = 0; + while ((c = getopt(argc, argv, "hn:")) >= 0) { + switch (c) { + case 'h': break; + case 'n': n_occ = atoi(optarg); break; + default: return 1; + } + } + + if (optind + 3 > argc) { + fprintf(stderr, "Usage: bwa samse [-n max_occ] \n"); + return 1; + } + if (n_occ > 0) bwa_print_all_hits(argv[optind], argv[optind+1], argv[optind+2], n_occ); + else bwa_sai2sam_se_core(argv[optind], argv[optind+1], argv[optind+2]); + return 0; +} diff --git a/c/bwa/bwase.h b/c/bwa/bwase.h new file mode 100755 index 000000000..54a10b3d7 --- /dev/null +++ b/c/bwa/bwase.h @@ -0,0 +1,52 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef BWASE_H +#define BWASE_H + +#include "bntseq.h" +#include "bwt.h" +#include "bwtaln.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Initialize mapping tables in the bwa single-end mapper. +void bwase_initialize(); +// Calculate the approximate position of the sequence from the specified bwt with loaded suffix array. +void bwa_cal_pac_pos_core(const bwt_t* forward_bwt, const bwt_t* reverse_bwt, bwa_seq_t* seq, const int max_mm, const float fnr); +// Refine the approximate position of the sequence to an actual placement for the sequence. +void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns); +// Backfill certain alignment properties mainly centering around number of matches. +void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); +// Calculate the end position of a read given a certain sequence. +int64_t pos_end(const bwa_seq_t *p); + +#ifdef __cplusplus +} +#endif + +#endif // BWASE_H diff --git a/c/bwa/bwtaln.c b/c/bwa/bwtaln.c new file mode 100755 index 000000000..8894338e0 --- /dev/null +++ b/c/bwa/bwtaln.c @@ -0,0 +1,295 @@ +#include +#include +#include +#include +#include +#include +#include +#include "bwtaln.h" +#include "bwtgap.h" +#include "utils.h" + +#ifdef HAVE_PTHREAD +#define THREAD_BLOCK_SIZE 1024 +#include +static pthread_mutex_t g_seq_lock = PTHREAD_MUTEX_INITIALIZER; +#endif + +gap_opt_t *gap_init_opt() +{ + gap_opt_t *o; + o = (gap_opt_t*)calloc(1, sizeof(gap_opt_t)); + /* IMPORTANT: s_mm*10 should be about the average base error + rate. Voilating this requirement will break pairing! */ + o->s_mm = 3; o->s_gapo = 11; o->s_gape = 4; + o->max_diff = -1; o->max_gapo = 1; o->max_gape = 6; + o->indel_end_skip = 5; o->max_del_occ = 10; o->max_entries = 2000000; + o->mode = BWA_MODE_GAPE | BWA_MODE_COMPREAD; + o->seed_len = 0x7fffffff; o->max_seed_diff = 2; + o->fnr = 0.04; + o->n_threads = 1; + o->max_top2 = 30; + o->trim_qual = 0; + return o; +} + +int bwa_cal_maxdiff(int l, double err, double thres) +{ + double elambda = exp(-l * err); + double sum, y = 1.0; + int k, x = 1; + for (k = 1, sum = elambda; k < 1000; ++k) { + y *= l * err; + x *= k; + sum += elambda * y / x; + if (1.0 - sum < thres) return k; + } + return 2; +} + +// width must be filled as zero +static int bwt_cal_width(const bwt_t *rbwt, int len, const ubyte_t *str, bwt_width_t *width) +{ + bwtint_t k, l, ok, ol; + int i, bid; + bid = 0; + k = 0; l = rbwt->seq_len; + for (i = 0; i < len; ++i) { + ubyte_t c = str[i]; + if (c < 4) { + bwt_2occ(rbwt, k - 1, l, c, &ok, &ol); + k = rbwt->L2[c] + ok + 1; + l = rbwt->L2[c] + ol; + } + if (k > l || c > 3) { // then restart + k = 0; + l = rbwt->seq_len; + ++bid; + } + width[i].w = l - k + 1; + width[i].bid = bid; + } + width[len].w = 0; + width[len].bid = ++bid; + return bid; +} + +void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt) +{ + int i, max_l = 0, max_len; + gap_stack_t *stack; + bwt_width_t *w[2], *seed_w[2]; + const ubyte_t *seq[2]; + gap_opt_t local_opt = *opt; + + // initiate priority stack + for (i = max_len = 0; i != n_seqs; ++i) + if (seqs[i].len > max_len) max_len = seqs[i].len; + if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(max_len, BWA_AVG_ERR, opt->fnr); + if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff; + stack = gap_init_stack(opt->max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt); + + seed_w[0] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t)); + seed_w[1] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t)); + w[0] = w[1] = 0; + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *p = seqs + i; +#ifdef HAVE_PTHREAD + if (opt->n_threads > 1) { + pthread_mutex_lock(&g_seq_lock); + if (p->tid < 0) { // unassigned + int j; + for (j = i; j < n_seqs && j < i + THREAD_BLOCK_SIZE; ++j) + seqs[j].tid = tid; + } else if (p->tid != tid) { + pthread_mutex_unlock(&g_seq_lock); + continue; + } + pthread_mutex_unlock(&g_seq_lock); + } +#endif + p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0; + seq[0] = p->seq; seq[1] = p->rseq; + if (max_l < p->len) { + max_l = p->len; + w[0] = (bwt_width_t*)calloc(max_l + 1, sizeof(bwt_width_t)); + w[1] = (bwt_width_t*)calloc(max_l + 1, sizeof(bwt_width_t)); + } + bwt_cal_width(bwt[0], p->len, seq[0], w[0]); + bwt_cal_width(bwt[1], p->len, seq[1], w[1]); + if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr); + local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff; + if (p->len > opt->seed_len) { + bwt_cal_width(bwt[0], opt->seed_len, seq[0] + (p->len - opt->seed_len), seed_w[0]); + bwt_cal_width(bwt[1], opt->seed_len, seq[1] + (p->len - opt->seed_len), seed_w[1]); + } + // core function + p->aln = bwt_match_gap(bwt, p->len, seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack); + // store the alignment + free(p->name); free(p->seq); free(p->rseq); free(p->qual); + p->name = 0; p->seq = p->rseq = p->qual = 0; + } + free(seed_w[0]); free(seed_w[1]); + free(w[0]); free(w[1]); + gap_destroy_stack(stack); +} + +#ifdef HAVE_PTHREAD +typedef struct { + int tid; + bwt_t *bwt[2]; + int n_seqs; + bwa_seq_t *seqs; + const gap_opt_t *opt; +} thread_aux_t; + +static void *worker(void *data) +{ + thread_aux_t *d = (thread_aux_t*)data; + bwa_cal_sa_reg_gap(d->tid, d->bwt, d->n_seqs, d->seqs, d->opt); + return 0; +} +#endif + +void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) +{ + int i, n_seqs, tot_seqs = 0; + bwa_seq_t *seqs; + bwa_seqio_t *ks; + clock_t t; + bwt_t *bwt[2]; + + // initialization + ks = bwa_seq_open(fn_fa); + + { // load BWT + char *str = (char*)calloc(strlen(prefix) + 10, 1); + strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str); + strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str); + free(str); + } + + // core loop + fwrite(opt, sizeof(gap_opt_t), 1, stdout); + while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode & BWA_MODE_COMPREAD, opt->trim_qual)) != 0) { + tot_seqs += n_seqs; + t = clock(); + + fprintf(stderr, "[bwa_aln_core] calculate SA coordinate... "); + +#ifdef HAVE_PTHREAD + if (opt->n_threads <= 1) { // no multi-threading at all + bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); + } else { + pthread_t *tid; + pthread_attr_t attr; + thread_aux_t *data; + int j; + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t)); + tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); + for (j = 0; j < opt->n_threads; ++j) { + data[j].tid = j; data[j].bwt[0] = bwt[0]; data[j].bwt[1] = bwt[1]; + data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt; + pthread_create(&tid[j], &attr, worker, data + j); + } + for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0); + free(data); free(tid); + } +#else + bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); +#endif + + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + t = clock(); + fprintf(stderr, "[bwa_aln_core] write to the disk... "); + for (i = 0; i < n_seqs; ++i) { + bwa_seq_t *p = seqs + i; + fwrite(&p->n_aln, 4, 1, stdout); + if (p->n_aln) fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout); + } + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + bwa_free_read_seq(n_seqs, seqs); + fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs); + } + + // destroy + bwt_destroy(bwt[0]); bwt_destroy(bwt[1]); + bwa_seq_close(ks); +} + +int bwa_aln(int argc, char *argv[]) +{ + int c, opte = -1; + gap_opt_t *opt; + + opt = gap_init_opt(); + while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:")) >= 0) { + switch (c) { + case 'n': + if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; + else opt->max_diff = atoi(optarg), opt->fnr = -1.0; + break; + case 'o': opt->max_gapo = atoi(optarg); break; + case 'e': opte = atoi(optarg); break; + case 'M': opt->s_mm = atoi(optarg); break; + case 'O': opt->s_gapo = atoi(optarg); break; + case 'E': opt->s_gape = atoi(optarg); break; + case 'd': opt->max_del_occ = atoi(optarg); break; + case 'i': opt->indel_end_skip = atoi(optarg); break; + case 'l': opt->seed_len = atoi(optarg); break; + case 'k': opt->max_seed_diff = atoi(optarg); break; + case 'm': opt->max_entries = atoi(optarg); break; + case 't': opt->n_threads = atoi(optarg); break; + case 'L': opt->mode |= BWA_MODE_LOGGAP; break; + case 'R': opt->max_top2 = atoi(optarg); break; + case 'q': opt->trim_qual = atoi(optarg); break; + case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break; + case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break; + default: return 1; + } + } + if (opte > 0) { + opt->max_gape = opte; + opt->mode &= ~BWA_MODE_GAPE; + } + + if (optind + 2 > argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa aln [options] \n\n"); + fprintf(stderr, "Options: -n NUM max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n", + BWA_AVG_ERR, opt->fnr); + fprintf(stderr, " -o INT maximum number or fraction of gap opens [%d]\n", opt->max_gapo); + fprintf(stderr, " -e INT maximum number of gap extensions, -1 for disabling long gaps [-1]\n"); + fprintf(stderr, " -i INT do not put an indel within INT bp towards the ends [%d]\n", opt->indel_end_skip); + fprintf(stderr, " -d INT maximum occurrences for extending a long deletion [%d]\n", opt->max_del_occ); + fprintf(stderr, " -l INT seed length [%d]\n", opt->seed_len); + fprintf(stderr, " -k INT maximum differences in the seed [%d]\n", opt->max_seed_diff); + fprintf(stderr, " -m INT maximum entries in the queue [%d]\n", opt->max_entries); + fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); + fprintf(stderr, " -M INT mismatch penalty [%d]\n", opt->s_mm); + fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->s_gapo); + fprintf(stderr, " -E INT gap extension penalty [%d]\n", opt->s_gape); + fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2); + fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual); + fprintf(stderr, " -c input sequences are in the color space\n"); + fprintf(stderr, " -L log-scaled gap penalty for long deletions\n"); + fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n"); + fprintf(stderr, "\n"); + return 1; + } + if (opt->fnr > 0.0) { + int i, k; + for (i = 17, k = 0; i <= 250; ++i) { + int l = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr); + if (l != k) fprintf(stderr, "[bwa_aln] %dbp reads: max_diff = %d\n", i, l); + k = l; + } + } + bwa_aln_core(argv[optind], argv[optind+1], opt); + free(opt); + return 0; +} diff --git a/c/bwa/bwtaln.h b/c/bwa/bwtaln.h new file mode 100755 index 000000000..68c1c4f1f --- /dev/null +++ b/c/bwa/bwtaln.h @@ -0,0 +1,113 @@ +#ifndef BWTALN_H +#define BWTALN_H + +#include +#include "bwt.h" + +#define BWA_TYPE_NO_MATCH 0 +#define BWA_TYPE_UNIQUE 1 +#define BWA_TYPE_REPEAT 2 +#define BWA_TYPE_MATESW 3 + +#define SAM_FPD 1 // paired +#define SAM_FPP 2 // properly paired +#define SAM_FSU 4 // self-unmapped +#define SAM_FMU 8 // mate-unmapped +#define SAM_FSR 16 // self on the reverse strand +#define SAM_FMR 32 // mate on the reverse strand +#define SAM_FR1 64 // this is read one +#define SAM_FR2 128 // this is read two +#define SAM_FSC 256 // secondary alignment + +#define BWA_AVG_ERR 0.02 +#define BWA_MIN_RDLEN 35 // for read trimming + +#ifndef bns_pac +#define bns_pac(pac, k) ((pac)[(k)>>2] >> ((~(k)&3)<<1) & 3) +#endif + +typedef struct { + bwtint_t w; + int bid; +} bwt_width_t; + +typedef struct { + uint32_t n_mm:8, n_gapo:8, n_gape:8, a:1; + bwtint_t k, l; + int score; +} bwt_aln1_t; + +typedef struct { + char *name; + ubyte_t *seq, *rseq, *qual; + uint32_t len:20, strand:1, type:2, dummy:1, extra_flag:8; + uint32_t n_mm:8, n_gapo:8, n_gape:8, mapQ:8; + int score; + // alignments in SA coordinates + int n_aln; + bwt_aln1_t *aln; + // alignment information + bwtint_t sa, pos; + uint64_t c1:28, c2:28, seQ:8; // number of top1 and top2 hits; single-end mapQ + int n_cigar; + uint16_t *cigar; + // for multi-threading only + int tid; + // NM and MD tags + uint32_t full_len:20, nm:12; + char *md; +} bwa_seq_t; + +#define BWA_MODE_GAPE 0x01 +#define BWA_MODE_COMPREAD 0x02 +#define BWA_MODE_LOGGAP 0x04 +#define BWA_MODE_NONSTOP 0x10 + +typedef struct { + int s_mm, s_gapo, s_gape; + int mode; + int indel_end_skip, max_del_occ, max_entries; + float fnr; + int max_diff, max_gapo, max_gape; + int max_seed_diff, seed_len; + int n_threads; + int max_top2; + int trim_qual; +} gap_opt_t; + +#define BWA_PET_STD 1 +#define BWA_PET_SOLID 2 + +typedef struct { + int max_isize; + int max_occ; + int type, is_sw; +} pe_opt_t; + +struct __bwa_seqio_t; +typedef struct __bwa_seqio_t bwa_seqio_t; + +#ifdef __cplusplus +extern "C" { +#endif + + gap_opt_t *gap_init_opt(); + void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt); + + bwa_seqio_t *bwa_seq_open(const char *fn); + void bwa_seq_close(bwa_seqio_t *bs); + void seq_reverse(int len, ubyte_t *seq, int is_comp); + bwa_seq_t *bwa_read_seq(bwa_seqio_t *seq, int n_needed, int *n, int is_comp, int trim_qual); + void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs); + + int bwa_cal_maxdiff(int l, double err, double thres); + + void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac); + + void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt); + +#ifdef __cplusplus +} +#endif + +#endif