diff --git a/c/bwa/bntseq.c b/c/bwa/bntseq.c deleted file mode 100755 index 9db4a5242..000000000 --- a/c/bwa/bntseq.c +++ /dev/null @@ -1,302 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 Genome Research Ltd (GRL). - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Contact: Heng Li */ - -#include -#include -#include -#include -#include "bntseq.h" -#include "main.h" -#include "utils.h" - -#include "kseq.h" -KSEQ_INIT(gzFile, gzread) - -unsigned char nst_nt4_table[256] = { - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 -}; - -void bns_dump(const bntseq_t *bns, const char *prefix) -{ - char str[1024]; - FILE *fp; - int i; - { // dump .ann - strcpy(str, prefix); strcat(str, ".ann"); - fp = xopen(str, "w"); - fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed); - for (i = 0; i != bns->n_seqs; ++i) { - bntann1_t *p = bns->anns + i; - fprintf(fp, "%d %s", p->gi, p->name); - if (p->anno[0]) fprintf(fp, " %s\n", p->anno); - else fprintf(fp, "\n"); - fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs); - } - fclose(fp); - } - { // dump .amb - strcpy(str, prefix); strcat(str, ".amb"); - fp = xopen(str, "w"); - fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes); - for (i = 0; i != bns->n_holes; ++i) { - bntamb1_t *p = bns->ambs + i; - fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb); - } - fclose(fp); - } -} - -bntseq_t *bns_restore(const char *prefix) -{ - char ann_filename[1024], amb_filename[1024], pac_filename[1024]; - strcpy(ann_filename,prefix); - strcat(ann_filename,".ann"); - strcpy(amb_filename,prefix); - strcat(amb_filename,".amb"); - strcpy(pac_filename,prefix); - strcat(pac_filename,".pac"); - - return bns_restore_core(ann_filename, amb_filename, pac_filename); -} - -bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename) -{ - char buffer[1024]; - FILE *fp; - bntseq_t *bns; - int i; - bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); - { // read .ann - fp = xopen(ann_filename, "r"); - fscanf(fp, "%lld%d%u", &bns->l_pac, &bns->n_seqs, &bns->seed); - bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t)); - for (i = 0; i < bns->n_seqs; ++i) { - bntann1_t *p = bns->anns + i; - char *q = buffer; - int c; - // read gi and sequence name - fscanf(fp, "%u%s", &p->gi, buffer); - p->name = strdup(buffer); - // read fasta comments - while ((c = fgetc(fp)) != '\n' && c != EOF) *q++ = c; - *q = 0; - if (q - buffer > 1) p->anno = strdup(buffer + 1); // skip leading space - else p->anno = strdup(""); - // read the rest - fscanf(fp, "%lld%d%d", &p->offset, &p->len, &p->n_ambs); - } - fclose(fp); - } - { // read .amb - int64_t l_pac; - int32_t n_seqs; - fp = xopen(amb_filename, "r"); - fscanf(fp, "%lld%d%d", &l_pac, &n_seqs, &bns->n_holes); - xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files."); - bns->ambs = (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t)); - for (i = 0; i < bns->n_holes; ++i) { - bntamb1_t *p = bns->ambs + i; - fscanf(fp, "%lld%d%s", &p->offset, &p->len, buffer); - p->amb = buffer[0]; - } - fclose(fp); - } - { // open .pac - bns->fp_pac = xopen(pac_filename, "rb"); - } - return bns; -} - -void bns_destroy(bntseq_t *bns) -{ - if (bns == 0) return; - else { - int i; - if (bns->fp_pac) fclose(bns->fp_pac); - free(bns->ambs); - for (i = 0; i < bns->n_seqs; ++i) { - free(bns->anns[i].name); - free(bns->anns[i].anno); - } - free(bns->anns); - free(bns); - } -} - -void bns_fasta2bntseq(gzFile fp_fa, const char *prefix) -{ - kseq_t *seq; - char name[1024]; - bntseq_t *bns; - bntamb1_t *q; - int l_buf; - unsigned char buf[0x10000]; - int32_t m_seqs, m_holes, l, i; - FILE *fp; - - // initialization - seq = kseq_init(fp_fa); - bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); - bns->seed = 11; // fixed seed for random generator - srand48(bns->seed); - m_seqs = m_holes = 8; - bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); - bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); - q = bns->ambs; - l_buf = 0; - strcpy(name, prefix); strcat(name, ".pac"); - fp = xopen(name, "wb"); - memset(buf, 0, 0x10000); - // read sequences - while ((l = kseq_read(seq)) >= 0) { - bntann1_t *p; - int lasts; - if (bns->n_seqs == m_seqs) { - m_seqs <<= 1; - bns->anns = (bntann1_t*)realloc(bns->anns, m_seqs * sizeof(bntann1_t)); - } - p = bns->anns + bns->n_seqs; - p->name = strdup((char*)seq->name.s); - p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)"); - p->gi = 0; p->len = l; - p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; - p->n_ambs = 0; - for (i = 0, lasts = 0; i < l; ++i) { - int c = nst_nt4_table[(int)seq->seq.s[i]]; - if (c >= 4) { // N - if (lasts == seq->seq.s[i]) { // contiguous N - ++q->len; - } else { - if (bns->n_holes == m_holes) { - m_holes <<= 1; - bns->ambs = (bntamb1_t*)realloc(bns->ambs, m_holes * sizeof(bntamb1_t)); - } - q = bns->ambs + bns->n_holes; - q->len = 1; - q->offset = p->offset + i; - q->amb = seq->seq.s[i]; - ++p->n_ambs; - ++bns->n_holes; - } - } - lasts = seq->seq.s[i]; - { // fill buffer - if (c >= 4) c = lrand48()&0x3; - if (l_buf == 0x40000) { - fwrite(buf, 1, 0x10000, fp); - memset(buf, 0, 0x10000); - l_buf = 0; - } - buf[l_buf>>2] |= c << ((3 - (l_buf&3)) << 1); - ++l_buf; - } - } - ++bns->n_seqs; - bns->l_pac += seq->seq.l; - } - xassert(bns->l_pac, "zero length sequence."); - { // finalize .pac file - ubyte_t ct; - fwrite(buf, 1, (l_buf>>2) + ((l_buf&3) == 0? 0 : 1), fp); - // the following codes make the pac file size always (l_pac/4+1+1) - if (bns->l_pac % 4 == 0) { - ct = 0; - fwrite(&ct, 1, 1, fp); - } - ct = bns->l_pac % 4; - fwrite(&ct, 1, 1, fp); - // close .pac file - fclose(fp); - } - bns_dump(bns, prefix); - bns_destroy(bns); - kseq_destroy(seq); -} - -int bwa_fa2pac(int argc, char *argv[]) -{ - gzFile fp; - if (argc < 2) { - fprintf(stderr, "Usage: bwa fa2pac []\n"); - return 1; - } - fp = xzopen(argv[1], "r"); - bns_fasta2bntseq(fp, (argc < 3)? argv[1] : argv[2]); - gzclose(fp); - return 0; -} - -int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq) -{ - int left, mid, right, nn; - if (pac_coor >= bns->l_pac) - err_fatal("bns_coor_pac2real", "bug! Coordinate is longer than sequence (%lld>=%lld).", pac_coor, bns->l_pac); - // binary search for the sequence ID. Note that this is a bit different from the following one... - left = 0; mid = 0; right = bns->n_seqs; - while (left < right) { - mid = (left + right) >> 1; - if (pac_coor >= bns->anns[mid].offset) { - if (mid == bns->n_seqs - 1) break; - if (pac_coor < bns->anns[mid+1].offset) break; - left = mid + 1; - } else right = mid; - } - *real_seq = mid; - // binary search for holes - left = 0; right = bns->n_holes; nn = 0; - while (left < right) { - int64_t mid = (left + right) >> 1; - if (pac_coor >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1; - else if (pac_coor + len <= bns->ambs[mid].offset) right = mid; - else { // overlap - if (pac_coor >= bns->ambs[mid].offset) { - nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len? - bns->ambs[mid].offset + bns->ambs[mid].len - pac_coor : len; - } else { - nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len? - bns->ambs[mid].len : len - (bns->ambs[mid].offset - pac_coor); - } - break; - } - } - return nn; -} diff --git a/c/bwa/bntseq.h b/c/bwa/bntseq.h deleted file mode 100755 index 21b831ee4..000000000 --- a/c/bwa/bntseq.h +++ /dev/null @@ -1,80 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 Genome Research Ltd (GRL). - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Contact: Heng Li */ - -#ifndef BWT_BNTSEQ_H -#define BWT_BNTSEQ_H - -#include -#include - -#ifndef BWA_UBYTE -#define BWA_UBYTE -typedef uint8_t ubyte_t; -#endif - -typedef struct { - int64_t offset; - int32_t len; - int32_t n_ambs; - uint32_t gi; - char *name, *anno; -} bntann1_t; - -typedef struct { - int64_t offset; - int32_t len; - char amb; -} bntamb1_t; - -typedef struct { - int64_t l_pac; - int32_t n_seqs; - uint32_t seed; - bntann1_t *anns; // n_seqs elements - int32_t n_holes; - bntamb1_t *ambs; // n_holes elements - FILE *fp_pac; -} bntseq_t; - -extern unsigned char nst_nt4_table[256]; - -#ifdef __cplusplus -extern "C" { -#endif - - void bns_dump(const bntseq_t *bns, const char *prefix); - bntseq_t *bns_restore(const char *prefix); - bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename); - void bns_destroy(bntseq_t *bns); - void bns_fasta2bntseq(gzFile fp_fa, const char *prefix); - int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/c/bwa/build_mac.sh b/c/bwa/build_mac.sh index 4a0527288..bfed900bb 100644 --- a/c/bwa/build_mac.sh +++ b/c/bwa/build_mac.sh @@ -1,5 +1,5 @@ #!/bin/sh -export BWA_HOME="/Users/mhanna/src/bwa-0.5.3" +export BWA_HOME="/Users/mhanna/src/bwa" export JAVA_INCLUDE="/System/Library/Frameworks/JavaVM.framework/Headers" export TARGET_LIB="libbwa.dylib" export EXTRA_LIBS="-lc -lz -lsupc++" diff --git a/c/bwa/bwase.c b/c/bwa/bwase.c deleted file mode 100755 index 014e50bc4..000000000 --- a/c/bwa/bwase.c +++ /dev/null @@ -1,597 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "bwase.h" -#include "stdaln.h" -#include "bntseq.h" -#include "utils.h" -#include "kstring.h" - -static int g_log_n[256]; - -void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s) -{ - int i, cnt, best; - if (n_aln == 0) { - s->type = BWA_TYPE_NO_MATCH; - s->c1 = s->c2 = 0; - return; - } - best = aln[0].score; - for (i = cnt = 0; i < n_aln; ++i) { - const bwt_aln1_t *p = aln + i; - if (p->score > best) break; - if (drand48() * (p->l - p->k + 1) > (double)cnt) { - s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape; s->strand = p->a; - s->score = p->score; - s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48()); - } - cnt += p->l - p->k + 1; - } - s->c1 = cnt; - for (; i < n_aln; ++i) cnt += aln[i].l - aln[i].k + 1; - s->c2 = cnt - s->c1; - s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE; -} - -int bwa_approx_mapQ(const bwa_seq_t *p, int mm) -{ - int n; - if (p->c1 == 0) return 23; - if (p->c1 > 1) return 0; - if (p->n_mm == mm) return 25; - if (p->c2 == 0) return 37; - n = (p->c2 >= 255)? 255 : p->c2; - return (23 < g_log_n[n])? 0 : 23 - g_log_n[n]; -} - -void bwa_cal_pac_pos(const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr) -{ - int i; - char str[1024]; - bwt_t *bwt; - // load forward SA - strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); - strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); - for (i = 0; i != n_seqs; ++i) { - bwa_seq_t *p = seqs + i; - if(p->strand) // reverse strand only - bwa_cal_pac_pos_core(bwt, NULL, p, max_mm, fnr); - } - bwt_destroy(bwt); - // load reverse BWT and SA - strcpy(str, prefix); strcat(str, ".rbwt"); bwt = bwt_restore_bwt(str); - strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt); - for (i = 0; i != n_seqs; ++i) { - bwa_seq_t *p = seqs + i; - if(!p->strand) // forward strand only - bwa_cal_pac_pos_core(NULL, bwt, p, max_mm, fnr); - } - bwt_destroy(bwt); -} - -/** - * Derive the actual position in the read from the given suffix array coordinates. - * Note that the position will be approximate based on whether indels appear in the - * read and whether calculations are performed from the start or end of the read. - */ -void bwa_cal_pac_pos_core(const bwt_t* forward_bwt, const bwt_t* reverse_bwt, bwa_seq_t* seq, const int max_mm, const float fnr) { - if(seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT) - return; - - int max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm; - if(seq->strand) { // reverse strand only - seq->pos = bwt_sa(forward_bwt, seq->sa); - seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff); - } - else { // forward strand only - /* NB: For gapped alignment, p->pos may not be correct, - * which will be fixed in refine_gapped_core(). This - * line also determines the way "x" is calculated in - * refine_gapped_core() when (ext < 0 && is_end == 0). */ - seq->pos = reverse_bwt->seq_len - (bwt_sa(reverse_bwt, seq->sa) + seq->len); - seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff); - } -} - -/* is_end_correct == 1 if (*pos+len) gives the correct coordinate on - * forward strand. This happens when p->pos is calculated by - * bwa_cal_pac_pos(). is_end_correct==0 if (*pos) gives the correct - * coordinate. This happens only for color-converted alignment. */ -static uint16_t *refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, bwtint_t *_pos, - int ext, int *n_cigar, int is_end_correct) -{ - uint16_t *cigar = 0; - ubyte_t *ref_seq; - int l = 0, path_len, ref_len; - AlnParam ap = aln_param_bwa; - path_t *path; - int64_t k, __pos = *_pos > l_pac? (int64_t)((int32_t)*_pos) : *_pos; - - ref_len = len + abs(ext); - if (ext > 0) { - ref_seq = (ubyte_t*)calloc(ref_len, 1); - for (k = __pos; k < __pos + ref_len && k < l_pac; ++k) - ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; - } else { - int64_t x = __pos + (is_end_correct? len : ref_len); - ref_seq = (ubyte_t*)calloc(ref_len, 1); - for (l = 0, k = x - ref_len > 0? x - ref_len : 0; k < x && k < l_pac; ++k) - ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; - } - path = (path_t*)calloc(l+len, sizeof(path_t)); - - aln_global_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len); - cigar = aln_path2cigar(path, path_len, n_cigar); - - if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped on the forward strand - for (l = k = 0; k < *n_cigar; ++k) { - if (cigar[k]>>14 == FROM_D) l -= cigar[k]&0x3fff; - else if (cigar[k]>>14 == FROM_I) l += cigar[k]&0x3fff; - } - __pos += l; - } - - if (cigar[0]>>14 == FROM_D) { // deletion at the 5'-end - __pos += cigar[0]&0x3fff; - for (k = 0; k < *n_cigar - 1; ++k) cigar[k] = cigar[k+1]; - --(*n_cigar); - } - if (cigar[*n_cigar-1]>>14 == FROM_D) --(*n_cigar); // deletion at the 3'-end - - // change "I" at either end of the read to S. just in case. This should rarely happen... - if (cigar[*n_cigar-1]>>14 == FROM_I) cigar[*n_cigar-1] = 3<<14 | (cigar[*n_cigar-1]&0x3fff); - if (cigar[0]>>14 == FROM_I) cigar[0] = 3<<14 | (cigar[0]&0x3fff); - - *_pos = (bwtint_t)__pos; - free(ref_seq); free(path); - return cigar; -} - -char *bwa_cal_md1(int n_cigar, uint16_t *cigar, int len, bwtint_t pos, ubyte_t *seq, - bwtint_t l_pac, ubyte_t *pacseq, kstring_t *str, int *_nm) -{ - bwtint_t x, y; - int z, u, c, nm = 0; - str->l = 0; // reset - x = pos; y = 0; - if (cigar) { - int k, l; - for (k = u = 0; k < n_cigar; ++k) { - l = cigar[k]&0x3fff; - if (cigar[k]>>14 == FROM_M) { - for (z = 0; z < l && x+z < l_pac; ++z) { - c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3; - if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) { - ksprintf(str, "%d", u); - kputc("ACGTN"[c], str); - ++nm; - u = 0; - } else ++u; - } - x += l; y += l; - } else if (cigar[k]>>14 == FROM_I || cigar[k]>>14 == 3) { - y += l; nm += l; - } else if (cigar[k]>>14 == FROM_D) { - ksprintf(str, "%d", u); - kputc('^', str); - for (z = 0; z < l && x+z < l_pac; ++z) - kputc("ACGT"[pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3], str); - u = 0; - x += l; nm += l; - } - } - } else { // no gaps - for (z = u = 0; z < (bwtint_t)len; ++z) { - c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3; - if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) { - ksprintf(str, "%d", u); - kputc("ACGTN"[c], str); - ++nm; - u = 0; - } else ++u; - } - } - ksprintf(str, "%d", u); - *_nm = nm; - return strdup(str->s); -} - -void bwa_correct_trimmed(bwa_seq_t *s) -{ - if (s->len == s->full_len) return; - if (s->strand == 0) { // forward - if (s->cigar && s->cigar[s->n_cigar-1]>>14 == 3) { // the last is S - s->cigar[s->n_cigar-1] += s->full_len - s->len; - } else { - if (s->cigar == 0) { - s->n_cigar = 2; - s->cigar = calloc(s->n_cigar, 2); - s->cigar[0] = 0<<14 | s->len; - } else { - ++s->n_cigar; - s->cigar = realloc(s->cigar, s->n_cigar * 2); - } - s->cigar[s->n_cigar-1] = 3<<14 | (s->full_len - s->len); - } - } else { // reverse - if (s->cigar && s->cigar[0]>>14 == 3) { // the first is S - s->cigar[0] += s->full_len - s->len; - } else { - if (s->cigar == 0) { - s->n_cigar = 2; - s->cigar = calloc(s->n_cigar, 2); - s->cigar[1] = 0<<14 | s->len; - } else { - ++s->n_cigar; - s->cigar = realloc(s->cigar, s->n_cigar * 2); - memmove(s->cigar + 1, s->cigar, (s->n_cigar-1) * 2); - } - s->cigar[0] = 3<<14 | (s->full_len - s->len); - } - } - s->len = s->full_len; -} - -void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns) -{ - ubyte_t *pacseq, *ntpac = 0; - int i; - kstring_t *str; - - if (ntbns) { // in color space - ntpac = (ubyte_t*)calloc(ntbns->l_pac/4+1, 1); - rewind(ntbns->fp_pac); - fread(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac); - } - - if (!_pacseq) { - pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); - rewind(bns->fp_pac); - fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac); - } else pacseq = _pacseq; - for (i = 0; i != n_seqs; ++i) { - bwa_seq_t *s = seqs + i; - seq_reverse(s->len, s->seq, 0); // IMPORTANT: s->seq is reversed here!!! - if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue; - s->cigar = refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, - (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1); - } - - if (ntbns) { // in color space - for (i = 0; i < n_seqs; ++i) { - bwa_seq_t *s = seqs + i; - bwa_cs2nt_core(s, bns->l_pac, ntpac); - if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again - free(s->cigar); - s->cigar = refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos, - (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0); - } - } - } - - // generate MD tag - str = (kstring_t*)calloc(1, sizeof(kstring_t)); - for (i = 0; i != n_seqs; ++i) { - bwa_seq_t *s = seqs + i; - if (s->type != BWA_TYPE_NO_MATCH) { - int nm; - s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq, - bns->l_pac, ntbns? ntpac : pacseq, str, &nm); - s->nm = nm; - } - } - free(str->s); free(str); - - // correct for trimmed reads - for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i); - - if (!_pacseq) free(pacseq); - free(ntpac); -} - -int64_t pos_end(const bwa_seq_t *p) -{ - if (p->cigar) { - int j; - int64_t x = p->pos; - for (j = 0; j != p->n_cigar; ++j) { - int op = p->cigar[j]>>14; - if (op == 0 || op == 2) x += p->cigar[j]&0x3fff; - } - return x; - } else return p->pos + p->len; -} - -static int64_t pos_5(const bwa_seq_t *p) -{ - if (p->type != BWA_TYPE_NO_MATCH) - return p->strand? pos_end(p) : p->pos; - return -1; -} - -void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2) -{ - int j; - if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) { - int seqid, nn, am = 0, flag = p->extra_flag; - char XT; - - if (p->type == BWA_TYPE_NO_MATCH) { - p->pos = mate->pos; - p->strand = mate->strand; - flag |= SAM_FSU; - j = 1; - } else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment - - // get seqid - nn = bns_coor_pac2real(bns, p->pos, j, &seqid); - - // update flag and print it - if (p->strand) flag |= SAM_FSR; - if (mate) { - if (mate->type != BWA_TYPE_NO_MATCH) { - if (mate->strand) flag |= SAM_FMR; - } else flag |= SAM_FMU; - } - printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name); - printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ); - - // print CIGAR - if (p->cigar) { - for (j = 0; j != p->n_cigar; ++j) - printf("%d%c", p->cigar[j]&0x3fff, "MIDS"[p->cigar[j]>>14]); - } else if (p->type == BWA_TYPE_NO_MATCH) printf("*"); - else printf("%dM", p->len); - - // print mate coordinate - if (mate && mate->type != BWA_TYPE_NO_MATCH) { - int m_seqid, m_is_N; - long long isize; - am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality - // redundant calculation here, but should not matter too much - m_is_N = bns_coor_pac2real(bns, mate->pos, mate->len, &m_seqid); - printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name); - isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0; - if (p->type == BWA_TYPE_NO_MATCH) isize = 0; - printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize); - } else if (mate) printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1)); - else printf("\t*\t0\t0\t"); - - // print sequence and quality - if (p->strand == 0) - for (j = 0; j != p->full_len; ++j) putchar("ACGTN"[(int)p->seq[j]]); - else for (j = 0; j != p->full_len; ++j) putchar("TGCAN"[p->seq[p->full_len - 1 - j]]); - putchar('\t'); - if (p->qual) { - if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality - printf("%s", p->qual); - } else printf("*"); - - if (p->type != BWA_TYPE_NO_MATCH) { - // calculate XT tag - XT = "NURM"[p->type]; - if (nn > 10) XT = 'N'; - // print tags - printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm); - if (nn) printf("\tXN:i:%d", nn); - if (mate) printf("\tSM:i:%d\tAM:i:%d", p->seQ, am); - if (p->type != BWA_TYPE_MATESW) { // X0 and X1 are not available for this type of alignment - printf("\tX0:i:%d", p->c1); - if (p->c1 <= max_top2) printf("\tX1:i:%d", p->c2); - } - printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo, p->n_gapo+p->n_gape); - if (p->md) printf("\tMD:Z:%s", p->md); - } - putchar('\n'); - } else { // this read has no match - ubyte_t *s = p->strand? p->rseq : p->seq; - int flag = p->extra_flag | SAM_FSU; - if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU; - printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag); - for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]); - putchar('\t'); - if (p->qual) { - if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality - printf("%s", p->qual); - } else printf("*"); - putchar('\n'); - } -} - -bntseq_t *bwa_open_nt(const char *prefix) -{ - bntseq_t *ntbns; - char *str; - str = (char*)calloc(strlen(prefix) + 10, 1); - strcat(strcpy(str, prefix), ".nt"); - ntbns = bns_restore(str); - free(str); - return ntbns; -} - -void bwa_print_sam_SQ(const bntseq_t *bns) -{ - int i; - for (i = 0; i < bns->n_seqs; ++i) - printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len); -} - -void bwase_initialize() -{ - int i; - for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); -} - -void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa) -{ - int i, n_seqs, tot_seqs = 0, m_aln; - bwt_aln1_t *aln = 0; - bwa_seq_t *seqs; - bwa_seqio_t *ks; - clock_t t; - bntseq_t *bns, *ntbns = 0; - FILE *fp_sa; - gap_opt_t opt; - - // initialization - bwase_initialize(); - for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); - bns = bns_restore(prefix); - srand48(bns->seed); - ks = bwa_seq_open(fn_fa); - fp_sa = xopen(fn_sa, "r"); - - // core loop - m_aln = 0; - fread(&opt, sizeof(gap_opt_t), 1, fp_sa); - if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac - ntbns = bwa_open_nt(prefix); - bwa_print_sam_SQ(bns); - while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode & BWA_MODE_COMPREAD, opt.trim_qual)) != 0) { - tot_seqs += n_seqs; - t = clock(); - - // read alignment - for (i = 0; i < n_seqs; ++i) { - bwa_seq_t *p = seqs + i; - int n_aln; - fread(&n_aln, 4, 1, fp_sa); - if (n_aln > m_aln) { - m_aln = n_aln; - aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln); - } - fread(aln, sizeof(bwt_aln1_t), n_aln, fp_sa); - bwa_aln2seq(n_aln, aln, p); - } - - fprintf(stderr, "[bwa_aln_core] convert to sequence coordinate... "); - bwa_cal_pac_pos(prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); - - fprintf(stderr, "[bwa_aln_core] refine gapped alignments... "); - bwa_refine_gapped(bns, n_seqs, seqs, 0, ntbns); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); - - fprintf(stderr, "[bwa_aln_core] print alignments... "); - for (i = 0; i < n_seqs; ++i) - bwa_print_sam1(bns, seqs + i, 0, opt.mode, opt.max_top2); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); - - bwa_free_read_seq(n_seqs, seqs); - fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs); - } - - // destroy - bwa_seq_close(ks); - if (ntbns) bns_destroy(ntbns); - bns_destroy(bns); - fclose(fp_sa); - free(aln); -} - -static void print_aln_simple(bwt_t *const bwt[2], const bntseq_t *bns, const bwt_aln1_t *q, int len, bwtint_t k) -{ - bwtint_t pos; - int seqid, is_N; - pos = q->a? bwt_sa(bwt[0], k) : bwt[1]->seq_len - (bwt_sa(bwt[1], k) + len); - if (pos > bwt[1]->seq_len) pos = 0; // negative - is_N = bns_coor_pac2real(bns, pos, len, &seqid); - printf("%s\t%c%d\t%d\n", bns->anns[seqid].name, "+-"[q->a], (int)(pos - bns->anns[seqid].offset + 1), - q->n_mm + q->n_gapo + q->n_gape); -} - -void bwa_print_all_hits(const char *prefix, const char *fn_sa, const char *fn_fa, int max_occ) -{ - int i, n_seqs, tot_seqs = 0, m_aln; - bwt_aln1_t *aln = 0; - bwa_seq_t *seqs; - bwa_seqio_t *ks; - bntseq_t *bns; - FILE *fp_sa; - gap_opt_t opt; - bwt_t *bwt[2]; - - bns = bns_restore(prefix); - srand48(bns->seed); - ks = bwa_seq_open(fn_fa); - fp_sa = xopen(fn_sa, "r"); - { // load BWT - char *str = (char*)calloc(strlen(prefix) + 10, 1); - strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str); - strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]); - strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str); - strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]); - free(str); - } - - m_aln = 0; - fread(&opt, sizeof(gap_opt_t), 1, fp_sa); - while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode & BWA_MODE_COMPREAD, opt.trim_qual)) != 0) { - tot_seqs += n_seqs; - for (i = 0; i < n_seqs; ++i) { - bwa_seq_t *p = seqs + i; - int n_aln, n_occ, k, rest, len; - len = p->len; - fread(&n_aln, 4, 1, fp_sa); - if (n_aln > m_aln) { - m_aln = n_aln; - aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln); - } - fread(aln, sizeof(bwt_aln1_t), n_aln, fp_sa); - for (k = n_occ = 0; k < n_aln; ++k) { - const bwt_aln1_t *q = aln + k; - n_occ += q->l - q->k + 1; - } - rest = n_occ > max_occ? max_occ : n_occ; - printf(">%s %d %d\n", p->name, rest, n_occ); - for (k = 0; k < n_aln; ++k) { - const bwt_aln1_t *q = aln + k; - if (q->l - q->k + 1 <= rest) { - bwtint_t l; - for (l = q->k; l <= q->l; ++l) - print_aln_simple(bwt, bns, q, len, l); - rest -= q->l - q->k + 1; - } else { // See also: http://code.activestate.com/recipes/272884/ - int j, i, k; - for (j = rest, i = q->l - q->k + 1, k = 0; j > 0; --j) { - double p = 1.0, x = drand48(); - while (x < p) p -= p * j / (i--); - print_aln_simple(bwt, bns, q, len, q->l - i); - } - rest = 0; - break; - } - } - } - bwa_free_read_seq(n_seqs, seqs); - } - bwt_destroy(bwt[0]); bwt_destroy(bwt[1]); - bwa_seq_close(ks); - bns_destroy(bns); - fclose(fp_sa); - free(aln); -} - -int bwa_sai2sam_se(int argc, char *argv[]) -{ - int c, n_occ = 0; - while ((c = getopt(argc, argv, "hn:")) >= 0) { - switch (c) { - case 'h': break; - case 'n': n_occ = atoi(optarg); break; - default: return 1; - } - } - - if (optind + 3 > argc) { - fprintf(stderr, "Usage: bwa samse [-n max_occ] \n"); - return 1; - } - if (n_occ > 0) bwa_print_all_hits(argv[optind], argv[optind+1], argv[optind+2], n_occ); - else bwa_sai2sam_se_core(argv[optind], argv[optind+1], argv[optind+2]); - return 0; -} diff --git a/c/bwa/bwase.h b/c/bwa/bwase.h deleted file mode 100755 index 54a10b3d7..000000000 --- a/c/bwa/bwase.h +++ /dev/null @@ -1,52 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 Genome Research Ltd (GRL). - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -#ifndef BWASE_H -#define BWASE_H - -#include "bntseq.h" -#include "bwt.h" -#include "bwtaln.h" - -#ifdef __cplusplus -extern "C" { -#endif - -// Initialize mapping tables in the bwa single-end mapper. -void bwase_initialize(); -// Calculate the approximate position of the sequence from the specified bwt with loaded suffix array. -void bwa_cal_pac_pos_core(const bwt_t* forward_bwt, const bwt_t* reverse_bwt, bwa_seq_t* seq, const int max_mm, const float fnr); -// Refine the approximate position of the sequence to an actual placement for the sequence. -void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns); -// Backfill certain alignment properties mainly centering around number of matches. -void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); -// Calculate the end position of a read given a certain sequence. -int64_t pos_end(const bwa_seq_t *p); - -#ifdef __cplusplus -} -#endif - -#endif // BWASE_H diff --git a/c/bwa/bwtaln.c b/c/bwa/bwtaln.c deleted file mode 100755 index 8894338e0..000000000 --- a/c/bwa/bwtaln.c +++ /dev/null @@ -1,295 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include "bwtaln.h" -#include "bwtgap.h" -#include "utils.h" - -#ifdef HAVE_PTHREAD -#define THREAD_BLOCK_SIZE 1024 -#include -static pthread_mutex_t g_seq_lock = PTHREAD_MUTEX_INITIALIZER; -#endif - -gap_opt_t *gap_init_opt() -{ - gap_opt_t *o; - o = (gap_opt_t*)calloc(1, sizeof(gap_opt_t)); - /* IMPORTANT: s_mm*10 should be about the average base error - rate. Voilating this requirement will break pairing! */ - o->s_mm = 3; o->s_gapo = 11; o->s_gape = 4; - o->max_diff = -1; o->max_gapo = 1; o->max_gape = 6; - o->indel_end_skip = 5; o->max_del_occ = 10; o->max_entries = 2000000; - o->mode = BWA_MODE_GAPE | BWA_MODE_COMPREAD; - o->seed_len = 0x7fffffff; o->max_seed_diff = 2; - o->fnr = 0.04; - o->n_threads = 1; - o->max_top2 = 30; - o->trim_qual = 0; - return o; -} - -int bwa_cal_maxdiff(int l, double err, double thres) -{ - double elambda = exp(-l * err); - double sum, y = 1.0; - int k, x = 1; - for (k = 1, sum = elambda; k < 1000; ++k) { - y *= l * err; - x *= k; - sum += elambda * y / x; - if (1.0 - sum < thres) return k; - } - return 2; -} - -// width must be filled as zero -static int bwt_cal_width(const bwt_t *rbwt, int len, const ubyte_t *str, bwt_width_t *width) -{ - bwtint_t k, l, ok, ol; - int i, bid; - bid = 0; - k = 0; l = rbwt->seq_len; - for (i = 0; i < len; ++i) { - ubyte_t c = str[i]; - if (c < 4) { - bwt_2occ(rbwt, k - 1, l, c, &ok, &ol); - k = rbwt->L2[c] + ok + 1; - l = rbwt->L2[c] + ol; - } - if (k > l || c > 3) { // then restart - k = 0; - l = rbwt->seq_len; - ++bid; - } - width[i].w = l - k + 1; - width[i].bid = bid; - } - width[len].w = 0; - width[len].bid = ++bid; - return bid; -} - -void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt) -{ - int i, max_l = 0, max_len; - gap_stack_t *stack; - bwt_width_t *w[2], *seed_w[2]; - const ubyte_t *seq[2]; - gap_opt_t local_opt = *opt; - - // initiate priority stack - for (i = max_len = 0; i != n_seqs; ++i) - if (seqs[i].len > max_len) max_len = seqs[i].len; - if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(max_len, BWA_AVG_ERR, opt->fnr); - if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff; - stack = gap_init_stack(opt->max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt); - - seed_w[0] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t)); - seed_w[1] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t)); - w[0] = w[1] = 0; - for (i = 0; i != n_seqs; ++i) { - bwa_seq_t *p = seqs + i; -#ifdef HAVE_PTHREAD - if (opt->n_threads > 1) { - pthread_mutex_lock(&g_seq_lock); - if (p->tid < 0) { // unassigned - int j; - for (j = i; j < n_seqs && j < i + THREAD_BLOCK_SIZE; ++j) - seqs[j].tid = tid; - } else if (p->tid != tid) { - pthread_mutex_unlock(&g_seq_lock); - continue; - } - pthread_mutex_unlock(&g_seq_lock); - } -#endif - p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0; - seq[0] = p->seq; seq[1] = p->rseq; - if (max_l < p->len) { - max_l = p->len; - w[0] = (bwt_width_t*)calloc(max_l + 1, sizeof(bwt_width_t)); - w[1] = (bwt_width_t*)calloc(max_l + 1, sizeof(bwt_width_t)); - } - bwt_cal_width(bwt[0], p->len, seq[0], w[0]); - bwt_cal_width(bwt[1], p->len, seq[1], w[1]); - if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr); - local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff; - if (p->len > opt->seed_len) { - bwt_cal_width(bwt[0], opt->seed_len, seq[0] + (p->len - opt->seed_len), seed_w[0]); - bwt_cal_width(bwt[1], opt->seed_len, seq[1] + (p->len - opt->seed_len), seed_w[1]); - } - // core function - p->aln = bwt_match_gap(bwt, p->len, seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack); - // store the alignment - free(p->name); free(p->seq); free(p->rseq); free(p->qual); - p->name = 0; p->seq = p->rseq = p->qual = 0; - } - free(seed_w[0]); free(seed_w[1]); - free(w[0]); free(w[1]); - gap_destroy_stack(stack); -} - -#ifdef HAVE_PTHREAD -typedef struct { - int tid; - bwt_t *bwt[2]; - int n_seqs; - bwa_seq_t *seqs; - const gap_opt_t *opt; -} thread_aux_t; - -static void *worker(void *data) -{ - thread_aux_t *d = (thread_aux_t*)data; - bwa_cal_sa_reg_gap(d->tid, d->bwt, d->n_seqs, d->seqs, d->opt); - return 0; -} -#endif - -void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) -{ - int i, n_seqs, tot_seqs = 0; - bwa_seq_t *seqs; - bwa_seqio_t *ks; - clock_t t; - bwt_t *bwt[2]; - - // initialization - ks = bwa_seq_open(fn_fa); - - { // load BWT - char *str = (char*)calloc(strlen(prefix) + 10, 1); - strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str); - strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str); - free(str); - } - - // core loop - fwrite(opt, sizeof(gap_opt_t), 1, stdout); - while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode & BWA_MODE_COMPREAD, opt->trim_qual)) != 0) { - tot_seqs += n_seqs; - t = clock(); - - fprintf(stderr, "[bwa_aln_core] calculate SA coordinate... "); - -#ifdef HAVE_PTHREAD - if (opt->n_threads <= 1) { // no multi-threading at all - bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); - } else { - pthread_t *tid; - pthread_attr_t attr; - thread_aux_t *data; - int j; - pthread_attr_init(&attr); - pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t)); - tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); - for (j = 0; j < opt->n_threads; ++j) { - data[j].tid = j; data[j].bwt[0] = bwt[0]; data[j].bwt[1] = bwt[1]; - data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt; - pthread_create(&tid[j], &attr, worker, data + j); - } - for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0); - free(data); free(tid); - } -#else - bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); -#endif - - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); - - t = clock(); - fprintf(stderr, "[bwa_aln_core] write to the disk... "); - for (i = 0; i < n_seqs; ++i) { - bwa_seq_t *p = seqs + i; - fwrite(&p->n_aln, 4, 1, stdout); - if (p->n_aln) fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout); - } - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); - - bwa_free_read_seq(n_seqs, seqs); - fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs); - } - - // destroy - bwt_destroy(bwt[0]); bwt_destroy(bwt[1]); - bwa_seq_close(ks); -} - -int bwa_aln(int argc, char *argv[]) -{ - int c, opte = -1; - gap_opt_t *opt; - - opt = gap_init_opt(); - while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:")) >= 0) { - switch (c) { - case 'n': - if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; - else opt->max_diff = atoi(optarg), opt->fnr = -1.0; - break; - case 'o': opt->max_gapo = atoi(optarg); break; - case 'e': opte = atoi(optarg); break; - case 'M': opt->s_mm = atoi(optarg); break; - case 'O': opt->s_gapo = atoi(optarg); break; - case 'E': opt->s_gape = atoi(optarg); break; - case 'd': opt->max_del_occ = atoi(optarg); break; - case 'i': opt->indel_end_skip = atoi(optarg); break; - case 'l': opt->seed_len = atoi(optarg); break; - case 'k': opt->max_seed_diff = atoi(optarg); break; - case 'm': opt->max_entries = atoi(optarg); break; - case 't': opt->n_threads = atoi(optarg); break; - case 'L': opt->mode |= BWA_MODE_LOGGAP; break; - case 'R': opt->max_top2 = atoi(optarg); break; - case 'q': opt->trim_qual = atoi(optarg); break; - case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break; - case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break; - default: return 1; - } - } - if (opte > 0) { - opt->max_gape = opte; - opt->mode &= ~BWA_MODE_GAPE; - } - - if (optind + 2 > argc) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa aln [options] \n\n"); - fprintf(stderr, "Options: -n NUM max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n", - BWA_AVG_ERR, opt->fnr); - fprintf(stderr, " -o INT maximum number or fraction of gap opens [%d]\n", opt->max_gapo); - fprintf(stderr, " -e INT maximum number of gap extensions, -1 for disabling long gaps [-1]\n"); - fprintf(stderr, " -i INT do not put an indel within INT bp towards the ends [%d]\n", opt->indel_end_skip); - fprintf(stderr, " -d INT maximum occurrences for extending a long deletion [%d]\n", opt->max_del_occ); - fprintf(stderr, " -l INT seed length [%d]\n", opt->seed_len); - fprintf(stderr, " -k INT maximum differences in the seed [%d]\n", opt->max_seed_diff); - fprintf(stderr, " -m INT maximum entries in the queue [%d]\n", opt->max_entries); - fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); - fprintf(stderr, " -M INT mismatch penalty [%d]\n", opt->s_mm); - fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->s_gapo); - fprintf(stderr, " -E INT gap extension penalty [%d]\n", opt->s_gape); - fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2); - fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual); - fprintf(stderr, " -c input sequences are in the color space\n"); - fprintf(stderr, " -L log-scaled gap penalty for long deletions\n"); - fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n"); - fprintf(stderr, "\n"); - return 1; - } - if (opt->fnr > 0.0) { - int i, k; - for (i = 17, k = 0; i <= 250; ++i) { - int l = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr); - if (l != k) fprintf(stderr, "[bwa_aln] %dbp reads: max_diff = %d\n", i, l); - k = l; - } - } - bwa_aln_core(argv[optind], argv[optind+1], opt); - free(opt); - return 0; -} diff --git a/c/bwa/bwtaln.h b/c/bwa/bwtaln.h deleted file mode 100755 index 68c1c4f1f..000000000 --- a/c/bwa/bwtaln.h +++ /dev/null @@ -1,113 +0,0 @@ -#ifndef BWTALN_H -#define BWTALN_H - -#include -#include "bwt.h" - -#define BWA_TYPE_NO_MATCH 0 -#define BWA_TYPE_UNIQUE 1 -#define BWA_TYPE_REPEAT 2 -#define BWA_TYPE_MATESW 3 - -#define SAM_FPD 1 // paired -#define SAM_FPP 2 // properly paired -#define SAM_FSU 4 // self-unmapped -#define SAM_FMU 8 // mate-unmapped -#define SAM_FSR 16 // self on the reverse strand -#define SAM_FMR 32 // mate on the reverse strand -#define SAM_FR1 64 // this is read one -#define SAM_FR2 128 // this is read two -#define SAM_FSC 256 // secondary alignment - -#define BWA_AVG_ERR 0.02 -#define BWA_MIN_RDLEN 35 // for read trimming - -#ifndef bns_pac -#define bns_pac(pac, k) ((pac)[(k)>>2] >> ((~(k)&3)<<1) & 3) -#endif - -typedef struct { - bwtint_t w; - int bid; -} bwt_width_t; - -typedef struct { - uint32_t n_mm:8, n_gapo:8, n_gape:8, a:1; - bwtint_t k, l; - int score; -} bwt_aln1_t; - -typedef struct { - char *name; - ubyte_t *seq, *rseq, *qual; - uint32_t len:20, strand:1, type:2, dummy:1, extra_flag:8; - uint32_t n_mm:8, n_gapo:8, n_gape:8, mapQ:8; - int score; - // alignments in SA coordinates - int n_aln; - bwt_aln1_t *aln; - // alignment information - bwtint_t sa, pos; - uint64_t c1:28, c2:28, seQ:8; // number of top1 and top2 hits; single-end mapQ - int n_cigar; - uint16_t *cigar; - // for multi-threading only - int tid; - // NM and MD tags - uint32_t full_len:20, nm:12; - char *md; -} bwa_seq_t; - -#define BWA_MODE_GAPE 0x01 -#define BWA_MODE_COMPREAD 0x02 -#define BWA_MODE_LOGGAP 0x04 -#define BWA_MODE_NONSTOP 0x10 - -typedef struct { - int s_mm, s_gapo, s_gape; - int mode; - int indel_end_skip, max_del_occ, max_entries; - float fnr; - int max_diff, max_gapo, max_gape; - int max_seed_diff, seed_len; - int n_threads; - int max_top2; - int trim_qual; -} gap_opt_t; - -#define BWA_PET_STD 1 -#define BWA_PET_SOLID 2 - -typedef struct { - int max_isize; - int max_occ; - int type, is_sw; -} pe_opt_t; - -struct __bwa_seqio_t; -typedef struct __bwa_seqio_t bwa_seqio_t; - -#ifdef __cplusplus -extern "C" { -#endif - - gap_opt_t *gap_init_opt(); - void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt); - - bwa_seqio_t *bwa_seq_open(const char *fn); - void bwa_seq_close(bwa_seqio_t *bs); - void seq_reverse(int len, ubyte_t *seq, int is_comp); - bwa_seq_t *bwa_read_seq(bwa_seqio_t *seq, int n_needed, int *n, int is_comp, int trim_qual); - void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs); - - int bwa_cal_maxdiff(int l, double err, double thres); - - void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac); - - void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt); - -#ifdef __cplusplus -} -#endif - -#endif