Temporarily adding changed bwa files to Sting repository until I can pull together the official bwa patch.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1975 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-11-04 21:46:05 +00:00
parent eca0942644
commit 15e80aec3b
6 changed files with 1439 additions and 0 deletions

302
c/bwa/bntseq.c 100755
View File

@ -0,0 +1,302 @@
/* The MIT License
Copyright (c) 2008 Genome Research Ltd (GRL).
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/* Contact: Heng Li <lh3@sanger.ac.uk> */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <zlib.h>
#include "bntseq.h"
#include "main.h"
#include "utils.h"
#include "kseq.h"
KSEQ_INIT(gzFile, gzread)
unsigned char nst_nt4_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
};
void bns_dump(const bntseq_t *bns, const char *prefix)
{
char str[1024];
FILE *fp;
int i;
{ // dump .ann
strcpy(str, prefix); strcat(str, ".ann");
fp = xopen(str, "w");
fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed);
for (i = 0; i != bns->n_seqs; ++i) {
bntann1_t *p = bns->anns + i;
fprintf(fp, "%d %s", p->gi, p->name);
if (p->anno[0]) fprintf(fp, " %s\n", p->anno);
else fprintf(fp, "\n");
fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs);
}
fclose(fp);
}
{ // dump .amb
strcpy(str, prefix); strcat(str, ".amb");
fp = xopen(str, "w");
fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes);
for (i = 0; i != bns->n_holes; ++i) {
bntamb1_t *p = bns->ambs + i;
fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb);
}
fclose(fp);
}
}
bntseq_t *bns_restore(const char *prefix)
{
char ann_filename[1024], amb_filename[1024], pac_filename[1024];
strcpy(ann_filename,prefix);
strcat(ann_filename,".ann");
strcpy(amb_filename,prefix);
strcat(amb_filename,".amb");
strcpy(pac_filename,prefix);
strcat(pac_filename,".pac");
return bns_restore_core(ann_filename, amb_filename, pac_filename);
}
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename)
{
char buffer[1024];
FILE *fp;
bntseq_t *bns;
int i;
bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
{ // read .ann
fp = xopen(ann_filename, "r");
fscanf(fp, "%lld%d%u", &bns->l_pac, &bns->n_seqs, &bns->seed);
bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t));
for (i = 0; i < bns->n_seqs; ++i) {
bntann1_t *p = bns->anns + i;
char *q = buffer;
int c;
// read gi and sequence name
fscanf(fp, "%u%s", &p->gi, buffer);
p->name = strdup(buffer);
// read fasta comments
while ((c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
*q = 0;
if (q - buffer > 1) p->anno = strdup(buffer + 1); // skip leading space
else p->anno = strdup("");
// read the rest
fscanf(fp, "%lld%d%d", &p->offset, &p->len, &p->n_ambs);
}
fclose(fp);
}
{ // read .amb
int64_t l_pac;
int32_t n_seqs;
fp = xopen(amb_filename, "r");
fscanf(fp, "%lld%d%d", &l_pac, &n_seqs, &bns->n_holes);
xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files.");
bns->ambs = (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t));
for (i = 0; i < bns->n_holes; ++i) {
bntamb1_t *p = bns->ambs + i;
fscanf(fp, "%lld%d%s", &p->offset, &p->len, buffer);
p->amb = buffer[0];
}
fclose(fp);
}
{ // open .pac
bns->fp_pac = xopen(pac_filename, "rb");
}
return bns;
}
void bns_destroy(bntseq_t *bns)
{
if (bns == 0) return;
else {
int i;
if (bns->fp_pac) fclose(bns->fp_pac);
free(bns->ambs);
for (i = 0; i < bns->n_seqs; ++i) {
free(bns->anns[i].name);
free(bns->anns[i].anno);
}
free(bns->anns);
free(bns);
}
}
void bns_fasta2bntseq(gzFile fp_fa, const char *prefix)
{
kseq_t *seq;
char name[1024];
bntseq_t *bns;
bntamb1_t *q;
int l_buf;
unsigned char buf[0x10000];
int32_t m_seqs, m_holes, l, i;
FILE *fp;
// initialization
seq = kseq_init(fp_fa);
bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
bns->seed = 11; // fixed seed for random generator
srand48(bns->seed);
m_seqs = m_holes = 8;
bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
q = bns->ambs;
l_buf = 0;
strcpy(name, prefix); strcat(name, ".pac");
fp = xopen(name, "wb");
memset(buf, 0, 0x10000);
// read sequences
while ((l = kseq_read(seq)) >= 0) {
bntann1_t *p;
int lasts;
if (bns->n_seqs == m_seqs) {
m_seqs <<= 1;
bns->anns = (bntann1_t*)realloc(bns->anns, m_seqs * sizeof(bntann1_t));
}
p = bns->anns + bns->n_seqs;
p->name = strdup((char*)seq->name.s);
p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)");
p->gi = 0; p->len = l;
p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
p->n_ambs = 0;
for (i = 0, lasts = 0; i < l; ++i) {
int c = nst_nt4_table[(int)seq->seq.s[i]];
if (c >= 4) { // N
if (lasts == seq->seq.s[i]) { // contiguous N
++q->len;
} else {
if (bns->n_holes == m_holes) {
m_holes <<= 1;
bns->ambs = (bntamb1_t*)realloc(bns->ambs, m_holes * sizeof(bntamb1_t));
}
q = bns->ambs + bns->n_holes;
q->len = 1;
q->offset = p->offset + i;
q->amb = seq->seq.s[i];
++p->n_ambs;
++bns->n_holes;
}
}
lasts = seq->seq.s[i];
{ // fill buffer
if (c >= 4) c = lrand48()&0x3;
if (l_buf == 0x40000) {
fwrite(buf, 1, 0x10000, fp);
memset(buf, 0, 0x10000);
l_buf = 0;
}
buf[l_buf>>2] |= c << ((3 - (l_buf&3)) << 1);
++l_buf;
}
}
++bns->n_seqs;
bns->l_pac += seq->seq.l;
}
xassert(bns->l_pac, "zero length sequence.");
{ // finalize .pac file
ubyte_t ct;
fwrite(buf, 1, (l_buf>>2) + ((l_buf&3) == 0? 0 : 1), fp);
// the following codes make the pac file size always (l_pac/4+1+1)
if (bns->l_pac % 4 == 0) {
ct = 0;
fwrite(&ct, 1, 1, fp);
}
ct = bns->l_pac % 4;
fwrite(&ct, 1, 1, fp);
// close .pac file
fclose(fp);
}
bns_dump(bns, prefix);
bns_destroy(bns);
kseq_destroy(seq);
}
int bwa_fa2pac(int argc, char *argv[])
{
gzFile fp;
if (argc < 2) {
fprintf(stderr, "Usage: bwa fa2pac <in.fasta> [<out.prefix>]\n");
return 1;
}
fp = xzopen(argv[1], "r");
bns_fasta2bntseq(fp, (argc < 3)? argv[1] : argv[2]);
gzclose(fp);
return 0;
}
int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq)
{
int left, mid, right, nn;
if (pac_coor >= bns->l_pac)
err_fatal("bns_coor_pac2real", "bug! Coordinate is longer than sequence (%lld>=%lld).", pac_coor, bns->l_pac);
// binary search for the sequence ID. Note that this is a bit different from the following one...
left = 0; mid = 0; right = bns->n_seqs;
while (left < right) {
mid = (left + right) >> 1;
if (pac_coor >= bns->anns[mid].offset) {
if (mid == bns->n_seqs - 1) break;
if (pac_coor < bns->anns[mid+1].offset) break;
left = mid + 1;
} else right = mid;
}
*real_seq = mid;
// binary search for holes
left = 0; right = bns->n_holes; nn = 0;
while (left < right) {
int64_t mid = (left + right) >> 1;
if (pac_coor >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1;
else if (pac_coor + len <= bns->ambs[mid].offset) right = mid;
else { // overlap
if (pac_coor >= bns->ambs[mid].offset) {
nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len?
bns->ambs[mid].offset + bns->ambs[mid].len - pac_coor : len;
} else {
nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len?
bns->ambs[mid].len : len - (bns->ambs[mid].offset - pac_coor);
}
break;
}
}
return nn;
}

80
c/bwa/bntseq.h 100755
View File

@ -0,0 +1,80 @@
/* The MIT License
Copyright (c) 2008 Genome Research Ltd (GRL).
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/* Contact: Heng Li <lh3@sanger.ac.uk> */
#ifndef BWT_BNTSEQ_H
#define BWT_BNTSEQ_H
#include <stdint.h>
#include <zlib.h>
#ifndef BWA_UBYTE
#define BWA_UBYTE
typedef uint8_t ubyte_t;
#endif
typedef struct {
int64_t offset;
int32_t len;
int32_t n_ambs;
uint32_t gi;
char *name, *anno;
} bntann1_t;
typedef struct {
int64_t offset;
int32_t len;
char amb;
} bntamb1_t;
typedef struct {
int64_t l_pac;
int32_t n_seqs;
uint32_t seed;
bntann1_t *anns; // n_seqs elements
int32_t n_holes;
bntamb1_t *ambs; // n_holes elements
FILE *fp_pac;
} bntseq_t;
extern unsigned char nst_nt4_table[256];
#ifdef __cplusplus
extern "C" {
#endif
void bns_dump(const bntseq_t *bns, const char *prefix);
bntseq_t *bns_restore(const char *prefix);
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename);
void bns_destroy(bntseq_t *bns);
void bns_fasta2bntseq(gzFile fp_fa, const char *prefix);
int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq);
#ifdef __cplusplus
}
#endif
#endif

597
c/bwa/bwase.c 100755
View File

@ -0,0 +1,597 @@
#include <unistd.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include "bwase.h"
#include "stdaln.h"
#include "bntseq.h"
#include "utils.h"
#include "kstring.h"
static int g_log_n[256];
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s)
{
int i, cnt, best;
if (n_aln == 0) {
s->type = BWA_TYPE_NO_MATCH;
s->c1 = s->c2 = 0;
return;
}
best = aln[0].score;
for (i = cnt = 0; i < n_aln; ++i) {
const bwt_aln1_t *p = aln + i;
if (p->score > best) break;
if (drand48() * (p->l - p->k + 1) > (double)cnt) {
s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape; s->strand = p->a;
s->score = p->score;
s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48());
}
cnt += p->l - p->k + 1;
}
s->c1 = cnt;
for (; i < n_aln; ++i) cnt += aln[i].l - aln[i].k + 1;
s->c2 = cnt - s->c1;
s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE;
}
int bwa_approx_mapQ(const bwa_seq_t *p, int mm)
{
int n;
if (p->c1 == 0) return 23;
if (p->c1 > 1) return 0;
if (p->n_mm == mm) return 25;
if (p->c2 == 0) return 37;
n = (p->c2 >= 255)? 255 : p->c2;
return (23 < g_log_n[n])? 0 : 23 - g_log_n[n];
}
void bwa_cal_pac_pos(const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr)
{
int i;
char str[1024];
bwt_t *bwt;
// load forward SA
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
for (i = 0; i != n_seqs; ++i) {
bwa_seq_t *p = seqs + i;
if(p->strand) // reverse strand only
bwa_cal_pac_pos_core(bwt, NULL, p, max_mm, fnr);
}
bwt_destroy(bwt);
// load reverse BWT and SA
strcpy(str, prefix); strcat(str, ".rbwt"); bwt = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt);
for (i = 0; i != n_seqs; ++i) {
bwa_seq_t *p = seqs + i;
if(!p->strand) // forward strand only
bwa_cal_pac_pos_core(NULL, bwt, p, max_mm, fnr);
}
bwt_destroy(bwt);
}
/**
* Derive the actual position in the read from the given suffix array coordinates.
* Note that the position will be approximate based on whether indels appear in the
* read and whether calculations are performed from the start or end of the read.
*/
void bwa_cal_pac_pos_core(const bwt_t* forward_bwt, const bwt_t* reverse_bwt, bwa_seq_t* seq, const int max_mm, const float fnr) {
if(seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT)
return;
int max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm;
if(seq->strand) { // reverse strand only
seq->pos = bwt_sa(forward_bwt, seq->sa);
seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
}
else { // forward strand only
/* NB: For gapped alignment, p->pos may not be correct,
* which will be fixed in refine_gapped_core(). This
* line also determines the way "x" is calculated in
* refine_gapped_core() when (ext < 0 && is_end == 0). */
seq->pos = reverse_bwt->seq_len - (bwt_sa(reverse_bwt, seq->sa) + seq->len);
seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
}
}
/* is_end_correct == 1 if (*pos+len) gives the correct coordinate on
* forward strand. This happens when p->pos is calculated by
* bwa_cal_pac_pos(). is_end_correct==0 if (*pos) gives the correct
* coordinate. This happens only for color-converted alignment. */
static uint16_t *refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, bwtint_t *_pos,
int ext, int *n_cigar, int is_end_correct)
{
uint16_t *cigar = 0;
ubyte_t *ref_seq;
int l = 0, path_len, ref_len;
AlnParam ap = aln_param_bwa;
path_t *path;
int64_t k, __pos = *_pos > l_pac? (int64_t)((int32_t)*_pos) : *_pos;
ref_len = len + abs(ext);
if (ext > 0) {
ref_seq = (ubyte_t*)calloc(ref_len, 1);
for (k = __pos; k < __pos + ref_len && k < l_pac; ++k)
ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
} else {
int64_t x = __pos + (is_end_correct? len : ref_len);
ref_seq = (ubyte_t*)calloc(ref_len, 1);
for (l = 0, k = x - ref_len > 0? x - ref_len : 0; k < x && k < l_pac; ++k)
ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
}
path = (path_t*)calloc(l+len, sizeof(path_t));
aln_global_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len);
cigar = aln_path2cigar(path, path_len, n_cigar);
if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped on the forward strand
for (l = k = 0; k < *n_cigar; ++k) {
if (cigar[k]>>14 == FROM_D) l -= cigar[k]&0x3fff;
else if (cigar[k]>>14 == FROM_I) l += cigar[k]&0x3fff;
}
__pos += l;
}
if (cigar[0]>>14 == FROM_D) { // deletion at the 5'-end
__pos += cigar[0]&0x3fff;
for (k = 0; k < *n_cigar - 1; ++k) cigar[k] = cigar[k+1];
--(*n_cigar);
}
if (cigar[*n_cigar-1]>>14 == FROM_D) --(*n_cigar); // deletion at the 3'-end
// change "I" at either end of the read to S. just in case. This should rarely happen...
if (cigar[*n_cigar-1]>>14 == FROM_I) cigar[*n_cigar-1] = 3<<14 | (cigar[*n_cigar-1]&0x3fff);
if (cigar[0]>>14 == FROM_I) cigar[0] = 3<<14 | (cigar[0]&0x3fff);
*_pos = (bwtint_t)__pos;
free(ref_seq); free(path);
return cigar;
}
char *bwa_cal_md1(int n_cigar, uint16_t *cigar, int len, bwtint_t pos, ubyte_t *seq,
bwtint_t l_pac, ubyte_t *pacseq, kstring_t *str, int *_nm)
{
bwtint_t x, y;
int z, u, c, nm = 0;
str->l = 0; // reset
x = pos; y = 0;
if (cigar) {
int k, l;
for (k = u = 0; k < n_cigar; ++k) {
l = cigar[k]&0x3fff;
if (cigar[k]>>14 == FROM_M) {
for (z = 0; z < l && x+z < l_pac; ++z) {
c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
ksprintf(str, "%d", u);
kputc("ACGTN"[c], str);
++nm;
u = 0;
} else ++u;
}
x += l; y += l;
} else if (cigar[k]>>14 == FROM_I || cigar[k]>>14 == 3) {
y += l; nm += l;
} else if (cigar[k]>>14 == FROM_D) {
ksprintf(str, "%d", u);
kputc('^', str);
for (z = 0; z < l && x+z < l_pac; ++z)
kputc("ACGT"[pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3], str);
u = 0;
x += l; nm += l;
}
}
} else { // no gaps
for (z = u = 0; z < (bwtint_t)len; ++z) {
c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
ksprintf(str, "%d", u);
kputc("ACGTN"[c], str);
++nm;
u = 0;
} else ++u;
}
}
ksprintf(str, "%d", u);
*_nm = nm;
return strdup(str->s);
}
void bwa_correct_trimmed(bwa_seq_t *s)
{
if (s->len == s->full_len) return;
if (s->strand == 0) { // forward
if (s->cigar && s->cigar[s->n_cigar-1]>>14 == 3) { // the last is S
s->cigar[s->n_cigar-1] += s->full_len - s->len;
} else {
if (s->cigar == 0) {
s->n_cigar = 2;
s->cigar = calloc(s->n_cigar, 2);
s->cigar[0] = 0<<14 | s->len;
} else {
++s->n_cigar;
s->cigar = realloc(s->cigar, s->n_cigar * 2);
}
s->cigar[s->n_cigar-1] = 3<<14 | (s->full_len - s->len);
}
} else { // reverse
if (s->cigar && s->cigar[0]>>14 == 3) { // the first is S
s->cigar[0] += s->full_len - s->len;
} else {
if (s->cigar == 0) {
s->n_cigar = 2;
s->cigar = calloc(s->n_cigar, 2);
s->cigar[1] = 0<<14 | s->len;
} else {
++s->n_cigar;
s->cigar = realloc(s->cigar, s->n_cigar * 2);
memmove(s->cigar + 1, s->cigar, (s->n_cigar-1) * 2);
}
s->cigar[0] = 3<<14 | (s->full_len - s->len);
}
}
s->len = s->full_len;
}
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns)
{
ubyte_t *pacseq, *ntpac = 0;
int i;
kstring_t *str;
if (ntbns) { // in color space
ntpac = (ubyte_t*)calloc(ntbns->l_pac/4+1, 1);
rewind(ntbns->fp_pac);
fread(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac);
}
if (!_pacseq) {
pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
rewind(bns->fp_pac);
fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac);
} else pacseq = _pacseq;
for (i = 0; i != n_seqs; ++i) {
bwa_seq_t *s = seqs + i;
seq_reverse(s->len, s->seq, 0); // IMPORTANT: s->seq is reversed here!!!
if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue;
s->cigar = refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos,
(s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1);
}
if (ntbns) { // in color space
for (i = 0; i < n_seqs; ++i) {
bwa_seq_t *s = seqs + i;
bwa_cs2nt_core(s, bns->l_pac, ntpac);
if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again
free(s->cigar);
s->cigar = refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos,
(s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0);
}
}
}
// generate MD tag
str = (kstring_t*)calloc(1, sizeof(kstring_t));
for (i = 0; i != n_seqs; ++i) {
bwa_seq_t *s = seqs + i;
if (s->type != BWA_TYPE_NO_MATCH) {
int nm;
s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq,
bns->l_pac, ntbns? ntpac : pacseq, str, &nm);
s->nm = nm;
}
}
free(str->s); free(str);
// correct for trimmed reads
for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i);
if (!_pacseq) free(pacseq);
free(ntpac);
}
int64_t pos_end(const bwa_seq_t *p)
{
if (p->cigar) {
int j;
int64_t x = p->pos;
for (j = 0; j != p->n_cigar; ++j) {
int op = p->cigar[j]>>14;
if (op == 0 || op == 2) x += p->cigar[j]&0x3fff;
}
return x;
} else return p->pos + p->len;
}
static int64_t pos_5(const bwa_seq_t *p)
{
if (p->type != BWA_TYPE_NO_MATCH)
return p->strand? pos_end(p) : p->pos;
return -1;
}
void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2)
{
int j;
if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) {
int seqid, nn, am = 0, flag = p->extra_flag;
char XT;
if (p->type == BWA_TYPE_NO_MATCH) {
p->pos = mate->pos;
p->strand = mate->strand;
flag |= SAM_FSU;
j = 1;
} else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment
// get seqid
nn = bns_coor_pac2real(bns, p->pos, j, &seqid);
// update flag and print it
if (p->strand) flag |= SAM_FSR;
if (mate) {
if (mate->type != BWA_TYPE_NO_MATCH) {
if (mate->strand) flag |= SAM_FMR;
} else flag |= SAM_FMU;
}
printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name);
printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ);
// print CIGAR
if (p->cigar) {
for (j = 0; j != p->n_cigar; ++j)
printf("%d%c", p->cigar[j]&0x3fff, "MIDS"[p->cigar[j]>>14]);
} else if (p->type == BWA_TYPE_NO_MATCH) printf("*");
else printf("%dM", p->len);
// print mate coordinate
if (mate && mate->type != BWA_TYPE_NO_MATCH) {
int m_seqid, m_is_N;
long long isize;
am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality
// redundant calculation here, but should not matter too much
m_is_N = bns_coor_pac2real(bns, mate->pos, mate->len, &m_seqid);
printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name);
isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0;
if (p->type == BWA_TYPE_NO_MATCH) isize = 0;
printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize);
} else if (mate) printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1));
else printf("\t*\t0\t0\t");
// print sequence and quality
if (p->strand == 0)
for (j = 0; j != p->full_len; ++j) putchar("ACGTN"[(int)p->seq[j]]);
else for (j = 0; j != p->full_len; ++j) putchar("TGCAN"[p->seq[p->full_len - 1 - j]]);
putchar('\t');
if (p->qual) {
if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
printf("%s", p->qual);
} else printf("*");
if (p->type != BWA_TYPE_NO_MATCH) {
// calculate XT tag
XT = "NURM"[p->type];
if (nn > 10) XT = 'N';
// print tags
printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm);
if (nn) printf("\tXN:i:%d", nn);
if (mate) printf("\tSM:i:%d\tAM:i:%d", p->seQ, am);
if (p->type != BWA_TYPE_MATESW) { // X0 and X1 are not available for this type of alignment
printf("\tX0:i:%d", p->c1);
if (p->c1 <= max_top2) printf("\tX1:i:%d", p->c2);
}
printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo, p->n_gapo+p->n_gape);
if (p->md) printf("\tMD:Z:%s", p->md);
}
putchar('\n');
} else { // this read has no match
ubyte_t *s = p->strand? p->rseq : p->seq;
int flag = p->extra_flag | SAM_FSU;
if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU;
printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag);
for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]);
putchar('\t');
if (p->qual) {
if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
printf("%s", p->qual);
} else printf("*");
putchar('\n');
}
}
bntseq_t *bwa_open_nt(const char *prefix)
{
bntseq_t *ntbns;
char *str;
str = (char*)calloc(strlen(prefix) + 10, 1);
strcat(strcpy(str, prefix), ".nt");
ntbns = bns_restore(str);
free(str);
return ntbns;
}
void bwa_print_sam_SQ(const bntseq_t *bns)
{
int i;
for (i = 0; i < bns->n_seqs; ++i)
printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
}
void bwase_initialize()
{
int i;
for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
}
void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa)
{
int i, n_seqs, tot_seqs = 0, m_aln;
bwt_aln1_t *aln = 0;
bwa_seq_t *seqs;
bwa_seqio_t *ks;
clock_t t;
bntseq_t *bns, *ntbns = 0;
FILE *fp_sa;
gap_opt_t opt;
// initialization
bwase_initialize();
for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
bns = bns_restore(prefix);
srand48(bns->seed);
ks = bwa_seq_open(fn_fa);
fp_sa = xopen(fn_sa, "r");
// core loop
m_aln = 0;
fread(&opt, sizeof(gap_opt_t), 1, fp_sa);
if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac
ntbns = bwa_open_nt(prefix);
bwa_print_sam_SQ(bns);
while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode & BWA_MODE_COMPREAD, opt.trim_qual)) != 0) {
tot_seqs += n_seqs;
t = clock();
// read alignment
for (i = 0; i < n_seqs; ++i) {
bwa_seq_t *p = seqs + i;
int n_aln;
fread(&n_aln, 4, 1, fp_sa);
if (n_aln > m_aln) {
m_aln = n_aln;
aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln);
}
fread(aln, sizeof(bwt_aln1_t), n_aln, fp_sa);
bwa_aln2seq(n_aln, aln, p);
}
fprintf(stderr, "[bwa_aln_core] convert to sequence coordinate... ");
bwa_cal_pac_pos(prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
fprintf(stderr, "[bwa_aln_core] refine gapped alignments... ");
bwa_refine_gapped(bns, n_seqs, seqs, 0, ntbns);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
fprintf(stderr, "[bwa_aln_core] print alignments... ");
for (i = 0; i < n_seqs; ++i)
bwa_print_sam1(bns, seqs + i, 0, opt.mode, opt.max_top2);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
bwa_free_read_seq(n_seqs, seqs);
fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs);
}
// destroy
bwa_seq_close(ks);
if (ntbns) bns_destroy(ntbns);
bns_destroy(bns);
fclose(fp_sa);
free(aln);
}
static void print_aln_simple(bwt_t *const bwt[2], const bntseq_t *bns, const bwt_aln1_t *q, int len, bwtint_t k)
{
bwtint_t pos;
int seqid, is_N;
pos = q->a? bwt_sa(bwt[0], k) : bwt[1]->seq_len - (bwt_sa(bwt[1], k) + len);
if (pos > bwt[1]->seq_len) pos = 0; // negative
is_N = bns_coor_pac2real(bns, pos, len, &seqid);
printf("%s\t%c%d\t%d\n", bns->anns[seqid].name, "+-"[q->a], (int)(pos - bns->anns[seqid].offset + 1),
q->n_mm + q->n_gapo + q->n_gape);
}
void bwa_print_all_hits(const char *prefix, const char *fn_sa, const char *fn_fa, int max_occ)
{
int i, n_seqs, tot_seqs = 0, m_aln;
bwt_aln1_t *aln = 0;
bwa_seq_t *seqs;
bwa_seqio_t *ks;
bntseq_t *bns;
FILE *fp_sa;
gap_opt_t opt;
bwt_t *bwt[2];
bns = bns_restore(prefix);
srand48(bns->seed);
ks = bwa_seq_open(fn_fa);
fp_sa = xopen(fn_sa, "r");
{ // load BWT
char *str = (char*)calloc(strlen(prefix) + 10, 1);
strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]);
strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]);
free(str);
}
m_aln = 0;
fread(&opt, sizeof(gap_opt_t), 1, fp_sa);
while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode & BWA_MODE_COMPREAD, opt.trim_qual)) != 0) {
tot_seqs += n_seqs;
for (i = 0; i < n_seqs; ++i) {
bwa_seq_t *p = seqs + i;
int n_aln, n_occ, k, rest, len;
len = p->len;
fread(&n_aln, 4, 1, fp_sa);
if (n_aln > m_aln) {
m_aln = n_aln;
aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln);
}
fread(aln, sizeof(bwt_aln1_t), n_aln, fp_sa);
for (k = n_occ = 0; k < n_aln; ++k) {
const bwt_aln1_t *q = aln + k;
n_occ += q->l - q->k + 1;
}
rest = n_occ > max_occ? max_occ : n_occ;
printf(">%s %d %d\n", p->name, rest, n_occ);
for (k = 0; k < n_aln; ++k) {
const bwt_aln1_t *q = aln + k;
if (q->l - q->k + 1 <= rest) {
bwtint_t l;
for (l = q->k; l <= q->l; ++l)
print_aln_simple(bwt, bns, q, len, l);
rest -= q->l - q->k + 1;
} else { // See also: http://code.activestate.com/recipes/272884/
int j, i, k;
for (j = rest, i = q->l - q->k + 1, k = 0; j > 0; --j) {
double p = 1.0, x = drand48();
while (x < p) p -= p * j / (i--);
print_aln_simple(bwt, bns, q, len, q->l - i);
}
rest = 0;
break;
}
}
}
bwa_free_read_seq(n_seqs, seqs);
}
bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
bwa_seq_close(ks);
bns_destroy(bns);
fclose(fp_sa);
free(aln);
}
int bwa_sai2sam_se(int argc, char *argv[])
{
int c, n_occ = 0;
while ((c = getopt(argc, argv, "hn:")) >= 0) {
switch (c) {
case 'h': break;
case 'n': n_occ = atoi(optarg); break;
default: return 1;
}
}
if (optind + 3 > argc) {
fprintf(stderr, "Usage: bwa samse [-n max_occ] <prefix> <in.sai> <in.fq>\n");
return 1;
}
if (n_occ > 0) bwa_print_all_hits(argv[optind], argv[optind+1], argv[optind+2], n_occ);
else bwa_sai2sam_se_core(argv[optind], argv[optind+1], argv[optind+2]);
return 0;
}

52
c/bwa/bwase.h 100755
View File

@ -0,0 +1,52 @@
/* The MIT License
Copyright (c) 2008 Genome Research Ltd (GRL).
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#ifndef BWASE_H
#define BWASE_H
#include "bntseq.h"
#include "bwt.h"
#include "bwtaln.h"
#ifdef __cplusplus
extern "C" {
#endif
// Initialize mapping tables in the bwa single-end mapper.
void bwase_initialize();
// Calculate the approximate position of the sequence from the specified bwt with loaded suffix array.
void bwa_cal_pac_pos_core(const bwt_t* forward_bwt, const bwt_t* reverse_bwt, bwa_seq_t* seq, const int max_mm, const float fnr);
// Refine the approximate position of the sequence to an actual placement for the sequence.
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns);
// Backfill certain alignment properties mainly centering around number of matches.
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
// Calculate the end position of a read given a certain sequence.
int64_t pos_end(const bwa_seq_t *p);
#ifdef __cplusplus
}
#endif
#endif // BWASE_H

295
c/bwa/bwtaln.c 100755
View File

@ -0,0 +1,295 @@
#include <stdio.h>
#include <unistd.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <stdint.h>
#include "bwtaln.h"
#include "bwtgap.h"
#include "utils.h"
#ifdef HAVE_PTHREAD
#define THREAD_BLOCK_SIZE 1024
#include <pthread.h>
static pthread_mutex_t g_seq_lock = PTHREAD_MUTEX_INITIALIZER;
#endif
gap_opt_t *gap_init_opt()
{
gap_opt_t *o;
o = (gap_opt_t*)calloc(1, sizeof(gap_opt_t));
/* IMPORTANT: s_mm*10 should be about the average base error
rate. Voilating this requirement will break pairing! */
o->s_mm = 3; o->s_gapo = 11; o->s_gape = 4;
o->max_diff = -1; o->max_gapo = 1; o->max_gape = 6;
o->indel_end_skip = 5; o->max_del_occ = 10; o->max_entries = 2000000;
o->mode = BWA_MODE_GAPE | BWA_MODE_COMPREAD;
o->seed_len = 0x7fffffff; o->max_seed_diff = 2;
o->fnr = 0.04;
o->n_threads = 1;
o->max_top2 = 30;
o->trim_qual = 0;
return o;
}
int bwa_cal_maxdiff(int l, double err, double thres)
{
double elambda = exp(-l * err);
double sum, y = 1.0;
int k, x = 1;
for (k = 1, sum = elambda; k < 1000; ++k) {
y *= l * err;
x *= k;
sum += elambda * y / x;
if (1.0 - sum < thres) return k;
}
return 2;
}
// width must be filled as zero
static int bwt_cal_width(const bwt_t *rbwt, int len, const ubyte_t *str, bwt_width_t *width)
{
bwtint_t k, l, ok, ol;
int i, bid;
bid = 0;
k = 0; l = rbwt->seq_len;
for (i = 0; i < len; ++i) {
ubyte_t c = str[i];
if (c < 4) {
bwt_2occ(rbwt, k - 1, l, c, &ok, &ol);
k = rbwt->L2[c] + ok + 1;
l = rbwt->L2[c] + ol;
}
if (k > l || c > 3) { // then restart
k = 0;
l = rbwt->seq_len;
++bid;
}
width[i].w = l - k + 1;
width[i].bid = bid;
}
width[len].w = 0;
width[len].bid = ++bid;
return bid;
}
void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt)
{
int i, max_l = 0, max_len;
gap_stack_t *stack;
bwt_width_t *w[2], *seed_w[2];
const ubyte_t *seq[2];
gap_opt_t local_opt = *opt;
// initiate priority stack
for (i = max_len = 0; i != n_seqs; ++i)
if (seqs[i].len > max_len) max_len = seqs[i].len;
if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(max_len, BWA_AVG_ERR, opt->fnr);
if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff;
stack = gap_init_stack(opt->max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt);
seed_w[0] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
seed_w[1] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
w[0] = w[1] = 0;
for (i = 0; i != n_seqs; ++i) {
bwa_seq_t *p = seqs + i;
#ifdef HAVE_PTHREAD
if (opt->n_threads > 1) {
pthread_mutex_lock(&g_seq_lock);
if (p->tid < 0) { // unassigned
int j;
for (j = i; j < n_seqs && j < i + THREAD_BLOCK_SIZE; ++j)
seqs[j].tid = tid;
} else if (p->tid != tid) {
pthread_mutex_unlock(&g_seq_lock);
continue;
}
pthread_mutex_unlock(&g_seq_lock);
}
#endif
p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0;
seq[0] = p->seq; seq[1] = p->rseq;
if (max_l < p->len) {
max_l = p->len;
w[0] = (bwt_width_t*)calloc(max_l + 1, sizeof(bwt_width_t));
w[1] = (bwt_width_t*)calloc(max_l + 1, sizeof(bwt_width_t));
}
bwt_cal_width(bwt[0], p->len, seq[0], w[0]);
bwt_cal_width(bwt[1], p->len, seq[1], w[1]);
if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr);
local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff;
if (p->len > opt->seed_len) {
bwt_cal_width(bwt[0], opt->seed_len, seq[0] + (p->len - opt->seed_len), seed_w[0]);
bwt_cal_width(bwt[1], opt->seed_len, seq[1] + (p->len - opt->seed_len), seed_w[1]);
}
// core function
p->aln = bwt_match_gap(bwt, p->len, seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack);
// store the alignment
free(p->name); free(p->seq); free(p->rseq); free(p->qual);
p->name = 0; p->seq = p->rseq = p->qual = 0;
}
free(seed_w[0]); free(seed_w[1]);
free(w[0]); free(w[1]);
gap_destroy_stack(stack);
}
#ifdef HAVE_PTHREAD
typedef struct {
int tid;
bwt_t *bwt[2];
int n_seqs;
bwa_seq_t *seqs;
const gap_opt_t *opt;
} thread_aux_t;
static void *worker(void *data)
{
thread_aux_t *d = (thread_aux_t*)data;
bwa_cal_sa_reg_gap(d->tid, d->bwt, d->n_seqs, d->seqs, d->opt);
return 0;
}
#endif
void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
{
int i, n_seqs, tot_seqs = 0;
bwa_seq_t *seqs;
bwa_seqio_t *ks;
clock_t t;
bwt_t *bwt[2];
// initialization
ks = bwa_seq_open(fn_fa);
{ // load BWT
char *str = (char*)calloc(strlen(prefix) + 10, 1);
strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
free(str);
}
// core loop
fwrite(opt, sizeof(gap_opt_t), 1, stdout);
while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode & BWA_MODE_COMPREAD, opt->trim_qual)) != 0) {
tot_seqs += n_seqs;
t = clock();
fprintf(stderr, "[bwa_aln_core] calculate SA coordinate... ");
#ifdef HAVE_PTHREAD
if (opt->n_threads <= 1) { // no multi-threading at all
bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt);
} else {
pthread_t *tid;
pthread_attr_t attr;
thread_aux_t *data;
int j;
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t));
tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t));
for (j = 0; j < opt->n_threads; ++j) {
data[j].tid = j; data[j].bwt[0] = bwt[0]; data[j].bwt[1] = bwt[1];
data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt;
pthread_create(&tid[j], &attr, worker, data + j);
}
for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0);
free(data); free(tid);
}
#else
bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt);
#endif
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
t = clock();
fprintf(stderr, "[bwa_aln_core] write to the disk... ");
for (i = 0; i < n_seqs; ++i) {
bwa_seq_t *p = seqs + i;
fwrite(&p->n_aln, 4, 1, stdout);
if (p->n_aln) fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout);
}
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
bwa_free_read_seq(n_seqs, seqs);
fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs);
}
// destroy
bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
bwa_seq_close(ks);
}
int bwa_aln(int argc, char *argv[])
{
int c, opte = -1;
gap_opt_t *opt;
opt = gap_init_opt();
while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:")) >= 0) {
switch (c) {
case 'n':
if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1;
else opt->max_diff = atoi(optarg), opt->fnr = -1.0;
break;
case 'o': opt->max_gapo = atoi(optarg); break;
case 'e': opte = atoi(optarg); break;
case 'M': opt->s_mm = atoi(optarg); break;
case 'O': opt->s_gapo = atoi(optarg); break;
case 'E': opt->s_gape = atoi(optarg); break;
case 'd': opt->max_del_occ = atoi(optarg); break;
case 'i': opt->indel_end_skip = atoi(optarg); break;
case 'l': opt->seed_len = atoi(optarg); break;
case 'k': opt->max_seed_diff = atoi(optarg); break;
case 'm': opt->max_entries = atoi(optarg); break;
case 't': opt->n_threads = atoi(optarg); break;
case 'L': opt->mode |= BWA_MODE_LOGGAP; break;
case 'R': opt->max_top2 = atoi(optarg); break;
case 'q': opt->trim_qual = atoi(optarg); break;
case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break;
case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break;
default: return 1;
}
}
if (opte > 0) {
opt->max_gape = opte;
opt->mode &= ~BWA_MODE_GAPE;
}
if (optind + 2 > argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa aln [options] <prefix> <in.fq>\n\n");
fprintf(stderr, "Options: -n NUM max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n",
BWA_AVG_ERR, opt->fnr);
fprintf(stderr, " -o INT maximum number or fraction of gap opens [%d]\n", opt->max_gapo);
fprintf(stderr, " -e INT maximum number of gap extensions, -1 for disabling long gaps [-1]\n");
fprintf(stderr, " -i INT do not put an indel within INT bp towards the ends [%d]\n", opt->indel_end_skip);
fprintf(stderr, " -d INT maximum occurrences for extending a long deletion [%d]\n", opt->max_del_occ);
fprintf(stderr, " -l INT seed length [%d]\n", opt->seed_len);
fprintf(stderr, " -k INT maximum differences in the seed [%d]\n", opt->max_seed_diff);
fprintf(stderr, " -m INT maximum entries in the queue [%d]\n", opt->max_entries);
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
fprintf(stderr, " -M INT mismatch penalty [%d]\n", opt->s_mm);
fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->s_gapo);
fprintf(stderr, " -E INT gap extension penalty [%d]\n", opt->s_gape);
fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2);
fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual);
fprintf(stderr, " -c input sequences are in the color space\n");
fprintf(stderr, " -L log-scaled gap penalty for long deletions\n");
fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n");
fprintf(stderr, "\n");
return 1;
}
if (opt->fnr > 0.0) {
int i, k;
for (i = 17, k = 0; i <= 250; ++i) {
int l = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr);
if (l != k) fprintf(stderr, "[bwa_aln] %dbp reads: max_diff = %d\n", i, l);
k = l;
}
}
bwa_aln_core(argv[optind], argv[optind+1], opt);
free(opt);
return 0;
}

113
c/bwa/bwtaln.h 100755
View File

@ -0,0 +1,113 @@
#ifndef BWTALN_H
#define BWTALN_H
#include <stdint.h>
#include "bwt.h"
#define BWA_TYPE_NO_MATCH 0
#define BWA_TYPE_UNIQUE 1
#define BWA_TYPE_REPEAT 2
#define BWA_TYPE_MATESW 3
#define SAM_FPD 1 // paired
#define SAM_FPP 2 // properly paired
#define SAM_FSU 4 // self-unmapped
#define SAM_FMU 8 // mate-unmapped
#define SAM_FSR 16 // self on the reverse strand
#define SAM_FMR 32 // mate on the reverse strand
#define SAM_FR1 64 // this is read one
#define SAM_FR2 128 // this is read two
#define SAM_FSC 256 // secondary alignment
#define BWA_AVG_ERR 0.02
#define BWA_MIN_RDLEN 35 // for read trimming
#ifndef bns_pac
#define bns_pac(pac, k) ((pac)[(k)>>2] >> ((~(k)&3)<<1) & 3)
#endif
typedef struct {
bwtint_t w;
int bid;
} bwt_width_t;
typedef struct {
uint32_t n_mm:8, n_gapo:8, n_gape:8, a:1;
bwtint_t k, l;
int score;
} bwt_aln1_t;
typedef struct {
char *name;
ubyte_t *seq, *rseq, *qual;
uint32_t len:20, strand:1, type:2, dummy:1, extra_flag:8;
uint32_t n_mm:8, n_gapo:8, n_gape:8, mapQ:8;
int score;
// alignments in SA coordinates
int n_aln;
bwt_aln1_t *aln;
// alignment information
bwtint_t sa, pos;
uint64_t c1:28, c2:28, seQ:8; // number of top1 and top2 hits; single-end mapQ
int n_cigar;
uint16_t *cigar;
// for multi-threading only
int tid;
// NM and MD tags
uint32_t full_len:20, nm:12;
char *md;
} bwa_seq_t;
#define BWA_MODE_GAPE 0x01
#define BWA_MODE_COMPREAD 0x02
#define BWA_MODE_LOGGAP 0x04
#define BWA_MODE_NONSTOP 0x10
typedef struct {
int s_mm, s_gapo, s_gape;
int mode;
int indel_end_skip, max_del_occ, max_entries;
float fnr;
int max_diff, max_gapo, max_gape;
int max_seed_diff, seed_len;
int n_threads;
int max_top2;
int trim_qual;
} gap_opt_t;
#define BWA_PET_STD 1
#define BWA_PET_SOLID 2
typedef struct {
int max_isize;
int max_occ;
int type, is_sw;
} pe_opt_t;
struct __bwa_seqio_t;
typedef struct __bwa_seqio_t bwa_seqio_t;
#ifdef __cplusplus
extern "C" {
#endif
gap_opt_t *gap_init_opt();
void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt);
bwa_seqio_t *bwa_seq_open(const char *fn);
void bwa_seq_close(bwa_seqio_t *bs);
void seq_reverse(int len, ubyte_t *seq, int is_comp);
bwa_seq_t *bwa_read_seq(bwa_seqio_t *seq, int n_needed, int *n, int is_comp, int trim_qual);
void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs);
int bwa_cal_maxdiff(int l, double err, double thres);
void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac);
void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt);
#ifdef __cplusplus
}
#endif
#endif