fast-bwa/bwa.c

240 lines
6.4 KiB
C
Raw Normal View History

2013-02-24 05:41:44 +08:00
#include <string.h>
2013-02-24 04:30:46 +08:00
#include <stdio.h>
#include <zlib.h>
#include "bntseq.h"
#include "bwa.h"
#include "ksw.h"
2013-02-24 05:41:44 +08:00
#include "utils.h"
2013-02-24 04:30:46 +08:00
2013-02-24 05:10:48 +08:00
int bwa_verbose = 3;
2013-02-24 05:41:44 +08:00
char bwa_rg_id[256];
2013-02-24 05:10:48 +08:00
2013-02-24 04:30:46 +08:00
/************************
* Batch FASTA/Q reader *
************************/
#include "kseq.h"
KSEQ_DECLARE(gzFile)
static inline void trim_readno(kstring_t *s)
{
if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1]))
s->l -= 2, s->s[s->l] = 0;
}
static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s)
{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice
s->name = strdup(ks->name.s);
2013-02-24 05:57:34 +08:00
s->comment = ks->comment.l? strdup(ks->comment.s) : 0;
2013-02-24 04:30:46 +08:00
s->seq = strdup(ks->seq.s);
s->qual = ks->qual.l? strdup(ks->qual.s) : 0;
s->l_seq = strlen(s->seq);
}
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
{
kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_;
int size = 0, m, n;
bseq1_t *seqs;
m = n = 0; seqs = 0;
while (kseq_read(ks) >= 0) {
if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads
fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__);
break;
}
if (n >= m) {
m = m? m<<1 : 256;
seqs = realloc(seqs, m * sizeof(bseq1_t));
}
trim_readno(&ks->name);
kseq2bseq1(ks, &seqs[n]);
size += seqs[n++].l_seq;
if (ks2) {
trim_readno(&ks2->name);
kseq2bseq1(ks2, &seqs[n]);
size += seqs[n++].l_seq;
}
if (size >= chunk_size) break;
}
if (size == 0) { // test if the 2nd file is finished
if (ks2 && kseq_read(ks2) >= 0)
fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__);
}
*n_ = n;
return seqs;
}
// Generate CIGAR when the alignment end points are known
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar)
{
uint32_t *cigar = 0;
uint8_t tmp, *rseq;
int i, w;
int64_t rlen;
*n_cigar = 0;
if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand
rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range
if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position
for (i = 0; i < l_query>>1; ++i)
tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
for (i = 0; i < rlen>>1; ++i)
tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp;
}
//printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
//printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
// set the band-width
w = (int)((double)(l_query * mat[0] - q) / r + 1.);
w = w < 1? w : 1;
w = w < w_? w : w_;
w += abs(rlen - l_query);
// NW alignment
*score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar);
if (rb >= l_pac) // reverse back query
for (i = 0; i < l_query>>1; ++i)
tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
ret_gen_cigar:
free(rseq);
return cigar;
}
2013-02-24 04:55:55 +08:00
/*********************
* Full index reader *
*********************/
2013-02-24 05:10:48 +08:00
char *bwa_idx_infer_prefix(const char *hint)
{
char *prefix;
int l_hint;
FILE *fp;
l_hint = strlen(hint);
prefix = malloc(l_hint + 3 + 4 + 1);
strcpy(prefix, hint);
strcpy(prefix + l_hint, ".64.bwt");
if ((fp = fopen(prefix, "rb")) != 0) {
fclose(fp);
prefix[l_hint + 3] = 0;
return prefix;
} else {
strcpy(prefix + l_hint, ".bwt");
if ((fp = fopen(prefix, "rb")) == 0) {
free(prefix);
return 0;
} else {
fclose(fp);
prefix[l_hint] = 0;
return prefix;
}
}
}
bwt_t *bwa_idx_load_bwt(const char *hint)
{
char *tmp, *prefix;
bwt_t *bwt;
prefix = bwa_idx_infer_prefix(hint);
if (prefix == 0) {
2013-02-24 05:41:44 +08:00
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
2013-02-24 05:10:48 +08:00
return 0;
}
tmp = calloc(strlen(prefix) + 5, 1);
strcat(strcpy(tmp, prefix), ".bwt"); // FM-index
bwt = bwt_restore_bwt(tmp);
strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA)
bwt_restore_sa(tmp, bwt);
free(tmp); free(prefix);
return bwt;
}
bwaidx_t *bwa_idx_load(const char *hint, int which)
2013-02-24 04:55:55 +08:00
{
bwaidx_t *idx;
2013-02-24 05:10:48 +08:00
char *prefix;
prefix = bwa_idx_infer_prefix(hint);
2013-02-24 05:41:44 +08:00
if (prefix == 0) {
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
return 0;
}
2013-02-24 04:55:55 +08:00
idx = calloc(1, sizeof(bwaidx_t));
2013-02-24 05:10:48 +08:00
if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
2013-02-24 04:55:55 +08:00
if (which & BWA_IDX_BNS) {
idx->bns = bns_restore(prefix);
if (which & BWA_IDX_PAC) {
idx->pac = calloc(idx->bns->l_pac/4+1, 1);
fread(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
2013-02-24 05:10:48 +08:00
fclose(idx->bns->fp_pac);
idx->bns->fp_pac = 0;
2013-02-24 04:55:55 +08:00
}
}
2013-02-24 05:10:48 +08:00
free(prefix);
2013-02-24 04:55:55 +08:00
return idx;
}
void bwa_idx_destroy(bwaidx_t *idx)
{
if (idx == 0) return;
if (idx->bwt) bwt_destroy(idx->bwt);
if (idx->bns) bns_destroy(idx->bns);
if (idx->pac) free(idx->pac);
free(idx);
}
2013-02-24 05:41:44 +08:00
/***********************
* SAM header routines *
***********************/
void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line)
{
int i;
for (i = 0; i < bns->n_seqs; ++i)
err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
if (rg_line) err_printf("%s\n", rg_line);
}
static char *bwa_escape(char *s)
{
char *p, *q;
for (p = q = s; *p; ++p) {
if (*p == '\\') {
++p;
if (*p == 't') *q++ = '\t';
else if (*p == 'n') *q++ = '\n';
else if (*p == 'r') *q++ = '\r';
else if (*p == '\\') *q++ = '\\';
} else *q++ = *p;
}
*q = '\0';
return s;
}
char *bwa_set_rg(const char *s)
{
char *p, *q, *r, *rg_line = 0;
memset(bwa_rg_id, 0, 256);
2013-02-24 05:44:02 +08:00
if (strstr(s, "@RG") != s) {
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__);
goto err_set_rg;
}
2013-02-24 05:41:44 +08:00
rg_line = strdup(s);
bwa_escape(rg_line);
if ((p = strstr(rg_line, "\tID:")) == 0) {
2013-02-24 05:44:02 +08:00
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID at the read group line\n", __func__);
2013-02-24 05:41:44 +08:00
goto err_set_rg;
}
p += 4;
for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
if (q - p + 1 > 256) {
2013-02-24 05:44:02 +08:00
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__);
2013-02-24 05:41:44 +08:00
goto err_set_rg;
}
for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
*r++ = *q;
return rg_line;
err_set_rg:
free(rg_line);
return 0;
}