Merge branch 'master' into master_fixes

Merged to master version b621d3a

Conflicts:
	Makefile
	bntseq.c
	bwa.c
	bwase.c
	bwaseqio.c
	bwtaln.c
	bwtindex.c
	bwtio.c
	bwtmisc.c
	bwtsw2_aux.c
	cs2nt.c
	fastmap.c
	khash.h
	kseq.h
	ksw.c
	kvec.h
	simple_dp.c
	utils.c
	utils.h
This commit is contained in:
Rob Davies 2013-03-01 09:37:46 +00:00
commit 3d33ab063e
44 changed files with 3666 additions and 1942 deletions

1
.gitignore vendored
View File

@ -1,4 +1,5 @@
*.[oa]
bwa
test
test64
.*.swp

View File

@ -1,14 +1,11 @@
CC= gcc
CXX= g++
CFLAGS= -g -Wall -O2
CFLAGS= -g -Wall -O2 -msse2
CXXFLAGS= $(CFLAGS)
AR= ar
DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64
LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o stdaln.o \
bwaseqio.o bwase.o kstring.o
AOBJS= QSufSort.o bwt_gen.o \
is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \
bwape.o cs2nt.o \
LOBJS= utils.o kstring.o ksw.o kopen.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o
AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
is.o bwtindex.o bwape.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
PROG= bwa
@ -26,7 +23,7 @@ SUBDIRS= .
all:$(PROG)
bwa:libbwa.a $(AOBJS) main.o
$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa
libbwa.a:$(LOBJS)
$(AR) -csru $@ $(LOBJS)
@ -34,35 +31,40 @@ libbwa.a:$(LOBJS)
clean:
rm -f gmon.out *.o a.out $(PROG) *~ *.a
depend:
( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) -- *.c )
# DO NOT DELETE THIS LINE -- make depend depends on it.
QSufSort.o: QSufSort.h
bamlite.o: bamlite.h utils.h
bntseq.o: bntseq.h kseq.h main.h utils.h
bwa.o: bntseq.h bwa.h bwt.h bwtaln.h bwtgap.h stdaln.h utils.h
bwape.o: bntseq.h bwase.h bwt.h bwtaln.h khash.h ksort.h kvec.h stdaln.h
bwape.o: utils.h
bwase.o: bntseq.h bwase.h bwt.h bwtaln.h kstring.h stdaln.h utils.h
bwaseqio.o: bamlite.h bwt.h bwtaln.h kseq.h stdaln.h utils.h
bwt.o: bwt.h kvec.h utils.h
bamlite.o: utils.h bamlite.h
bntseq.o: bntseq.h main.h utils.h kseq.h
bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h kseq.h
bwamem.o: kstring.h utils.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h
bwamem.o: ksort.h kbtree.h
bwamem_pair.o: kstring.h utils.h bwamem.h bwt.h bntseq.h bwa.h kvec.h ksw.h
bwape.o: bwtaln.h bwt.h stdaln.h kvec.h bntseq.h utils.h bwase.h bwa.h
bwape.o: khash.h
bwase.o: stdaln.h bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h bwa.h
bwaseqio.o: bwtaln.h bwt.h stdaln.h utils.h bamlite.h kseq.h
bwt.o: utils.h bwt.h kvec.h
bwt_gen.o: QSufSort.h utils.h
bwt_lite.o: bwt_lite.h utils.h
bwtaln.o: bwt.h bwtaln.h bwtgap.h stdaln.h utils.h
bwtgap.o: bwt.h bwtaln.h bwtgap.h stdaln.h utils.h
bwtaln.o: bwtaln.h bwt.h stdaln.h bwtgap.h utils.h bwa.h bntseq.h
bwtgap.o: bwtgap.h bwt.h bwtaln.h stdaln.h utils.h
bwtindex.o: bntseq.h bwt.h main.h utils.h
bwtio.o: bwt.h utils.h
bwtmisc.o: bntseq.h bwt.h main.h utils.h
bwtsw2_aux.o: bntseq.h bwt.h bwt_lite.h bwtsw2.h kseq.h ksort.h kstring.h
bwtsw2_aux.o: stdaln.h utils.h
bwtsw2_chain.o: bntseq.h bwt.h bwt_lite.h bwtsw2.h ksort.h utils.h
bwtsw2_core.o: bntseq.h bwt.h bwt_lite.h bwtsw2.h khash.h ksort.h kvec.h
bwtsw2_core.o: utils.h
bwtsw2_main.o: bntseq.h bwt.h bwt_lite.h bwtsw2.h utils.h
bwtsw2_pair.o: bntseq.h bwt.h bwt_lite.h bwtsw2.h kstring.h ksw.h utils.h
cs2nt.o: bwt.h bwtaln.h stdaln.h utils.h
fastmap.o: bntseq.h bwt.h kseq.h kvec.h utils.h
bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h stdaln.h kstring.h
bwtsw2_aux.o: bwa.h kseq.h ksort.h
bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h utils.h ksort.h
bwtsw2_core.o: bwt_lite.h bwtsw2.h bntseq.h bwt.h kvec.h utils.h khash.h
bwtsw2_core.o: ksort.h
bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h
bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h ksw.h
fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h utils.h kseq.h
is.o: utils.h
kopen.o: utils.h
kstring.o: kstring.h utils.h
ksw.o: ksw.h utils.h
main.o: main.h utils.h
simple_dp.o: kseq.h stdaln.h utils.h
stdaln.o: stdaln.h utils.h
utils.o: utils.h
utils.o: utils.h ksort.h kseq.h

View File

@ -59,12 +59,9 @@ void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsin
qsint_t i, j;
qsint_t s, negatedSortedGroupLength;
qsint_t numSymbolAggregated;
qsint_t maxNumInputSymbol;
qsint_t numSortedPos = 1;
qsint_t newAlphabetSize;
maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1;
if (!skipTransform) {
/* bucketing possible*/
newAlphabetSize = QSufSortTransform(V, I, numChar, largestInputSymbol, smallestInputSymbol,

View File

@ -36,7 +36,7 @@
#include "utils.h"
#include "kseq.h"
KSEQ_INIT(gzFile, err_gzread)
KSEQ_DECLARE(gzFile)
unsigned char nst_nt4_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
@ -310,21 +310,26 @@ int bwa_fa2pac(int argc, char *argv[])
return 0;
}
int bns_pos2rid(const bntseq_t *bns, int64_t pos_f)
{
int left, mid, right;
if (pos_f >= bns->l_pac) return -1;
left = 0; mid = 0; right = bns->n_seqs;
while (left < right) { // binary search
mid = (left + right) >> 1;
if (pos_f >= bns->anns[mid].offset) {
if (mid == bns->n_seqs - 1) break;
if (pos_f < bns->anns[mid+1].offset) break; // bracketed
left = mid + 1;
} else right = mid;
}
return mid;
}
int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id)
{
int left, mid, right, nn;
if (ref_id) {
left = 0; mid = 0; right = bns->n_seqs;
while (left < right) {
mid = (left + right) >> 1;
if (pos_f >= bns->anns[mid].offset) {
if (mid == bns->n_seqs - 1) break;
if (pos_f < bns->anns[mid+1].offset) break; // bracketed
left = mid + 1;
} else right = mid;
}
*ref_id = mid;
}
if (ref_id) *ref_id = bns_pos2rid(bns, pos_f);
left = 0; right = bns->n_holes; nn = 0;
while (left < right) {
mid = (left + right) >> 1;
@ -343,3 +348,26 @@ int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id)
}
return nn;
}
uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len)
{
uint8_t *seq = 0;
if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap
if (end > l_pac<<1) end = l_pac<<1;
if (beg < 0) beg = 0;
if (beg >= l_pac || end <= l_pac) {
int64_t k, l = 0;
*len = end - beg;
seq = xmalloc(end - beg);
if (beg >= l_pac) { // reverse strand
int64_t beg_f = (l_pac<<1) - 1 - end;
int64_t end_f = (l_pac<<1) - 1 - beg;
for (k = end_f; k > beg_f; --k)
seq[l++] = 3 - _get_pac(pac, k);
} else { // forward strand
for (k = beg; k < end; ++k)
seq[l++] = _get_pac(pac, k);
}
} else *len = 0; // if bridging the forward-reverse boundary, return nothing
return seq;
}

View File

@ -29,6 +29,7 @@
#define BWT_BNTSEQ_H
#include <stdint.h>
#include <stdio.h>
#include <zlib.h>
#ifndef BWA_UBYTE
@ -71,7 +72,9 @@ extern "C" {
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename);
void bns_destroy(bntseq_t *bns);
int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only);
int bns_pos2rid(const bntseq_t *bns, int64_t pos_f);
int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id);
uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len);
#ifdef __cplusplus
}

537
bwa.c
View File

@ -1,274 +1,313 @@
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <math.h>
#include "utils.h"
#include "bwa.h"
#include "bwt.h"
#include "bwtgap.h"
#include <zlib.h>
#include <assert.h>
#include "bntseq.h"
#include "bwa.h"
#include "ksw.h"
#include "utils.h"
#ifndef kroundup32
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif
int bwa_verbose = 3;
char bwa_rg_id[256];
extern unsigned char nst_nt4_table[256];
extern void seq_reverse(int len, uint8_t *seq, int is_comp);
/************************
* Batch FASTA/Q reader *
************************/
bwa_opt_t bwa_def_opt = { 11, 4, -1, 1, 6, 32, 2, 0.04 };
#include "kseq.h"
KSEQ_DECLARE(gzFile)
struct bwa_idx_t {
static inline void trim_readno(kstring_t *s)
{
if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1]))
s->l -= 2, s->s[s->l] = 0;
}
static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s)
{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice
s->name = xstrdup(ks->name.s);
s->comment = ks->comment.l? xstrdup(ks->comment.s) : 0;
s->seq = xstrdup(ks->seq.s);
s->qual = ks->qual.l? xstrdup(ks->qual.s) : 0;
s->l_seq = strlen(s->seq);
}
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
{
kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_;
int size = 0, m, n;
bseq1_t *seqs;
m = n = 0; seqs = 0;
while (kseq_read(ks) >= 0) {
if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads
fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__);
break;
}
if (n >= m) {
m = m? m<<1 : 256;
seqs = xrealloc(seqs, m * sizeof(bseq1_t));
}
trim_readno(&ks->name);
kseq2bseq1(ks, &seqs[n]);
size += seqs[n++].l_seq;
if (ks2) {
trim_readno(&ks2->name);
kseq2bseq1(ks2, &seqs[n]);
size += seqs[n++].l_seq;
}
if (size >= chunk_size) break;
}
if (size == 0) { // test if the 2nd file is finished
if (ks2 && kseq_read(ks2) >= 0)
fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__);
}
*n_ = n;
return seqs;
}
/*****************
* CIGAR related *
*****************/
// Generate CIGAR when the alignment end points are known
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM)
{
uint32_t *cigar = 0;
uint8_t tmp, *rseq;
int i, w;
int64_t rlen;
*n_cigar = 0; *NM = -1;
if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand
rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range
if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position
for (i = 0; i < l_query>>1; ++i)
tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
for (i = 0; i < rlen>>1; ++i)
tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp;
}
//printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
//printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
// set the band-width
w = (int)((double)(l_query * mat[0] - q) / r + 1.);
w = w < 1? w : 1;
w = w < w_? w : w_;
w += abs(rlen - l_query);
// NW alignment
*score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar);
{// compute NM
int k, x, y, n_mm = 0, n_gap = 0;
for (k = 0, x = y = 0; k < *n_cigar; ++k) {
int op = cigar[k]&0xf;
int len = cigar[k]>>4;
if (op == 0) { // match
for (i = 0; i < len; ++i)
if (query[x + i] != rseq[y + i]) ++n_mm;
x += len; y += len;
} else if (op == 1) x += len, n_gap += len;
else if (op == 2) y += len, n_gap += len;
}
*NM = n_mm + n_gap;
}
if (rb >= l_pac) // reverse back query
for (i = 0; i < l_query>>1; ++i)
tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
ret_gen_cigar:
free(rseq);
return cigar;
}
int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)
{
int ib, ie, is_rev;
int64_t fb, fe, mid = -1;
if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary
*qb = *qe = *rb = *re = -1;
return -1; // unable to fix
} else {
fb = bns_depos(bns, *rb < bns->l_pac? *rb : *re - 1, &is_rev);
ib = bns_pos2rid(bns, fb);
if (fb - bns->anns[ib].offset + (*re - *rb) <= bns->anns[ib].len) return 0; // no need to fix
fe = bns_depos(bns, *re - 1 < bns->l_pac? *re - 1 : *rb, &is_rev);
ie = bns_pos2rid(bns, fe);
if (ie - ib > 1) { // bridge three or more references
*qb = *qe = *rb = *re = -1;
return -2; // unable to fix
} else {
int l = bns->anns[ib].offset + bns->anns[ib].len - fb;
mid = is_rev? *re - l : *rb + l;
}
}
if (mid >= 0) {
int i, score, n_cigar, y, NM;
uint32_t *cigar;
int64_t x;
cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM);
for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) {
int op = cigar[i]&0xf, len = cigar[i]>>4;
if (op == 0) {
if (x <= mid && mid < x + len) {
if (mid - *rb > *re - mid) { // the first part is longer
if (x == mid) { // need to check the previous operation
assert(i); // mid != *rb should always stand
if ((cigar[i-1]&0xf) == 1) *qe = y - (cigar[i-1]>>4), *re = x;
else if ((cigar[i-1]&0xf) == 2) *qe = y, *re = x - (cigar[i-1]>>4);
else abort(); // should not be here
} else *qe = y + (mid - x), *re = mid;
} else *qb = y + (mid - x), *rb = mid;
break;
} else x += len, y += len;
} else if (op == 1) { // insertion
y += len;
} else if (op == 2) { // deletion
if (x <= mid && mid < x + len) {
if (mid - *rb > *re - mid) *qe = y, *re = x;
else *qb = y, *rb = x + len;
break;
} else x += len;
} else abort(); // should not be here
}
free(cigar);
}
return 1;
}
/*********************
* Full index reader *
*********************/
char *bwa_idx_infer_prefix(const char *hint)
{
char *prefix;
int l_hint;
FILE *fp;
l_hint = strlen(hint);
prefix = xmalloc(l_hint + 3 + 4 + 1);
strcpy(prefix, hint);
strcpy(prefix + l_hint, ".64.bwt");
if ((fp = fopen(prefix, "rb")) != 0) {
fclose(fp);
prefix[l_hint + 3] = 0;
return prefix;
} else {
strcpy(prefix + l_hint, ".bwt");
if ((fp = fopen(prefix, "rb")) == 0) {
free(prefix);
return 0;
} else {
fclose(fp);
prefix[l_hint] = 0;
return prefix;
}
}
}
bwt_t *bwa_idx_load_bwt(const char *hint)
{
char *tmp, *prefix;
bwt_t *bwt;
bntseq_t *bns;
uint8_t *pac;
};
struct bwa_buf_t {
int max_buf;
bwa_pestat_t pes;
gap_stack_t *stack;
gap_opt_t *opt;
int *diff_tab;
uint8_t *buf;
int *logn;
};
bwa_idx_t *bwa_idx_load(const char *prefix)
{
bwa_idx_t *p;
int l;
char *str;
l = strlen(prefix);
p = xcalloc(1, sizeof(bwa_idx_t));
str = xmalloc(l + 10);
strcpy(str, prefix);
p->bns = bns_restore(str);
strcpy(str + l, ".bwt");
p->bwt = bwt_restore_bwt(str);
str[l] = 0;
strcpy(str + l, ".sa");
bwt_restore_sa(str, p->bwt);
free(str);
p->pac = xcalloc(p->bns->l_pac/4+1, 1);
err_fread_noeof(p->pac, 1, p->bns->l_pac/4+1, p->bns->fp_pac);
err_fclose(p->bns->fp_pac);
p->bns->fp_pac = 0;
return p;
prefix = bwa_idx_infer_prefix(hint);
if (prefix == 0) {
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
return 0;
}
tmp = xcalloc(strlen(prefix) + 5, 1);
strcat(strcpy(tmp, prefix), ".bwt"); // FM-index
bwt = bwt_restore_bwt(tmp);
strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA)
bwt_restore_sa(tmp, bwt);
free(tmp); free(prefix);
return bwt;
}
void bwa_idx_destroy(bwa_idx_t *p)
bwaidx_t *bwa_idx_load(const char *hint, int which)
{
bns_destroy(p->bns);
bwt_destroy(p->bwt);
free(p->pac);
free(p);
bwaidx_t *idx;
char *prefix;
prefix = bwa_idx_infer_prefix(hint);
if (prefix == 0) {
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
return 0;
}
idx = xcalloc(1, sizeof(bwaidx_t));
if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
if (which & BWA_IDX_BNS) {
idx->bns = bns_restore(prefix);
if (which & BWA_IDX_PAC) {
idx->pac = xcalloc(idx->bns->l_pac/4+1, 1);
err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
err_fclose(idx->bns->fp_pac);
idx->bns->fp_pac = 0;
}
}
free(prefix);
return idx;
}
bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score)
void bwa_idx_destroy(bwaidx_t *idx)
{
if (idx == 0) return;
if (idx->bwt) bwt_destroy(idx->bwt);
if (idx->bns) bns_destroy(idx->bns);
if (idx->pac) free(idx->pac);
free(idx);
}
/***********************
* SAM header routines *
***********************/
void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line)
{
extern gap_opt_t *gap_init_opt(void);
extern int bwa_cal_maxdiff(int l, double err, double thres);
int i;
bwa_buf_t *p;
p = xmalloc(sizeof(bwa_buf_t));
p->stack = gap_init_stack2(max_score);
p->opt = gap_init_opt();
p->opt->s_gapo = opt->s_gapo;
p->opt->s_gape = opt->s_gape;
p->opt->max_diff = opt->max_diff;
p->opt->max_gapo = opt->max_gapo;
p->opt->max_gape = opt->max_gape;
p->opt->seed_len = opt->seed_len;
p->opt->max_seed_diff = opt->max_seed_diff;
p->opt->fnr = opt->fnr;
p->diff_tab = xcalloc(BWA_MAX_QUERY_LEN, sizeof(int));
for (i = 1; i < BWA_MAX_QUERY_LEN; ++i)
p->diff_tab[i] = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr);
p->logn = xcalloc(256, sizeof(int));
for (i = 1; i != 256; ++i)
p->logn[i] = (int)(4.343 * log(i) + 0.499);
return p;
for (i = 0; i < bns->n_seqs; ++i)
err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
if (rg_line) err_printf("%s\n", rg_line);
}
void bwa_buf_destroy(bwa_buf_t *p)
static char *bwa_escape(char *s)
{
gap_destroy_stack(p->stack);
free(p->diff_tab); free(p->logn); free(p->opt);
free(p);
}
bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq)
{
extern int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width);
int i, seq_len, buf_len;
bwt_width_t *w, *seed_w;
uint8_t *s;
gap_opt_t opt2 = *buf->opt;
bwa_sai_t sai;
seq_len = strlen(seq);
// estimate the buffer length
buf_len = (buf->opt->seed_len + seq_len + 1) * sizeof(bwt_width_t) + seq_len;
if (buf_len > buf->max_buf) {
buf->max_buf = buf_len;
kroundup32(buf->max_buf);
buf->buf = xrealloc(buf->buf, buf->max_buf);
char *p, *q;
for (p = q = s; *p; ++p) {
if (*p == '\\') {
++p;
if (*p == 't') *q++ = '\t';
else if (*p == 'n') *q++ = '\n';
else if (*p == 'r') *q++ = '\r';
else if (*p == '\\') *q++ = '\\';
} else *q++ = *p;
}
memset(buf->buf, 0, buf_len);
seed_w = (bwt_width_t*)buf->buf;
w = seed_w + buf->opt->seed_len;
s = (uint8_t*)(w + seq_len + 1);
if (opt2.fnr > 0.) opt2.max_diff = buf->diff_tab[seq_len];
// copy the sequence
for (i = 0; i < seq_len; ++i)
s[i] = nst_nt4_table[(int)seq[i]];
seq_reverse(seq_len, s, 0);
// mapping
bwt_cal_width(idx->bwt, seq_len, s, w);
if (opt2.seed_len >= seq_len) opt2.seed_len = 0x7fffffff;
if (seq_len > buf->opt->seed_len)
bwt_cal_width(idx->bwt, buf->opt->seed_len, s + (seq_len - buf->opt->seed_len), seed_w);
for (i = 0; i < seq_len; ++i) // complement; I forgot why...
s[i] = s[i] > 3? 4 : 3 - s[i];
sai.sai = (bwa_sai1_t*)bwt_match_gap(idx->bwt, seq_len, s, w, seq_len <= buf->opt->seed_len? 0 : seed_w, &opt2, &sai.n, buf->stack);
return sai;
*q = '\0';
return s;
}
static void compute_NM(const uint8_t *pac, uint64_t l_pac, uint8_t *seq, int64_t pos, int n_cigar, uint32_t *cigar, int *n_mm, int *n_gaps)
char *bwa_set_rg(const char *s)
{
uint64_t x = pos, z;
int k, y = 0;
*n_mm = *n_gaps = 0;
for (k = 0; k < n_cigar; ++k) {
int l = cigar[k]>>4;
int op = cigar[k]&0xf;
if (op == 0) { // match/mismatch
for (z = 0; z < l && x + z < l_pac; ++z) {
int c = pac[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) ++(*n_mm);
}
}
if (op == 1 || op == 2) (*n_gaps) += l;
if (op == 0 || op == 2) x += l;
if (op == 0 || op == 1 || op == 4) y += l;
char *p, *q, *r, *rg_line = 0;
memset(bwa_rg_id, 0, 256);
if (strstr(s, "@RG") != s) {
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__);
goto err_set_rg;
}
rg_line = xstrdup(s);
bwa_escape(rg_line);
if ((p = strstr(rg_line, "\tID:")) == 0) {
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID at the read group line\n", __func__);
goto err_set_rg;
}
p += 4;
for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
if (q - p + 1 > 256) {
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__);
goto err_set_rg;
}
for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
*r++ = *q;
return rg_line;
err_set_rg:
free(rg_line);
return 0;
}
void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps, bwa_aln_t *aln)
{
extern bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand);
extern bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const uint8_t *seq, bwtint_t *_pos, int ext, int *n_cigar, int is_end_correct);
int strand, seq_len, i, n_gap, n_mm;
uint64_t pos3, pac_pos;
uint8_t *s[2];
memset(aln, 0, sizeof(bwa_aln_t));
seq_len = strlen(seq);
if (seq_len<<1 > buf->max_buf) {
buf->max_buf = seq_len<<1;
kroundup32(buf->max_buf);
buf->buf = xrealloc(buf->buf, buf->max_buf);
}
s[0] = buf->buf;
s[1] = s[0] + seq_len;
for (i = 0; i < seq_len; ++i)
s[0][i] = s[1][i] = nst_nt4_table[(int)seq[i]];
seq_reverse(seq_len, s[1], 1);
pac_pos = bwa_sa2pos(idx->bns, idx->bwt, sa, seq_len, &strand);
if (strand) aln->flag |= 16;
if (n_gaps) { // only for gapped alignment
int n_cigar;
bwa_cigar_t *cigar16;
cigar16 = bwa_refine_gapped_core(idx->bns->l_pac, idx->pac, seq_len, s[strand], &pac_pos, strand? n_gaps : -n_gaps, &n_cigar, 1);
aln->n_cigar = n_cigar;
aln->cigar = xmalloc(n_cigar * 4);
for (i = 0, pos3 = pac_pos; i < n_cigar; ++i) {
int op = cigar16[i]>>14;
int len = cigar16[i]&0x3fff;
if (op == 3) op = 4; // the 16-bit CIGAR is different from the 32-bit CIGAR
aln->cigar[i] = len<<4 | op;
if (op == 0 || op == 2) pos3 += len;
}
free(cigar16);
} else { // ungapped
aln->n_cigar = 1;
aln->cigar = xmalloc(4);
aln->cigar[0] = seq_len<<4 | 0;
pos3 = pac_pos + seq_len;
}
aln->n_n = bns_cnt_ambi(idx->bns, pac_pos, pos3 - pac_pos, &aln->ref_id);
aln->offset = pac_pos - idx->bns->anns[aln->ref_id].offset;
if (pos3 - idx->bns->anns[aln->ref_id].offset > idx->bns->anns[aln->ref_id].len) // read mapped beyond the end of a sequence
aln->flag |= 4; // read unmapped
compute_NM(idx->pac, idx->bns->l_pac, s[strand], pac_pos, aln->n_cigar, aln->cigar, &n_mm, &n_gap);
aln->n_mm = n_mm;
aln->n_gap = n_gap;
}
/************************
* Single-end alignment *
************************/
bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar)
{
bwa_one_t *one;
int best, cnt, i, seq_len;
seq_len = strlen(seq);
one = xcalloc(1, sizeof(bwa_one_t));
one->sai = bwa_sai(idx, buf, seq);
if (one->sai.n == 0) return one;
// count number of hits; randomly select one alignment
best = one->sai.sai[0].score;
for (i = cnt = 0; i < one->sai.n; ++i) {
bwa_sai1_t *p = &one->sai.sai[i];
if (p->score > best) break;
if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) {
one->which = p;
one->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48());
}
cnt += p->l - p->k + 1;
}
one->c1 = cnt;
for (; i < one->sai.n; ++i)
cnt += one->sai.sai[i].l - one->sai.sai[i].k + 1;
one->c2 = cnt - one->c1;
// estimate single-end mapping quality
one->mapQs = -1;
if (one->c1 == 0) one->mapQs = 23; // FIXME: is it possible?
else if (one->c1 > 1) one->mapQs = 0;
else {
int diff = one->which->n_mm + one->which->n_gapo + one->which->n_gape;
if (diff >= buf->diff_tab[seq_len]) one->mapQs = 25;
else if (one->c2 == 0) one->mapQs = 37;
}
if (one->mapQs < 0) {
cnt = (one->c2 >= 255)? 255 : one->c2;
one->mapQs = 23 < buf->logn[cnt]? 0 : 23 - buf->logn[cnt];
}
one->mapQ = one->mapQs;
// compute CIGAR on request
one->one.ref_id = -1;
if (gen_cigar) bwa_sa2aln(idx, buf, seq, one->sa, one->which->n_gapo + one->which->n_gape, &one->one);
return one;
}
void bwa_one_destroy(bwa_one_t *one)
{
free(one->sai.sai);
free(one->one.cigar);
free(one);
}
/************************
* Paired-end alignment *
************************/
void bwa_pestat(bwa_buf_t *buf, int n, bwa_one_t **o[2])
{
}
void bwa_pe(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq[2], bwa_one_t *o[2])
{
}

108
bwa.h
View File

@ -2,103 +2,45 @@
#define BWA_H_
#include <stdint.h>
#include "bntseq.h"
#include "bwt.h"
#define BWA_DEF_MAX_SCORE 2048
#define BWA_MAX_QUERY_LEN 1024
// BWA index
struct bwa_idx_t;
typedef struct bwa_idx_t bwa_idx_t;
// Buffer for BWA alignment
struct bwa_buf_t;
typedef struct bwa_buf_t bwa_buf_t;
// BWA alignment options
typedef struct {
int s_gapo, s_gape; // gap open and extension penalties; the mismatch penalty is fixed at 3
int max_diff, max_gapo, max_gape; // max differences (-1 to use fnr for length-adjusted max diff), gap opens and gap extensions
int seed_len, max_seed_diff; // seed length and max differences allowed in the seed
float fnr; // parameter for automatic length-adjusted max differences
} bwa_opt_t;
// default BWA alignment options
extern bwa_opt_t bwa_def_opt; // = { 11, 4, -1, 1, 6, 32, 2, 0.04 }
// an interval hit in the SA coordinate; basic unit in .sai files
typedef struct {
uint32_t n_mm:16, n_gapo:8, n_gape:8;
int score;
uint64_t k, l; // [k,l] is the SA interval; each interval has l-k+1 hits
} bwa_sai1_t;
// all interval hits in the SA coordinate
typedef struct {
int n; // number of interval hits
bwa_sai1_t *sai;
} bwa_sai_t;
// an alignment
typedef struct {
uint32_t n_n:8, n_gap:12, n_mm:12; // number of ambiguous bases, gaps and mismatches in the alignment
int32_t ref_id; // referece sequence index (the first seq is indexed by 0)
uint32_t offset; // coordinate on the reference; zero-based
uint32_t n_cigar:16, flag:16; // number of CIGAR operations; SAM flag
uint32_t *cigar; // CIGAR in the BAM 28+4 encoding; having n_cigar operations
} bwa_aln_t;
#define BWA_IDX_BWT 0x1
#define BWA_IDX_BNS 0x2
#define BWA_IDX_PAC 0x4
#define BWA_IDX_ALL 0x7
typedef struct {
int mapQs, mapQ, c1, c2;
uint64_t sa;
bwa_sai1_t *which;
bwa_sai_t sai;
bwa_aln_t one;
} bwa_one_t;
bwt_t *bwt; // FM-index
bntseq_t *bns; // information on the reference sequences
uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
} bwaidx_t;
typedef struct {
double avg, std, ap_prior;
uint64_t low, high, high_bayesian;
} bwa_pestat_t;
int l_seq;
char *name, *comment, *seq, *qual, *sam;
} bseq1_t;
extern int bwa_verbose;
extern char bwa_rg_id[256];
#ifdef __cplusplus
extern "C" {
#endif
// load a BWA index
bwa_idx_t *bwa_idx_load(const char *prefix);
void bwa_idx_destroy(bwa_idx_t *p);
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
// allocate a BWA alignment buffer; if unsure, set opt to &bwa_def_opt and max_score to BWA_DEF_MAX_SCORE
bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score);
void bwa_buf_destroy(bwa_buf_t *p);
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re);
/**
* Find all the SA intervals
*
* @param idx BWA index; multiple threads can share the same index
* @param buf BWA alignment buffer; each thread should have its own buffer
* @param seq NULL terminated C string, consisting of A/C/G/T/N only
*
* @return SA intervals seq is matched to
*/
bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq);
char *bwa_idx_infer_prefix(const char *hint);
bwt_t *bwa_idx_load_bwt(const char *hint);
/**
* Construct an alignment in the base-pair coordinate
*
* @param idx BWA index
* @param buf BWA alignment buffer
* @param seq NULL terinated C string
* @param sa Suffix array value
* @param n_gaps Number of gaps (typically equal to bwa_sai1_t::n_gapo + bwa_sai1_t::n_gape
*
* @return An alignment
*/
void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps, bwa_aln_t *aln);
bwaidx_t *bwa_idx_load(const char *hint, int which);
void bwa_idx_destroy(bwaidx_t *idx);
bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar);
void bwa_one_destroy(bwa_one_t *one);
void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line);
char *bwa_set_rg(const char *s);
#ifdef __cplusplus
}

791
bwamem.c 100644
View File

@ -0,0 +1,791 @@
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include <math.h>
#ifdef HAVE_PTHREAD
#include <pthread.h>
#endif
#include "kstring.h"
#include "bwamem.h"
#include "bntseq.h"
#include "ksw.h"
#include "kvec.h"
#include "ksort.h"
#include "utils.h"
/* Theory on probability and scoring *ungapped* alignment
*
* s'(a,b) = log[P(b|a)/P(b)] = log[4P(b|a)], assuming uniform base distribution
* s'(a,a) = log(4), s'(a,b) = log(4e/3), where e is the error rate
*
* Scale s'(a,b) to s(a,a) s.t. s(a,a)=x. Then s(a,b) = x*s'(a,b)/log(4), or conversely: s'(a,b)=s(a,b)*log(4)/x
*
* If the matching score is x and mismatch penalty is -y, we can compute error rate e:
* e = .75 * exp[-log(4) * y/x]
*
* log P(seq) = \sum_i log P(b_i|a_i) = \sum_i {s'(a,b) - log(4)}
* = \sum_i { s(a,b)*log(4)/x - log(4) } = log(4) * (S/x - l)
*
* where S=\sum_i s(a,b) is the alignment score. Converting to the phred scale:
* Q(seq) = -10/log(10) * log P(seq) = 10*log(4)/log(10) * (l - S/x) = 6.02 * (l - S/x)
*
*
* Gap open (zero gap): q' = log[P(gap-open)], r' = log[P(gap-ext)] (see Durbin et al. (1998) Section 4.1)
* Then q = x*log[P(gap-open)]/log(4), r = x*log[P(gap-ext)]/log(4)
*
* When there are gaps, l should be the length of alignment matches (i.e. the M operator in CIGAR)
*/
mem_opt_t *mem_opt_init()
{
mem_opt_t *o;
o = xcalloc(1, sizeof(mem_opt_t));
o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100;
o->flag = 0;
o->min_seed_len = 19;
o->split_width = 10;
o->max_occ = 10000;
o->max_chain_gap = 10000;
o->max_ins = 10000;
o->mask_level = 0.50;
o->chain_drop_ratio = 0.50;
o->split_factor = 1.5;
o->chunk_size = 10000000;
o->n_threads = 1;
o->pen_unpaired = 9;
o->max_matesw = 100;
mem_fill_scmat(o->a, o->b, o->mat);
return o;
}
void mem_fill_scmat(int a, int b, int8_t mat[25])
{
int i, j, k;
for (i = k = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
mat[k++] = i == j? a : -b;
mat[k++] = 0; // ambiguous base
}
for (j = 0; j < 5; ++j) mat[k++] = 0;
}
/***************************
* SMEM iterator interface *
***************************/
struct __smem_i {
const bwt_t *bwt;
const uint8_t *query;
int start, len;
bwtintv_v *matches; // matches; to be returned by smem_next()
bwtintv_v *sub; // sub-matches inside the longest match; temporary
bwtintv_v *tmpvec[2]; // temporary arrays
};
smem_i *smem_itr_init(const bwt_t *bwt)
{
smem_i *itr;
itr = xcalloc(1, sizeof(smem_i));
itr->bwt = bwt;
itr->tmpvec[0] = xcalloc(1, sizeof(bwtintv_v));
itr->tmpvec[1] = xcalloc(1, sizeof(bwtintv_v));
itr->matches = xcalloc(1, sizeof(bwtintv_v));
itr->sub = xcalloc(1, sizeof(bwtintv_v));
return itr;
}
void smem_itr_destroy(smem_i *itr)
{
free(itr->tmpvec[0]->a); free(itr->tmpvec[0]);
free(itr->tmpvec[1]->a); free(itr->tmpvec[1]);
free(itr->matches->a); free(itr->matches);
free(itr->sub->a); free(itr->sub);
free(itr);
}
void smem_set_query(smem_i *itr, int len, const uint8_t *query)
{
itr->query = query;
itr->start = 0;
itr->len = len;
}
const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width)
{
int i, max, max_i, ori_start;
itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0;
if (itr->start >= itr->len || itr->start < 0) return 0;
while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
if (itr->start == itr->len) return 0;
ori_start = itr->start;
itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM
if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here
for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match
bwtintv_t *p = &itr->matches->a[i];
int len = (uint32_t)p->info - (p->info>>32);
if (max < len) max = len, max_i = i;
}
if (split_len > 0 && max >= split_len && itr->matches->a[max_i].x[2] <= split_width) { // if the longest SMEM is unique and long
int j;
bwtintv_v *a = itr->tmpvec[0]; // reuse tmpvec[0] for merging
bwtintv_t *p = &itr->matches->a[max_i];
bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, itr->matches->a[max_i].x[2]+1, itr->sub, itr->tmpvec); // starting from the middle of the longest MEM
i = j = 0; a->n = 0;
while (i < itr->matches->n && j < itr->sub->n) { // ordered merge
int64_t xi = itr->matches->a[i].info>>32<<32 | (itr->len - (uint32_t)itr->matches->a[i].info);
int64_t xj = itr->sub->a[j].info>>32<<32 | (itr->len - (uint32_t)itr->sub->a[j].info);
if (xi < xj) {
kv_push(bwtintv_t, *a, itr->matches->a[i]);
++i;
} else if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1 && (uint32_t)itr->sub->a[j].info > ori_start) {
kv_push(bwtintv_t, *a, itr->sub->a[j]);
++j;
} else ++j;
}
for (; i < itr->matches->n; ++i) kv_push(bwtintv_t, *a, itr->matches->a[i]);
for (; j < itr->sub->n; ++j)
if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1 && (uint32_t)itr->sub->a[j].info > ori_start)
kv_push(bwtintv_t, *a, itr->sub->a[j]);
kv_copy(bwtintv_t, *itr->matches, *a);
}
return itr->matches;
}
/********************************
* Chaining while finding SMEMs *
********************************/
typedef struct {
int64_t rbeg;
int32_t qbeg, len;
} mem_seed_t;
typedef struct {
int n, m;
int64_t pos;
mem_seed_t *seeds;
} mem_chain_t;
typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v;
#include "kbtree.h"
#define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos))
KBTREE_INIT(chn, mem_chain_t, chain_cmp)
static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t *p)
{
int64_t qend, rend, x, y;
const mem_seed_t *last = &c->seeds[c->n-1];
qend = last->qbeg + last->len;
rend = last->rbeg + last->len;
if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend)
return 1; // contained seed; do nothing
x = p->qbeg - last->qbeg; // always non-negtive
y = p->rbeg - last->rbeg;
if (y >= 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain
if (c->n == c->m) {
c->m <<= 1;
c->seeds = xrealloc(c->seeds, c->m * sizeof(mem_seed_t));
}
c->seeds[c->n++] = *p;
return 1;
}
return 0; // request to add a new chain
}
static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *itr)
{
const bwtintv_v *a;
int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
split_len = split_len < itr->len? split_len : itr->len;
while ((a = smem_next(itr, split_len, opt->split_width)) != 0) { // to find all SMEM and some internal MEM
int i;
for (i = 0; i < a->n; ++i) { // go through each SMEM/MEM up to itr->start
bwtintv_t *p = &a->a[i];
int slen = (uint32_t)p->info - (p->info>>32); // seed length
int64_t k;
if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive
for (k = 0; k < p->x[2]; ++k) {
mem_chain_t tmp, *lower, *upper;
mem_seed_t s;
int to_add = 0;
s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
s.qbeg = p->info>>32;
s.len = slen;
if (kb_size(tree)) {
kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain
if (!lower || !test_and_merge(opt, lower, &s)) to_add = 1;
} else to_add = 1;
if (to_add) { // add the seed as a new chain
tmp.n = 1; tmp.m = 4;
tmp.seeds = xcalloc(tmp.m, sizeof(mem_seed_t));
tmp.seeds[0] = s;
kb_putp(chn, tree, &tmp);
}
}
}
}
}
void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn)
{
int i, j;
for (i = 0; i < chn->n; ++i) {
mem_chain_t *p = &chn->a[i];
err_printf("%d", p->n);
for (j = 0; j < p->n; ++j) {
bwtint_t pos;
int is_rev, ref_id;
pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev);
if (is_rev) pos -= p->seeds[j].len - 1;
bns_cnt_ambi(bns, pos, p->seeds[j].len, &ref_id);
err_printf("\t%d,%d,%ld(%s:%c%ld)", p->seeds[j].len, p->seeds[j].qbeg, (long)p->seeds[j].rbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
}
err_putchar('\n');
}
}
mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq)
{
mem_chain_v chain;
smem_i *itr;
kbtree_t(chn) *tree;
kv_init(chain);
if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match
tree = kb_init(chn, KB_DEFAULT_SIZE);
itr = smem_itr_init(bwt);
smem_set_query(itr, len, seq);
mem_insert_seed(opt, tree, itr);
kv_resize(mem_chain_t, chain, kb_size(tree));
#define traverse_func(p_) (chain.a[chain.n++] = *(p_))
__kb_traverse(mem_chain_t, tree, traverse_func);
#undef traverse_func
smem_itr_destroy(itr);
kb_destroy(chn, tree);
return chain;
}
/********************
* Filtering chains *
********************/
typedef struct {
int beg, end, w;
void *p, *p2;
} flt_aux_t;
#define flt_lt(a, b) ((a).w > (b).w)
KSORT_INIT(mem_flt, flt_aux_t, flt_lt)
int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains)
{
flt_aux_t *a;
int i, j, n;
if (n_chn <= 1) return n_chn; // no need to filter
a = xmalloc(sizeof(flt_aux_t) * n_chn);
for (i = 0; i < n_chn; ++i) {
mem_chain_t *c = &chains[i];
int64_t end;
int w = 0, tmp;
for (j = 0, end = 0; j < c->n; ++j) {
const mem_seed_t *s = &c->seeds[j];
if (s->qbeg >= end) w += s->len;
else if (s->qbeg + s->len > end) w += s->qbeg + s->len - end;
end = end > s->qbeg + s->len? end : s->qbeg + s->len;
}
tmp = w;
for (j = 0, end = 0; j < c->n; ++j) {
const mem_seed_t *s = &c->seeds[j];
if (s->rbeg >= end) w += s->len;
else if (s->rbeg + s->len > end) w += s->rbeg + s->len - end;
end = end > s->qbeg + s->len? end : s->qbeg + s->len;
}
w = w < tmp? w : tmp;
a[i].beg = c->seeds[0].qbeg;
a[i].end = c->seeds[c->n-1].qbeg + c->seeds[c->n-1].len;
a[i].w = w; a[i].p = c; a[i].p2 = 0;
}
ks_introsort(mem_flt, n_chn, a);
{ // reorder chains such that the best chain appears first
mem_chain_t *swap;
swap = xmalloc(sizeof(mem_chain_t) * n_chn);
for (i = 0; i < n_chn; ++i) {
swap[i] = *((mem_chain_t*)a[i].p);
a[i].p = &chains[i]; // as we will memcpy() below, a[i].p is changed
}
memcpy(chains, swap, sizeof(mem_chain_t) * n_chn);
free(swap);
}
for (i = 1, n = 1; i < n_chn; ++i) {
for (j = 0; j < n; ++j) {
int b_max = a[j].beg > a[i].beg? a[j].beg : a[i].beg;
int e_min = a[j].end < a[i].end? a[j].end : a[i].end;
if (e_min > b_max) { // have overlap
int min_l = a[i].end - a[i].beg < a[j].end - a[j].beg? a[i].end - a[i].beg : a[j].end - a[j].beg;
if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap
if (a[j].p2 == 0) a[j].p2 = a[i].p;
if (a[i].w < a[j].w * opt->chain_drop_ratio && a[j].w - a[i].w >= opt->min_seed_len<<1)
break;
}
}
}
if (j == n) a[n++] = a[i]; // if have no significant overlap with better chains, keep it.
}
for (i = 0; i < n; ++i) { // mark chains to be kept
mem_chain_t *c = (mem_chain_t*)a[i].p;
if (c->n > 0) c->n = -c->n;
c = (mem_chain_t*)a[i].p2;
if (c && c->n > 0) c->n = -c->n;
}
free(a);
for (i = 0; i < n_chn; ++i) { // free discarded chains
mem_chain_t *c = &chains[i];
if (c->n >= 0) {
free(c->seeds);
c->n = c->m = 0;
} else c->n = -c->n;
}
for (i = n = 0; i < n_chn; ++i) { // squeeze out discarded chains
if (chains[i].n > 0) {
if (n != i) chains[n++] = chains[i];
else ++n;
}
}
return n;
}
/******************************
* De-overlap single-end hits *
******************************/
#define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb))))
KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt)
int mem_sort_and_dedup(int n, mem_alnreg_t *a)
{
int m, i;
if (n <= 1) return n;
ks_introsort(mem_ars, n, a);
for (i = 1; i < n; ++i) { // mark identical hits
if (a[i].score == a[i-1].score && a[i].rb == a[i-1].rb && a[i].qb == a[i-1].qb)
a[i].qe = a[i].qb;
}
for (i = 1, m = 1; i < n; ++i) // exclude identical hits
if (a[i].qe > a[i].qb) {
if (m != i) a[m++] = a[i];
else ++m;
}
return m;
}
void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORTANT: must run mem_sort_and_dedup() before calling this function
{ // similar to the loop in mem_chain_flt()
int i, k, tmp;
kvec_t(int) z;
if (n == 0) return;
kv_init(z);
for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1;
tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r;
kv_push(int, z, 0);
for (i = 1; i < n; ++i) {
for (k = 0; k < z.n; ++k) {
int j = z.a[k];
int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb;
int e_min = a[j].qe < a[i].qe? a[j].qe : a[i].qe;
if (e_min > b_max) { // have overlap
int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb;
if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap
if (a[j].sub == 0) a[j].sub = a[i].score;
if (a[j].score - a[i].score <= tmp) ++a[j].sub_n;
break;
}
}
}
if (k == z.n) kv_push(int, z, i);
else a[i].secondary = z.a[k];
}
free(z.a);
}
/****************************************
* Construct the alignment from a chain *
****************************************/
static inline int cal_max_gap(const mem_opt_t *opt, int qlen)
{
int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.);
l = l > 1? l : 1;
return l < opt->w<<1? l : opt->w<<1;
}
void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av)
{ // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds
int i, k;
int64_t rlen, rmax[2], tmp, max = 0;
const mem_seed_t *s;
uint8_t *rseq = 0;
uint64_t *srt;
if (c->n == 0) return;
// get the max possible span
rmax[0] = l_pac<<1; rmax[1] = 0;
for (i = 0; i < c->n; ++i) {
int64_t b, e;
const mem_seed_t *t = &c->seeds[i];
b = t->rbeg - (t->qbeg + cal_max_gap(opt, t->qbeg));
e = t->rbeg + t->len + ((l_query - t->qbeg - t->len) + cal_max_gap(opt, l_query - t->qbeg - t->len));
rmax[0] = rmax[0] < b? rmax[0] : b;
rmax[1] = rmax[1] > e? rmax[1] : e;
if (t->len > max) max = t->len;
}
rmax[0] = rmax[0] > 0? rmax[0] : 0;
rmax[1] = rmax[1] < l_pac<<1? rmax[1] : l_pac<<1;
if (rmax[0] < l_pac && l_pac < rmax[1]) { // crossing the forward-reverse boundary; then choose one side
if (l_pac - rmax[0] > rmax[1] - l_pac) rmax[1] = l_pac;
else rmax[0] = l_pac;
}
// retrieve the reference sequence
rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen);
if (rlen != rmax[1] - rmax[0]) return;
srt = xmalloc(c->n * 8);
for (i = 0; i < c->n; ++i)
srt[i] = (uint64_t)c->seeds[i].len<<32 | i;
ks_introsort_64(c->n, srt);
for (k = c->n - 1; k >= 0; --k) {
mem_alnreg_t *a;
s = &c->seeds[(uint32_t)srt[k]];
for (i = 0; i < av->n; ++i) { // test whether extension has been made before
mem_alnreg_t *p = &av->a[i];
int64_t rd;
int qd, w, max_gap;
if (s->rbeg < p->rb || s->rbeg + s->len > p->re || s->qbeg < p->qb || s->qbeg + s->len > p->qe) continue; // not fully contained
// qd: distance ahead of the seed on query; rd: on reference
qd = s->qbeg - p->qb; rd = s->rbeg - p->rb;
max_gap = cal_max_gap(opt, qd < rd? qd : rd); // the maximal gap allowed in regions ahead of the seed
w = max_gap < opt->w? max_gap : opt->w; // bounded by the band width
if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit
// similar to the previous four lines, but this time we look at the region behind
qd = p->qe - (s->qbeg + s->len); rd = p->re - (s->rbeg + s->len);
max_gap = cal_max_gap(opt, qd < rd? qd : rd);
w = max_gap < opt->w? max_gap : opt->w;
if (qd - rd < w && rd - qd < w) break;
}
if (i < av->n) continue;
a = kv_pushp(mem_alnreg_t, *av);
memset(a, 0, sizeof(mem_alnreg_t));
if (s->qbeg) { // left extension
uint8_t *rs, *qs;
int qle, tle;
qs = xmalloc(s->qbeg);
for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i];
tmp = s->rbeg - rmax[0];
rs = xmalloc(tmp);
for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i];
a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle);
a->qb = s->qbeg - qle; a->rb = s->rbeg - tle;
free(qs); free(rs);
} else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg;
if (s->qbeg + s->len != l_query) { // right extension
int qle, tle, qe, re;
qe = s->qbeg + s->len;
re = s->rbeg + s->len - rmax[0];
a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle);
a->qe = qe + qle; a->re = rmax[0] + re + tle;
} else a->qe = l_query, a->re = s->rbeg + s->len;
if (bwa_verbose >= 4) err_printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re);
// compute seedcov
for (i = 0, a->seedcov = 0; i < c->n; ++i) {
const mem_seed_t *t = &c->seeds[i];
if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained
a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough
}
}
free(srt); free(rseq);
}
/*****************************
* Basic hit->SAM conversion *
*****************************/
void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m)
{
#define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1)
int score, n_cigar, is_rev = 0, rid, mid, copy_mate = 0, NM = -1;
uint32_t *cigar = 0;
int64_t pos;
bwahit_t ptmp, *p = &ptmp;
if (!p_) { // in this case, generate an unmapped alignment
memset(&ptmp, 0, sizeof(bwahit_t));
ptmp.rb = ptmp.re = -1;
} else ptmp = *p_;
p->flag |= m? 1 : 0; // is paired in sequencing
p->flag |= !is_mapped(p)? 4 : 0; // is mapped
p->flag |= m && !is_mapped(m)? 8 : 0; // is mate mapped
if (m && !is_mapped(p) && is_mapped(m)) {
p->rb = m->rb; p->re = m->re; p->qb = 0; p->qe = s->l_seq;
copy_mate = 1;
}
p->flag |= p->rb >= bns->l_pac? 0x10 : 0; // is reverse strand
p->flag |= m && m->rb >= bns->l_pac? 0x20 : 0; // is mate on reverse strand
kputs(s->name, str); kputc('\t', str);
if (is_mapped(p)) { // has a coordinate, no matter whether it is mapped or copied from the mate
int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag
if (p->flag&0x10000) sam_flag |= 0x100;
if (!copy_mate) {
cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM);
p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened)
} else n_cigar = 0, cigar = 0;
pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev);
bns_cnt_ambi(bns, pos, p->re - p->rb, &rid);
kputw(sam_flag, str); kputc('\t', str);
kputs(bns->anns[rid].name, str); kputc('\t', str); kputuw(pos - bns->anns[rid].offset + 1, str); kputc('\t', str);
kputw(p->qual, str); kputc('\t', str);
if (n_cigar) {
int i, clip5, clip3;
clip5 = is_rev? s->l_seq - p->qe : p->qb;
clip3 = is_rev? p->qb : s->l_seq - p->qe;
if (clip5) { kputw(clip5, str); kputc("SH"[(is_hard!=0)], str); }
for (i = 0; i < n_cigar; ++i) {
kputw(cigar[i]>>4, str); kputc("MIDSH"[cigar[i]&0xf], str);
}
if (clip3) { kputw(clip3, str); kputc("SH"[(is_hard!=0)], str); }
} else kputc('*', str);
} else { // no coordinate
kputw(p->flag, str);
kputs("\t*\t0\t0\t*", str);
rid = -1;
}
if (m && is_mapped(m)) { // then print mate pos and isize
pos = bns_depos(bns, m->rb < bns->l_pac? m->rb : m->re - 1, &is_rev);
bns_cnt_ambi(bns, pos, m->re - m->rb, &mid);
kputc('\t', str);
if (mid == rid) kputc('=', str);
else kputs(bns->anns[mid].name, str);
kputc('\t', str); kputuw(pos - bns->anns[mid].offset + 1, str);
kputc('\t', str);
if (mid == rid) {
int64_t p0 = p->rb < bns->l_pac? p->rb : (bns->l_pac<<1) - 1 - p->rb;
int64_t p1 = m->rb < bns->l_pac? m->rb : (bns->l_pac<<1) - 1 - m->rb;
kputw(p0 - p1 + (p0 > p1? 1 : -1), str);
} else kputw(0, str);
kputc('\t', str);
} else kputsn("\t*\t0\t0\t", 7, str);
if (p->flag&0x100) { // for secondary alignments, don't write SEQ and QUAL
kputsn("*\t*", 3, str);
} else if (!(p->flag&0x10)) { // print SEQ and QUAL, the forward strand
int i, qb = 0, qe = s->l_seq;
if (!(p->flag&4) && is_hard) qb = p->qb, qe = p->qe;
ks_resize(str, str->l + (qe - qb) + 1);
for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]];
kputc('\t', str);
if (s->qual) { // printf qual
ks_resize(str, str->l + (qe - qb) + 1);
for (i = qb; i < qe; ++i) str->s[str->l++] = s->qual[i];
str->s[str->l] = 0;
} else kputc('*', str);
} else { // the reverse strand
int i, qb = 0, qe = s->l_seq;
if (!(p->flag&4) && is_hard) qb = p->qb, qe = p->qe;
ks_resize(str, str->l + (qe - qb) + 1);
for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]];
kputc('\t', str);
if (s->qual) { // printf qual
ks_resize(str, str->l + (qe - qb) + 1);
for (i = qe-1; i >= qb; --i) str->s[str->l++] = s->qual[i];
str->s[str->l] = 0;
} else kputc('*', str);
}
if (NM >= 0) { kputsn("\tNM:i:", 6, str); kputw(NM, str); }
if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); }
if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); }
if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); }
if (s->comment) { kputc('\t', str); kputs(s->comment, str); }
kputc('\n', str);
free(cigar);
#undef is_mapped
}
/************************
* Integrated interface *
************************/
int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a)
{
int mapq, l, sub = a->sub? a->sub : opt->min_seed_len * opt->a;
double identity;
sub = a->csub > sub? a->csub : sub;
if (sub >= a->score) return 0;
l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb;
mapq = a->score? (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0;
identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l;
mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq;
if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499);
if (mapq > 60) mapq = 60;
if (mapq < 0) mapq = 0;
return mapq;
}
void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h)
{
h->rb = a->rb; h->re = a->re; h->qb = a->qb; h->qe = a->qe;
h->score = a->score;
h->sub = a->secondary >= 0? -1 : a->sub > a->csub? a->sub : a->csub;
h->qual = 0; // quality unset
h->flag = a->secondary >= 0? 0x100 : 0; // only the "secondary" bit is set
}
void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const bwahit_t *m)
{
int k;
kstring_t str;
str.l = str.m = 0; str.s = 0;
if (a->n > 0) {
int mapq0 = -1;
for (k = 0; k < a->n; ++k) {
bwahit_t h;
mem_alnreg_t *p = &a->a[k];
if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue;
if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue;
mem_alnreg2hit(p, &h);
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s->seq, &h.qb, &h.qe, &h.rb, &h.re);
h.flag |= extra_flag;
if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) h.flag |= 0x10000; // print the sequence, but flag as secondary (for Picard)
h.qual = p->secondary >= 0? 0 : mem_approx_mapq_se(opt, p);
if (k == 0) mapq0 = h.qual;
else if (h.qual > mapq0) h.qual = mapq0;
bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m);
}
} else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP, m);
s->sam = str.s;
}
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
{
int i;
mem_chain_v chn;
mem_alnreg_v regs;
for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so
seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]];
chn = mem_chain(opt, bwt, l_seq, (uint8_t*)seq);
chn.n = mem_chain_flt(opt, chn.n, chn.a);
if (bwa_verbose >= 4) mem_print_chain(bns, &chn);
kv_init(regs);
for (i = 0; i < chn.n; ++i) {
mem_chain_t *p = &chn.a[i];
mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, &regs);
free(chn.a[i].seeds);
}
free(chn.a);
regs.n = mem_sort_and_dedup(regs.n, regs.a);
return regs;
}
typedef struct {
int start, step, n;
const mem_opt_t *opt;
const bwt_t *bwt;
const bntseq_t *bns;
const uint8_t *pac;
const mem_pestat_t *pes;
bseq1_t *seqs;
mem_alnreg_v *regs;
} worker_t;
static void *worker1(void *data)
{
worker_t *w = (worker_t*)data;
int i;
if (!(w->opt->flag&MEM_F_PE)) {
for (i = w->start; i < w->n; i += w->step)
w->regs[i] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq);
} else { // for PE we align the two ends in the same thread in case the 2nd read is of worse quality, in which case some threads may be faster/slower
for (i = w->start; i < w->n>>1; i += w->step) {
w->regs[i<<1|0] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq);
w->regs[i<<1|1] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq);
}
}
return 0;
}
static void *worker2(void *data)
{
extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]);
worker_t *w = (worker_t*)data;
int i;
if (!(w->opt->flag&MEM_F_PE)) {
for (i = w->start; i < w->n; i += w->step) {
mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a);
mem_sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0);
free(w->regs[i].a);
}
} else {
int n = 0;
for (i = w->start; i < w->n>>1; i += w->step) { // not implemented yet
n += mem_sam_pe(w->opt, w->bns, w->pac, w->pes, i, &w->seqs[i<<1], &w->regs[i<<1]);
free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a);
}
fprintf(stderr, "[M::%s@%d] performed mate-SW for %d reads\n", __func__, w->start, n);
}
return 0;
}
void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs)
{
int i;
worker_t *w;
mem_alnreg_v *regs;
mem_pestat_t pes[4];
w = xcalloc(opt->n_threads, sizeof(worker_t));
regs = xmalloc(n * sizeof(mem_alnreg_v));
for (i = 0; i < opt->n_threads; ++i) {
worker_t *p = &w[i];
p->start = i; p->step = opt->n_threads; p->n = n;
p->opt = opt; p->bwt = bwt; p->bns = bns; p->pac = pac;
p->seqs = seqs; p->regs = regs;
p->pes = &pes[0];
}
#ifdef HAVE_PTHREAD
if (opt->n_threads == 1) {
worker1(w);
if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes);
worker2(w);
} else {
pthread_t *tid;
tid = (pthread_t*)xcalloc(opt->n_threads, sizeof(pthread_t));
for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker1, &w[i]);
for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0);
if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes);
for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker2, &w[i]);
for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0);
free(tid);
}
#else
worker1(w);
if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes);
worker2(w);
#endif
for (i = 0; i < n; ++i) {
err_fputs(seqs[i].sam, stdout);
free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam);
}
free(regs); free(w);
}

133
bwamem.h 100644
View File

@ -0,0 +1,133 @@
#ifndef BWAMEM_H_
#define BWAMEM_H_
#include "bwt.h"
#include "bntseq.h"
#include "bwa.h"
#define MEM_MAPQ_COEF 30.0
#define MEM_MAPQ_MAX 60
struct __smem_i;
typedef struct __smem_i smem_i;
#define MEM_F_HARDCLIP 0x1
#define MEM_F_PE 0x2
#define MEM_F_NOPAIRING 0x4
#define MEM_F_ALL 0x8
#define MEM_F_NO_MULTI 0x10
typedef struct {
int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r
int w; // band width
int flag; // see MEM_F_* macros
int min_seed_len; // minimum seed length
float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor
int split_width; // split into a seed if its occurence is smaller than this value
int max_occ; // skip a seed if its occurence is larger than this value
int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed
int n_threads; // number of threads
int chunk_size; // process chunk_size-bp sequences in a batch
float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain
int pen_unpaired; // phred-scaled penalty for unpaired reads
int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
} mem_opt_t;
typedef struct {
int64_t rb, re; // [rb,re): reference sequence in the alignment
int qb, qe; // [qb,qe): query sequence in the alignment
int score; // best SW score
int sub; // 2nd best SW score
int csub; // SW score of a tandem hit
int sub_n; // approximate number of suboptimal hits
int seedcov; // length of regions coverged by seeds
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
} mem_alnreg_t;
typedef struct {
int low, high, failed;
double avg, std;
} mem_pestat_t;
typedef struct {
int64_t rb, re;
int qb, qe, flag, qual;
// optional info
int score, sub;
} bwahit_t;
typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v;
#ifdef __cplusplus
extern "C" {
#endif
smem_i *smem_itr_init(const bwt_t *bwt);
void smem_itr_destroy(smem_i *itr);
void smem_set_query(smem_i *itr, int len, const uint8_t *query);
const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width);
mem_opt_t *mem_opt_init(void);
void mem_fill_scmat(int a, int b, int8_t mat[25]);
/**
* Align a batch of sequences and generate the alignments in the SAM format
*
* This routine requires $seqs[i].{l_seq,seq,name} and write $seqs[i].sam.
* Note that $seqs[i].sam may consist of several SAM lines if the
* corresponding sequence has multiple primary hits.
*
* In the paired-end mode (i.e. MEM_F_PE is set in $opt->flag), query
* sequences must be interleaved: $n must be an even number and the 2i-th
* sequence and the (2i+1)-th sequence constitute a read pair. In this
* mode, there should be enough (typically >50) unique pairs for the
* routine to infer the orientation and insert size.
*
* @param opt alignment parameters
* @param bwt FM-index of the reference sequence
* @param bns Information of the reference
* @param pac 2-bit encoded reference
* @param n number of query sequences
* @param seqs query sequences; $seqs[i].seq/sam to be modified after the call
*/
void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs);
/**
* Find the aligned regions for one query sequence
*
* Note that this routine does not generate CIGAR. CIGAR should be
* generated later by bwa_gen_cigar() defined in bwa.c.
*
* @param opt alignment parameters
* @param bwt FM-index of the reference sequence
* @param bns Information of the reference
* @param pac 2-bit encoded reference
* @param l_seq length of query sequence
* @param seq query sequence; conversion ACGTN/acgtn=>01234 to be applied
*
* @return list of aligned regions.
*/
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq);
/**
* Infer the insert size distribution from interleaved alignment regions
*
* This function can be called after mem_align1(), as long as paired-end
* reads are properly interleaved.
*
* @param opt alignment parameters
* @param l_pac length of concatenated reference sequence
* @param n number of query sequences; must be an even number
* @param regs region array of size $n; 2i-th and (2i+1)-th elements constitute a pair
* @param pes inferred insert size distribution (output)
*/
void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]);
#ifdef __cplusplus
}
#endif
#endif

314
bwamem_pair.c 100644
View File

@ -0,0 +1,314 @@
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <math.h>
#include "kstring.h"
#include "bwamem.h"
#include "kvec.h"
#include "utils.h"
#include "ksw.h"
#define MIN_RATIO 0.8
#define MIN_DIR_CNT 10
#define MIN_DIR_RATIO 0.05
#define OUTLIER_BOUND 2.0
#define MAPPING_BOUND 3.0
#define MAX_STDDEV 4.0
static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist)
{
int64_t p2;
int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac);
p2 = r1 == r2? b2 : (l_pac<<1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand
*dist = p2 > b1? p2 - b1 : b1 - p2;
return (r1 == r2? 0 : 1) ^ (p2 > b1? 0 : 3);
}
static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r)
{
int j;
for (j = 1; j < r->n; ++j) { // choose unique alignment
int b_max = r->a[j].qb > r->a[0].qb? r->a[j].qb : r->a[0].qb;
int e_min = r->a[j].qe < r->a[0].qe? r->a[j].qe : r->a[0].qe;
if (e_min > b_max) { // have overlap
int min_l = r->a[j].qe - r->a[j].qb < r->a[0].qe - r->a[0].qb? r->a[j].qe - r->a[j].qb : r->a[0].qe - r->a[0].qb;
if (e_min - b_max >= min_l * opt->mask_level) break; // significant overlap
}
}
return j < r->n? r->a[j].score : opt->min_seed_len * opt->a;
}
void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4])
{
int i, d, max;
uint64_v isize[4];
memset(pes, 0, 4 * sizeof(mem_pestat_t));
memset(isize, 0, sizeof(kvec_t(int)) * 4);
for (i = 0; i < n>>1; ++i) {
int dir;
int64_t is;
mem_alnreg_v *r[2];
r[0] = (mem_alnreg_v*)&regs[i<<1|0];
r[1] = (mem_alnreg_v*)&regs[i<<1|1];
if (r[0]->n == 0 || r[1]->n == 0) continue;
if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue;
if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue;
dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is);
if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is);
}
if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n);
for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two.
mem_pestat_t *r = &pes[d];
uint64_v *q = &isize[d];
int p25, p50, p75, x;
if (q->n < MIN_DIR_CNT) {
fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
r->failed = 1;
continue;
} else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
ks_introsort_64(q->n, q->a);
p25 = q->a[(int)(.25 * q->n + .499)];
p50 = q->a[(int)(.50 * q->n + .499)];
p75 = q->a[(int)(.75 * q->n + .499)];
r->low = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);
if (r->low < 1) r->low = 1;
r->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);
fprintf(stderr, "[M::%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75);
fprintf(stderr, "[M::%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r->low, r->high);
for (i = x = 0, r->avg = 0; i < q->n; ++i)
if (q->a[i] >= r->low && q->a[i] <= r->high)
r->avg += q->a[i], ++x;
r->avg /= x;
for (i = 0, r->std = 0; i < q->n; ++i)
if (q->a[i] >= r->low && q->a[i] <= r->high)
r->std += (q->a[i] - r->avg) * (q->a[i] - r->avg);
r->std = sqrt(r->std / x);
fprintf(stderr, "[M::%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r->avg, r->std);
r->low = (int)(p25 - MAPPING_BOUND * (p75 - p25) + .499);
r->high = (int)(p75 + MAPPING_BOUND * (p75 - p25) + .499);
if (r->low > r->avg - MAX_STDDEV * r->std) r->low = (int)(r->avg - MAX_STDDEV * r->std + .499);
if (r->high < r->avg - MAX_STDDEV * r->std) r->high = (int)(r->avg + MAX_STDDEV * r->std + .499);
if (r->low < 1) r->low = 1;
fprintf(stderr, "[M::%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r->low, r->high);
free(q->a);
}
for (d = 0, max = 0; d < 4; ++d)
max = max > isize[d].n? max : isize[d].n;
for (d = 0; d < 4; ++d)
if (pes[d].failed == 0 && isize[d].n < max * MIN_DIR_RATIO) {
pes[d].failed = 1;
fprintf(stderr, "[M::%s] skip orientation %c%c\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
}
}
int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma)
{
int i, r, skip[4], n = 0;
for (r = 0; r < 4; ++r)
skip[r] = pes[r].failed? 1 : 0;
for (i = 0; i < ma->n; ++i) { // check which orinentation has been found
int64_t dist;
r = mem_infer_dir(l_pac, a->rb, ma->a[i].rb, &dist);
if (dist >= pes[r].low && dist <= pes[r].high)
skip[r] = 1;
}
if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return 0; // consistent pair exist; no need to perform SW
for (r = 0; r < 4; ++r) {
int is_rev, is_larger;
uint8_t *seq, *rev = 0, *ref;
int64_t rb, re, len;
if (skip[r]) continue;
is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate
is_larger = !(r>>1); // whether the mate has larger coordinate
if (is_rev) {
rev = xmalloc(l_ms); // this is the reverse complement of $ms
for (i = 0; i < l_ms; ++i) rev[l_ms - 1 - i] = ms[i] < 4? 3 - ms[i] : 4;
seq = rev;
} else seq = (uint8_t*)ms;
if (!is_rev) {
rb = is_larger? a->rb + pes[r].low : a->rb - pes[r].high;
re = (is_larger? a->rb + pes[r].high: a->rb - pes[r].low) + l_ms; // if on the same strand, end position should be larger to make room for the seq length
} else {
rb = (is_larger? a->rb + pes[r].low : a->rb - pes[r].high) - l_ms; // similarly on opposite strands
re = is_larger? a->rb + pes[r].high: a->rb - pes[r].low;
}
if (rb < 0) rb = 0;
if (re > l_pac<<1) re = l_pac<<1;
ref = bns_get_seq(l_pac, pac, rb, re, &len);
if (len == re - rb) { // no funny things happening
kswr_t aln;
mem_alnreg_t b;
int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | opt->min_seed_len;
aln = ksw_align(l_ms, seq, len, ref, 5, opt->mat, opt->q, opt->r, xtra, 0);
memset(&b, 0, sizeof(mem_alnreg_t));
if (aln.score >= opt->min_seed_len) {
b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb;
b.qe = is_rev? l_ms - aln.qb : aln.qe + 1;
b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb;
b.re = is_rev? (l_pac<<1) - (rb + aln.tb) : rb + aln.te + 1;
b.score = aln.score;
b.csub = aln.score2;
b.secondary = -1;
b.seedcov = (b.re - b.rb < b.qe - b.qb? b.re - b.rb : b.qe - b.qb) >> 1;
// printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re);
kv_push(mem_alnreg_t, *ma, b); // make room for a new element
// move b s.t. ma is sorted
for (i = 0; i < ma->n - 1; ++i) // find the insertion point
if (ma->a[i].score < b.score) break;
tmp = i;
for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1];
ma->a[i] = b;
}
++n;
}
if (rev) free(rev);
free(ref);
}
return n;
}
int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2])
{
extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h);
pair64_v v, u;
int r, i, k, y[4], ret; // y[] keeps the last hit
kv_init(v); kv_init(u);
for (r = 0; r < 2; ++r) { // loop through read number
for (i = 0; i < a[r].n; ++i) {
pair64_t key;
mem_alnreg_t *e = &a[r].a[i];
key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position
key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= l_pac)<<1 | r;
kv_push(pair64_t, v, key);
}
}
ks_introsort_128(v.n, v.a);
y[0] = y[1] = y[2] = y[3] = -1;
//for (i = 0; i < v.n; ++i) printf("[%d]\t%d\t%c%ld\n", i, (int)(v.a[i].y&1)+1, "+-"[v.a[i].y>>1&1], (long)v.a[i].x);
for (i = 0; i < v.n; ++i) {
for (r = 0; r < 2; ++r) { // loop through direction
int dir = r<<1 | (v.a[i].y>>1&1), which;
if (pes[dir].failed) continue; // invalid orientation
which = r<<1 | ((v.a[i].y&1)^1);
if (y[which] < 0) continue; // no previous hits
for (k = y[which]; k >= 0; --k) { // TODO: this is a O(n^2) solution in the worst case; remember to check if this loop takes a lot of time (I doubt)
int64_t dist;
int q;
double ns;
pair64_t *p;
if ((v.a[k].y&3) != which) continue;
dist = (int64_t)v.a[i].x - v.a[k].x;
//printf("%d: %lld\n", k, dist);
if (dist > pes[dir].high) break;
if (dist < pes[dir].low) continue;
ns = (dist - pes[dir].avg) / pes[dir].std;
q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) + .499); // .721 = 1/log(4)
if (q < 0) q = 0;
p = kv_pushp(pair64_t, u);
p->y = (uint64_t)k<<32 | i;
p->x = (uint64_t)q<<32 | (hash_64(p->y ^ id<<8) & 0xffffffffU);
//printf("[%lld,%lld]\t%d\tdist=%ld\n", v.a[k].x, v.a[i].x, q, (long)dist);
}
}
y[v.a[i].y&3] = i;
}
if (u.n) { // found at least one proper pair
int tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r;
ks_introsort_128(u.n, u.a);
i = u.a[u.n-1].y >> 32; k = u.a[u.n-1].y << 32 >> 32;
z[v.a[i].y&1] = v.a[i].y<<32>>34; // index of the best pair
z[v.a[k].y&1] = v.a[k].y<<32>>34;
ret = u.a[u.n-1].x >> 32;
*sub = u.n > 1? u.a[u.n-2].x>>32 : 0;
for (i = (long)u.n - 2, *n_sub = 0; i >= 0; --i)
if (*sub - (int)(u.a[i].x>>32) <= tmp) ++*n_sub;
} else ret = 0, *sub = 0, *n_sub = 0;
free(u.a); free(v.a);
return ret;
}
int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2])
{
extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a);
extern void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const bwahit_t *m);
extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a);
extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h);
extern void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p, int is_hard, const bwahit_t *m);
int n = 0, i, j, z[2], o, subo, n_sub;
kstring_t str;
mem_alnreg_v b[2];
bwahit_t h[2];
str.l = str.m = 0; str.s = 0;
// perform SW for the best alignment
kv_init(b[0]); kv_init(b[1]);
for (i = 0; i < 2; ++i)
for (j = 0; j < a[i].n; ++j)
if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired)
kv_push(mem_alnreg_t, b[i], a[i].a[j]);
for (i = 0; i < 2; ++i)
for (j = 0; j < b[i].n && j < opt->max_matesw; ++j)
n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
free(b[0].a); free(b[1].a);
mem_mark_primary_se(opt, a[0].n, a[0].a);
mem_mark_primary_se(opt, a[1].n, a[1].a);
if (opt->flag&MEM_F_NOPAIRING) goto no_pairing;
// pairing single-end hits
if (a[0].n && a[1].n && (o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, &n_sub, z)) > 0) {
int is_multi[2], q_pe, extra_flag = 1, score_un, q_se[2];
// check if an end has multiple hits even after mate-SW
for (i = 0; i < 2; ++i) {
for (j = 1; j < a[i].n; ++j)
if (a[i].a[j].secondary < 0) break;
is_multi[i] = j < a[i].n? 1 : 0;
}
if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score
// compute mapQ for the best SE hit
score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired;
//q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0;
subo = subo > score_un? subo : score_un;
q_pe = (o - subo) * 6;
if (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499);
if (q_pe < 0) q_pe = 0;
if (q_pe > 60) q_pe = 60;
// the following assumes no split hits
if (o > score_un) { // paired alignment is preferred
mem_alnreg_t *c[2];
c[0] = &a[0].a[z[0]]; c[1] = &a[1].a[z[1]];
for (i = 0; i < 2; ++i) {
if (c[i]->secondary >= 0)
c[i]->sub = a[i].a[c[i]->secondary].score, c[i]->secondary = -2;
q_se[i] = mem_approx_mapq_se(opt, c[i]);
}
q_se[0] = q_se[0] > q_pe? q_se[0] : q_pe < q_se[0] + 40? q_pe : q_se[0] + 40;
q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe < q_se[1] + 40? q_pe : q_se[1] + 40;
extra_flag |= 2;
// cap at the tandem repeat score
q_se[0] = q_se[0] < (c[0]->score - c[0]->csub) * 6? q_se[0] : (c[0]->score - c[0]->csub) * 6;
q_se[1] = q_se[1] < (c[1]->score - c[1]->csub) * 6? q_se[1] : (c[1]->score - c[1]->csub) * 6;
} else { // the unpaired alignment is preferred
z[0] = z[1] = 0;
q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]);
q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]);
}
mem_alnreg2hit(&a[0].a[z[0]], &h[0]); h[0].qual = q_se[0]; h[0].flag |= 0x40 | extra_flag;
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[0].seq, &h[0].qb, &h[0].qe, &h[0].rb, &h[0].re);
mem_alnreg2hit(&a[1].a[z[1]], &h[1]); h[1].qual = q_se[1]; h[1].flag |= 0x80 | extra_flag;
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[1].seq, &h[1].qb, &h[1].qe, &h[1].rb, &h[1].re);
bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->flag&MEM_F_HARDCLIP, &h[1]); s[0].sam = xstrdup(str.s); str.l = 0;
bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->flag&MEM_F_HARDCLIP, &h[0]); s[1].sam = str.s;
} else goto no_pairing;
return n;
no_pairing:
for (i = 0; i < 2; ++i) {
if (a[i].n) {
mem_alnreg2hit(&a[i].a[0], &h[i]);
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[i].seq, &h[i].qb, &h[i].qe, &h[i].rb, &h[i].re);
} else h[i].rb = h[i].re = -1;
}
mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41, &h[1]);
mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81, &h[0]);
return n;
}

105
bwape.c
View File

@ -10,6 +10,7 @@
#include "utils.h"
#include "stdaln.h"
#include "bwase.h"
#include "bwa.h"
typedef struct {
int n;
@ -21,24 +22,15 @@ typedef struct {
bwtint_t low, high, high_bayesian;
} isize_info_t;
typedef struct {
uint64_t x, y;
} b128_t;
#define b128_lt(a, b) ((a).x < (b).x)
#define b128_eq(a, b) ((a).x == (b).x && (a).y == (b).y)
#define b128_hash(a) ((uint32_t)(a).x)
#include "khash.h"
KHASH_INIT(b128, b128_t, poslist_t, 1, b128_hash, b128_eq)
#include "ksort.h"
KSORT_INIT(b128, b128_t, b128_lt)
KSORT_INIT_GENERIC(uint64_t)
KHASH_INIT(b128, pair64_t, poslist_t, 1, b128_hash, b128_eq)
typedef struct {
kvec_t(b128_t) arr;
kvec_t(b128_t) pos[2];
pair64_v arr;
pair64_v pos[2];
kvec_t(bwt_aln1_t) aln[2];
} pe_data_t;
@ -69,19 +61,6 @@ pe_opt_t *bwa_init_pe_opt()
po->ap_prior = 1e-5;
return po;
}
static inline uint64_t hash_64(uint64_t key)
{
key += ~(key << 32);
key ^= (key >> 22);
key += ~(key << 13);
key ^= (key >> 8);
key += (key << 3);
key ^= (key >> 15);
key += ~(key << 27);
key ^= (key >> 31);
return key;
}
/*
static double ierfc(double x) // inverse erfc(); iphi(x) = M_SQRT2 *ierfc(2 * x);
{
@ -120,7 +99,7 @@ static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double
free(isizes);
return -1;
}
ks_introsort(uint64_t, tot, isizes);
ks_introsort_64(tot, isizes);
p25 = isizes[(int)(tot*0.25 + 0.5)];
p50 = isizes[(int)(tot*0.50 + 0.5)];
p75 = isizes[(int)(tot*0.75 + 0.5)];
@ -170,7 +149,7 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm,
{
int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len;
uint64_t o_score, subo_score;
b128_t last_pos[2][2], o_pos[2];
pair64_t last_pos[2][2], o_pos[2];
max_len = p[0]->full_len;
if (max_len < p[1]->full_len) max_len = p[1]->full_len;
if (low_bound < max_len) low_bound = max_len;
@ -206,11 +185,11 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm,
o_score = subo_score = (uint64_t)-1;
o_n = subo_n = 0;
ks_introsort(b128, d->arr.n, d->arr.a);
ks_introsort_128(d->arr.n, d->arr.a);
for (j = 0; j < 2; ++j) last_pos[j][0].x = last_pos[j][0].y = last_pos[j][1].x = last_pos[j][1].y = (uint64_t)-1;
if (opt->type == BWA_PET_STD) {
for (i = 0; i < d->arr.n; ++i) {
b128_t x = d->arr.a[i];
pair64_t x = d->arr.a[i];
int strand = x.y>>1&1;
if (strand == 1) { // reverse strand, then check
int y = 1 - (x.y&1);
@ -221,19 +200,6 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm,
last_pos[x.y&1][1] = x;
}
}
} else if (opt->type == BWA_PET_SOLID) {
for (i = 0; i < d->arr.n; ++i) {
b128_t x = d->arr.a[i];
int strand = x.y>>1&1;
if ((strand^x.y)&1) { // push
int y = 1 - (x.y&1);
__pairing_aux(last_pos[y][1], x);
__pairing_aux(last_pos[y][0], x);
} else { // check
last_pos[x.y&1][0] = last_pos[x.y&1][1];
last_pos[x.y&1][1] = x;
}
}
} else {
fprintf(stderr, "[paring] not implemented yet!\n");
exit(1);
@ -345,7 +311,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw
if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT)
&& (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT))
{ // only when both ends mapped
b128_t x;
pair64_t x;
int j, k;
long long n_occ[2];
for (j = 0; j < 2; ++j) {
@ -360,7 +326,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw
bwt_aln1_t *r = d->aln[j].a + k;
bwtint_t l;
if (0 && r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table
b128_t key;
pair64_t key;
int ret;
key.x = r->k; key.y = r->l;
khint_t iter = kh_put(b128, g_hash, key, &ret);
@ -377,14 +343,14 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw
for (l = 0; l < kh_val(g_hash, iter).n; ++l) {
x.x = kh_val(g_hash, iter).a[l]>>1;
x.y = k<<2 | (kh_val(g_hash, iter).a[l]&1)<<1 | j;
kv_push(b128_t, d->arr, x);
kv_push(pair64_t, d->arr, x);
}
} else { // then calculate on the fly
for (l = r->k; l <= r->l; ++l) {
int strand;
x.x = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand);
x.y = k<<2 | strand<<1 | j;
kv_push(b128_t, d->arr, x);
kv_push(pair64_t, d->arr, x);
}
}
}
@ -576,11 +542,11 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
++n_tot[is_singleton];
cigar[0] = cigar[1] = 0;
n_cigar[0] = n_cigar[1] = 0;
if (popt->type != BWA_PET_STD && popt->type != BWA_PET_SOLID) continue; // other types of pairing is not considered
if (popt->type != BWA_PET_STD) continue; // other types of pairing is not considered
for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified
ubyte_t *seq;
if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip
if (popt->type == BWA_PET_STD) {
{ // note that popt->type == BWA_PET_STD always true; in older versions, there was a branch for color-space FF/RR reads
if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate
__set_rght_coor(beg[k], end[k], p[1-k], p[k]);
seq = p[k]->rseq;
@ -589,17 +555,6 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
seq = p[k]->seq;
seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly
}
} else { // BWA_PET_SOLID
if (p[1-k]->strand == 0) { // R3-F3 pairing
if (k == 0) __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3
else __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3
seq = p[k]->rseq;
seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed
} else { // F3-R3 pairing
if (k == 0) __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3
else __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3
seq = p[k]->seq;
}
}
// perform SW alignment
cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]);
@ -656,14 +611,14 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
return pacseq;
}
void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt)
void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line)
{
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
int i, j, n_seqs, tot_seqs = 0;
bwa_seq_t *seqs[2];
bwa_seqio_t *ks[2];
clock_t t;
bntseq_t *bns, *ntbns = 0;
bntseq_t *bns;
FILE *fp_sa[2];
gap_opt_t opt, opt0;
khint_t iter;
@ -688,10 +643,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
opt0 = opt;
err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten!
ks[1] = bwa_open_reads(opt.mode, fn_fa[1]);
if (!(opt.mode & BWA_MODE_COMPREAD)) {
popt->type = BWA_PET_SOLID;
ntbns = bwa_open_nt(prefix);
} else { // for Illumina alignment only
{ // for Illumina alignment only
if (popt->is_preload) {
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
@ -702,7 +654,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
}
// core loop
bwa_print_sam_SQ(bns);
bwa_print_sam_hdr(bns, rg_line);
bwa_print_sam_PG();
while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) {
int cnt_chg;
@ -724,7 +676,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... ");
for (j = 0; j < 2; ++j)
bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq, ntbns);
bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
if (pac == 0) free(pacseq);
@ -749,7 +701,6 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
// destroy
bns_destroy(bns);
if (ntbns) bns_destroy(ntbns);
for (i = 0; i < 2; ++i) {
bwa_seq_close(ks[i]);
err_fclose(fp_sa[i]);
@ -764,21 +715,15 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
int bwa_sai2sam_pe(int argc, char *argv[])
{
extern char *bwa_rg_line, *bwa_rg_id;
extern int bwa_set_rg(const char *s);
extern char *bwa_infer_prefix(const char *hint);
int c;
pe_opt_t *popt;
char *prefix;
char *prefix, *rg_line = 0;
popt = bwa_init_pe_opt();
while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) {
switch (c) {
case 'r':
if (bwa_set_rg(optarg) < 0) {
fprintf(stderr, "[%s] malformated @RG line\n", __func__);
return 1;
}
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1;
break;
case 'a': popt->max_isize = atoi(optarg); break;
case 'o': popt->max_occ = atoi(optarg); break;
@ -812,13 +757,11 @@ int bwa_sai2sam_pe(int argc, char *argv[])
fprintf(stderr, "\n");
return 1;
}
if ((prefix = bwa_infer_prefix(argv[optind])) == 0) {
if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
fprintf(stderr, "[%s] fail to locate the index\n", __func__);
free(bwa_rg_line); free(bwa_rg_id);
return 0;
}
bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt);
free(bwa_rg_line); free(bwa_rg_id); free(prefix);
free(popt);
bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt, rg_line);
free(prefix); free(popt);
return 0;
}

131
bwase.c
View File

@ -10,9 +10,9 @@
#include "bntseq.h"
#include "utils.h"
#include "kstring.h"
#include "bwa.h"
int g_log_n[256];
char *bwa_rg_line, *bwa_rg_id;
void bwa_print_sam_PG();
@ -71,8 +71,8 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma
}
rest -= q->l - q->k + 1;
} else { // Random sampling (http://code.activestate.com/recipes/272884/). In fact, we never come here.
int j, i, k;
for (j = rest, i = q->l - q->k + 1, k = 0; j > 0; --j) {
int j, i;
for (j = rest, i = q->l - q->k + 1; j > 0; --j) {
double p = 1.0, x = drand48();
while (x < p) p -= p * j / (i--);
s->multi[z].pos = q->l - i;
@ -296,18 +296,12 @@ void bwa_correct_trimmed(bwa_seq_t *s)
s->len = s->full_len;
}
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns)
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq)
{
ubyte_t *pacseq, *ntpac = 0;
ubyte_t *pacseq;
int i, j;
kstring_t *str;
if (ntbns) { // in color space
ntpac = (ubyte_t*)xcalloc(ntbns->l_pac/4+1, 1);
err_rewind(ntbns->fp_pac);
err_fread_noeof(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac);
}
if (!_pacseq) {
pacseq = (ubyte_t*)xcalloc(bns->l_pac/4+1, 1);
err_rewind(bns->fp_pac);
@ -328,28 +322,6 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t
s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos,
(s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1);
}
#if 0
if (ntbns) { // in color space
for (i = 0; i < n_seqs; ++i) {
bwa_seq_t *s = seqs + i;
bwa_cs2nt_core(s, bns->l_pac, ntpac);
for (j = 0; j < s->n_multi; ++j) {
bwt_multi1_t *q = s->multi + j;
int n_cigar;
if (q->gap == 0) continue;
free(q->cigar);
q->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, q->strand? s->rseq : s->seq, &q->pos,
(q->strand? 1 : -1) * q->gap, &n_cigar, 0);
q->n_cigar = n_cigar;
}
if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again
free(s->cigar);
s->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos,
(s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0);
}
}
}
#endif
// generate MD tag
str = (kstring_t*)xcalloc(1, sizeof(kstring_t));
for (i = 0; i != n_seqs; ++i) {
@ -357,18 +329,16 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t
if (s->type != BWA_TYPE_NO_MATCH) {
int nm;
s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq,
bns->l_pac, ntbns? ntpac : pacseq, str, &nm);
bns->l_pac, pacseq, str, &nm);
s->nm = nm;
}
}
free(str->s); free(str);
// correct for trimmed reads
if (!ntbns) // trimming is only enabled for Illumina reads
for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i);
for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i);
if (!_pacseq) free(pacseq);
free(ntpac);
}
int64_t pos_end(const bwa_seq_t *p)
@ -462,11 +432,11 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
// print mate coordinate
if (mate && mate->type != BWA_TYPE_NO_MATCH) {
int m_seqid, m_is_N;
int m_seqid;
long long isize;
am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality
// redundant calculation here, but should not matter too much
m_is_N = bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid);
bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid);
err_printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name);
isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0;
if (p->type == BWA_TYPE_NO_MATCH) isize = 0;
@ -482,7 +452,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
err_printf("%s", p->qual);
} else err_printf("*");
if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id);
if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id);
if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc);
if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len);
if (p->type != BWA_TYPE_NO_MATCH) {
@ -532,74 +502,20 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
err_printf("%s", p->qual);
} else err_printf("*");
if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id);
if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id);
if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc);
if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len);
err_putchar('\n');
}
}
bntseq_t *bwa_open_nt(const char *prefix)
{
bntseq_t *ntbns;
char *str;
str = (char*)xcalloc(strlen(prefix) + 10, 1);
strcat(strcpy(str, prefix), ".nt");
ntbns = bns_restore(str);
free(str);
return ntbns;
}
void bwa_print_sam_SQ(const bntseq_t *bns)
{
int i;
for (i = 0; i < bns->n_seqs; ++i)
err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
if (bwa_rg_line) err_printf("%s\n", bwa_rg_line);
}
void bwase_initialize()
{
int i;
for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
}
char *bwa_escape(char *s)
{
char *p, *q;
for (p = q = s; *p; ++p) {
if (*p == '\\') {
++p;
if (*p == 't') *q++ = '\t';
else if (*p == 'n') *q++ = '\n';
else if (*p == 'r') *q++ = '\r';
else if (*p == '\\') *q++ = '\\';
} else *q++ = *p;
}
*q = '\0';
return s;
}
int bwa_set_rg(const char *s)
{
char *p, *q, *r;
if (strstr(s, "@RG") != s) return -1;
if (bwa_rg_line) free(bwa_rg_line);
if (bwa_rg_id) free(bwa_rg_id);
bwa_rg_line = xstrdup(s);
bwa_rg_id = 0;
bwa_escape(bwa_rg_line);
p = strstr(bwa_rg_line, "\tID:");
if (p == 0) return -1;
p += 4;
for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
bwa_rg_id = xcalloc(q - p + 1, 1);
for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
*r++ = *q;
return 0;
}
void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ)
void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ, const char *rg_line)
{
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
int i, n_seqs, tot_seqs = 0, m_aln;
@ -607,7 +523,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
bwa_seq_t *seqs;
bwa_seqio_t *ks;
clock_t t;
bntseq_t *bns, *ntbns = 0;
bntseq_t *bns;
FILE *fp_sa;
gap_opt_t opt;
@ -619,9 +535,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
m_aln = 0;
err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa);
if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac
ntbns = bwa_open_nt(prefix);
bwa_print_sam_SQ(bns);
bwa_print_sam_hdr(bns, rg_line);
//bwa_print_sam_PG();
// set ks
ks = bwa_open_reads(opt.mode, fn_fa);
@ -648,7 +562,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
fprintf(stderr, "[bwa_aln_core] refine gapped alignments... ");
bwa_refine_gapped(bns, n_seqs, seqs, 0, ntbns);
bwa_refine_gapped(bns, n_seqs, seqs, 0);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
fprintf(stderr, "[bwa_aln_core] print alignments... ");
@ -662,7 +576,6 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
// destroy
bwa_seq_close(ks);
if (ntbns) bns_destroy(ntbns);
bns_destroy(bns);
err_fclose(fp_sa);
free(aln);
@ -670,17 +583,13 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
int bwa_sai2sam_se(int argc, char *argv[])
{
extern char *bwa_infer_prefix(const char *hint);
int c, n_occ = 3;
char *prefix;
char *prefix, *rg_line = 0;
while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) {
switch (c) {
case 'h': break;
case 'r':
if (bwa_set_rg(optarg) < 0) {
fprintf(stderr, "[%s] malformated @RG line\n", __func__);
return 1;
}
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1;
break;
case 'n': n_occ = atoi(optarg); break;
case 'f': xreopen(optarg, "w", stdout); break;
@ -692,12 +601,10 @@ int bwa_sai2sam_se(int argc, char *argv[])
fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] <prefix> <in.sai> <in.fq>\n");
return 1;
}
if ((prefix = bwa_infer_prefix(argv[optind])) == 0) {
if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
fprintf(stderr, "[%s] fail to locate the index\n", __func__);
free(bwa_rg_line); free(bwa_rg_id);
return 0;
}
bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ);
free(bwa_rg_line); free(bwa_rg_id);
bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line);
return 0;
}

View File

@ -14,7 +14,7 @@ extern "C" {
// Calculate the approximate position of the sequence from the specified bwt with loaded suffix array.
void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t* bwt, bwa_seq_t* seq, const int max_mm, const float fnr);
// Refine the approximate position of the sequence to an actual placement for the sequence.
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns);
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq);
// Backfill certain alignment properties mainly centering around number of matches.
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
// Calculate the end position of a read given a certain sequence.

View File

@ -5,7 +5,7 @@
#include "bamlite.h"
#include "kseq.h"
KSEQ_INIT(gzFile, err_gzread)
KSEQ_DECLARE(gzFile)
extern unsigned char nst_nt4_table[256];
static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };

180
bwt.c
View File

@ -45,6 +45,14 @@ void bwt_gen_cnt_table(bwt_t *bwt)
}
}
static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inverse CSA
{
bwtint_t x = k - (k > bwt->primary);
x = bwt_B0(bwt, x);
x = bwt->L2[x] + bwt_occ(bwt, k, x);
return k == bwt->primary? 0 : x;
}
// bwt->bwt and bwt->occ must be precalculated
void bwt_cal_sa(bwt_t *bwt, int intv)
{
@ -93,21 +101,20 @@ static inline int __occ_aux(uint64_t y, int c)
bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c)
{
bwtint_t n, l, j;
uint32_t *p;
bwtint_t n;
uint32_t *p, *end;
if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
if (k == (bwtint_t)(-1)) return 0;
if (k >= bwt->primary) --k; // because $ is not in bwt
k -= (k >= bwt->primary); // because $ is not in bwt
// retrieve Occ at k/OCC_INTERVAL
n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c];
p += sizeof(bwtint_t); // jump to the start of the first BWT cell
// calculate Occ up to the last k/32
j = k >> 5 << 5;
for (l = k/OCC_INTERVAL*OCC_INTERVAL; l < j; l += 32, p += 2)
n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
end = p + (((k>>5) - ((k&~OCC_INTV_MASK)>>5))<<1);
for (; p < end; p += 2) n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
// calculate Occ
n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
@ -156,20 +163,20 @@ void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok,
void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4])
{
bwtint_t l, j, x;
uint32_t *p;
bwtint_t x;
uint32_t *p, tmp, *end;
if (k == (bwtint_t)(-1)) {
memset(cnt, 0, 4 * sizeof(bwtint_t));
return;
}
if (k >= bwt->primary) --k; // because $ is not in bwt
k -= (k >= bwt->primary); // because $ is not in bwt
p = bwt_occ_intv(bwt, k);
memcpy(cnt, p, 4 * sizeof(bwtint_t));
p += sizeof(bwtint_t);
j = k >> 4 << 4;
for (l = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; l < j; l += 16, ++p)
x += __occ_aux4(bwt, *p);
x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15);
p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t))
end = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4)); // this is the end point of the following loop
for (x = 0; p < end; ++p) x += __occ_aux4(bwt, *p);
tmp = *p & ~((1U<<((~k&15)<<1)) - 1);
x += __occ_aux4(bwt, tmp) - (~k&15);
cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
}
@ -177,29 +184,30 @@ void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4])
void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4])
{
bwtint_t _k, _l;
_k = (k >= bwt->primary)? k-1 : k;
_l = (l >= bwt->primary)? l-1 : l;
if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
_k = k - (k >= bwt->primary);
_l = l - (l >= bwt->primary);
if (_l>>OCC_INTV_SHIFT != _k>>OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
bwt_occ4(bwt, k, cntk);
bwt_occ4(bwt, l, cntl);
} else {
bwtint_t i, j, x, y;
uint32_t *p;
if (k >= bwt->primary) --k; // because $ is not in bwt
if (l >= bwt->primary) --l;
bwtint_t x, y;
uint32_t *p, tmp, *endk, *endl;
k -= (k >= bwt->primary); // because $ is not in bwt
l -= (l >= bwt->primary);
p = bwt_occ_intv(bwt, k);
memcpy(cntk, p, 4 * sizeof(bwtint_t));
p += sizeof(bwtint_t);
p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t))
// prepare cntk[]
j = k >> 4 << 4;
for (i = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; i < j; i += 16, ++p)
x += __occ_aux4(bwt, *p);
endk = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4));
endl = p + ((l>>4) - ((l&~OCC_INTV_MASK)>>4));
for (x = 0; p < endk; ++p) x += __occ_aux4(bwt, *p);
y = x;
x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15);
tmp = *p & ~((1U<<((~k&15)<<1)) - 1);
x += __occ_aux4(bwt, tmp) - (~k&15);
// calculate cntl[] and finalize cntk[]
j = l >> 4 << 4;
for (; i < j; i += 16, ++p) y += __occ_aux4(bwt, *p);
y += __occ_aux4(bwt, *p & ~((1U<<((~l&15)<<1)) - 1)) - (~l&15);
for (; p < endl; ++p) y += __occ_aux4(bwt, *p);
tmp = *p & ~((1U<<((~l&15)<<1)) - 1);
y += __occ_aux4(bwt, tmp) - (~l&15);
memcpy(cntl, cntk, 4 * sizeof(bwtint_t));
cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24;
cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24;
@ -273,7 +281,7 @@ static void bwt_reverse_intvs(bwtintv_v *p)
}
}
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2])
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
{
int i, j, c, ret;
bwtintv_t ik, ok[4];
@ -281,45 +289,45 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem
mem->n = 0;
if (q[x] > 3) return x + 1;
if (min_intv < 1) min_intv = 1; // the interval size should be at least 1
kv_init(a[0]); kv_init(a[1]);
prev = tmpvec[0]? tmpvec[0] : &a[0];
curr = tmpvec[1]? tmpvec[1] : &a[1];
bwt_set_intv(bwt, q[x], ik);
prev = tmpvec && tmpvec[0]? tmpvec[0] : &a[0]; // use the temporary vector if provided
curr = tmpvec && tmpvec[1]? tmpvec[1] : &a[1];
bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
ik.info = x + 1;
for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search
if (q[i] < 4) {
c = 3 - q[i];
if (q[i] < 4) { // an A/C/G/T base
c = 3 - q[i]; // complement of q[i]
bwt_extend(bwt, &ik, ok, 0);
if (ok[c].x[2] != ik.x[2]) // change of the interval size
if (ok[c].x[2] != ik.x[2]) { // change of the interval size
kv_push(bwtintv_t, *curr, ik);
if (ok[c].x[2] == 0) break; // cannot be extended
if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further
}
ik = ok[c]; ik.info = i + 1;
} else { // an ambiguous base
kv_push(bwtintv_t, *curr, ik);
break; // cannot be extended; in this case, i<len always stands
break; // always terminate extension at an ambiguous base; in this case, i<len always stands
}
}
if (i == len) kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end
bwt_reverse_intvs(curr); // s.t. smaller intervals visited first
bwt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first
ret = curr->a[0].info; // this will be the returned value
swap = curr; curr = prev; prev = swap;
for (i = x - 1; i >= -1; --i) { // backward search for MEMs
if (q[i] > 3) break;
c = i < 0? 0 : q[i];
c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base
for (j = 0, curr->n = 0; j < prev->n; ++j) {
bwtintv_t *p = &prev->a[j];
bwt_extend(bwt, p, ok, 1);
if (ok[c].x[2] == 0 || i == -1) { // keep the hit if reaching the beginning or not extended further
if (curr->n == 0) { // curr->n to make sure there is no longer matches
if (c < 0 || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches
if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches
ik = *p; ik.info |= (uint64_t)(i + 1)<<32;
kv_push(bwtintv_t, *mem, ik);
}
} // otherwise the match is contained in another longer match
}
if (ok[c].x[2] && (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2])) {
} else if (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2]) {
ok[c].info = p->info;
kv_push(bwtintv_t, *curr, ok[c]);
}
@ -329,7 +337,85 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem
}
bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate
if (tmpvec[0] == 0) free(a[0].a);
if (tmpvec[1] == 0) free(a[1].a);
if (tmpvec == 0 || tmpvec[0] == 0) free(a[0].a);
if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a);
return ret;
}
/*************************
* Read/write BWT and SA *
*************************/
void bwt_dump_bwt(const char *fn, const bwt_t *bwt)
{
FILE *fp;
fp = xopen(fn, "wb");
err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
err_fwrite(bwt->bwt, 4, bwt->bwt_size, fp);
err_fflush(fp);
err_fclose(fp);
}
void bwt_dump_sa(const char *fn, const bwt_t *bwt)
{
FILE *fp;
fp = xopen(fn, "wb");
err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp);
err_fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
err_fflush(fp);
err_fclose(fp);
}
void bwt_restore_sa(const char *fn, bwt_t *bwt)
{
char skipped[256];
FILE *fp;
bwtint_t primary;
fp = xopen(fn, "rb");
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same.");
err_fread_noeof(skipped, sizeof(bwtint_t), 4, fp); // skip
err_fread_noeof(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv;
bwt->sa = (bwtint_t*)xcalloc(bwt->n_sa, sizeof(bwtint_t));
bwt->sa[0] = -1;
err_fread_noeof(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
err_fclose(fp);
}
bwt_t *bwt_restore_bwt(const char *fn)
{
bwt_t *bwt;
FILE *fp;
bwt = (bwt_t*)xcalloc(1, sizeof(bwt_t));
fp = xopen(fn, "rb");
err_fseek(fp, 0, SEEK_END);
bwt->bwt_size = (err_ftell(fp) - sizeof(bwtint_t) * 5) >> 2;
bwt->bwt = (uint32_t*)xcalloc(bwt->bwt_size, 4);
err_fseek(fp, 0, SEEK_SET);
err_fread_noeof(&bwt->primary, sizeof(bwtint_t), 1, fp);
err_fread_noeof(bwt->L2+1, sizeof(bwtint_t), 4, fp);
err_fread_noeof(bwt->bwt, 4, bwt->bwt_size, fp);
bwt->seq_len = bwt->L2[4];
err_fclose(fp);
bwt_gen_cnt_table(bwt);
return bwt;
}
void bwt_destroy(bwt_t *bwt)
{
if (bwt == 0) return;
free(bwt->sa); free(bwt->bwt);
free(bwt);
}

17
bwt.h
View File

@ -30,8 +30,10 @@
#include <stdint.h>
// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line
#define OCC_INTERVAL 0x80
// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line because some part of the code assume OCC_INTERVAL=0x80
#define OCC_INTV_SHIFT 7
#define OCC_INTERVAL (1LL<<OCC_INTV_SHIFT)
#define OCC_INTV_MASK (OCC_INTERVAL - 1)
#ifndef BWA_UBYTE
#define BWA_UBYTE
@ -74,13 +76,6 @@ typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v;
* called bwt_B0 instead of bwt_B */
#define bwt_B0(b, k) (bwt_bwt(b, k)>>((~(k)&0xf)<<1)&3)
// inverse Psi function
#define bwt_invPsi(bwt, k) \
(((k) == (bwt)->primary)? 0 : \
((k) < (bwt)->primary)? \
(bwt)->L2[bwt_B0(bwt, k)] + bwt_occ(bwt, k, bwt_B0(bwt, k)) \
: (bwt)->L2[bwt_B0(bwt, (k)-1)] + bwt_occ(bwt, k, bwt_B0(bwt, (k)-1)))
#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)]+1, (ik).x[2] = (bwt)->L2[(int)(c)+1]-(bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3-(c)]+1, (ik).info = 0)
#ifdef __cplusplus
@ -121,7 +116,9 @@ extern "C" {
* Given a query _q_, collect potential SMEMs covering position _x_ and store them in _mem_.
* Return the end of the longest exact match starting from _x_.
*/
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
// SMEM iterator interface
#ifdef __cplusplus
}

View File

@ -1449,7 +1449,7 @@ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, bgint_t initialMaxB
}
err_fseek(packedFile, -1, SEEK_END);
packedFileLen = ftell(packedFile);
packedFileLen = err_ftell(packedFile);
err_fread_noeof(&lastByteLength, sizeof(unsigned char), 1, packedFile);
totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength);

View File

@ -11,6 +11,7 @@
#include "bwtaln.h"
#include "bwtgap.h"
#include "utils.h"
#include "bwa.h"
#ifdef HAVE_PTHREAD
#include <pthread.h>
@ -219,32 +220,6 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
bwa_seq_close(ks);
}
char *bwa_infer_prefix(const char *hint)
{
char *prefix;
int l_hint;
FILE *fp;
l_hint = strlen(hint);
prefix = xmalloc(l_hint + 3 + 4 + 1);
strcpy(prefix, hint);
strcpy(prefix + l_hint, ".64.bwt");
if ((fp = fopen(prefix, "rb")) != 0) {
fclose(fp);
prefix[l_hint + 3] = 0;
return prefix;
} else {
strcpy(prefix + l_hint, ".bwt");
if ((fp = fopen(prefix, "rb")) == 0) {
free(prefix);
return 0;
} else {
fclose(fp);
prefix[l_hint] = 0;
return prefix;
}
}
}
int bwa_aln(int argc, char *argv[])
{
int c, opte = -1;
@ -252,7 +227,7 @@ int bwa_aln(int argc, char *argv[])
char *prefix;
opt = gap_init_opt();
while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) {
while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:LR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) {
switch (c) {
case 'n':
if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1;
@ -272,7 +247,6 @@ int bwa_aln(int argc, char *argv[])
case 'L': opt->mode |= BWA_MODE_LOGGAP; break;
case 'R': opt->max_top2 = atoi(optarg); break;
case 'q': opt->trim_qual = atoi(optarg); break;
case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break;
case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break;
case 'f': xreopen(optarg, "wb", stdout); break;
case 'b': opt->mode |= BWA_MODE_BAM; break;
@ -310,7 +284,6 @@ int bwa_aln(int argc, char *argv[])
fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual);
fprintf(stderr, " -f FILE file to write output to instead of stdout\n");
fprintf(stderr, " -B INT length of barcode\n");
// fprintf(stderr, " -c input sequences are in the color space\n");
fprintf(stderr, " -L log-scaled gap penalty for long deletions\n");
fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n");
fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n");
@ -330,7 +303,7 @@ int bwa_aln(int argc, char *argv[])
k = l;
}
}
if ((prefix = bwa_infer_prefix(argv[optind])) == 0) {
if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
fprintf(stderr, "[%s] fail to locate the index\n", __func__);
free(opt);
return 0;

View File

@ -107,7 +107,6 @@ typedef struct {
} gap_opt_t;
#define BWA_PET_STD 1
#define BWA_PET_SOLID 2
typedef struct {
int max_isize, force_isize;

View File

@ -36,17 +36,160 @@
#include "main.h"
#include "utils.h"
bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is);
void bwa_pac_rev_core(const char *fn, const char *fn_rev);
#ifdef _DIVBWT
#include "divsufsort.h"
#endif
int bwa_index(int argc, char *argv[])
int is_bwt(ubyte_t *T, int n);
int64_t bwa_seq_len(const char *fn_pac)
{
FILE *fp;
int64_t pac_len;
ubyte_t c;
fp = xopen(fn_pac, "rb");
err_fseek(fp, -1, SEEK_END);
pac_len = err_ftell(fp);
err_fread_noeof(&c, 1, 1, fp);
err_fclose(fp);
return (pac_len - 1) * 4 + (int)c;
}
bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
{
bwt_t *bwt;
ubyte_t *buf, *buf2;
int i, pac_size;
FILE *fp;
// initialization
bwt = (bwt_t*)xcalloc(1, sizeof(bwt_t));
bwt->seq_len = bwa_seq_len(fn_pac);
bwt->bwt_size = (bwt->seq_len + 15) >> 4;
fp = xopen(fn_pac, "rb");
// prepare sequence
pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1);
buf2 = (ubyte_t*)xcalloc(pac_size, 1);
err_fread_noeof(buf2, 1, pac_size, fp);
err_fclose(fp);
memset(bwt->L2, 0, 5 * 4);
buf = (ubyte_t*)xcalloc(bwt->seq_len + 1, 1);
for (i = 0; i < bwt->seq_len; ++i) {
buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3;
++bwt->L2[1+buf[i]];
}
for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1];
free(buf2);
// Burrows-Wheeler Transform
if (use_is) {
bwt->primary = is_bwt(buf, bwt->seq_len);
} else {
#ifdef _DIVBWT
bwt->primary = divbwt(buf, buf, 0, bwt->seq_len);
#else
err_fatal_simple("libdivsufsort is not compiled in.");
#endif
}
bwt->bwt = (u_int32_t*)xcalloc(bwt->bwt_size, 4);
for (i = 0; i < bwt->seq_len; ++i)
bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1);
free(buf);
return bwt;
}
int bwa_pac2bwt(int argc, char *argv[]) // the "pac2bwt" command; IMPORTANT: bwt generated at this step CANNOT be used with BWA. bwtupdate is required!
{
bwt_t *bwt;
int c, use_is = 1;
while ((c = getopt(argc, argv, "d")) >= 0) {
switch (c) {
case 'd': use_is = 0; break;
default: return 1;
}
}
if (optind + 2 > argc) {
fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
return 1;
}
bwt = bwt_pac2bwt(argv[optind], use_is);
bwt_dump_bwt(argv[optind+1], bwt);
bwt_destroy(bwt);
return 0;
}
#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
void bwt_bwtupdate_core(bwt_t *bwt)
{
bwtint_t i, k, c[4], n_occ;
uint32_t *buf;
n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;
bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size
buf = (uint32_t*)xcalloc(bwt->bwt_size, 4); // will be the new bwt
c[0] = c[1] = c[2] = c[3] = 0;
for (i = k = 0; i < bwt->seq_len; ++i) {
if (i % OCC_INTERVAL == 0) {
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4)
}
if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2
++c[bwt_B00(bwt, i)];
}
// the last element
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size");
// update bwt
free(bwt->bwt); bwt->bwt = buf;
}
int bwa_bwtupdate(int argc, char *argv[]) // the "bwtupdate" command
{
bwt_t *bwt;
if (argc < 2) {
fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
return 1;
}
bwt = bwt_restore_bwt(argv[1]);
bwt_bwtupdate_core(bwt);
bwt_dump_bwt(argv[1], bwt);
bwt_destroy(bwt);
return 0;
}
int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command
{
bwt_t *bwt;
int c, sa_intv = 32;
while ((c = getopt(argc, argv, "i:")) >= 0) {
switch (c) {
case 'i': sa_intv = atoi(optarg); break;
default: return 1;
}
}
if (optind + 2 > argc) {
fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
return 1;
}
bwt = bwt_restore_bwt(argv[optind]);
bwt_cal_sa(bwt, sa_intv);
bwt_dump_sa(argv[optind+1], bwt);
bwt_destroy(bwt);
return 0;
}
int bwa_index(int argc, char *argv[]) // the "index" command
{
extern void bwa_pac_rev_core(const char *fn, const char *fn_rev);
char *prefix = 0, *str, *str2, *str3;
int c, algo_type = 0, is_color = 0, is_64 = 0;
int c, algo_type = 0, is_64 = 0;
clock_t t;
int64_t l_pac;
while ((c = getopt(argc, argv, "6ca:p:")) >= 0) {
while ((c = getopt(argc, argv, "6a:p:")) >= 0) {
switch (c) {
case 'a': // if -a is not set, algo_type will be determined later
if (strcmp(optarg, "div") == 0) algo_type = 1;
@ -55,7 +198,6 @@ int bwa_index(int argc, char *argv[])
else err_fatal(__func__, "unknown algorithm: '%s'.", optarg);
break;
case 'p': prefix = xstrdup(optarg); break;
case 'c': is_color = 1; break;
case '6': is_64 = 1; break;
default: return 1;
}
@ -67,7 +209,6 @@ int bwa_index(int argc, char *argv[])
fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n");
fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n");
fprintf(stderr, " -6 index files named as <in.fasta>.64.* instead of <in.fasta>.* \n");
// fprintf(stderr, " -c build color-space index\n");
fprintf(stderr, "\n");
fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n");
fprintf(stderr, " `-a div' do not work not for long genomes. Please choose `-a'\n");
@ -83,29 +224,13 @@ int bwa_index(int argc, char *argv[])
str2 = (char*)xcalloc(strlen(prefix) + 10, 1);
str3 = (char*)xcalloc(strlen(prefix) + 10, 1);
if (is_color == 0) { // nucleotide indexing
{ // nucleotide indexing
gzFile fp = xzopen(argv[optind], "r");
t = clock();
fprintf(stderr, "[bwa_index] Pack FASTA... ");
l_pac = bns_fasta2bntseq(fp, prefix, 0);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
err_gzclose(fp);
} else { // color indexing
gzFile fp = xzopen(argv[optind], "r");
strcat(strcpy(str, prefix), ".nt");
t = clock();
fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... ");
l_pac = bns_fasta2bntseq(fp, str, 0);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
err_gzclose(fp);
{
char *tmp_argv[3];
tmp_argv[0] = argv[0]; tmp_argv[1] = str; tmp_argv[2] = prefix;
t = clock();
fprintf(stderr, "[bwa_index] Convert nucleotide PAC to color PAC... ");
bwa_pac2cspac(3, tmp_argv);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
}
}
if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT
{

79
bwtio.c
View File

@ -1,79 +0,0 @@
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include "bwt.h"
#include "utils.h"
void bwt_dump_bwt(const char *fn, const bwt_t *bwt)
{
FILE *fp = NULL;
fp = xopen(fn, "wb");
err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
err_fwrite(bwt->bwt, 4, bwt->bwt_size, fp);
err_fflush(fp);
err_fclose(fp);
}
void bwt_dump_sa(const char *fn, const bwt_t *bwt)
{
FILE *fp;
fp = xopen(fn, "wb");
err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp);
err_fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
err_fflush(fp);
err_fclose(fp);
}
void bwt_restore_sa(const char *fn, bwt_t *bwt)
{
char skipped[256];
FILE *fp;
bwtint_t primary;
fp = xopen(fn, "rb");
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same.");
err_fread_noeof(skipped, sizeof(bwtint_t), 4, fp); // skip
err_fread_noeof(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv;
bwt->sa = (bwtint_t*)xcalloc(bwt->n_sa, sizeof(bwtint_t));
bwt->sa[0] = -1;
err_fread_noeof(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
err_fclose(fp);
}
bwt_t *bwt_restore_bwt(const char *fn)
{
bwt_t *bwt;
FILE *fp;
bwt = (bwt_t*)xcalloc(1, sizeof(bwt_t));
fp = xopen(fn, "rb");
err_fseek(fp, 0, SEEK_END);
bwt->bwt_size = (err_ftell(fp) - sizeof(bwtint_t) * 5) >> 2;
bwt->bwt = (uint32_t*)xcalloc(bwt->bwt_size, 4);
err_fseek(fp, 0, SEEK_SET);
err_fread_noeof(&bwt->primary, sizeof(bwtint_t), 1, fp);
err_fread_noeof(bwt->L2+1, sizeof(bwtint_t), 4, fp);
err_fread_noeof(bwt->bwt, 4, bwt->bwt_size, fp);
bwt->seq_len = bwt->L2[4];
err_fclose(fp);
bwt_gen_cnt_table(bwt);
return bwt;
}
void bwt_destroy(bwt_t *bwt)
{
if (bwt == 0) return;
free(bwt->sa); free(bwt->bwt);
free(bwt);
}

231
bwtmisc.c
View File

@ -1,231 +0,0 @@
/* The MIT License
Copyright (c) 2008 Genome Research Ltd (GRL).
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/* Contact: Heng Li <lh3@sanger.ac.uk> */
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include "bntseq.h"
#include "utils.h"
#include "main.h"
#include "bwt.h"
#ifdef _DIVBWT
#include "divsufsort.h"
#endif
int is_bwt(ubyte_t *T, int n);
int64_t bwa_seq_len(const char *fn_pac)
{
FILE *fp;
int64_t pac_len;
ubyte_t c;
fp = xopen(fn_pac, "rb");
err_fseek(fp, -1, SEEK_END);
pac_len = err_ftell(fp);
err_fread_noeof(&c, 1, 1, fp);
err_fclose(fp);
return (pac_len - 1) * 4 + (int)c;
}
bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
{
bwt_t *bwt;
ubyte_t *buf, *buf2;
int i, pac_size;
FILE *fp;
// initialization
bwt = (bwt_t*)xcalloc(1, sizeof(bwt_t));
bwt->seq_len = bwa_seq_len(fn_pac);
bwt->bwt_size = (bwt->seq_len + 15) >> 4;
fp = xopen(fn_pac, "rb");
// prepare sequence
pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1);
buf2 = (ubyte_t*)xcalloc(pac_size, 1);
err_fread_noeof(buf2, 1, pac_size, fp);
err_fclose(fp);
memset(bwt->L2, 0, 5 * 4);
buf = (ubyte_t*)xcalloc(bwt->seq_len + 1, 1);
for (i = 0; i < bwt->seq_len; ++i) {
buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3;
++bwt->L2[1+buf[i]];
}
for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1];
free(buf2);
// Burrows-Wheeler Transform
if (use_is) {
bwt->primary = is_bwt(buf, bwt->seq_len);
} else {
#ifdef _DIVBWT
bwt->primary = divbwt(buf, buf, 0, bwt->seq_len);
#else
err_fatal_simple("libdivsufsort is not compiled in.");
#endif
}
bwt->bwt = (u_int32_t*)xcalloc(bwt->bwt_size, 4);
for (i = 0; i < bwt->seq_len; ++i)
bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1);
free(buf);
return bwt;
}
int bwa_pac2bwt(int argc, char *argv[])
{
bwt_t *bwt;
int c, use_is = 1;
while ((c = getopt(argc, argv, "d")) >= 0) {
switch (c) {
case 'd': use_is = 0; break;
default: return 1;
}
}
if (optind + 2 > argc) {
fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
return 1;
}
bwt = bwt_pac2bwt(argv[optind], use_is);
bwt_dump_bwt(argv[optind+1], bwt);
bwt_destroy(bwt);
return 0;
}
#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
void bwt_bwtupdate_core(bwt_t *bwt)
{
bwtint_t i, k, c[4], n_occ;
uint32_t *buf;
n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;
bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size
buf = (uint32_t*)xcalloc(bwt->bwt_size, 4); // will be the new bwt
c[0] = c[1] = c[2] = c[3] = 0;
for (i = k = 0; i < bwt->seq_len; ++i) {
if (i % OCC_INTERVAL == 0) {
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4)
}
if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2
++c[bwt_B00(bwt, i)];
}
// the last element
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size");
// update bwt
free(bwt->bwt); bwt->bwt = buf;
}
int bwa_bwtupdate(int argc, char *argv[])
{
bwt_t *bwt;
if (argc < 2) {
fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
return 1;
}
bwt = bwt_restore_bwt(argv[1]);
bwt_bwtupdate_core(bwt);
bwt_dump_bwt(argv[1], bwt);
bwt_destroy(bwt);
return 0;
}
const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4};
/* this function is not memory efficient, but this will make life easier
Ideally we should also change .amb files as one 'N' in the nucleotide
sequence leads to two ambiguous colors. I may do this later... */
uint8_t *bwa_pac2cspac_core(const bntseq_t *bns)
{
uint8_t *pac, *cspac;
bwtint_t i;
int c1, c2;
pac = (uint8_t*)xcalloc(bns->l_pac/4 + 1, 1);
cspac = (uint8_t*)xcalloc(bns->l_pac/4 + 1, 1);
err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac);
err_rewind(bns->fp_pac);
c1 = pac[0]>>6; cspac[0] = c1<<6;
for (i = 1; i < bns->l_pac; ++i) {
c2 = pac[i>>2] >> (~i&3)*2 & 3;
cspac[i>>2] |= nst_color_space_table[(1<<c1)|(1<<c2)] << (~i&3)*2;
c1 = c2;
}
free(pac);
return cspac;
}
int bwa_pac2cspac(int argc, char *argv[])
{
bntseq_t *bns;
uint8_t *cspac, ct;
char *str;
FILE *fp;
if (argc < 3) {
fprintf(stderr, "Usage: bwa pac2cspac <in.nt.prefix> <out.cs.prefix>\n");
return 1;
}
bns = bns_restore(argv[1]);
cspac = bwa_pac2cspac_core(bns);
bns_dump(bns, argv[2]);
// now write cspac
str = (char*)xcalloc(strlen(argv[2]) + 5, 1);
strcat(strcpy(str, argv[2]), ".pac");
fp = xopen(str, "wb");
err_fwrite(cspac, 1, bns->l_pac/4 + 1, fp);
ct = bns->l_pac % 4;
err_fwrite(&ct, 1, 1, fp);
err_fflush(fp);
err_fclose(fp);
bns_destroy(bns);
free(cspac);
return 0;
}
int bwa_bwt2sa(int argc, char *argv[])
{
bwt_t *bwt;
int c, sa_intv = 32;
while ((c = getopt(argc, argv, "i:")) >= 0) {
switch (c) {
case 'i': sa_intv = atoi(optarg); break;
default: return 1;
}
}
if (optind + 2 > argc) {
fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
return 1;
}
bwt = bwt_restore_bwt(argv[optind]);
bwt_cal_sa(bwt, sa_intv);
bwt_dump_sa(argv[optind+1], bwt);
bwt_destroy(bwt);
return 0;
}

View File

@ -13,9 +13,10 @@
#include "bwtsw2.h"
#include "stdaln.h"
#include "kstring.h"
#include "bwa.h"
#include "kseq.h"
KSEQ_INIT(gzFile, err_gzread)
KSEQ_DECLARE(gzFile)
#include "ksort.h"
#define __left_lt(a, b) ((a).end > (b).end)
@ -186,14 +187,14 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8
bsw2aux_t *q = b->aux + i;
uint8_t *query;
bwtint_t k;
int score, path_len, beg, end;
int path_len, beg, end;
if (p->l) continue;
beg = (p->flag & 0x10)? lq - p->end : p->beg;
end = (p->flag & 0x10)? lq - p->beg : p->end;
query = seq[(p->flag & 0x10)? 1 : 0] + beg;
for (k = p->k; k < p->k + p->len; ++k) // in principle, no out-of-boundary here
target[k - p->k] = pac[k>>2] >> (~k&3)*2 & 0x3;
score = aln_global_core(target, p->len, query, end - beg, &par, path, &path_len);
aln_global_core(target, p->len, query, end - beg, &par, path, &path_len);
q->cigar = aln_path2cigar32(path, path_len, &q->n_cigar);
#if 0
if (name && score != p->G) { // debugging only
@ -747,7 +748,7 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *
// print and reset
for (i = 0; i < _seq->n; ++i) {
bsw2seq1_t *p = _seq->seq + i;
if (p->sam) printf("%s", p->sam);
if (p->sam) err_printf("%s", p->sam);
free(p->name); free(p->seq); free(p->qual); free(p->sam);
p->tid = -1; p->l = 0;
p->name = p->seq = p->qual = p->sam = 0;
@ -756,28 +757,18 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *
_seq->n = 0;
}
static void kseq_to_bsw2seq(const kseq_t *ks, bsw2seq1_t *p)
{
p->tid = -1;
p->l = ks->seq.l;
p->name = xstrdup(ks->name.s);
p->seq = xstrdup(ks->seq.s);
p->qual = ks->qual.l? xstrdup(ks->qual.s) : 0;
p->comment = ks->comment.l? xstrdup(ks->comment.s) : 0;
p->sam = 0;
}
void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2)
{
gzFile fp, fp2;
kseq_t *ks, *ks2;
int l, size = 0, is_pe = 0;
int l, is_pe = 0, i, n;
uint8_t *pac;
bsw2seq_t *_seq;
bseq1_t *bseq;
pac = xcalloc(bns->l_pac/4+1, 1);
for (l = 0; l < bns->n_seqs; ++l)
printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len);
err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len);
err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac);
fp = xzopen(fn, "r");
ks = kseq_init(fp);
@ -787,34 +778,25 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c
ks2 = kseq_init(fp2);
is_pe = 1;
} else fp2 = 0, ks2 = 0, is_pe = 0;
while (kseq_read(ks) >= 0) {
if (ks->name.l > 2 && ks->name.s[ks->name.l-2] == '/')
ks->name.l -= 2, ks->name.s[ks->name.l] = 0;
if (_seq->n == _seq->max) {
_seq->max = _seq->max? _seq->max<<1 : 1024;
while ((bseq = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) {
int size = 0;
if (n > _seq->max) {
_seq->max = n;
kroundup32(_seq->max);
_seq->seq = xrealloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t));
}
kseq_to_bsw2seq(ks, &_seq->seq[_seq->n++]);
size += ks->seq.l;
if (ks2) {
if (kseq_read(ks2) >= 0) {
if (ks2->name.l > 2 && ks2->name.s[ks2->name.l-2] == '/')
ks2->name.l -= 2, ks2->name.s[ks2->name.l] = 0;
kseq_to_bsw2seq(ks2, &_seq->seq[_seq->n++]); // for PE, _seq->n here must be odd and we do not need to enlarge
size += ks->seq.l;
} else {
fprintf(stderr, "[%s] The second query file has fewer reads. Switched to the single-end mode for the following batches.\n", __func__);
is_pe = 0;
}
}
if (size > opt->chunk_size * opt->n_threads) {
fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size);
process_seqs(_seq, opt, bns, pac, target, is_pe);
size = 0;
_seq->n = n;
for (i = 0; i < n; ++i) {
bseq1_t *b = &bseq[i];
bsw2seq1_t *p = &_seq->seq[i];
p->tid = -1; p->l = b->l_seq;
p->name = b->name; p->seq = b->seq; p->qual = b->qual; p->comment = b->comment; p->sam = 0;
size += p->l;
}
fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp) ...\n", n, size);
free(bseq);
process_seqs(_seq, opt, bns, pac, target, is_pe);
}
fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size);
process_seqs(_seq, opt, bns, pac, target, is_pe);
// free
free(pac);
free(_seq->seq); free(_seq);

View File

@ -6,14 +6,12 @@
#include "bwt.h"
#include "bwtsw2.h"
#include "utils.h"
#include "bwa.h"
int bwa_bwtsw2(int argc, char *argv[])
{
extern char *bwa_infer_prefix(const char *hint);
bsw2opt_t *opt;
bwt_t *target;
char buf[1024], *prefix;
bntseq_t *bns;
bwaidx_t *idx;
int c;
opt = bsw2_init_opt();
@ -81,19 +79,10 @@ int bwa_bwtsw2(int argc, char *argv[])
opt->t *= opt->a;
opt->coef *= opt->a;
if ((prefix = bwa_infer_prefix(argv[optind])) == 0) {
fprintf(stderr, "[%s] fail to locate the index\n", __func__);
return 0;
}
strcpy(buf, prefix); target = bwt_restore_bwt(strcat(buf, ".bwt"));
strcpy(buf, prefix); bwt_restore_sa(strcat(buf, ".sa"), target);
bns = bns_restore(prefix);
bsw2_aln(opt, bns, target, argv[optind+1], optind+2 < argc? argv[optind+2] : 0);
bns_destroy(bns);
bwt_destroy(target);
free(opt); free(prefix);
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 0;
bsw2_aln(opt, idx->bns, idx->bwt, argv[optind+1], optind+2 < argc? argv[optind+2] : 0);
bwa_idx_destroy(idx);
free(opt);
return 0;
}

View File

@ -7,6 +7,7 @@
#include "bntseq.h"
#include "bwtsw2.h"
#include "kstring.h"
#include "utils.h"
#ifndef _NO_SSE2
#include "ksw.h"
#else
@ -25,7 +26,6 @@ typedef struct {
bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins)
{
extern void ks_introsort_uint64_t(size_t n, uint64_t *a);
int i, k, x, p25, p50, p75, tmp, max_len = 0;
uint64_t *isize;
bsw2pestat_t r;
@ -45,7 +45,7 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins)
max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg;
isize[k++] = l;
}
ks_introsort_uint64_t(k, isize);
ks_introsort_64(k, isize);
p25 = isize[(int)(.25 * k + .499)];
p50 = isize[(int)(.50 * k + .499)];
p75 = isize[(int)(.75 * k + .499)];
@ -75,9 +75,9 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins)
r.low = tmp > max_len? tmp : max_len;
if (r.low < 1) r.low = 1;
r.high = (int)(p75 + 3. * (p75 - p25) + .499);
if (r.low > r.avg - MAX_STDDEV * 4.) r.low = (int)(r.avg - MAX_STDDEV * 4. + .499);
if (r.low > r.avg - MAX_STDDEV * r.std) r.low = (int)(r.avg - MAX_STDDEV * r.std + .499);
r.low = tmp > max_len? tmp : max_len;
if (r.high < r.avg - MAX_STDDEV * 4.) r.high = (int)(r.avg + MAX_STDDEV * 4. + .499);
if (r.high < r.avg - MAX_STDDEV * r.std) r.high = (int)(r.avg + MAX_STDDEV * r.std + .499);
ksprintf(msg, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high);
free(isize);
return r;
@ -128,35 +128,24 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b
seq[i] = nst_nt4_table[(int)mseq[i]];
}
#ifndef _NO_SSE2
{
ksw_query_t *q;
ksw_aux_t aux[2];
// forward Smith-Waterman
aux[0].T = opt->t; aux[0].gapo = opt->q; aux[0].gape = opt->r; aux[1] = aux[0];
q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat);
ksw_sse2(q, end - beg, ref, &aux[0]);
free(q);
if (aux[0].score < opt->t) {
free(seq);
return;
}
++aux[0].qe; ++aux[0].te;
// reverse Smith-Waterman
seq_reverse(aux[0].qe, seq, 0);
seq_reverse(aux[0].te, ref, 0);
q = ksw_qinit(aux[0].qe * g_mat[0] < 250? 1 : 2, aux[0].qe, seq, 5, g_mat);
ksw_sse2(q, aux[0].te, ref, &aux[1]);
free(q);
++aux[1].qe; ++aux[1].te;
// write output
a->G = aux[0].score;
a->G2 = aux[0].score2 > aux[1].score2? aux[0].score2 : aux[1].score2;
{ // FIXME!!! The following block has not been tested since the update of the ksw library
int flag = KSW_XSUBO | KSW_XSTART | (l_mseq * g_mat[0] < 250? KSW_XBYTE : 0) | opt->t;
kswr_t aln;
aln = ksw_align(l_mseq, seq, end - beg, ref, 5, g_mat, opt->q, opt->r, flag, 0);
a->G = aln.score;
a->G2 = aln.score2;
if (a->G < opt->t) a->G = 0;
if (a->G2 < opt->t) a->G2 = 0;
if (a->G2) a->flag |= BSW2_FLAG_TANDEM;
a->k = beg + (aux[0].te - aux[1].te);
a->len = aux[1].te;
a->beg = aux[0].qe - aux[1].qe;
a->end = aux[0].qe;
a->k = beg + aln.tb;
a->len = aln.te - aln.tb + 1;
a->beg = aln.qb;
a->end = aln.qe + 1;
/*
printf("[Q] "); for (i = 0; i < l_mseq; ++i) putchar("ACGTN"[(int)seq[i]]); putchar('\n');
printf("[R] "); for (i = 0; i < end - beg; ++i) putchar("ACGTN"[(int)ref[i]]); putchar('\n');
printf("G=%d,G2=%d,beg=%d,end=%d,k=%lld,len=%d\n", a->G, a->G2, a->beg, a->end, a->k, a->len);
*/
}
#else
{
@ -169,6 +158,7 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b
a->G = aln_local_core(ref, end - beg, seq, l_mseq, &ap, path, 0, opt->t, &a->G2);
if (a->G < opt->t) a->G = 0;
if (a->G2 < opt->t) a->G2 = 0;
if (a->G2) a->flag |= BSW2_FLAG_TANDEM;
a->k = beg + path[0].i - 1;
a->len = path[1].i - path[0].i + 1;
a->beg = path[0].j - 1;

192
cs2nt.c
View File

@ -1,192 +0,0 @@
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
#include "bwtaln.h"
#include "stdaln.h"
#include "utils.h"
/*
Here is a delicate example. ref_nt=ATTAAC(RBRBG), read_cs=RBBOG. If we
decode as ATTGAC(RBGOG), there are one color change and one nt change;
if we decode as ATTAAC(RBRBG), there are two color changes.
In DP, if color quality is smaller than COLOR_MM, we will use COLOR_MM
as the penalty; otherwise, we will use color quality as the
penalty. This means we always prefer two consistent color changes over
a nt change, but if a color has high quality, we may prefer one nt
change.
In the above example, the penalties of the two types of decoding are
q(B)+25 and q(B)+q(O), respectively. If q(O)>25, we prefer the first;
otherwise the second. Note that no matter what we choose, the fourth
base will get a low nt quality.
*/
#define COLOR_MM 19
#define NUCL_MM 25
static const int nst_ntnt2cs_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4 };
/*
{A,C,G,T,N} -> {0,1,2,3,4}
nt_ref[0..size]: nucleotide reference: 0/1/2/3/4
cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N
nt_read[0..size]: nucleotide read sequence: 0/1/2/3 (returned)
btarray[0..4*size]: backtrack array (working space)
*/
void cs2nt_DP(int size, const uint8_t *nt_ref, const uint8_t *cs_read, uint8_t *nt_read, uint8_t *btarray)
{
int h[8], curr, last;
int x, y, xmin, hmin, k;
// h[0..3] and h[4..7] are the current and last best score array, depending on curr and last
// recursion: initial value
if (nt_ref[0] >= 4) memset(h, 0, sizeof(int) << 2);
else {
for (x = 0; x != 4; ++x) h[x] = NUCL_MM;
h[nt_ref[0]] = 0;
}
// recursion: main loop
curr = 1; last = 0;
for (k = 1; k <= size; ++k) {
for (x = 0; x != 4; ++x) {
int min = 0x7fffffff, ymin = 0;
for (y = 0; y != 4; ++y) {
int s = h[last<<2|y];
if ((cs_read[k-1]&0x3f) != 63 && cs_read[k-1]>>6 != nst_ntnt2cs_table[1<<x|1<<y])
s += ((cs_read[k-1]&0x3f) < COLOR_MM)? COLOR_MM : (cs_read[k-1]&0x3f); // color mismatch
if (nt_ref[k] < 4 && nt_ref[k] != x) s += NUCL_MM; // nt mismatch
if (s < min) {
min = s; ymin = y;
}
}
h[curr<<2|x] = min; btarray[k<<2|x] = ymin;
}
last = curr; curr = 1 - curr; // swap
}
// back trace
hmin = 0x7fffffff; xmin = 0;
for (x = 0; x != 4; ++x) {
if (h[last<<2|x] < hmin) {
hmin = h[last<<2|x]; xmin = x;
}
}
nt_read[size] = xmin;
for (k = size - 1; k >= 0; --k)
nt_read[k] = btarray[(k+1)<<2 | nt_read[k+1]];
}
/*
nt_read[0..size]: nucleotide read sequence: 0/1/2/3
cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N
tarray[0..size*2-1]: temporary array
*/
uint8_t *cs2nt_nt_qual(int size, const uint8_t *nt_read, const uint8_t *cs_read, uint8_t *tarray)
{
int k, c1, c2;
uint8_t *t2array = tarray + size;
// get the color sequence of nt_read
c1 = nt_read[0];
for (k = 1; k <= size; ++k) {
c2 = nt_read[k]; // in principle, there is no 'N' in nt_read[]; just in case
tarray[k-1] = (c1 >= 4 || c2 >= 4)? 4 : nst_ntnt2cs_table[1<<c1 | 1<<c2];
c1 = c2;
}
for (k = 1; k != size; ++k) {
int q = 0;
if (tarray[k-1] == cs_read[k-1]>>6 && tarray[k] == cs_read[k]>>6) {
q = (int)(cs_read[k-1]&0x3f) + (int)(cs_read[k]&0x3f) + 10;
} else if (tarray[k-1] == cs_read[k-1]>>6) {
q = (int)(cs_read[k-1]&0x3f) - (int)(cs_read[k]&0x3f);
} else if (tarray[k] == cs_read[k]>>6) {
q = (int)(cs_read[k]&0x3f) - (int)(cs_read[k-1]&0x3f);
} // else, q = 0
if (q < 0) q = 0;
if (q > 60) q = 60;
t2array[k] = nt_read[k]<<6 | q;
if ((cs_read[k-1]&0x3f) == 63 || (cs_read[k]&0x3f) == 63) t2array[k] = 0;
}
return t2array + 1; // of size-2
}
// this function will be called when p->seq has been reversed by refine_gapped()
void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac)
{
uint8_t *ta, *nt_read, *btarray, *tarray, *nt_ref, *cs_read, *new_nt_read;
int i, len;
uint8_t *seq;
// set temporary arrays
if (p->type == BWA_TYPE_NO_MATCH) return;
len = p->len + p->n_gapo + p->n_gape + 100; // leave enough space
ta = (uint8_t*)xmalloc(len * 7);
nt_ref = ta;
cs_read = nt_ref + len;
nt_read = cs_read + len;
btarray = nt_read + len;
tarray = nt_read + len;
#define __gen_csbase(_cs, _i, _seq) do { \
int q = p->qual[p->strand? p->len - 1 - (_i) : (_i)] - 33; \
if (q > 60) q = 60; \
if (_seq[_i] > 3) q = 63; \
(_cs) = _seq[_i]<<6 | q; \
} while (0)
// generate len, nt_ref[] and cs_read
seq = p->strand? p->rseq : p->seq;
nt_ref[0] = p->pos? bns_pac(pac, p->pos-1) : 4;
if (p->cigar == 0) { // no gap or clipping
len = p->len;
for (i = 0; i < p->len; ++i) {
__gen_csbase(cs_read[i], i, seq);
nt_ref[i+1] = bns_pac(pac, p->pos + i);
}
} else {
int k, z;
bwtint_t x, y;
x = p->pos; y = 0;
for (k = z = 0; k < p->n_cigar; ++k) {
int l = __cigar_len(p->cigar[k]);
if (__cigar_op(p->cigar[k]) == FROM_M) {
for (i = 0; i < l; ++i, ++x, ++y) {
__gen_csbase(cs_read[z], y, seq);
nt_ref[z+1] = bns_pac(pac, x);
++z;
}
} else if (__cigar_op(p->cigar[k]) == FROM_I) {
for (i = 0; i < l; ++i, ++y) {
__gen_csbase(cs_read[z], y, seq);
nt_ref[z+1] = 4;
++z;
}
} else if (__cigar_op(p->cigar[k]) == FROM_S) y += l;
else x += l;
}
len = z;
}
cs2nt_DP(len, nt_ref, cs_read, nt_read, btarray);
new_nt_read = cs2nt_nt_qual(len, nt_read, cs_read, tarray);
// update p
p->len = p->full_len = len - 1;
for (i = 0; i < p->len; ++i) {
if ((new_nt_read[i]&0x3f) == 63) {
p->qual[i] = 33; seq[i] = 4;
} else {
p->qual[i] = (new_nt_read[i]&0x3f) + 33;
seq[i] = new_nt_read[i]>>6;
}
}
p->qual[p->len] = seq[p->len] = 0;
if (p->strand) {
memcpy(p->seq, seq, p->len);
seq_reverse(p->len, p->seq, 1);
seq_reverse(p->len, p->qual, 0);
} else {
memcpy(p->rseq, seq, p->len);
seq_reverse(p->len, p->rseq, 1);
}
free(ta);
}

194
fastmap.c
View File

@ -2,115 +2,174 @@
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include "bntseq.h"
#include "bwt.h"
#include "bwa.h"
#include "bwamem.h"
#include "kvec.h"
#include "utils.h"
#include "kseq.h"
#include "utils.h"
KSEQ_INIT(gzFile, err_gzread)
KSEQ_DECLARE(gzFile)
extern unsigned char nst_nt4_table[256];
typedef struct {
const bwt_t *bwt;
const uint8_t *query;
int start, len;
bwtintv_v *tmpvec[2], *matches;
} smem_i;
void *kopen(const char *fn, int *_fd);
int kclose(void *a);
smem_i *smem_iter_init(const bwt_t *bwt)
int main_mem(int argc, char *argv[])
{
smem_i *iter;
iter = xcalloc(1, sizeof(smem_i));
iter->bwt = bwt;
iter->tmpvec[0] = xcalloc(1, sizeof(bwtintv_v));
iter->tmpvec[1] = xcalloc(1, sizeof(bwtintv_v));
iter->matches = xcalloc(1, sizeof(bwtintv_v));
return iter;
}
mem_opt_t *opt;
int fd, fd2, i, c, n, copy_comment = 0;
gzFile fp, fp2 = 0;
kseq_t *ks, *ks2 = 0;
bseq1_t *seqs;
bwaidx_t *idx;
char *rg_line = 0;
void *ko = 0, *ko2 = 0;
void smem_iter_destroy(smem_i *iter)
{
free(iter->tmpvec[0]->a);
free(iter->tmpvec[1]->a);
free(iter->matches->a);
free(iter);
}
opt = mem_opt_init();
while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:w:")) >= 0) {
if (c == 'k') opt->min_seed_len = atoi(optarg);
else if (c == 'w') opt->w = atoi(optarg);
else if (c == 'A') opt->a = atoi(optarg);
else if (c == 'B') opt->b = atoi(optarg);
else if (c == 'O') opt->q = atoi(optarg);
else if (c == 'E') opt->r = atoi(optarg);
else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
else if (c == 'H') opt->flag |= MEM_F_HARDCLIP;
else if (c == 'a') opt->flag |= MEM_F_ALL;
else if (c == 'p') opt->flag |= MEM_F_PE;
else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
else if (c == 'c') opt->max_occ = atoi(optarg);
else if (c == 'v') bwa_verbose = atoi(optarg);
else if (c == 'r') opt->split_factor = atof(optarg);
else if (c == 'C') copy_comment = 1;
else if (c == 'R') {
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak
} else if (c == 's') opt->split_width = atoi(optarg);
}
if (opt->n_threads < 1) opt->n_threads = 1;
if (optind + 1 >= argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa mem [options] <idxbase> <in1.fq> [in2.fq]\n\n");
fprintf(stderr, "Algorithm options:\n\n");
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len);
fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
fprintf(stderr, " -P skip pairing; perform mate SW only\n");
fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a);
fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q);
fprintf(stderr, " -E INT gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r);
fprintf(stderr, "\nInput/output options:\n\n");
fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n");
fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
fprintf(stderr, "\n");
fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
fprintf(stderr, " -a output all alignments for SE or unpaired PE\n");
fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n");
fprintf(stderr, " -H hard clipping\n");
fprintf(stderr, " -M mark shorter split hits as secondary (for Picard/GATK compatibility)\n");
fprintf(stderr, "\n");
free(opt);
return 1;
}
void smem_set_query(smem_i *iter, int len, const uint8_t *query)
{
iter->query = query;
iter->start = 0;
iter->len = len;
}
mem_fill_scmat(opt->a, opt->b, opt->mat);
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
bwa_print_sam_hdr(idx->bns, rg_line);
int smem_next(smem_i *iter)
{
iter->tmpvec[0]->n = iter->tmpvec[1]->n = iter->matches->n = 0;
if (iter->start >= iter->len || iter->start < 0) return -1;
while (iter->start < iter->len && iter->query[iter->start] > 3) ++iter->start; // skip ambiguous bases
if (iter->start == iter->len) return -1;
iter->start = bwt_smem1(iter->bwt, iter->len, iter->query, iter->start, iter->matches, iter->tmpvec);
return iter->start;
ko = kopen(argv[optind + 1], &fd);
fp = gzdopen(fd, "r");
ks = kseq_init(fp);
if (optind + 2 < argc) {
if (opt->flag&MEM_F_PE) {
if (bwa_verbose >= 2)
fprintf(stderr, "[W::%s] when '-p' is in use, the second query file will be ignored.\n", __func__);
} else {
ko2 = kopen(argv[optind + 2], &fd2);
fp2 = gzdopen(fd2, "r");
ks2 = kseq_init(fp2);
opt->flag |= MEM_F_PE;
}
}
while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) {
int64_t size = 0;
if (!copy_comment)
for (i = 0; i < n; ++i) {
free(seqs[i].comment); seqs[i].comment = 0;
}
for (i = 0; i < n; ++i) size += seqs[i].l_seq;
if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size);
mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n, seqs);
free(seqs);
}
free(opt);
bwa_idx_destroy(idx);
kseq_destroy(ks);
err_gzclose(fp); kclose(ko);
if (ks2) {
kseq_destroy(ks2);
err_gzclose(fp2); kclose(ko2);
}
return 0;
}
int main_fastmap(int argc, char *argv[])
{
int c, i, min_iwidth = 20, min_len = 17, print_seq = 0;
int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, split_width = 0;
kseq_t *seq;
bwtint_t k;
gzFile fp;
bwt_t *bwt;
bntseq_t *bns;
smem_i *iter;
smem_i *itr;
const bwtintv_v *a;
bwaidx_t *idx;
while ((c = getopt(argc, argv, "w:l:s")) >= 0) {
while ((c = getopt(argc, argv, "w:l:ps:")) >= 0) {
switch (c) {
case 's': print_seq = 1; break;
case 's': split_width = atoi(optarg); break;
case 'p': print_seq = 1; break;
case 'w': min_iwidth = atoi(optarg); break;
case 'l': min_len = atoi(optarg); break;
}
}
if (optind + 1 >= argc) {
fprintf(stderr, "Usage: bwa fastmap [-s] [-l minLen=%d] [-w maxSaSize=%d] <idxbase> <in.fq>\n", min_len, min_iwidth);
fprintf(stderr, "Usage: bwa fastmap [-p] [-s splitWidth=%d] [-l minLen=%d] [-w maxSaSize=%d] <idxbase> <in.fq>\n", split_width, min_len, min_iwidth);
return 1;
}
fp = xzopen(argv[optind + 1], "r");
seq = kseq_init(fp);
{ // load the packed sequences, BWT and SA
char *tmp = xcalloc(strlen(argv[optind]) + 5, 1);
strcat(strcpy(tmp, argv[optind]), ".bwt");
bwt = bwt_restore_bwt(tmp);
strcat(strcpy(tmp, argv[optind]), ".sa");
bwt_restore_sa(tmp, bwt);
free(tmp);
bns = bns_restore(argv[optind]);
}
iter = smem_iter_init(bwt);
idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS);
itr = smem_itr_init(idx->bwt);
while (kseq_read(seq) >= 0) {
printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l);
err_printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l);
if (print_seq) {
err_putchar('\t');
err_puts(seq->seq.s);
} else err_putchar('\n');
for (i = 0; i < seq->seq.l; ++i)
seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
smem_set_query(iter, seq->seq.l, (uint8_t*)seq->seq.s);
while (smem_next(iter) > 0) {
for (i = 0; i < iter->matches->n; ++i) {
bwtintv_t *p = &iter->matches->a[i];
smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s);
while ((a = smem_next(itr, min_len<<1, split_width)) != 0) {
for (i = 0; i < a->n; ++i) {
bwtintv_t *p = &a->a[i];
if ((uint32_t)p->info - (p->info>>32) < min_len) continue;
printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]);
err_printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]);
if (p->x[2] <= min_iwidth) {
for (k = 0; k < p->x[2]; ++k) {
bwtint_t pos;
int len, is_rev, ref_id;
len = (uint32_t)p->info - (p->info>>32);
pos = bns_depos(bns, bwt_sa(bwt, p->x[0] + k), &is_rev);
pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &is_rev);
if (is_rev) pos -= len - 1;
bns_cnt_ambi(bns, pos, len, &ref_id);
printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
bns_cnt_ambi(idx->bns, pos, len, &ref_id);
err_printf("\t%s:%c%ld", idx->bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - idx->bns->anns[ref_id].offset) + 1);
}
} else err_puts("\t*");
err_putchar('\n');
@ -119,9 +178,8 @@ int main_fastmap(int argc, char *argv[])
err_puts("//");
}
smem_iter_destroy(iter);
bns_destroy(bns);
bwt_destroy(bwt);
smem_itr_destroy(itr);
bwa_idx_destroy(idx);
kseq_destroy(seq);
err_gzclose(fp);
return 0;

385
kbtree.h 100644
View File

@ -0,0 +1,385 @@
/*-
* Copyright 1997-1999, 2001, John-Mark Gurney.
* 2008-2009, Attractive Chaos <attractor@live.co.uk>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef __AC_KBTREE_H
#define __AC_KBTREE_H
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include "utils.h"
typedef struct {
int32_t is_internal:1, n:31;
} kbnode_t;
#define __KB_KEY(type, x) ((type*)((char*)x + 4))
#define __KB_PTR(btr, x) ((kbnode_t**)((char*)x + btr->off_ptr))
#define __KB_TREE_T(name) \
typedef struct { \
kbnode_t *root; \
int off_key, off_ptr, ilen, elen; \
int n, t; \
int n_keys, n_nodes; \
} kbtree_##name##_t;
#define __KB_INIT(name, key_t) \
kbtree_##name##_t *kb_init_##name(int size) \
{ \
kbtree_##name##_t *b; \
b = (kbtree_##name##_t*)xcalloc(1, sizeof(kbtree_##name##_t)); \
b->t = ((size - 4 - sizeof(void*)) / (sizeof(void*) + sizeof(key_t)) + 1) >> 1; \
if (b->t < 2) { \
free(b); return 0; \
} \
b->n = 2 * b->t - 1; \
b->off_ptr = 4 + b->n * sizeof(key_t); \
b->ilen = (4 + sizeof(void*) + b->n * (sizeof(void*) + sizeof(key_t)) + 3) >> 2 << 2; \
b->elen = (b->off_ptr + 3) >> 2 << 2; \
b->root = (kbnode_t*)xcalloc(1, b->ilen); \
++b->n_nodes; \
return b; \
}
#define __kb_destroy(b) do { \
int i, max = 8; \
kbnode_t *x, **top, **stack = 0; \
if (b) { \
top = stack = (kbnode_t**)xcalloc(max, sizeof(kbnode_t*)); \
*top++ = (b)->root; \
while (top != stack) { \
x = *--top; \
if (x->is_internal == 0) { free(x); continue; } \
for (i = 0; i <= x->n; ++i) \
if (__KB_PTR(b, x)[i]) { \
if (top - stack == max) { \
max <<= 1; \
stack = (kbnode_t**)xrealloc(stack, max * sizeof(kbnode_t*)); \
top = stack + (max>>1); \
} \
*top++ = __KB_PTR(b, x)[i]; \
} \
free(x); \
} \
} \
free(b); free(stack); \
} while (0)
#define __kb_get_first(key_t, b, ret) do { \
kbnode_t *__x = (b)->root; \
while (__KB_PTR(b, __x)[0] != 0) \
__x = __KB_PTR(b, __x)[0]; \
(ret) = __KB_KEY(key_t, __x)[0]; \
} while (0)
#define __KB_GET_AUX0(name, key_t, __cmp) \
static inline int __kb_get_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \
{ \
int tr, *rr, begin, end, n = x->n >> 1; \
if (x->n == 0) return -1; \
if (__cmp(*k, __KB_KEY(key_t, x)[n]) < 0) { \
begin = 0; end = n; \
} else { begin = n; end = x->n - 1; } \
rr = r? r : &tr; \
n = end; \
while (n >= begin && (*rr = __cmp(*k, __KB_KEY(key_t, x)[n])) < 0) --n; \
return n; \
}
#define __KB_GET_AUX1(name, key_t, __cmp) \
static inline int __kb_getp_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \
{ \
int tr, *rr, begin = 0, end = x->n; \
if (x->n == 0) return -1; \
rr = r? r : &tr; \
while (begin < end) { \
int mid = (begin + end) >> 1; \
if (__cmp(__KB_KEY(key_t, x)[mid], *k) < 0) begin = mid + 1; \
else end = mid; \
} \
if (begin == x->n) { *rr = 1; return x->n - 1; } \
if ((*rr = __cmp(*k, __KB_KEY(key_t, x)[begin])) < 0) --begin; \
return begin; \
}
#define __KB_GET(name, key_t) \
static key_t *kb_getp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
{ \
int i, r = 0; \
kbnode_t *x = b->root; \
while (x) { \
i = __kb_getp_aux_##name(x, k, &r); \
if (i >= 0 && r == 0) return &__KB_KEY(key_t, x)[i]; \
if (x->is_internal == 0) return 0; \
x = __KB_PTR(b, x)[i + 1]; \
} \
return 0; \
} \
static inline key_t *kb_get_##name(kbtree_##name##_t *b, const key_t k) \
{ \
return kb_getp_##name(b, &k); \
}
#define __KB_INTERVAL(name, key_t) \
static void kb_intervalp_##name(kbtree_##name##_t *b, const key_t * __restrict k, key_t **lower, key_t **upper) \
{ \
int i, r = 0; \
kbnode_t *x = b->root; \
*lower = *upper = 0; \
while (x) { \
i = __kb_getp_aux_##name(x, k, &r); \
if (i >= 0 && r == 0) { \
*lower = *upper = &__KB_KEY(key_t, x)[i]; \
return; \
} \
if (i >= 0) *lower = &__KB_KEY(key_t, x)[i]; \
if (i < x->n - 1) *upper = &__KB_KEY(key_t, x)[i + 1]; \
if (x->is_internal == 0) return; \
x = __KB_PTR(b, x)[i + 1]; \
} \
} \
static inline void kb_interval_##name(kbtree_##name##_t *b, const key_t k, key_t **lower, key_t **upper) \
{ \
kb_intervalp_##name(b, &k, lower, upper); \
}
#define __KB_PUT(name, key_t, __cmp) \
/* x must be an internal node */ \
static void __kb_split_##name(kbtree_##name##_t *b, kbnode_t *x, int i, kbnode_t *y) \
{ \
kbnode_t *z; \
z = (kbnode_t*)xcalloc(1, y->is_internal? b->ilen : b->elen); \
++b->n_nodes; \
z->is_internal = y->is_internal; \
z->n = b->t - 1; \
memcpy(__KB_KEY(key_t, z), __KB_KEY(key_t, y) + b->t, sizeof(key_t) * (b->t - 1)); \
if (y->is_internal) memcpy(__KB_PTR(b, z), __KB_PTR(b, y) + b->t, sizeof(void*) * b->t); \
y->n = b->t - 1; \
memmove(__KB_PTR(b, x) + i + 2, __KB_PTR(b, x) + i + 1, sizeof(void*) * (x->n - i)); \
__KB_PTR(b, x)[i + 1] = z; \
memmove(__KB_KEY(key_t, x) + i + 1, __KB_KEY(key_t, x) + i, sizeof(key_t) * (x->n - i)); \
__KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[b->t - 1]; \
++x->n; \
} \
static void __kb_putp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k) \
{ \
int i = x->n - 1; \
if (x->is_internal == 0) { \
i = __kb_getp_aux_##name(x, k, 0); \
if (i != x->n - 1) \
memmove(__KB_KEY(key_t, x) + i + 2, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
__KB_KEY(key_t, x)[i + 1] = *k; \
++x->n; \
} else { \
i = __kb_getp_aux_##name(x, k, 0) + 1; \
if (__KB_PTR(b, x)[i]->n == 2 * b->t - 1) { \
__kb_split_##name(b, x, i, __KB_PTR(b, x)[i]); \
if (__cmp(*k, __KB_KEY(key_t, x)[i]) > 0) ++i; \
} \
__kb_putp_aux_##name(b, __KB_PTR(b, x)[i], k); \
} \
} \
static void kb_putp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
{ \
kbnode_t *r, *s; \
++b->n_keys; \
r = b->root; \
if (r->n == 2 * b->t - 1) { \
++b->n_nodes; \
s = (kbnode_t*)xcalloc(1, b->ilen); \
b->root = s; s->is_internal = 1; s->n = 0; \
__KB_PTR(b, s)[0] = r; \
__kb_split_##name(b, s, 0, r); \
r = s; \
} \
__kb_putp_aux_##name(b, r, k); \
} \
static inline void kb_put_##name(kbtree_##name##_t *b, const key_t k) \
{ \
kb_putp_##name(b, &k); \
}
#define __KB_DEL(name, key_t) \
static key_t __kb_delp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k, int s) \
{ \
int yn, zn, i, r = 0; \
kbnode_t *xp, *y, *z; \
key_t kp; \
if (x == 0) return *k; \
if (s) { /* s can only be 0, 1 or 2 */ \
r = x->is_internal == 0? 0 : s == 1? 1 : -1; \
i = s == 1? x->n - 1 : -1; \
} else i = __kb_getp_aux_##name(x, k, &r); \
if (x->is_internal == 0) { \
if (s == 2) ++i; \
kp = __KB_KEY(key_t, x)[i]; \
memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
--x->n; \
return kp; \
} \
if (r == 0) { \
if ((yn = __KB_PTR(b, x)[i]->n) >= b->t) { \
xp = __KB_PTR(b, x)[i]; \
kp = __KB_KEY(key_t, x)[i]; \
__KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 1); \
return kp; \
} else if ((zn = __KB_PTR(b, x)[i + 1]->n) >= b->t) { \
xp = __KB_PTR(b, x)[i + 1]; \
kp = __KB_KEY(key_t, x)[i]; \
__KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 2); \
return kp; \
} else if (yn == b->t - 1 && zn == b->t - 1) { \
y = __KB_PTR(b, x)[i]; z = __KB_PTR(b, x)[i + 1]; \
__KB_KEY(key_t, y)[y->n++] = *k; \
memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, z), z->n * sizeof(key_t)); \
if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, z), (z->n + 1) * sizeof(void*)); \
y->n += z->n; \
memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \
--x->n; \
free(z); \
return __kb_delp_aux_##name(b, y, k, s); \
} \
} \
++i; \
if ((xp = __KB_PTR(b, x)[i])->n == b->t - 1) { \
if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n >= b->t) { \
memmove(__KB_KEY(key_t, xp) + 1, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \
if (xp->is_internal) memmove(__KB_PTR(b, xp) + 1, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \
__KB_KEY(key_t, xp)[0] = __KB_KEY(key_t, x)[i - 1]; \
__KB_KEY(key_t, x)[i - 1] = __KB_KEY(key_t, y)[y->n - 1]; \
if (xp->is_internal) __KB_PTR(b, xp)[0] = __KB_PTR(b, y)[y->n]; \
--y->n; ++xp->n; \
} else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n >= b->t) { \
__KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \
__KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[0]; \
if (xp->is_internal) __KB_PTR(b, xp)[xp->n] = __KB_PTR(b, y)[0]; \
--y->n; \
memmove(__KB_KEY(key_t, y), __KB_KEY(key_t, y) + 1, y->n * sizeof(key_t)); \
if (y->is_internal) memmove(__KB_PTR(b, y), __KB_PTR(b, y) + 1, (y->n + 1) * sizeof(void*)); \
} else if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n == b->t - 1) { \
__KB_KEY(key_t, y)[y->n++] = __KB_KEY(key_t, x)[i - 1]; \
memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \
if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \
y->n += xp->n; \
memmove(__KB_KEY(key_t, x) + i - 1, __KB_KEY(key_t, x) + i, (x->n - i) * sizeof(key_t)); \
memmove(__KB_PTR(b, x) + i, __KB_PTR(b, x) + i + 1, (x->n - i) * sizeof(void*)); \
--x->n; \
free(xp); \
xp = y; \
} else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n == b->t - 1) { \
__KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \
memmove(__KB_KEY(key_t, xp) + xp->n, __KB_KEY(key_t, y), y->n * sizeof(key_t)); \
if (xp->is_internal) memmove(__KB_PTR(b, xp) + xp->n, __KB_PTR(b, y), (y->n + 1) * sizeof(void*)); \
xp->n += y->n; \
memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \
--x->n; \
free(y); \
} \
} \
return __kb_delp_aux_##name(b, xp, k, s); \
} \
static key_t kb_delp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
{ \
kbnode_t *x; \
key_t ret; \
ret = __kb_delp_aux_##name(b, b->root, k, 0); \
--b->n_keys; \
if (b->root->n == 0 && b->root->is_internal) { \
--b->n_nodes; \
x = b->root; \
b->root = __KB_PTR(b, x)[0]; \
free(x); \
} \
return ret; \
} \
static inline key_t kb_del_##name(kbtree_##name##_t *b, const key_t k) \
{ \
return kb_delp_##name(b, &k); \
}
typedef struct {
kbnode_t *x;
int i;
} __kbstack_t;
#define __kb_traverse(key_t, b, __func) do { \
int __kmax = 8; \
__kbstack_t *__kstack, *__kp; \
__kp = __kstack = (__kbstack_t*)xcalloc(__kmax, sizeof(__kbstack_t)); \
__kp->x = (b)->root; __kp->i = 0; \
for (;;) { \
while (__kp->x && __kp->i <= __kp->x->n) { \
if (__kp - __kstack == __kmax - 1) { \
__kmax <<= 1; \
__kstack = (__kbstack_t*)xrealloc(__kstack, __kmax * sizeof(__kbstack_t)); \
__kp = __kstack + (__kmax>>1) - 1; \
} \
(__kp+1)->i = 0; (__kp+1)->x = __kp->x->is_internal? __KB_PTR(b, __kp->x)[__kp->i] : 0; \
++__kp; \
} \
--__kp; \
if (__kp >= __kstack) { \
if (__kp->x && __kp->i < __kp->x->n) __func(&__KB_KEY(key_t, __kp->x)[__kp->i]); \
++__kp->i; \
} else break; \
} \
free(__kstack); \
} while (0)
#define KBTREE_INIT(name, key_t, __cmp) \
__KB_TREE_T(name) \
__KB_INIT(name, key_t) \
__KB_GET_AUX1(name, key_t, __cmp) \
__KB_GET(name, key_t) \
__KB_INTERVAL(name, key_t) \
__KB_PUT(name, key_t, __cmp) \
__KB_DEL(name, key_t)
#define KB_DEFAULT_SIZE 512
#define kbtree_t(name) kbtree_##name##_t
#define kb_init(name, s) kb_init_##name(s)
#define kb_destroy(name, b) __kb_destroy(b)
#define kb_get(name, b, k) kb_get_##name(b, k)
#define kb_put(name, b, k) kb_put_##name(b, k)
#define kb_del(name, b, k) kb_del_##name(b, k)
#define kb_interval(name, b, k, l, u) kb_interval_##name(b, k, l, u)
#define kb_getp(name, b, k) kb_getp_##name(b, k)
#define kb_putp(name, b, k) kb_putp_##name(b, k)
#define kb_delp(name, b, k) kb_delp_##name(b, k)
#define kb_intervalp(name, b, k, l, u) kb_intervalp_##name(b, k, l, u)
#define kb_size(b) ((b)->n_keys)
#define kb_generic_cmp(a, b) (((b) < (a)) - ((a) < (b)))
#define kb_str_cmp(a, b) strcmp(a, b)
#endif

282
khash.h
View File

@ -1,6 +1,6 @@
/* The MIT License
Copyright (c) 2008, 2009 by attractor <attractor@live.co.uk>
Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
@ -33,7 +33,6 @@ int main() {
khiter_t k;
khash_t(32) *h = kh_init(32);
k = kh_put(32, h, 5, &ret);
if (!ret) kh_del(32, h, k);
kh_value(h, k) = 10;
k = kh_get(32, h, 10);
is_missing = (k == kh_end(h));
@ -47,6 +46,29 @@ int main() {
*/
/*
2011-12-29 (0.2.7):
* Minor code clean up; no actual effect.
2011-09-16 (0.2.6):
* The capacity is a power of 2. This seems to dramatically improve the
speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
- http://code.google.com/p/ulib/
- http://nothings.org/computer/judy/
* Allow to optionally use linear probing which usually has better
performance for random input. Double hashing is still the default as it
is more robust to certain non-random input.
* Added Wang's integer hash function (not used by default). This hash
function is more robust to certain non-random input.
2011-02-14 (0.2.5):
* Allow to declare global functions.
2009-09-26 (0.2.4):
* Improve portability
@ -86,11 +108,9 @@ int main() {
@header
Generic hash table library.
@copyright Heng Li
*/
#define AC_VERSION_KHASH_H "0.2.4"
#define AC_VERSION_KHASH_H "0.2.6"
#include <stdlib.h>
#include <string.h>
@ -112,24 +132,14 @@ typedef unsigned long long khint64_t;
#endif
#ifdef _MSC_VER
#define inline __inline
#define kh_inline __inline
#else
#define kh_inline inline
#endif
typedef khint32_t khint_t;
typedef khint_t khiter_t;
#define __ac_HASH_PRIME_SIZE 32
static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
{
0ul, 3ul, 11ul, 23ul, 53ul,
97ul, 193ul, 389ul, 769ul, 1543ul,
3079ul, 6151ul, 12289ul, 24593ul, 49157ul,
98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,
3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,
100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,
3221225473ul, 4294967291ul
};
#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
@ -138,88 +148,128 @@ static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
#ifdef KHASH_LINEAR
#define __ac_inc(k, m) 1
#else
#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m)
#endif
#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
#ifndef kroundup32
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif
#ifndef kcalloc
#define kcalloc(N,Z) xcalloc(N,Z)
#endif
#ifndef kmalloc
#define kmalloc(Z) xmalloc(Z)
#endif
#ifndef krealloc
#define krealloc(P,Z) xrealloc(P,Z)
#endif
#ifndef kfree
#define kfree(P) free(P)
#endif
static const double __ac_HASH_UPPER = 0.77;
#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
typedef struct { \
khint_t n_buckets, size, n_occupied, upper_bound; \
khint32_t *flags; \
khkey_t *keys; \
khval_t *vals; \
} kh_##name##_t; \
static inline kh_##name##_t *kh_init_##name() { \
return (kh_##name##_t*)xcalloc(1, sizeof(kh_##name##_t)); \
#define __KHASH_TYPE(name, khkey_t, khval_t) \
typedef struct { \
khint_t n_buckets, size, n_occupied, upper_bound; \
khint32_t *flags; \
khkey_t *keys; \
khval_t *vals; \
} kh_##name##_t;
#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \
extern kh_##name##_t *kh_init_##name(void); \
extern void kh_destroy_##name(kh_##name##_t *h); \
extern void kh_clear_##name(kh_##name##_t *h); \
extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
extern void kh_del_##name(kh_##name##_t *h, khint_t x);
#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
SCOPE kh_##name##_t *kh_init_##name(void) { \
return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \
} \
static inline void kh_destroy_##name(kh_##name##_t *h) \
SCOPE void kh_destroy_##name(kh_##name##_t *h) \
{ \
if (h) { \
free(h->keys); free(h->flags); \
free(h->vals); \
free(h); \
kfree((void *)h->keys); kfree(h->flags); \
kfree((void *)h->vals); \
kfree(h); \
} \
} \
static inline void kh_clear_##name(kh_##name##_t *h) \
SCOPE void kh_clear_##name(kh_##name##_t *h) \
{ \
if (h && h->flags) { \
memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \
memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
h->size = h->n_occupied = 0; \
} \
} \
static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
{ \
if (h->n_buckets) { \
khint_t inc, k, i, last; \
k = __hash_func(key); i = k % h->n_buckets; \
inc = 1 + k % (h->n_buckets - 1); last = i; \
khint_t inc, k, i, last, mask; \
mask = h->n_buckets - 1; \
k = __hash_func(key); i = k & mask; \
inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
else i += inc; \
i = (i + inc) & mask; \
if (i == last) return h->n_buckets; \
} \
return __ac_iseither(h->flags, i)? h->n_buckets : i; \
} else return 0; \
} \
static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
{ \
SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
{ /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
khint32_t *new_flags = 0; \
khint_t j = 1; \
{ \
khint_t t = __ac_HASH_PRIME_SIZE - 1; \
while (__ac_prime_list[t] > new_n_buckets) --t; \
new_n_buckets = __ac_prime_list[t+1]; \
if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \
else { \
new_flags = (khint32_t*)xmalloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \
memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \
if (h->n_buckets < new_n_buckets) { \
h->keys = (khkey_t*)xrealloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
if (kh_is_map) \
h->vals = (khval_t*)xrealloc(h->vals, new_n_buckets * sizeof(khval_t)); \
} \
kroundup32(new_n_buckets); \
if (new_n_buckets < 4) new_n_buckets = 4; \
if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
else { /* hash table size to be changed (shrink or expand); rehash */ \
new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
if (!new_flags) return -1; \
memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
if (h->n_buckets < new_n_buckets) { /* expand */ \
khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
if (!new_keys) return -1; \
h->keys = new_keys; \
if (kh_is_map) { \
khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
if (!new_vals) return -1; \
h->vals = new_vals; \
} \
} /* otherwise shrink */ \
} \
} \
if (j) { \
if (j) { /* rehashing is needed */ \
for (j = 0; j != h->n_buckets; ++j) { \
if (__ac_iseither(h->flags, j) == 0) { \
khkey_t key = h->keys[j]; \
khval_t val; \
khint_t new_mask; \
new_mask = new_n_buckets - 1; \
if (kh_is_map) val = h->vals[j]; \
__ac_set_isdel_true(h->flags, j); \
while (1) { \
while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
khint_t inc, k, i; \
k = __hash_func(key); \
i = k % new_n_buckets; \
inc = 1 + k % (new_n_buckets - 1); \
while (!__ac_isempty(new_flags, i)) { \
if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \
else i += inc; \
} \
i = k & new_mask; \
inc = __ac_inc(k, new_mask); \
while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \
__ac_set_isempty_false(new_flags, i); \
if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \
if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
__ac_set_isdel_true(h->flags, i); \
} else { \
__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
} else { /* write the element and jump out of the loop */ \
h->keys[i] = key; \
if (kh_is_map) h->vals[i] = val; \
break; \
@ -227,35 +277,39 @@ static const double __ac_HASH_UPPER = 0.77;
} \
} \
} \
if (h->n_buckets > new_n_buckets) { \
h->keys = (khkey_t*)xrealloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
if (kh_is_map) \
h->vals = (khval_t*)xrealloc(h->vals, new_n_buckets * sizeof(khval_t)); \
if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
} \
free(h->flags); \
kfree(h->flags); /* free the working space */ \
h->flags = new_flags; \
h->n_buckets = new_n_buckets; \
h->n_occupied = h->size; \
h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
} \
return 0; \
} \
static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
{ \
khint_t x; \
if (h->n_occupied >= h->upper_bound) { \
if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \
else kh_resize_##name(h, h->n_buckets + 1); \
} \
if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
if (h->n_buckets > (h->size<<1)) { \
if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
*ret = -1; return h->n_buckets; \
} \
} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
*ret = -1; return h->n_buckets; \
} \
} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
{ \
khint_t inc, k, i, site, last; \
x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \
if (__ac_isempty(h->flags, i)) x = i; \
khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \
x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
else { \
inc = 1 + k % (h->n_buckets - 1); last = i; \
inc = __ac_inc(k, mask); last = i; \
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
if (__ac_isdel(h->flags, i)) site = i; \
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
else i += inc; \
i = (i + inc) & mask; \
if (i == last) { x = site; break; } \
} \
if (x == h->n_buckets) { \
@ -264,20 +318,20 @@ static const double __ac_HASH_UPPER = 0.77;
} \
} \
} \
if (__ac_isempty(h->flags, x)) { \
if (__ac_isempty(h->flags, x)) { /* not present at all */ \
h->keys[x] = key; \
__ac_set_isboth_false(h->flags, x); \
++h->size; ++h->n_occupied; \
*ret = 1; \
} else if (__ac_isdel(h->flags, x)) { \
} else if (__ac_isdel(h->flags, x)) { /* deleted */ \
h->keys[x] = key; \
__ac_set_isboth_false(h->flags, x); \
++h->size; \
*ret = 2; \
} else *ret = 0; \
} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
return x; \
} \
static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \
SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \
{ \
if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
__ac_set_isdel_true(h->flags, x); \
@ -285,6 +339,17 @@ static const double __ac_HASH_UPPER = 0.77;
} \
}
#define KHASH_DECLARE(name, khkey_t, khval_t) \
__KHASH_TYPE(name, khkey_t, khval_t) \
__KHASH_PROTOTYPES(name, khkey_t, khval_t)
#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
__KHASH_TYPE(name, khkey_t, khval_t) \
__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
/* --- BEGIN OF HASH FUNCTIONS --- */
/*! @function
@ -312,10 +377,10 @@ static const double __ac_HASH_UPPER = 0.77;
@param s Pointer to a null terminated string
@return The hash value
*/
static inline khint_t __ac_X31_hash_string(const char *s)
static kh_inline khint_t __ac_X31_hash_string(const char *s)
{
khint_t h = *s;
if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
khint_t h = (khint_t)*s;
if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
return h;
}
/*! @function
@ -329,9 +394,21 @@ static inline khint_t __ac_X31_hash_string(const char *s)
*/
#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
static kh_inline khint_t __ac_Wang_hash(khint_t key)
{
key += ~(key << 15);
key ^= (key >> 10);
key += (key << 3);
key ^= (key >> 6);
key += ~(key << 11);
key ^= (key >> 16);
return key;
}
#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key)
/* --- END OF HASH FUNCTIONS --- */
/* Other necessary macros... */
/* Other convenient macros... */
/*!
@abstract Type of the hash table.
@ -397,7 +474,6 @@ static inline khint_t __ac_X31_hash_string(const char *s)
*/
#define kh_del(name, h, k) kh_del_##name(h, k)
/*! @function
@abstract Test whether a bucket contains data.
@param h Pointer to the hash table [khash_t(name)*]
@ -456,6 +532,34 @@ static inline khint_t __ac_X31_hash_string(const char *s)
*/
#define kh_n_buckets(h) ((h)->n_buckets)
/*! @function
@abstract Iterate over the entries in the hash table
@param h Pointer to the hash table [khash_t(name)*]
@param kvar Variable to which key will be assigned
@param vvar Variable to which value will be assigned
@param code Block of code to execute
*/
#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \
for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
if (!kh_exist(h,__i)) continue; \
(kvar) = kh_key(h,__i); \
(vvar) = kh_val(h,__i); \
code; \
} }
/*! @function
@abstract Iterate over the values in the hash table
@param h Pointer to the hash table [khash_t(name)*]
@param vvar Variable to which value will be assigned
@param code Block of code to execute
*/
#define kh_foreach_value(h, vvar, code) { khint_t __i; \
for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
if (!kh_exist(h,__i)) continue; \
(vvar) = kh_val(h,__i); \
code; \
} }
/* More conenient interfaces */
/*! @function

372
kopen.c 100644
View File

@ -0,0 +1,372 @@
#include <stdio.h>
#include <fcntl.h>
#include <errno.h>
#include <ctype.h>
#include <unistd.h>
#include <string.h>
#include <stdlib.h>
#include <signal.h>
#include <sys/wait.h>
#include <sys/types.h>
#ifndef _WIN32
#include <netdb.h>
#include <arpa/inet.h>
#include <sys/socket.h>
#endif
#include "utils.h"
#ifdef _WIN32
#define _KO_NO_NET
#endif
#ifndef _KO_NO_NET
static int socket_wait(int fd, int is_read)
{
fd_set fds, *fdr = 0, *fdw = 0;
struct timeval tv;
int ret;
tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
FD_ZERO(&fds);
FD_SET(fd, &fds);
if (is_read) fdr = &fds;
else fdw = &fds;
ret = select(fd+1, fdr, fdw, 0, &tv);
if (ret == -1) perror("select");
return ret;
}
static int socket_connect(const char *host, const char *port)
{
#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
int on = 1, fd;
struct linger lng = { 0, 0 };
struct addrinfo hints, *res = 0;
memset(&hints, 0, sizeof(struct addrinfo));
hints.ai_family = AF_UNSPEC;
hints.ai_socktype = SOCK_STREAM;
if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
freeaddrinfo(res);
return fd;
#undef __err_connect
}
static int write_bytes(int fd, const char *buf, size_t len)
{
ssize_t bytes;
do {
bytes = write(fd, buf, len);
if (bytes >= 0) {
len -= bytes;
} else if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
return -1;
}
} while (len > 0);
return 0;
}
static int http_open(const char *fn)
{
char *p, *proxy, *q, *http_host, *host, *port, *path, *buf;
int fd, ret, l;
ssize_t bytes = 0, bufsz = 0x10000;
/* parse URL; adapted from khttp_parse_url() in knetfile.c */
if (strstr(fn, "http://") != fn) return 0;
// set ->http_host
for (p = (char*)fn + 7; *p && *p != '/'; ++p);
l = p - fn - 7;
http_host = xcalloc(l + 1, 1);
strncpy(http_host, fn + 7, l);
http_host[l] = 0;
for (q = http_host; *q && *q != ':'; ++q);
if (*q == ':') *q++ = 0;
// get http_proxy
proxy = getenv("http_proxy");
// set host, port and path
if (proxy == 0) {
host = xstrdup(http_host); // when there is no proxy, server name is identical to http_host name.
port = xstrdup(*q? q : "80");
path = xstrdup(*p? p : "/");
} else {
host = (strstr(proxy, "http://") == proxy)? xstrdup(proxy + 7) : xstrdup(proxy);
for (q = host; *q && *q != ':'; ++q);
if (*q == ':') *q++ = 0;
port = xstrdup(*q? q : "80");
path = xstrdup(fn);
}
/* connect; adapted from khttp_connect() in knetfile.c */
l = 0;
fd = socket_connect(host, port);
buf = xcalloc(bufsz, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
l += snprintf(buf + l, bufsz, "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n",
path, http_host);
if (write_bytes(fd, buf, l) != 0) {
close(fd);
fd = -1;
goto out;
}
l = 0;
retry:
while (l < bufsz && (bytes = read(fd, buf + l, 1)) > 0) { // read HTTP header; FIXME: bad efficiency
if (buf[l] == '\n' && l >= 3)
if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
++l;
}
if (bytes < 0 && (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)) goto retry;
buf[l] = 0;
if (bytes < 0 || l < 14) { // prematured header
close(fd);
fd = -1;
goto out;
}
ret = strtol(buf + 8, &p, 0); // HTTP return code
if (ret != 200) {
close(fd);
fd = -1;
}
out:
free(buf); free(http_host); free(host); free(port); free(path);
return fd;
}
typedef struct {
int max_response, ctrl_fd;
char *response;
} ftpaux_t;
static int kftp_get_response(ftpaux_t *aux)
{
unsigned char c;
int n = 0;
char *p;
if (socket_wait(aux->ctrl_fd, 1) <= 0) return 0;
while (read(aux->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
if (n >= aux->max_response) {
aux->max_response = aux->max_response? aux->max_response<<1 : 256;
aux->response = xrealloc(aux->response, aux->max_response);
}
aux->response[n++] = c;
if (c == '\n') {
if (n >= 4 && isdigit(aux->response[0]) && isdigit(aux->response[1]) && isdigit(aux->response[2])
&& aux->response[3] != '-') break;
n = 0;
continue;
}
}
if (n < 2) return -1;
aux->response[n-2] = 0;
return strtol(aux->response, &p, 0);
}
static int kftp_send_cmd(ftpaux_t *aux, const char *cmd, int is_get)
{
if (socket_wait(aux->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
if (write_bytes(aux->ctrl_fd, cmd, strlen(cmd)) != 0) return -1;
return is_get? kftp_get_response(aux) : 0;
}
static int ftp_open(const char *fn)
{
char *p, *host = 0, *port = 0, *retr = 0;
char host2[80], port2[10];
int v[6], l, fd = -1, ret, pasv_port, pasv_ip[4];
ftpaux_t aux;
/* parse URL */
if (strstr(fn, "ftp://") != fn) return 0;
for (p = (char*)fn + 6; *p && *p != '/'; ++p);
if (*p != '/') return 0;
l = p - fn - 6;
port = xstrdup("21");
host = xcalloc(l + 1, 1);
strncpy(host, fn + 6, l);
retr = xcalloc(strlen(p) + 8, 1);
sprintf(retr, "RETR %s\r\n", p);
/* connect to ctrl */
memset(&aux, 0, sizeof(ftpaux_t));
aux.ctrl_fd = socket_connect(host, port);
if (aux.ctrl_fd == -1) goto ftp_open_end; /* fail to connect ctrl */
/* connect to the data stream */
kftp_get_response(&aux);
kftp_send_cmd(&aux, "USER anonymous\r\n", 1);
kftp_send_cmd(&aux, "PASS kopen@\r\n", 1);
kftp_send_cmd(&aux, "TYPE I\r\n", 1);
kftp_send_cmd(&aux, "PASV\r\n", 1);
for (p = aux.response; *p && *p != '('; ++p);
if (*p != '(') goto ftp_open_end;
++p;
sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
memcpy(pasv_ip, v, 4 * sizeof(int));
pasv_port = (v[4]<<8&0xff00) + v[5];
kftp_send_cmd(&aux, retr, 0);
sprintf(host2, "%d.%d.%d.%d", pasv_ip[0], pasv_ip[1], pasv_ip[2], pasv_ip[3]);
sprintf(port2, "%d", pasv_port);
fd = socket_connect(host2, port2);
if (fd == -1) goto ftp_open_end;
ret = kftp_get_response(&aux);
if (ret != 150) {
close(fd);
fd = -1;
}
close(aux.ctrl_fd);
ftp_open_end:
free(host); free(port); free(retr); free(aux.response);
return fd;
}
#endif /* !defined(_KO_NO_NET) */
static char **cmd2argv(const char *cmd)
{
int i, beg, end, argc;
char **argv, *str;
end = strlen(cmd);
for (i = end - 1; i >= 0; --i)
if (!isspace(cmd[i])) break;
end = i + 1;
for (beg = 0; beg < end; ++beg)
if (!isspace(cmd[beg])) break;
if (beg == end) return 0;
for (i = beg + 1, argc = 0; i < end; ++i)
if (isspace(cmd[i]) && !isspace(cmd[i-1]))
++argc;
argv = (char**)xcalloc(argc + 2, sizeof(void*));
argv[0] = str = (char*)xcalloc(end - beg + 1, 1);
strncpy(argv[0], cmd + beg, end - beg);
for (i = argc = 1; i < end - beg; ++i)
if (isspace(str[i])) str[i] = 0;
else if (str[i] && str[i-1] == 0) argv[argc++] = &str[i];
return argv;
}
#define KO_STDIN 1
#define KO_FILE 2
#define KO_PIPE 3
#define KO_HTTP 4
#define KO_FTP 5
typedef struct {
int type, fd;
pid_t pid;
} koaux_t;
void *kopen(const char *fn, int *_fd)
{
koaux_t *aux = 0;
*_fd = -1;
if (strstr(fn, "http://") == fn) {
aux = xcalloc(1, sizeof(koaux_t));
aux->type = KO_HTTP;
aux->fd = http_open(fn);
} else if (strstr(fn, "ftp://") == fn) {
aux = xcalloc(1, sizeof(koaux_t));
aux->type = KO_FTP;
aux->fd = ftp_open(fn);
} else if (strcmp(fn, "-") == 0) {
aux = xcalloc(1, sizeof(koaux_t));
aux->type = KO_STDIN;
aux->fd = STDIN_FILENO;
} else {
const char *p, *q;
for (p = fn; *p; ++p)
if (!isspace(*p)) break;
if (*p == '<') { // pipe open
int need_shell, pfd[2];
pid_t pid;
// a simple check to see if we need to invoke a shell; not always working
for (q = p + 1; *q; ++q)
if (ispunct(*q) && *q != '.' && *q != '_' && *q != '-' && *q != ':')
break;
need_shell = (*q != 0);
if (pipe(pfd) != 0) return 0;
pid = vfork();
if (pid == -1) { /* vfork() error */
close(pfd[0]); close(pfd[1]);
return 0;
}
if (pid == 0) { /* the child process */
char **argv; /* FIXME: I do not know if this will lead to a memory leak */
close(pfd[0]);
dup2(pfd[1], STDOUT_FILENO);
close(pfd[1]);
if (!need_shell) {
argv = cmd2argv(p + 1);
execvp(argv[0], argv);
free(argv[0]); free(argv);
} else execl("/bin/sh", "sh", "-c", p + 1, NULL);
exit(1);
} else { /* parent process */
close(pfd[1]);
aux = xcalloc(1, sizeof(koaux_t));
aux->type = KO_PIPE;
aux->fd = pfd[0];
aux->pid = pid;
}
} else {
#ifdef _WIN32
*_fd = open(fn, O_RDONLY | O_BINARY);
#else
*_fd = open(fn, O_RDONLY);
#endif
if (*_fd) {
aux = xcalloc(1, sizeof(koaux_t));
aux->type = KO_FILE;
aux->fd = *_fd;
}
}
}
*_fd = aux->fd;
return aux;
}
int kclose(void *a)
{
koaux_t *aux = (koaux_t*)a;
if (aux->type == KO_PIPE) {
int status;
pid_t pid;
pid = waitpid(aux->pid, &status, WNOHANG);
if (pid != aux->pid) kill(aux->pid, 15);
}
free(aux);
return 0;
}
#ifdef _KO_MAIN
#define BUF_SIZE 0x10000
int main(int argc, char *argv[])
{
void *x;
int l, fd;
unsigned char buf[BUF_SIZE];
FILE *fp;
if (argc == 1) {
fprintf(stderr, "Usage: kopen <file>\n");
return 1;
}
x = kopen(argv[1], &fd);
fp = fdopen(fd, "r");
if (fp == 0) {
fprintf(stderr, "ERROR: fail to open the input\n");
return 1;
}
do {
if ((l = fread(buf, 1, BUF_SIZE, fp)) != 0)
fwrite(buf, 1, l, stdout);
} while (l == BUF_SIZE);
fclose(fp);
kclose(x);
return 0;
}
#endif

137
kseq.h
View File

@ -1,6 +1,6 @@
/* The MIT License
Copyright (c) 2008, by Heng Li <lh3@sanger.ac.uk>
Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
@ -23,6 +23,8 @@
SOFTWARE.
*/
/* Last Modified: 05MAR2012 */
#ifndef AC_KSEQ_H
#define AC_KSEQ_H
@ -31,9 +33,14 @@
#include <stdlib.h>
#include "utils.h"
#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
#define KS_SEP_TAB 1 // isspace() && !' '
#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
#define KS_SEP_MAX 2
#define __KS_TYPE(type_t) \
typedef struct __kstream_t { \
char *buf; \
unsigned char *buf; \
int begin, end, is_eof; \
type_t f; \
} kstream_t;
@ -46,7 +53,7 @@
{ \
kstream_t *ks = (kstream_t*)xcalloc(1, sizeof(kstream_t)); \
ks->f = f; \
ks->buf = (char*)xmalloc(__bufsize); \
ks->buf = (unsigned char*)xmalloc(__bufsize); \
return ks; \
} \
static inline void ks_destroy(kstream_t *ks) \
@ -83,10 +90,10 @@ typedef struct __kstring_t {
#endif
#define __KS_GETUNTIL(__read, __bufsize) \
static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
{ \
if (dret) *dret = 0; \
str->l = 0; \
str->l = append? str->l : 0; \
if (ks->begin >= ks->end && ks->is_eof) return -1; \
for (;;) { \
int i; \
@ -98,14 +105,20 @@ typedef struct __kstring_t {
if (ks->end == 0) break; \
} else break; \
} \
if (delimiter) { \
if (delimiter == KS_SEP_LINE) { \
for (i = ks->begin; i < ks->end; ++i) \
if (ks->buf[i] == '\n') break; \
} else if (delimiter > KS_SEP_MAX) { \
for (i = ks->begin; i < ks->end; ++i) \
if (ks->buf[i] == delimiter) break; \
} else { \
} else if (delimiter == KS_SEP_SPACE) { \
for (i = ks->begin; i < ks->end; ++i) \
if (isspace(ks->buf[i])) break; \
} \
if (str->m - str->l < i - ks->begin + 1) { \
} else if (delimiter == KS_SEP_TAB) { \
for (i = ks->begin; i < ks->end; ++i) \
if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
} else i = 0; /* never come to here! */ \
if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
str->m = str->l + (i - ks->begin) + 1; \
kroundup32(str->m); \
str->s = (char*)xrealloc(str->s, str->m); \
@ -118,9 +131,15 @@ typedef struct __kstring_t {
break; \
} \
} \
if (str->s == 0) { \
str->m = 1; \
str->s = (char*)xcalloc(1, 1); \
} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
str->s[str->l] = '\0'; \
return str->l; \
}
} \
static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
#define KSTREAM_INIT(type_t, __read, __bufsize) \
__KS_TYPE(type_t) \
@ -128,19 +147,16 @@ typedef struct __kstring_t {
__KS_GETC(__read, __bufsize) \
__KS_GETUNTIL(__read, __bufsize)
#define __KSEQ_BASIC(type_t) \
static inline kseq_t *kseq_init(type_t fd) \
#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
#define __KSEQ_BASIC(SCOPE, type_t) \
SCOPE kseq_t *kseq_init(type_t fd) \
{ \
kseq_t *s = (kseq_t*)xcalloc(1, sizeof(kseq_t)); \
s->f = ks_init(fd); \
return s; \
} \
static inline void kseq_rewind(kseq_t *ks) \
{ \
ks->last_char = 0; \
ks->f->is_eof = ks->f->begin = ks->f->end = 0; \
} \
static inline void kseq_destroy(kseq_t *ks) \
SCOPE void kseq_destroy(kseq_t *ks) \
{ \
if (!ks) return; \
free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
@ -153,44 +169,46 @@ typedef struct __kstring_t {
-1 end-of-file
-2 truncated quality string
*/
#define __KSEQ_READ \
static int kseq_read(kseq_t *seq) \
{ \
int c; \
kstream_t *ks = seq->f; \
#define __KSEQ_READ(SCOPE) \
SCOPE int kseq_read(kseq_t *seq) \
{ \
int c; \
kstream_t *ks = seq->f; \
if (seq->last_char == 0) { /* then jump to the next header line */ \
while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
if (c == -1) return -1; /* end of file */ \
seq->last_char = c; \
} /* the first header char has been read */ \
seq->comment.l = seq->seq.l = seq->qual.l = 0; \
if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \
if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \
while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
if (c == -1) return -1; /* end of file */ \
seq->last_char = c; \
} /* else: the first header char has been read in the previous call */ \
seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
seq->seq.m = 256; \
seq->seq.s = (char*)xmalloc(seq->seq.m); \
} \
while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
if (isgraph(c)) { /* printable non-space character */ \
if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
seq->seq.m = seq->seq.l + 2; \
kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
seq->seq.s = (char*)xrealloc(seq->seq.s, seq->seq.m); \
} \
seq->seq.s[seq->seq.l++] = (char)c; \
} \
} \
if (c == '\n') continue; /* skip empty lines */ \
seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
} \
if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
if (c != '+') return seq->seq.l; /* FASTA */ \
if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \
seq->qual.m = seq->seq.m; \
seq->qual.s = (char*)xrealloc(seq->qual.s, seq->qual.m); \
} \
if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
seq->seq.m = seq->seq.l + 2; \
kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
seq->seq.s = (char*)xrealloc(seq->seq.s, seq->seq.m); \
} \
seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
if (c != '+') return seq->seq.l; /* FASTA */ \
if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
seq->qual.m = seq->seq.m; \
seq->qual.s = (char*)xrealloc(seq->qual.s, seq->qual.m); \
} \
while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
if (c == -1) return -2; /* we should not stop here */ \
while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \
if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \
if (c == -1) return -2; /* error: no quality string */ \
while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
seq->last_char = 0; /* we have not come to the next header line */ \
if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
return seq->seq.l; \
if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
return seq->seq.l; \
}
#define __KSEQ_TYPE(type_t) \
@ -200,10 +218,19 @@ typedef struct __kstring_t {
kstream_t *f; \
} kseq_t;
#define KSEQ_INIT(type_t, __read) \
KSTREAM_INIT(type_t, __read, 4096) \
#define KSEQ_INIT2(SCOPE, type_t, __read) \
KSTREAM_INIT(type_t, __read, 16384) \
__KSEQ_TYPE(type_t) \
__KSEQ_BASIC(type_t) \
__KSEQ_READ
__KSEQ_BASIC(SCOPE, type_t) \
__KSEQ_READ(SCOPE)
#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
#define KSEQ_DECLARE(type_t) \
__KS_TYPE(type_t) \
__KSEQ_TYPE(type_t) \
extern kseq_t *kseq_init(type_t fd); \
void kseq_destroy(kseq_t *ks); \
int kseq_read(kseq_t *seq);
#endif

View File

@ -140,7 +140,7 @@ typedef struct {
tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \
} \
} \
inline void __ks_insertsort_##name(type_t *s, type_t *t) \
static inline void __ks_insertsort_##name(type_t *s, type_t *t) \
{ \
type_t *i, *j, swap_tmp; \
for (i = s + 1; i < t; ++i) \

View File

@ -27,7 +27,7 @@ int ksprintf(kstring_t *s, const char *fmt, ...)
int main()
{
kstring_t *s;
s = (kstring_t*)calloc(1, sizeof(kstring_t));
s = (kstring_t*)xcalloc(1, sizeof(kstring_t));
ksprintf(s, "abcdefg: %d", 100);
printf("%s\n", s->s);
free(s);

View File

@ -17,19 +17,33 @@ typedef struct __kstring_t {
} kstring_t;
#endif
static inline int kputs(const char *p, kstring_t *s)
static inline void ks_resize(kstring_t *s, size_t size)
{
if (s->m < size) {
s->m = size;
kroundup32(s->m);
s->s = (char*)xrealloc(s->s, s->m);
}
}
static inline int kputsn(const char *p, int l, kstring_t *s)
{
int l = strlen(p);
if (s->l + l + 1 >= s->m) {
s->m = s->l + l + 2;
kroundup32(s->m);
s->s = (char*)xrealloc(s->s, s->m);
}
strcpy(s->s + s->l, p);
memcpy(s->s + s->l, p, l);
s->l += l;
s->s[s->l] = 0;
return l;
}
static inline int kputs(const char *p, kstring_t *s)
{
return kputsn(p, strlen(p), s);
}
static inline int kputc(int c, kstring_t *s)
{
if (s->l + 1 >= s->m) {
@ -42,6 +56,40 @@ static inline int kputc(int c, kstring_t *s)
return c;
}
static inline int kputw(int c, kstring_t *s)
{
char buf[16];
int l, x;
if (c == 0) return kputc('0', s);
for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
if (c < 0) buf[l++] = '-';
if (s->l + l + 1 >= s->m) {
s->m = s->l + l + 2;
kroundup32(s->m);
s->s = (char*)xrealloc(s->s, s->m);
}
for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
s->s[s->l] = 0;
return 0;
}
static inline int kputuw(unsigned c, kstring_t *s)
{
char buf[16];
int l, i;
unsigned x;
if (c == 0) return kputc('0', s);
for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0';
if (s->l + l + 1 >= s->m) {
s->m = s->l + l + 2;
kroundup32(s->m);
s->s = (char*)xrealloc(s->s, s->m);
}
for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
s->s[s->l] = 0;
return 0;
}
int ksprintf(kstring_t *s, const char *fmt, ...);
#endif

378
ksw.c
View File

@ -23,7 +23,6 @@
SOFTWARE.
*/
#ifndef _NO_SSE2
#include <stdlib.h>
#include <stdint.h>
#include <emmintrin.h>
@ -38,22 +37,35 @@
#define UNLIKELY(x) (x)
#endif
struct _ksw_query_t {
const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 };
struct _kswq_t {
int qlen, slen;
uint8_t shift, mdiff, max, size;
__m128i *qp, *H0, *H1, *E, *Hmax;
};
ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
/**
* Initialize the query data structure
*
* @param size Number of bytes used to store a score; valid valures are 1 or 2
* @param qlen Length of the query sequence
* @param query Query sequence
* @param m Size of the alphabet
* @param mat Scoring matrix in a one-dimension array
*
* @return Query data structure
*/
kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
{
ksw_query_t *q;
kswq_t *q;
int slen, a, tmp, p;
size = size > 1? 2 : 1;
p = 8 * (3 - size); // # values per __m128i
slen = (qlen + p - 1) / p; // segmented length
q = xmalloc(sizeof(ksw_query_t) + 256 + 16 * slen * (m + 4)); // a single block of memory
q->qp = (__m128i*)(((size_t)q + sizeof(ksw_query_t) + 15) >> 4 << 4); // align memory
q = (kswq_t*)xmalloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory
q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory
q->H0 = q->qp + slen * m;
q->H1 = q->H0 + slen;
q->E = q->H1 + slen;
@ -92,11 +104,12 @@ ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const in
return q;
}
int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e)
kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e)
{
int slen, i, m_b, n_b, te = -1, gmax = 0;
int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
uint64_t *b;
__m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax;
kswr_t r;
#define __max_16(ret, xx) do { \
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \
@ -107,10 +120,13 @@ int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) /
} while (0)
// initialization
r = g_defr;
minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
m_b = n_b = 0; b = 0;
zero = _mm_set1_epi32(0);
gapoe = _mm_set1_epi8(a->gapo + a->gape);
gape = _mm_set1_epi8(a->gape);
gapoe = _mm_set1_epi8(_gapo + _gape);
gape = _mm_set1_epi8(_gape);
shift = _mm_set1_epi8(q->shift);
H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
slen = q->slen;
@ -166,11 +182,11 @@ int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) /
end_loop16:
//int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n");
__max_16(imax, max); // imax is the maximum number in max
if (imax >= a->T) { // write the b array; this condition adds branching unfornately
if (imax >= minsc) { // write the b array; this condition adds branching unfornately
if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append
if (n_b == m_b) {
m_b = m_b? m_b<<1 : 8;
b = xrealloc(b, 8 * m_b);
b = (uint64_t*)xrealloc(b, 8 * m_b);
}
b[n_b++] = (uint64_t)imax<<32 | i;
} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
@ -179,34 +195,38 @@ end_loop16:
gmax = imax; te = i; // te is the end position on the target
for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector
_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
if (gmax + q->shift >= 255) break;
if (gmax + q->shift >= 255 || gmax >= endsc) break;
}
S = H1; H1 = H0; H0 = S; // swap H0 and H1
}
a->score = gmax; a->te = te;
{ // get a->qe, the end of query match; find the 2nd best score
r.score = gmax + q->shift < 255? gmax : 255;
r.te = te;
if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score
int max = -1, low, high, qlen = slen * 16;
uint8_t *t = (uint8_t*)Hmax;
for (i = 0, a->qe = -1; i < qlen; ++i, ++t)
if ((int)*t > max) max = *t, a->qe = i / 16 + i % 16 * slen;
for (i = 0; i < qlen; ++i, ++t)
if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen;
//printf("%d,%d\n", max, gmax);
i = (a->score + q->max - 1) / q->max;
low = te - i; high = te + i;
for (i = 0, a->score2 = 0; i < n_b; ++i) {
int e = (int32_t)b[i];
if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2)
a->score2 = b[i]>>32, a->te2 = e;
if (b) {
i = (r.score + q->max - 1) / q->max;
low = te - i; high = te + i;
for (i = 0; i < n_b; ++i) {
int e = (int32_t)b[i];
if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
r.score2 = b[i]>>32, r.te2 = e;
}
}
}
free(b);
return a->score + q->shift >= 255? 255 : a->score;
return r;
}
int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e)
kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e)
{
int slen, i, m_b, n_b, te = -1, gmax = 0;
int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
uint64_t *b;
__m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax;
kswr_t r;
#define __max_8(ret, xx) do { \
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
@ -216,10 +236,13 @@ int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) //
} while (0)
// initialization
r = g_defr;
minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
m_b = n_b = 0; b = 0;
zero = _mm_set1_epi32(0);
gapoe = _mm_set1_epi16(a->gapo + a->gape);
gape = _mm_set1_epi16(a->gape);
gapoe = _mm_set1_epi16(_gapo + _gape);
gape = _mm_set1_epi16(_gape);
H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
slen = q->slen;
for (i = 0; i < slen; ++i) {
@ -261,11 +284,11 @@ int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) //
}
end_loop8:
__max_8(imax, max);
if (imax >= a->T) {
if (imax >= minsc) {
if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) {
if (n_b == m_b) {
m_b = m_b? m_b<<1 : 8;
b = xrealloc(b, 8 * m_b);
b = (uint64_t*)xrealloc(b, 8 * m_b);
}
b[n_b++] = (uint64_t)imax<<32 | i;
} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
@ -274,31 +297,238 @@ end_loop8:
gmax = imax; te = i;
for (j = 0; LIKELY(j < slen); ++j)
_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
if (gmax >= endsc) break;
}
S = H1; H1 = H0; H0 = S;
}
a->score = gmax; a->te = te;
r.score = gmax; r.te = te;
{
int max = -1, low, high, qlen = slen * 8;
uint16_t *t = (uint16_t*)Hmax;
for (i = 0, a->qe = -1; i < qlen; ++i, ++t)
if ((int)*t > max) max = *t, a->qe = i / 8 + i % 8 * slen;
i = (a->score + q->max - 1) / q->max;
low = te - i; high = te + i;
for (i = 0, a->score2 = 0; i < n_b; ++i) {
int e = (int32_t)b[i];
if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2)
a->score2 = b[i]>>32, a->te2 = e;
for (i = 0, r.qe = -1; i < qlen; ++i, ++t)
if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen;
if (b) {
i = (r.score + q->max - 1) / q->max;
low = te - i; high = te + i;
for (i = 0; i < n_b; ++i) {
int e = (int32_t)b[i];
if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
r.score2 = b[i]>>32, r.te2 = e;
}
}
}
free(b);
return a->score;
return r;
}
int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a)
static void revseq(int l, uint8_t *s)
{
if (q->size == 1) return ksw_sse2_16(q, tlen, target, a);
else return ksw_sse2_8(q, tlen, target, a);
int i, t;
for (i = 0; i < l>>1; ++i)
t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t;
}
kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry)
{
int size;
kswq_t *q;
kswr_t r, rr;
kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int);
q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat);
if (qry && *qry == 0) *qry = q;
func = q->size == 2? ksw_i16 : ksw_u8;
size = q->size;
r = func(q, tlen, target, gapo, gape, xtra);
if (qry == 0) free(q);
if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r;
revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end
q = ksw_qinit(size, r.qe + 1, query, m, mat);
rr = func(q, tlen, target, gapo, gape, KSW_XSTOP | r.score);
revseq(r.qe + 1, query); revseq(r.te + 1, target);
free(q);
if (r.score == rr.score)
r.tb = r.te - rr.te, r.qb = r.qe - rr.qe;
return r;
}
/********************
*** SW extension ***
********************/
typedef struct {
int32_t h, e;
} eh_t;
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle)
{
eh_t *eh; // score array
int8_t *qp; // query profile
int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap;
if (h0 < 0) h0 = 0;
// allocate memory
qp = xmalloc(qlen * m);
eh = xcalloc(qlen + 1, 8);
// generate the query profile
for (k = i = 0; k < m; ++k) {
const int8_t *p = &mat[k * m];
for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
}
// fill the first row
eh[0].h = h0; eh[1].h = h0 > gapoe? h0 - gapoe : 0;
for (j = 2; j <= qlen && eh[j-1].h > gape; ++j)
eh[j].h = eh[j-1].h - gape;
// adjust $w if it is too large
k = m * m;
for (i = 0, max = 0; i < k; ++i) // get the max score
max = max > mat[i]? max : mat[i];
max_gap = (int)((double)(qlen * max - gapo) / gape + 1.);
max_gap = max_gap > 1? max_gap : 1;
w = w < max_gap? w : max_gap;
// DP loop
max = h0, max_i = max_j = -1;
beg = 0, end = qlen;
for (i = 0; LIKELY(i < tlen); ++i) {
int f = 0, h1, m = 0, mj = -1;
int8_t *q = &qp[target[i] * qlen];
// compute the first column
h1 = h0 - (gapo + gape * (i + 1));
if (h1 < 0) h1 = 0;
// apply the band and the constraint (if provided)
if (beg < i - w) beg = i - w;
if (end > i + w + 1) end = i + w + 1;
if (end > qlen) end = qlen;
for (j = beg; LIKELY(j < end); ++j) {
// At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
// Similar to SSE2-SW, cells are computed in the following order:
// H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
// E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape
// F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape
eh_t *p = &eh[j];
int h = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j)
p->h = h1; // set H(i,j-1) for the next row
h += q[j];
h = h > e? h : e;
h = h > f? h : f;
h1 = h; // save H(i,j) to h1 for the next column
mj = m > h? mj : j;
m = m > h? m : h; // m is stored at eh[mj+1]
h -= gapoe;
h = h > 0? h : 0;
e -= gape;
e = e > h? e : h; // computed E(i+1,j)
p->e = e; // save E(i+1,j) for the next row
f -= gape;
f = f > h? f : h; // computed F(i,j+1)
}
eh[end].h = h1; eh[end].e = 0;
if (m == 0) break;
if (m > max) max = m, max_i = i, max_j = mj;
// update beg and end for the next round
for (j = mj; j >= beg && eh[j].h; --j);
beg = j + 1;
for (j = mj + 2; j <= end && eh[j].h; ++j);
end = j;
//beg = 0; end = qlen; // uncomment this line for debugging
}
free(eh); free(qp);
if (_qle) *_qle = max_j + 1;
if (_tle) *_tle = max_i + 1;
return max;
}
/********************
* Global alignment *
********************/
#define MINUS_INF -0x40000000
static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, int op, int len)
{
if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) {
if (*n_cigar == *m_cigar) {
*m_cigar = *m_cigar? (*m_cigar)<<1 : 4;
cigar = xrealloc(cigar, (*m_cigar) << 2);
}
cigar[(*n_cigar)++] = len<<4 | op;
} else cigar[(*n_cigar)-1] += len<<4;
return cigar;
}
int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_)
{
eh_t *eh;
int8_t *qp; // query profile
int i, j, k, gapoe = gapo + gape, score, n_col;
uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex
if (n_cigar_) *n_cigar_ = 0;
// allocate memory
n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix
z = xmalloc(n_col * tlen);
qp = xmalloc(qlen * m);
eh = xcalloc(qlen + 1, 8);
// generate the query profile
for (k = i = 0; k < m; ++k) {
const int8_t *p = &mat[k * m];
for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
}
// fill the first row
eh[0].h = 0; eh[0].e = MINUS_INF;
for (j = 1; j <= qlen && j <= w; ++j)
eh[j].h = -(gapo + gape * j), eh[j].e = MINUS_INF;
for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band
// DP loop
for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop
int32_t f = MINUS_INF, h1, beg, end;
int8_t *q = &qp[target[i] * qlen];
uint8_t *zi = &z[i * n_col];
beg = i > w? i - w : 0;
end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence
h1 = beg == 0? -(gapo + gape * (i + 1)) : MINUS_INF;
for (j = beg; LIKELY(j < end); ++j) {
// This loop is organized in a similar way to ksw_extend() and ksw_sse2(), except:
// 1) not checking h>0; 2) recording direction for backtracking
eh_t *p = &eh[j];
int32_t h = p->h, e = p->e;
uint8_t d; // direction
p->h = h1;
h += q[j];
d = h >= e? 0 : 1;
h = h >= e? h : e;
d = h >= f? d : 2;
h = h >= f? h : f;
h1 = h;
h -= gapoe;
e -= gape;
d |= e > h? 1<<2 : 0;
e = e > h? e : h;
p->e = e;
f -= gape;
d |= f > h? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two
f = f > h? f : h;
zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell
}
eh[end].h = h1; eh[end].e = MINUS_INF;
}
score = eh[qlen].h;
if (n_cigar_ && cigar_) { // backtrack
int n_cigar = 0, m_cigar = 0, which = 0;
uint32_t *cigar = 0, tmp;
i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell
while (i >= 0 && k >= 0) {
which = z[i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3;
if (which == 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k;
else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i;
else cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k;
}
if (i >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1);
if (k >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1);
for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;
*n_cigar_ = n_cigar, *cigar_ = cigar;
}
free(eh); free(qp); free(z);
return score;
}
/*******************************************
@ -334,30 +564,33 @@ unsigned char seq_nt4_table[256] = {
int main(int argc, char *argv[])
{
int c, sa = 1, sb = 3, i, j, k, forward_only = 0, size = 2;
int c, sa = 1, sb = 3, i, j, k, forward_only = 0, max_rseq = 0;
int8_t mat[25];
ksw_aux_t a;
int gapo = 5, gape = 2, minsc = 0, xtra = KSW_XSTART;
uint8_t *rseq = 0;
gzFile fpt, fpq;
kseq_t *kst, *ksq;
// parse command line
a.gapo = 5; a.gape = 2; a.T = 10;
while ((c = getopt(argc, argv, "a:b:q:r:ft:s:")) >= 0) {
while ((c = getopt(argc, argv, "a:b:q:r:ft:1")) >= 0) {
switch (c) {
case 'a': sa = atoi(optarg); break;
case 'b': sb = atoi(optarg); break;
case 'q': a.gapo = atoi(optarg); break;
case 'r': a.gape = atoi(optarg); break;
case 't': a.T = atoi(optarg); break;
case 'q': gapo = atoi(optarg); break;
case 'r': gape = atoi(optarg); break;
case 't': minsc = atoi(optarg); break;
case 'f': forward_only = 1; break;
case 's': size = atoi(optarg); break;
case '1': xtra |= KSW_XBYTE; break;
}
}
if (optind + 2 > argc) {
fprintf(stderr, "Usage: ksw [-s%d] [-a%d] [-b%d] [-q%d] [-r%d] <target.fa> <query.fa>\n", size, sa, sb, a.gapo, a.gape);
fprintf(stderr, "Usage: ksw [-1] [-f] [-a%d] [-b%d] [-q%d] [-r%d] [-t%d] <target.fa> <query.fa>\n", sa, sb, gapo, gape, minsc);
return 1;
}
if (minsc > 0xffff) minsc = 0xffff;
xtra |= KSW_XSUBO | minsc;
// initialize scoring matrix
for (i = k = 0; i < 5; ++i) {
for (i = k = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
mat[k++] = i == j? sa : -sb;
mat[k++] = 0; // ambiguous base
@ -368,35 +601,34 @@ int main(int argc, char *argv[])
fpq = xzopen(argv[optind+1], "r"); ksq = kseq_init(fpq);
// all-pair alignment
while (kseq_read(ksq) > 0) {
ksw_query_t *q[2];
for (i = 0; i < ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]];
q[0] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat);
kswq_t *q[2] = {0, 0};
kswr_t r;
for (i = 0; i < (int)ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]];
if (!forward_only) { // reverse
for (i = 0; i < ksq->seq.l/2; ++i) {
int t = ksq->seq.s[i];
ksq->seq.s[i] = ksq->seq.s[ksq->seq.l-1-i];
ksq->seq.s[ksq->seq.l-1-i] = t;
if ((int)ksq->seq.m > max_rseq) {
max_rseq = ksq->seq.m;
rseq = (uint8_t*)xrealloc(rseq, max_rseq);
}
for (i = 0; i < ksq->seq.l; ++i)
ksq->seq.s[i] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i];
q[1] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat);
} else q[1] = 0;
for (i = 0, j = ksq->seq.l - 1; i < (int)ksq->seq.l; ++i, --j)
rseq[j] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i];
}
gzrewind(fpt); kseq_rewind(kst);
while (kseq_read(kst) > 0) {
int s;
for (i = 0; i < kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]];
s = ksw_sse2(q[0], kst->seq.l, (uint8_t*)kst->seq.s, &a);
printf("%s\t%s\t+\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1);
if (q[1]) {
s = ksw_sse2(q[1], kst->seq.l, (uint8_t*)kst->seq.s, &a);
printf("%s\t%s\t-\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1);
for (i = 0; i < (int)kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]];
r = ksw_align(ksq->seq.l, (uint8_t*)ksq->seq.s, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[0]);
if (r.score >= minsc)
err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, r.qb, r.qe+1, r.score, r.score2, r.te2);
if (rseq) {
r = ksw_align(ksq->seq.l, rseq, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[1]);
if (r.score >= minsc)
err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, (int)ksq->seq.l - r.qb, (int)ksq->seq.l - 1 - r.qe, r.score, r.score2, r.te2);
}
}
free(q[0]); free(q[1]);
}
free(rseq);
kseq_destroy(kst); err_gzclose(fpt);
kseq_destroy(ksq); err_gzclose(fpq);
return 0;
}
#endif // _KSW_MAIN
#endif // _NO_SSE2
#endif

84
ksw.h
View File

@ -1,51 +1,69 @@
#ifndef __AC_KSW_H
#define __AC_KSW_H
struct _ksw_query_t;
typedef struct _ksw_query_t ksw_query_t;
#include <stdint.h>
#define KSW_XBYTE 0x10000
#define KSW_XSTOP 0x20000
#define KSW_XSUBO 0x40000
#define KSW_XSTART 0x80000
struct _kswq_t;
typedef struct _kswq_t kswq_t;
typedef struct {
// input
unsigned gapo, gape; // the first gap costs gapo+gape
unsigned T; // threshold
// output
int score, te, qe, score2, te2;
} ksw_aux_t;
int score; // best score
int te, qe; // target end and query end
int score2, te2; // second best score and ending position on the target
int tb, qb; // target start and query start
} kswr_t;
#ifdef __cplusplus
extern "C" {
#endif
/**
* Initialize the query data structure
* Aligning two sequences
*
* @param size Number of bytes used to store a score; valid valures are 1 or 2
* @param qlen Length of the query sequence
* @param query Query sequence
* @param m Size of the alphabet
* @param mat Scoring matrix in a one-dimension array
* @param qlen length of the query sequence (typically <tlen)
* @param query query sequence with 0 <= query[i] < m
* @param tlen length of the target sequence
* @param target target sequence
* @param m number of residue types
* @param mat m*m scoring matrix in one-dimention array
* @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)"
* @param gape gap extension penalty
* @param xtra extra information (see below)
* @param qry query profile (see below)
*
* @return Query data structure
* @return alignment information in a struct; unset values to -1
*
* When xtra==0, ksw_align() uses a signed two-byte integer to store a
* score and only finds the best score and the end positions. The 2nd best
* score or the start positions are not attempted. The default behavior can
* be tuned by setting KSW_X* flags:
*
* KSW_XBYTE: use an unsigned byte to store a score. If overflow occurs,
* kswr_t::score will be set to 255
*
* KSW_XSUBO: track the 2nd best score and the ending position on the
* target if the 2nd best is higher than (xtra&0xffff)
*
* KSW_XSTOP: stop if the maximum score is above (xtra&0xffff)
*
* KSW_XSTART: find the start positions
*
* When *qry==NULL, ksw_align() will compute and allocate the query profile
* and when the function returns, *qry will point to the profile, which can
* be deallocated simply by free(). If one query is aligned against multiple
* target sequences, *qry should be set to NULL during the first call and
* freed after the last call. Note that qry can equal 0. In this case, the
* query profile will be deallocated in ksw_align().
*/
ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat); // to free, simply call free()
kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry);
/**
* Compute the maximum local score for queries initialized with ksw_qinit(1, ...)
*
* @param q Query data structure returned by ksw_qinit(1, ...)
* @param tlen Length of the target sequence
* @param target Target sequence
* @param a Auxiliary data structure (see ksw.h)
*
* @return The maximum local score; if the returned value equals 255, the SW may not be finished
*/
int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a);
/** Compute the maximum local score for queries initialized with ksw_qinit(2, ...) */
int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a);
/** Unified interface for ksw_sse2_8() and ksw_sse2_16() */
int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a);
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle);
int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *_n_cigar, uint32_t **_cigar);
#ifdef __cplusplus
}

12
kvec.h
View File

@ -1,6 +1,6 @@
/* The MIT License
Copyright (c) 2008, by Attractive Chaos <attractivechaos@aol.co.uk>
Copyright (c) 2008, by Attractive Chaos <attractor@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
@ -76,15 +76,15 @@ int main() {
(v).a[(v).n++] = (x); \
} while (0)
#define kv_pushp(type, v) (((v).n == (v).m)? \
#define kv_pushp(type, v) ((((v).n == (v).m)? \
((v).m = ((v).m? (v).m<<1 : 2), \
(v).a = (type*)xrealloc((v).a, sizeof(type) * (v).m), 0) \
: 0), ((v).a + ((v).n++))
: 0), &(v).a[(v).n++])
#define kv_a(type, v, i) ((v).m <= (size_t)(i)? \
#define kv_a(type, v, i) (((v).m <= (size_t)(i)? \
((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \
(v).a = (type*)xrealloc((v).a, sizeof(type) * (v).m), 0) \
: (v).n <= (size_t)(i)? (v).n = (i) \
: 0), (v).a[(i)]
: (v).n <= (size_t)(i)? (v).n = (i) + 1 \
: 0), (v).a[(i)])
#endif

11
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.6.2-r132"
#define PACKAGE_VERSION "0.6.2-r301-beta"
#endif
static int usage()
@ -20,21 +20,20 @@ static int usage()
fprintf(stderr, " sampe generate alignment (paired ended)\n");
fprintf(stderr, " bwasw BWA-SW for long queries\n");
fprintf(stderr, " fastmap identify super-maximal exact matches\n");
fprintf(stderr, " mem BWA-MEM algorithm\n");
fprintf(stderr, "\n");
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
fprintf(stderr, " pac2bwt generate BWT from PAC\n");
fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n");
fprintf(stderr, " bwtupdate update .bwt to the new format\n");
fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n");
fprintf(stderr, " pac2cspac convert PAC to color-space PAC\n");
fprintf(stderr, " stdsw standard SW/NW alignment\n");
fprintf(stderr, "\n");
return 1;
}
void bwa_print_sam_PG()
{
printf("@PG\tID:bwa\tPN:bwa\tVN:%s\n", PACKAGE_VERSION);
err_printf("@PG\tID:bwa\tPN:bwa\tVN:%s\n", PACKAGE_VERSION);
}
int main(int argc, char *argv[])
@ -50,15 +49,13 @@ int main(int argc, char *argv[])
else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1);
else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1);
else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1);
else if (strcmp(argv[1], "sw") == 0) ret = bwa_stdsw(argc-1, argv+1);
else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1);
else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1);
else if (strcmp(argv[1], "pac2cspac") == 0) ret = bwa_pac2cspac(argc-1, argv+1);
else if (strcmp(argv[1], "stdsw") == 0) ret = bwa_stdsw(argc-1, argv+1);
else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1);
else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1);
else {
fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
return 1;

4
main.h
View File

@ -6,7 +6,6 @@ extern "C" {
#endif
int bwa_fa2pac(int argc, char *argv[]);
int bwa_pac2cspac(int argc, char *argv[]);
int bwa_pac2bwt(int argc, char *argv[]);
int bwa_bwtupdate(int argc, char *argv[]);
int bwa_bwt2sa(int argc, char *argv[]);
@ -17,11 +16,10 @@ extern "C" {
int bwa_sai2sam_se(int argc, char *argv[]);
int bwa_sai2sam_pe(int argc, char *argv[]);
int bwa_stdsw(int argc, char *argv[]);
int bwa_bwtsw2(int argc, char *argv[]);
int main_fastmap(int argc, char *argv[]);
int main_mem(int argc, char *argv[]);
#ifdef __cplusplus
}

View File

@ -1,162 +0,0 @@
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <zlib.h>
#include <stdint.h>
#include "stdaln.h"
#include "utils.h"
#include "kseq.h"
KSEQ_INIT(gzFile, err_gzread)
typedef struct {
int l;
unsigned char *s;
char *n;
} seq1_t;
typedef struct {
int n_seqs, m_seqs;
seq1_t *seqs;
} seqs_t;
unsigned char aln_rev_table[256] = {
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','T','V','G', 'H','N','N','C', 'D','N','N','M', 'N','K','N','N',
'N','N','Y','S', 'A','N','B','W', 'X','R','N','N', 'N','N','N','N',
'N','t','v','g', 'h','N','N','c', 'd','N','N','m', 'N','k','N','N',
'N','N','y','s', 'a','N','b','w', 'x','r','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N'
};
static int g_is_global = 0, g_thres = 1, g_strand = 0, g_aa = 0;
static AlnParam g_aln_param;
static void revseq(int len, uint8_t *seq)
{
int i;
for (i = 0; i < len>>1; ++i) {
uint8_t tmp = aln_rev_table[seq[len-1-i]];
seq[len-1-i] = aln_rev_table[seq[i]];
seq[i] = tmp;
}
if (len&1) seq[i] = aln_rev_table[seq[i]];
}
static seqs_t *load_seqs(const char *fn)
{
seqs_t *s;
seq1_t *p;
gzFile fp;
int l;
kseq_t *seq;
fp = xzopen(fn, "r");
seq = kseq_init(fp);
s = (seqs_t*)xcalloc(1, sizeof(seqs_t));
s->m_seqs = 256;
s->seqs = (seq1_t*)xcalloc(s->m_seqs, sizeof(seq1_t));
while ((l = kseq_read(seq)) >= 0) {
if (s->n_seqs == s->m_seqs) {
s->m_seqs <<= 1;
s->seqs = (seq1_t*)xrealloc(s->seqs, s->m_seqs * sizeof(seq1_t));
}
p = s->seqs + (s->n_seqs++);
p->l = seq->seq.l;
p->s = (unsigned char*)xmalloc(p->l + 1);
memcpy(p->s, seq->seq.s, p->l);
p->s[p->l] = 0;
p->n = xstrdup((const char*)seq->name.s);
}
kseq_destroy(seq);
err_gzclose(fp);
fprintf(stderr, "[load_seqs] %d sequences are loaded.\n", s->n_seqs);
return s;
}
static void aln_1seq(const seqs_t *ss, const char *name, int l, const char *s, char strand)
{
int i;
for (i = 0; i < ss->n_seqs; ++i) {
AlnAln *aa;
seq1_t *p = ss->seqs + i;
g_aln_param.band_width = l + p->l;
aa = aln_stdaln_aux(s, (const char*)p->s, &g_aln_param, g_is_global, g_thres, l, p->l);
if (aa->score >= g_thres || g_is_global) {
printf(">%s\t%d\t%d\t%s\t%c\t%d\t%d\t%d\t%d\t", p->n, aa->start1? aa->start1 : 1, aa->end1, name, strand,
aa->start2? aa->start2 : 1, aa->end2, aa->score, aa->subo);
// NB: I put the short sequence as the first sequence in SW, an insertion to
// the reference becomes a deletion from the short sequence. Therefore, I use
// "MDI" here rather than "MID", and print ->out2 first rather than ->out1.
for (i = 0; i != aa->n_cigar; ++i)
printf("%d%c", aa->cigar32[i]>>4, "MDI"[aa->cigar32[i]&0xf]);
printf("\n%s\n%s\n%s\n", aa->out2, aa->outm, aa->out1);
}
aln_free_AlnAln(aa);
}
}
static void aln_seqs(const seqs_t *ss, const char *fn)
{
gzFile fp;
kseq_t *seq;
int l;
fp = xzopen(fn, "r");
seq = kseq_init(fp);
while ((l = kseq_read(seq)) >= 0) {
if (g_strand&1) aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '+');
if (g_strand&2) {
revseq(l, (uint8_t*)seq->seq.s);
aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '-');
}
}
kseq_destroy(seq);
err_gzclose(fp);
}
int bwa_stdsw(int argc, char *argv[])
{
int c;
seqs_t *ss;
while ((c = getopt(argc, argv, "gT:frp")) >= 0) {
switch (c) {
case 'g': g_is_global = 1; break;
case 'T': g_thres = atoi(optarg); break;
case 'f': g_strand |= 1; break;
case 'r': g_strand |= 2; break;
case 'p': g_aa = 1; break;
}
}
if (g_strand == 0) g_strand = 3;
if (g_aa) g_strand = 1;
if (optind + 1 >= argc) {
fprintf(stderr, "\nUsage: bwa stdsw [options] <seq1.long.fa> <seq2.short.fa>\n\n");
fprintf(stderr, "Options: -T INT minimum score [%d]\n", g_thres);
fprintf(stderr, " -p protein alignment (suppressing -r)\n");
fprintf(stderr, " -f forward strand only\n");
fprintf(stderr, " -r reverse strand only\n");
fprintf(stderr, " -g global alignment\n\n");
fprintf(stderr, "Note: This program is specifically designed for alignment between multiple short\n");
fprintf(stderr, " sequences and ONE long sequence. It outputs the suboptimal score on the long\n");
fprintf(stderr, " sequence.\n\n");
return 1;
}
g_aln_param = g_aa? aln_param_aa2aa : aln_param_blast;
g_aln_param.gap_end = 0;
ss = load_seqs(argv[optind]);
aln_seqs(ss, argv[optind+1]);
return 0;
}

View File

@ -1,111 +0,0 @@
#!/usr/bin/perl -w
# Author: lh3
# Note: Ideally, this script should be written in C. It is a bit slow at present.
# Also note that this script is different from the one contained in MAQ.
use strict;
use warnings;
use Getopt::Std;
my %opts;
my $version = '0.1.4';
my $usage = qq{
Usage: solid2fastq.pl <in.title> <out.prefix>
Note: <in.title> is the string showed in the `# Title:' line of a
".csfasta" read file. Then <in.title>F3.csfasta is read sequence
file and <in.title>F3_QV.qual is the quality file. If
<in.title>R3.csfasta is present, this script assumes reads are
paired; otherwise reads will be regarded as single-end.
The read name will be <out.prefix>:panel_x_y/[12] with `1' for R3
tag and `2' for F3. Usually you may want to use short <out.prefix>
to save diskspace. Long <out.prefix> also causes troubles to maq.
};
getopts('', \%opts);
die($usage) if (@ARGV != 2);
my ($title, $pre) = @ARGV;
my (@fhr, @fhw);
my @fn_suff = ('F3.csfasta', 'F3_QV.qual', 'R3.csfasta', 'R3_QV.qual');
my $is_paired = (-f "$title$fn_suff[2]" || -f "$title$fn_suff[2].gz")? 1 : 0;
if ($is_paired) { # paired end
for (0 .. 3) {
my $fn = "$title$fn_suff[$_]";
$fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz");
open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n");
}
open($fhw[0], "|gzip >$pre.read2.fastq.gz") || die; # this is NOT a typo
open($fhw[1], "|gzip >$pre.read1.fastq.gz") || die;
open($fhw[2], "|gzip >$pre.single.fastq.gz") || die;
my (@df, @dr);
@df = &read1(1); @dr = &read1(2);
while (@df && @dr) {
if ($df[0] eq $dr[0]) { # mate pair
print {$fhw[0]} $df[1]; print {$fhw[1]} $dr[1];
@df = &read1(1); @dr = &read1(2);
} else {
if ($df[0] le $dr[0]) {
print {$fhw[2]} $df[1];
@df = &read1(1);
} else {
print {$fhw[2]} $dr[1];
@dr = &read1(2);
}
}
}
if (@df) {
print {$fhw[2]} $df[1];
while (@df = &read1(1, $fhr[0], $fhr[1])) {
print {$fhw[2]} $df[1];
}
}
if (@dr) {
print {$fhw[2]} $dr[1];
while (@dr = &read1(2, $fhr[2], $fhr[3])) {
print {$fhw[2]} $dr[1];
}
}
close($fhr[$_]) for (0 .. $#fhr);
close($fhw[$_]) for (0 .. $#fhw);
} else { # single end
for (0 .. 1) {
my $fn = "$title$fn_suff[$_]";
$fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz");
open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n");
}
open($fhw[2], "|gzip >$pre.single.fastq.gz") || die;
my @df;
while (@df = &read1(1, $fhr[0], $fhr[1])) {
print {$fhw[2]} $df[1];
}
close($fhr[$_]) for (0 .. $#fhr);
close($fhw[2]);
}
sub read1 {
my $i = shift(@_);
my $j = ($i-1)<<1;
my ($key, $seq);
my ($fhs, $fhq) = ($fhr[$j], $fhr[$j|1]);
while (<$fhs>) {
my $t = <$fhq>;
if (/^>(\d+)_(\d+)_(\d+)_[FR]3/) {
$key = sprintf("%.4d_%.4d_%.4d", $1, $2, $3); # this line could be improved on 64-bit machines
die(qq/** unmatched read name: '$_' != '$_'\n/) unless ($_ eq $t);
my $name = "$pre:$1_$2_$3/$i";
$_ = substr(<$fhs>, 2);
tr/0123./ACGTN/;
my $s = $_;
$_ = <$fhq>;
s/-1\b/0/eg;
s/^(\d+)\s*//;
s/(\d+)\s*/chr($1+33)/eg;
$seq = qq/\@$name\n$s+\n$_\n/;
last;
}
}
return defined($seq)? ($key, $seq) : ();
}

View File

@ -543,13 +543,12 @@ int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2,
int start, end, max_score;
int thres, *suba, *ss;
int gap_open, gap_ext, b;
int gap_open, gap_ext;
int *score_matrix, N_MATRIX_ROW;
/* initialize some align-related parameters. just for compatibility */
gap_open = ap->gap_open;
gap_ext = ap->gap_ext;
b = ap->band_width;
score_matrix = ap->matrix;
N_MATRIX_ROW = ap->row;
thres = _thres > 0? _thres : -_thres;
@ -863,7 +862,7 @@ uint16_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar)
int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap,
path_t *path, int *path_len, int G0, uint8_t *_mem)
{
int q, r, qr, tmp_len;
int q, r, qr;
int32_t **s_array, *score_array;
int is_overflow, of_base;
uint32_t *eh;
@ -890,7 +889,6 @@ int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2
s_array[i] = (int32_t*)_p, _p += 4 * len1;
/* initialization */
aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array);
tmp_len = len1 + 1;
start = 1; end = 2;
end_i = end_j = 0;
score = 0;

90
utils.c
View File

@ -41,6 +41,18 @@
#include <sys/time.h>
#include "utils.h"
#include "ksort.h"
#define pair64_lt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y < (b).y))
KSORT_INIT(128, pair64_t, pair64_lt)
KSORT_INIT(64, uint64_t, ks_lt_generic)
#include "kseq.h"
KSEQ_INIT2(, gzFile, err_gzread)
/********************
* System utilities *
********************/
FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
{
FILE *fp = 0;
@ -51,6 +63,7 @@ FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
}
return fp;
}
FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp)
{
if (freopen(fn, mode, fp) == 0) {
@ -58,6 +71,7 @@ FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE
}
return fp;
}
gzFile err_xzopen_core(const char *func, const char *fn, const char *mode)
{
gzFile fp;
@ -109,12 +123,10 @@ void _err_fatal_simple_core(const char *func, const char *msg)
size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
{
size_t ret = fwrite(ptr, size, nmemb, stream);
if (ret != nmemb)
{
_err_fatal_simple("fwrite", strerror(errno));
}
return ret;
size_t ret = fwrite(ptr, size, nmemb, stream);
if (ret != nmemb)
_err_fatal_simple("fwrite", strerror(errno));
return ret;
}
size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream)
@ -163,36 +175,26 @@ long err_ftell(FILE *stream)
int err_printf(const char *format, ...)
{
va_list arg;
int done;
va_start(arg, format);
done = vfprintf(stdout, format, arg);
int saveErrno = errno;
va_end(arg);
if (done < 0)
{
_err_fatal_simple("vfprintf(stdout)", strerror(saveErrno));
}
return done;
va_list arg;
int done;
va_start(arg, format);
done = vfprintf(stdout, format, arg);
int saveErrno = errno;
va_end(arg);
if (done < 0) _err_fatal_simple("vfprintf(stdout)", strerror(saveErrno));
return done;
}
int err_fprintf(FILE *stream, const char *format, ...)
{
va_list arg;
int done;
va_start(arg, format);
done = vfprintf(stream, format, arg);
int saveErrno = errno;
va_end(arg);
if (done < 0)
{
_err_fatal_simple("vfprintf", strerror(saveErrno));
}
return done;
va_list arg;
int done;
va_start(arg, format);
done = vfprintf(stream, format, arg);
int saveErrno = errno;
va_end(arg);
if (done < 0) _err_fatal_simple("vfprintf", strerror(saveErrno));
return done;
}
int err_fputc(int c, FILE *stream)
@ -220,10 +222,8 @@ int err_fputs(const char *s, FILE *stream)
int err_fflush(FILE *stream)
{
int ret = fflush(stream);
if (ret != 0)
{
_err_fatal_simple("fflush", strerror(errno));
}
if (ret != 0) _err_fatal_simple("fflush", strerror(errno));
#ifdef FSYNC_ON_FLUSH
/* Calling fflush() ensures that all the data has made it to the
kernel buffers, but this may not be sufficient for remote filesystems
@ -234,15 +234,12 @@ int err_fflush(FILE *stream)
{
struct stat sbuf;
if (0 != fstat(fileno(stream), &sbuf))
{
_err_fatal_simple("fstat", strerror(errno));
}
if (S_ISREG(sbuf.st_mode))
{
if (0 != fsync(fileno(stream)))
{
_err_fatal_simple("fsync", strerror(errno));
}
}
}
#endif
@ -251,12 +248,9 @@ int err_fflush(FILE *stream)
int err_fclose(FILE *stream)
{
int ret = fclose(stream);
if (ret != 0)
{
_err_fatal_simple("fclose", strerror(errno));
}
return ret;
int ret = fclose(stream);
if (ret != 0) _err_fatal_simple("fclose", strerror(errno));
return ret;
}
int err_gzclose(gzFile file)
@ -311,6 +305,10 @@ char *err_strdup(const char *s, const char *file, unsigned int line, const char
return p;
}
/*********
* Timer *
*********/
double cputime()
{
struct rusage r;

27
utils.h
View File

@ -28,6 +28,7 @@
#ifndef LH3_UTILS_H
#define LH3_UTILS_H
#include <stdint.h>
#include <stdio.h>
#include <zlib.h>
@ -38,10 +39,9 @@
#define ATTRIBUTE(list)
#endif
#define err_fatal_simple(msg) _err_fatal_simple(__func__, msg)
#define err_fatal_simple_core(msg) _err_fatal_simple_core(__func__, msg)
#define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
#define xreopen(fn, mode, fp) err_xreopen_core(__func__, fn, mode, fp)
#define xzopen(fn, mode) err_xzopen_core(__func__, fn, mode)
@ -54,6 +54,13 @@
#define xstrdup(s) err_strdup( (s), __FILE__, __LINE__, __func__)
typedef struct {
uint64_t x, y;
} pair64_t;
typedef struct { size_t n, m; uint64_t *a; } uint64_v;
typedef struct { size_t n, m; pair64_t *a; } pair64_v;
#ifdef __cplusplus
extern "C" {
#endif
@ -92,8 +99,24 @@ extern "C" {
double cputime();
double realtime();
void ks_introsort_64 (size_t n, uint64_t *a);
void ks_introsort_128(size_t n, pair64_t *a);
#ifdef __cplusplus
}
#endif
static inline uint64_t hash_64(uint64_t key)
{
key += ~(key << 32);
key ^= (key >> 22);
key += ~(key << 13);
key ^= (key >> 8);
key += (key << 3);
key ^= (key >> 15);
key += ~(key << 27);
key ^= (key >> 31);
return key;
}
#endif