Merge branch 'master' into master_fixes
Merged to master version b621d3a
Conflicts:
Makefile
bntseq.c
bwa.c
bwase.c
bwaseqio.c
bwtaln.c
bwtindex.c
bwtio.c
bwtmisc.c
bwtsw2_aux.c
cs2nt.c
fastmap.c
khash.h
kseq.h
ksw.c
kvec.h
simple_dp.c
utils.c
utils.h
This commit is contained in:
commit
3d33ab063e
|
|
@ -1,4 +1,5 @@
|
|||
*.[oa]
|
||||
bwa
|
||||
test
|
||||
test64
|
||||
.*.swp
|
||||
|
|
|
|||
64
Makefile
64
Makefile
|
|
@ -1,14 +1,11 @@
|
|||
CC= gcc
|
||||
CXX= g++
|
||||
CFLAGS= -g -Wall -O2
|
||||
CFLAGS= -g -Wall -O2 -msse2
|
||||
CXXFLAGS= $(CFLAGS)
|
||||
AR= ar
|
||||
DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64
|
||||
LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o stdaln.o \
|
||||
bwaseqio.o bwase.o kstring.o
|
||||
AOBJS= QSufSort.o bwt_gen.o \
|
||||
is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \
|
||||
bwape.o cs2nt.o \
|
||||
LOBJS= utils.o kstring.o ksw.o kopen.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o
|
||||
AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
||||
is.o bwtindex.o bwape.o \
|
||||
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
||||
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
|
||||
PROG= bwa
|
||||
|
|
@ -26,7 +23,7 @@ SUBDIRS= .
|
|||
all:$(PROG)
|
||||
|
||||
bwa:libbwa.a $(AOBJS) main.o
|
||||
$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
|
||||
$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa
|
||||
|
||||
libbwa.a:$(LOBJS)
|
||||
$(AR) -csru $@ $(LOBJS)
|
||||
|
|
@ -34,35 +31,40 @@ libbwa.a:$(LOBJS)
|
|||
clean:
|
||||
rm -f gmon.out *.o a.out $(PROG) *~ *.a
|
||||
|
||||
depend:
|
||||
( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) -- *.c )
|
||||
|
||||
# DO NOT DELETE THIS LINE -- make depend depends on it.
|
||||
|
||||
QSufSort.o: QSufSort.h
|
||||
bamlite.o: bamlite.h utils.h
|
||||
bntseq.o: bntseq.h kseq.h main.h utils.h
|
||||
bwa.o: bntseq.h bwa.h bwt.h bwtaln.h bwtgap.h stdaln.h utils.h
|
||||
bwape.o: bntseq.h bwase.h bwt.h bwtaln.h khash.h ksort.h kvec.h stdaln.h
|
||||
bwape.o: utils.h
|
||||
bwase.o: bntseq.h bwase.h bwt.h bwtaln.h kstring.h stdaln.h utils.h
|
||||
bwaseqio.o: bamlite.h bwt.h bwtaln.h kseq.h stdaln.h utils.h
|
||||
bwt.o: bwt.h kvec.h utils.h
|
||||
bamlite.o: utils.h bamlite.h
|
||||
bntseq.o: bntseq.h main.h utils.h kseq.h
|
||||
bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h kseq.h
|
||||
bwamem.o: kstring.h utils.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h
|
||||
bwamem.o: ksort.h kbtree.h
|
||||
bwamem_pair.o: kstring.h utils.h bwamem.h bwt.h bntseq.h bwa.h kvec.h ksw.h
|
||||
bwape.o: bwtaln.h bwt.h stdaln.h kvec.h bntseq.h utils.h bwase.h bwa.h
|
||||
bwape.o: khash.h
|
||||
bwase.o: stdaln.h bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h bwa.h
|
||||
bwaseqio.o: bwtaln.h bwt.h stdaln.h utils.h bamlite.h kseq.h
|
||||
bwt.o: utils.h bwt.h kvec.h
|
||||
bwt_gen.o: QSufSort.h utils.h
|
||||
bwt_lite.o: bwt_lite.h utils.h
|
||||
bwtaln.o: bwt.h bwtaln.h bwtgap.h stdaln.h utils.h
|
||||
bwtgap.o: bwt.h bwtaln.h bwtgap.h stdaln.h utils.h
|
||||
bwtaln.o: bwtaln.h bwt.h stdaln.h bwtgap.h utils.h bwa.h bntseq.h
|
||||
bwtgap.o: bwtgap.h bwt.h bwtaln.h stdaln.h utils.h
|
||||
bwtindex.o: bntseq.h bwt.h main.h utils.h
|
||||
bwtio.o: bwt.h utils.h
|
||||
bwtmisc.o: bntseq.h bwt.h main.h utils.h
|
||||
bwtsw2_aux.o: bntseq.h bwt.h bwt_lite.h bwtsw2.h kseq.h ksort.h kstring.h
|
||||
bwtsw2_aux.o: stdaln.h utils.h
|
||||
bwtsw2_chain.o: bntseq.h bwt.h bwt_lite.h bwtsw2.h ksort.h utils.h
|
||||
bwtsw2_core.o: bntseq.h bwt.h bwt_lite.h bwtsw2.h khash.h ksort.h kvec.h
|
||||
bwtsw2_core.o: utils.h
|
||||
bwtsw2_main.o: bntseq.h bwt.h bwt_lite.h bwtsw2.h utils.h
|
||||
bwtsw2_pair.o: bntseq.h bwt.h bwt_lite.h bwtsw2.h kstring.h ksw.h utils.h
|
||||
cs2nt.o: bwt.h bwtaln.h stdaln.h utils.h
|
||||
fastmap.o: bntseq.h bwt.h kseq.h kvec.h utils.h
|
||||
bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h stdaln.h kstring.h
|
||||
bwtsw2_aux.o: bwa.h kseq.h ksort.h
|
||||
bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h utils.h ksort.h
|
||||
bwtsw2_core.o: bwt_lite.h bwtsw2.h bntseq.h bwt.h kvec.h utils.h khash.h
|
||||
bwtsw2_core.o: ksort.h
|
||||
bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h
|
||||
bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h ksw.h
|
||||
fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h utils.h kseq.h
|
||||
is.o: utils.h
|
||||
kopen.o: utils.h
|
||||
kstring.o: kstring.h utils.h
|
||||
ksw.o: ksw.h utils.h
|
||||
main.o: main.h utils.h
|
||||
simple_dp.o: kseq.h stdaln.h utils.h
|
||||
stdaln.o: stdaln.h utils.h
|
||||
utils.o: utils.h
|
||||
utils.o: utils.h ksort.h kseq.h
|
||||
|
|
|
|||
|
|
@ -59,12 +59,9 @@ void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsin
|
|||
qsint_t i, j;
|
||||
qsint_t s, negatedSortedGroupLength;
|
||||
qsint_t numSymbolAggregated;
|
||||
qsint_t maxNumInputSymbol;
|
||||
qsint_t numSortedPos = 1;
|
||||
qsint_t newAlphabetSize;
|
||||
|
||||
maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1;
|
||||
|
||||
if (!skipTransform) {
|
||||
/* bucketing possible*/
|
||||
newAlphabetSize = QSufSortTransform(V, I, numChar, largestInputSymbol, smallestInputSymbol,
|
||||
|
|
|
|||
54
bntseq.c
54
bntseq.c
|
|
@ -36,7 +36,7 @@
|
|||
#include "utils.h"
|
||||
|
||||
#include "kseq.h"
|
||||
KSEQ_INIT(gzFile, err_gzread)
|
||||
KSEQ_DECLARE(gzFile)
|
||||
|
||||
unsigned char nst_nt4_table[256] = {
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
|
|
@ -310,21 +310,26 @@ int bwa_fa2pac(int argc, char *argv[])
|
|||
return 0;
|
||||
}
|
||||
|
||||
int bns_pos2rid(const bntseq_t *bns, int64_t pos_f)
|
||||
{
|
||||
int left, mid, right;
|
||||
if (pos_f >= bns->l_pac) return -1;
|
||||
left = 0; mid = 0; right = bns->n_seqs;
|
||||
while (left < right) { // binary search
|
||||
mid = (left + right) >> 1;
|
||||
if (pos_f >= bns->anns[mid].offset) {
|
||||
if (mid == bns->n_seqs - 1) break;
|
||||
if (pos_f < bns->anns[mid+1].offset) break; // bracketed
|
||||
left = mid + 1;
|
||||
} else right = mid;
|
||||
}
|
||||
return mid;
|
||||
}
|
||||
|
||||
int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id)
|
||||
{
|
||||
int left, mid, right, nn;
|
||||
if (ref_id) {
|
||||
left = 0; mid = 0; right = bns->n_seqs;
|
||||
while (left < right) {
|
||||
mid = (left + right) >> 1;
|
||||
if (pos_f >= bns->anns[mid].offset) {
|
||||
if (mid == bns->n_seqs - 1) break;
|
||||
if (pos_f < bns->anns[mid+1].offset) break; // bracketed
|
||||
left = mid + 1;
|
||||
} else right = mid;
|
||||
}
|
||||
*ref_id = mid;
|
||||
}
|
||||
if (ref_id) *ref_id = bns_pos2rid(bns, pos_f);
|
||||
left = 0; right = bns->n_holes; nn = 0;
|
||||
while (left < right) {
|
||||
mid = (left + right) >> 1;
|
||||
|
|
@ -343,3 +348,26 @@ int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id)
|
|||
}
|
||||
return nn;
|
||||
}
|
||||
|
||||
uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len)
|
||||
{
|
||||
uint8_t *seq = 0;
|
||||
if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap
|
||||
if (end > l_pac<<1) end = l_pac<<1;
|
||||
if (beg < 0) beg = 0;
|
||||
if (beg >= l_pac || end <= l_pac) {
|
||||
int64_t k, l = 0;
|
||||
*len = end - beg;
|
||||
seq = xmalloc(end - beg);
|
||||
if (beg >= l_pac) { // reverse strand
|
||||
int64_t beg_f = (l_pac<<1) - 1 - end;
|
||||
int64_t end_f = (l_pac<<1) - 1 - beg;
|
||||
for (k = end_f; k > beg_f; --k)
|
||||
seq[l++] = 3 - _get_pac(pac, k);
|
||||
} else { // forward strand
|
||||
for (k = beg; k < end; ++k)
|
||||
seq[l++] = _get_pac(pac, k);
|
||||
}
|
||||
} else *len = 0; // if bridging the forward-reverse boundary, return nothing
|
||||
return seq;
|
||||
}
|
||||
|
|
|
|||
3
bntseq.h
3
bntseq.h
|
|
@ -29,6 +29,7 @@
|
|||
#define BWT_BNTSEQ_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <zlib.h>
|
||||
|
||||
#ifndef BWA_UBYTE
|
||||
|
|
@ -71,7 +72,9 @@ extern "C" {
|
|||
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename);
|
||||
void bns_destroy(bntseq_t *bns);
|
||||
int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only);
|
||||
int bns_pos2rid(const bntseq_t *bns, int64_t pos_f);
|
||||
int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id);
|
||||
uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
537
bwa.c
537
bwa.c
|
|
@ -1,274 +1,313 @@
|
|||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include "utils.h"
|
||||
#include "bwa.h"
|
||||
#include "bwt.h"
|
||||
#include "bwtgap.h"
|
||||
#include <zlib.h>
|
||||
#include <assert.h>
|
||||
#include "bntseq.h"
|
||||
#include "bwa.h"
|
||||
#include "ksw.h"
|
||||
#include "utils.h"
|
||||
|
||||
#ifndef kroundup32
|
||||
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
||||
#endif
|
||||
int bwa_verbose = 3;
|
||||
char bwa_rg_id[256];
|
||||
|
||||
extern unsigned char nst_nt4_table[256];
|
||||
extern void seq_reverse(int len, uint8_t *seq, int is_comp);
|
||||
/************************
|
||||
* Batch FASTA/Q reader *
|
||||
************************/
|
||||
|
||||
bwa_opt_t bwa_def_opt = { 11, 4, -1, 1, 6, 32, 2, 0.04 };
|
||||
#include "kseq.h"
|
||||
KSEQ_DECLARE(gzFile)
|
||||
|
||||
struct bwa_idx_t {
|
||||
static inline void trim_readno(kstring_t *s)
|
||||
{
|
||||
if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1]))
|
||||
s->l -= 2, s->s[s->l] = 0;
|
||||
}
|
||||
|
||||
static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s)
|
||||
{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice
|
||||
s->name = xstrdup(ks->name.s);
|
||||
s->comment = ks->comment.l? xstrdup(ks->comment.s) : 0;
|
||||
s->seq = xstrdup(ks->seq.s);
|
||||
s->qual = ks->qual.l? xstrdup(ks->qual.s) : 0;
|
||||
s->l_seq = strlen(s->seq);
|
||||
}
|
||||
|
||||
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
|
||||
{
|
||||
kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_;
|
||||
int size = 0, m, n;
|
||||
bseq1_t *seqs;
|
||||
m = n = 0; seqs = 0;
|
||||
while (kseq_read(ks) >= 0) {
|
||||
if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads
|
||||
fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__);
|
||||
break;
|
||||
}
|
||||
if (n >= m) {
|
||||
m = m? m<<1 : 256;
|
||||
seqs = xrealloc(seqs, m * sizeof(bseq1_t));
|
||||
}
|
||||
trim_readno(&ks->name);
|
||||
kseq2bseq1(ks, &seqs[n]);
|
||||
size += seqs[n++].l_seq;
|
||||
if (ks2) {
|
||||
trim_readno(&ks2->name);
|
||||
kseq2bseq1(ks2, &seqs[n]);
|
||||
size += seqs[n++].l_seq;
|
||||
}
|
||||
if (size >= chunk_size) break;
|
||||
}
|
||||
if (size == 0) { // test if the 2nd file is finished
|
||||
if (ks2 && kseq_read(ks2) >= 0)
|
||||
fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__);
|
||||
}
|
||||
*n_ = n;
|
||||
return seqs;
|
||||
}
|
||||
|
||||
/*****************
|
||||
* CIGAR related *
|
||||
*****************/
|
||||
|
||||
// Generate CIGAR when the alignment end points are known
|
||||
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM)
|
||||
{
|
||||
uint32_t *cigar = 0;
|
||||
uint8_t tmp, *rseq;
|
||||
int i, w;
|
||||
int64_t rlen;
|
||||
*n_cigar = 0; *NM = -1;
|
||||
if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand
|
||||
rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
|
||||
if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range
|
||||
if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position
|
||||
for (i = 0; i < l_query>>1; ++i)
|
||||
tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
|
||||
for (i = 0; i < rlen>>1; ++i)
|
||||
tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp;
|
||||
}
|
||||
//printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
|
||||
//printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
|
||||
// set the band-width
|
||||
w = (int)((double)(l_query * mat[0] - q) / r + 1.);
|
||||
w = w < 1? w : 1;
|
||||
w = w < w_? w : w_;
|
||||
w += abs(rlen - l_query);
|
||||
// NW alignment
|
||||
*score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar);
|
||||
{// compute NM
|
||||
int k, x, y, n_mm = 0, n_gap = 0;
|
||||
for (k = 0, x = y = 0; k < *n_cigar; ++k) {
|
||||
int op = cigar[k]&0xf;
|
||||
int len = cigar[k]>>4;
|
||||
if (op == 0) { // match
|
||||
for (i = 0; i < len; ++i)
|
||||
if (query[x + i] != rseq[y + i]) ++n_mm;
|
||||
x += len; y += len;
|
||||
} else if (op == 1) x += len, n_gap += len;
|
||||
else if (op == 2) y += len, n_gap += len;
|
||||
}
|
||||
*NM = n_mm + n_gap;
|
||||
}
|
||||
if (rb >= l_pac) // reverse back query
|
||||
for (i = 0; i < l_query>>1; ++i)
|
||||
tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
|
||||
|
||||
ret_gen_cigar:
|
||||
free(rseq);
|
||||
return cigar;
|
||||
}
|
||||
|
||||
int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)
|
||||
{
|
||||
int ib, ie, is_rev;
|
||||
int64_t fb, fe, mid = -1;
|
||||
if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary
|
||||
*qb = *qe = *rb = *re = -1;
|
||||
return -1; // unable to fix
|
||||
} else {
|
||||
fb = bns_depos(bns, *rb < bns->l_pac? *rb : *re - 1, &is_rev);
|
||||
ib = bns_pos2rid(bns, fb);
|
||||
if (fb - bns->anns[ib].offset + (*re - *rb) <= bns->anns[ib].len) return 0; // no need to fix
|
||||
fe = bns_depos(bns, *re - 1 < bns->l_pac? *re - 1 : *rb, &is_rev);
|
||||
ie = bns_pos2rid(bns, fe);
|
||||
if (ie - ib > 1) { // bridge three or more references
|
||||
*qb = *qe = *rb = *re = -1;
|
||||
return -2; // unable to fix
|
||||
} else {
|
||||
int l = bns->anns[ib].offset + bns->anns[ib].len - fb;
|
||||
mid = is_rev? *re - l : *rb + l;
|
||||
}
|
||||
}
|
||||
if (mid >= 0) {
|
||||
int i, score, n_cigar, y, NM;
|
||||
uint32_t *cigar;
|
||||
int64_t x;
|
||||
cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM);
|
||||
for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) {
|
||||
int op = cigar[i]&0xf, len = cigar[i]>>4;
|
||||
if (op == 0) {
|
||||
if (x <= mid && mid < x + len) {
|
||||
if (mid - *rb > *re - mid) { // the first part is longer
|
||||
if (x == mid) { // need to check the previous operation
|
||||
assert(i); // mid != *rb should always stand
|
||||
if ((cigar[i-1]&0xf) == 1) *qe = y - (cigar[i-1]>>4), *re = x;
|
||||
else if ((cigar[i-1]&0xf) == 2) *qe = y, *re = x - (cigar[i-1]>>4);
|
||||
else abort(); // should not be here
|
||||
} else *qe = y + (mid - x), *re = mid;
|
||||
} else *qb = y + (mid - x), *rb = mid;
|
||||
break;
|
||||
} else x += len, y += len;
|
||||
} else if (op == 1) { // insertion
|
||||
y += len;
|
||||
} else if (op == 2) { // deletion
|
||||
if (x <= mid && mid < x + len) {
|
||||
if (mid - *rb > *re - mid) *qe = y, *re = x;
|
||||
else *qb = y, *rb = x + len;
|
||||
break;
|
||||
} else x += len;
|
||||
} else abort(); // should not be here
|
||||
}
|
||||
free(cigar);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*********************
|
||||
* Full index reader *
|
||||
*********************/
|
||||
|
||||
char *bwa_idx_infer_prefix(const char *hint)
|
||||
{
|
||||
char *prefix;
|
||||
int l_hint;
|
||||
FILE *fp;
|
||||
l_hint = strlen(hint);
|
||||
prefix = xmalloc(l_hint + 3 + 4 + 1);
|
||||
strcpy(prefix, hint);
|
||||
strcpy(prefix + l_hint, ".64.bwt");
|
||||
if ((fp = fopen(prefix, "rb")) != 0) {
|
||||
fclose(fp);
|
||||
prefix[l_hint + 3] = 0;
|
||||
return prefix;
|
||||
} else {
|
||||
strcpy(prefix + l_hint, ".bwt");
|
||||
if ((fp = fopen(prefix, "rb")) == 0) {
|
||||
free(prefix);
|
||||
return 0;
|
||||
} else {
|
||||
fclose(fp);
|
||||
prefix[l_hint] = 0;
|
||||
return prefix;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bwt_t *bwa_idx_load_bwt(const char *hint)
|
||||
{
|
||||
char *tmp, *prefix;
|
||||
bwt_t *bwt;
|
||||
bntseq_t *bns;
|
||||
uint8_t *pac;
|
||||
};
|
||||
|
||||
struct bwa_buf_t {
|
||||
int max_buf;
|
||||
bwa_pestat_t pes;
|
||||
gap_stack_t *stack;
|
||||
gap_opt_t *opt;
|
||||
int *diff_tab;
|
||||
uint8_t *buf;
|
||||
int *logn;
|
||||
};
|
||||
|
||||
bwa_idx_t *bwa_idx_load(const char *prefix)
|
||||
{
|
||||
bwa_idx_t *p;
|
||||
int l;
|
||||
char *str;
|
||||
l = strlen(prefix);
|
||||
p = xcalloc(1, sizeof(bwa_idx_t));
|
||||
str = xmalloc(l + 10);
|
||||
strcpy(str, prefix);
|
||||
p->bns = bns_restore(str);
|
||||
strcpy(str + l, ".bwt");
|
||||
p->bwt = bwt_restore_bwt(str);
|
||||
str[l] = 0;
|
||||
strcpy(str + l, ".sa");
|
||||
bwt_restore_sa(str, p->bwt);
|
||||
free(str);
|
||||
p->pac = xcalloc(p->bns->l_pac/4+1, 1);
|
||||
err_fread_noeof(p->pac, 1, p->bns->l_pac/4+1, p->bns->fp_pac);
|
||||
err_fclose(p->bns->fp_pac);
|
||||
p->bns->fp_pac = 0;
|
||||
return p;
|
||||
prefix = bwa_idx_infer_prefix(hint);
|
||||
if (prefix == 0) {
|
||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
|
||||
return 0;
|
||||
}
|
||||
tmp = xcalloc(strlen(prefix) + 5, 1);
|
||||
strcat(strcpy(tmp, prefix), ".bwt"); // FM-index
|
||||
bwt = bwt_restore_bwt(tmp);
|
||||
strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA)
|
||||
bwt_restore_sa(tmp, bwt);
|
||||
free(tmp); free(prefix);
|
||||
return bwt;
|
||||
}
|
||||
|
||||
void bwa_idx_destroy(bwa_idx_t *p)
|
||||
bwaidx_t *bwa_idx_load(const char *hint, int which)
|
||||
{
|
||||
bns_destroy(p->bns);
|
||||
bwt_destroy(p->bwt);
|
||||
free(p->pac);
|
||||
free(p);
|
||||
bwaidx_t *idx;
|
||||
char *prefix;
|
||||
prefix = bwa_idx_infer_prefix(hint);
|
||||
if (prefix == 0) {
|
||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
|
||||
return 0;
|
||||
}
|
||||
idx = xcalloc(1, sizeof(bwaidx_t));
|
||||
if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
|
||||
if (which & BWA_IDX_BNS) {
|
||||
idx->bns = bns_restore(prefix);
|
||||
if (which & BWA_IDX_PAC) {
|
||||
idx->pac = xcalloc(idx->bns->l_pac/4+1, 1);
|
||||
err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
|
||||
err_fclose(idx->bns->fp_pac);
|
||||
idx->bns->fp_pac = 0;
|
||||
}
|
||||
}
|
||||
free(prefix);
|
||||
return idx;
|
||||
}
|
||||
|
||||
bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score)
|
||||
void bwa_idx_destroy(bwaidx_t *idx)
|
||||
{
|
||||
if (idx == 0) return;
|
||||
if (idx->bwt) bwt_destroy(idx->bwt);
|
||||
if (idx->bns) bns_destroy(idx->bns);
|
||||
if (idx->pac) free(idx->pac);
|
||||
free(idx);
|
||||
}
|
||||
|
||||
/***********************
|
||||
* SAM header routines *
|
||||
***********************/
|
||||
|
||||
void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line)
|
||||
{
|
||||
extern gap_opt_t *gap_init_opt(void);
|
||||
extern int bwa_cal_maxdiff(int l, double err, double thres);
|
||||
int i;
|
||||
bwa_buf_t *p;
|
||||
p = xmalloc(sizeof(bwa_buf_t));
|
||||
p->stack = gap_init_stack2(max_score);
|
||||
p->opt = gap_init_opt();
|
||||
p->opt->s_gapo = opt->s_gapo;
|
||||
p->opt->s_gape = opt->s_gape;
|
||||
p->opt->max_diff = opt->max_diff;
|
||||
p->opt->max_gapo = opt->max_gapo;
|
||||
p->opt->max_gape = opt->max_gape;
|
||||
p->opt->seed_len = opt->seed_len;
|
||||
p->opt->max_seed_diff = opt->max_seed_diff;
|
||||
p->opt->fnr = opt->fnr;
|
||||
p->diff_tab = xcalloc(BWA_MAX_QUERY_LEN, sizeof(int));
|
||||
for (i = 1; i < BWA_MAX_QUERY_LEN; ++i)
|
||||
p->diff_tab[i] = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr);
|
||||
p->logn = xcalloc(256, sizeof(int));
|
||||
for (i = 1; i != 256; ++i)
|
||||
p->logn[i] = (int)(4.343 * log(i) + 0.499);
|
||||
return p;
|
||||
for (i = 0; i < bns->n_seqs; ++i)
|
||||
err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
|
||||
if (rg_line) err_printf("%s\n", rg_line);
|
||||
}
|
||||
|
||||
void bwa_buf_destroy(bwa_buf_t *p)
|
||||
static char *bwa_escape(char *s)
|
||||
{
|
||||
gap_destroy_stack(p->stack);
|
||||
free(p->diff_tab); free(p->logn); free(p->opt);
|
||||
free(p);
|
||||
}
|
||||
|
||||
bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq)
|
||||
{
|
||||
extern int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width);
|
||||
int i, seq_len, buf_len;
|
||||
bwt_width_t *w, *seed_w;
|
||||
uint8_t *s;
|
||||
gap_opt_t opt2 = *buf->opt;
|
||||
bwa_sai_t sai;
|
||||
|
||||
seq_len = strlen(seq);
|
||||
// estimate the buffer length
|
||||
buf_len = (buf->opt->seed_len + seq_len + 1) * sizeof(bwt_width_t) + seq_len;
|
||||
if (buf_len > buf->max_buf) {
|
||||
buf->max_buf = buf_len;
|
||||
kroundup32(buf->max_buf);
|
||||
buf->buf = xrealloc(buf->buf, buf->max_buf);
|
||||
char *p, *q;
|
||||
for (p = q = s; *p; ++p) {
|
||||
if (*p == '\\') {
|
||||
++p;
|
||||
if (*p == 't') *q++ = '\t';
|
||||
else if (*p == 'n') *q++ = '\n';
|
||||
else if (*p == 'r') *q++ = '\r';
|
||||
else if (*p == '\\') *q++ = '\\';
|
||||
} else *q++ = *p;
|
||||
}
|
||||
memset(buf->buf, 0, buf_len);
|
||||
seed_w = (bwt_width_t*)buf->buf;
|
||||
w = seed_w + buf->opt->seed_len;
|
||||
s = (uint8_t*)(w + seq_len + 1);
|
||||
if (opt2.fnr > 0.) opt2.max_diff = buf->diff_tab[seq_len];
|
||||
// copy the sequence
|
||||
for (i = 0; i < seq_len; ++i)
|
||||
s[i] = nst_nt4_table[(int)seq[i]];
|
||||
seq_reverse(seq_len, s, 0);
|
||||
// mapping
|
||||
bwt_cal_width(idx->bwt, seq_len, s, w);
|
||||
if (opt2.seed_len >= seq_len) opt2.seed_len = 0x7fffffff;
|
||||
if (seq_len > buf->opt->seed_len)
|
||||
bwt_cal_width(idx->bwt, buf->opt->seed_len, s + (seq_len - buf->opt->seed_len), seed_w);
|
||||
for (i = 0; i < seq_len; ++i) // complement; I forgot why...
|
||||
s[i] = s[i] > 3? 4 : 3 - s[i];
|
||||
sai.sai = (bwa_sai1_t*)bwt_match_gap(idx->bwt, seq_len, s, w, seq_len <= buf->opt->seed_len? 0 : seed_w, &opt2, &sai.n, buf->stack);
|
||||
return sai;
|
||||
*q = '\0';
|
||||
return s;
|
||||
}
|
||||
|
||||
static void compute_NM(const uint8_t *pac, uint64_t l_pac, uint8_t *seq, int64_t pos, int n_cigar, uint32_t *cigar, int *n_mm, int *n_gaps)
|
||||
char *bwa_set_rg(const char *s)
|
||||
{
|
||||
uint64_t x = pos, z;
|
||||
int k, y = 0;
|
||||
*n_mm = *n_gaps = 0;
|
||||
for (k = 0; k < n_cigar; ++k) {
|
||||
int l = cigar[k]>>4;
|
||||
int op = cigar[k]&0xf;
|
||||
if (op == 0) { // match/mismatch
|
||||
for (z = 0; z < l && x + z < l_pac; ++z) {
|
||||
int c = pac[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
|
||||
if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) ++(*n_mm);
|
||||
}
|
||||
}
|
||||
if (op == 1 || op == 2) (*n_gaps) += l;
|
||||
if (op == 0 || op == 2) x += l;
|
||||
if (op == 0 || op == 1 || op == 4) y += l;
|
||||
char *p, *q, *r, *rg_line = 0;
|
||||
memset(bwa_rg_id, 0, 256);
|
||||
if (strstr(s, "@RG") != s) {
|
||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__);
|
||||
goto err_set_rg;
|
||||
}
|
||||
rg_line = xstrdup(s);
|
||||
bwa_escape(rg_line);
|
||||
if ((p = strstr(rg_line, "\tID:")) == 0) {
|
||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID at the read group line\n", __func__);
|
||||
goto err_set_rg;
|
||||
}
|
||||
p += 4;
|
||||
for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
|
||||
if (q - p + 1 > 256) {
|
||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__);
|
||||
goto err_set_rg;
|
||||
}
|
||||
for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
|
||||
*r++ = *q;
|
||||
return rg_line;
|
||||
|
||||
err_set_rg:
|
||||
free(rg_line);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps, bwa_aln_t *aln)
|
||||
{
|
||||
extern bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand);
|
||||
extern bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const uint8_t *seq, bwtint_t *_pos, int ext, int *n_cigar, int is_end_correct);
|
||||
int strand, seq_len, i, n_gap, n_mm;
|
||||
uint64_t pos3, pac_pos;
|
||||
uint8_t *s[2];
|
||||
|
||||
memset(aln, 0, sizeof(bwa_aln_t));
|
||||
seq_len = strlen(seq);
|
||||
if (seq_len<<1 > buf->max_buf) {
|
||||
buf->max_buf = seq_len<<1;
|
||||
kroundup32(buf->max_buf);
|
||||
buf->buf = xrealloc(buf->buf, buf->max_buf);
|
||||
}
|
||||
s[0] = buf->buf;
|
||||
s[1] = s[0] + seq_len;
|
||||
for (i = 0; i < seq_len; ++i)
|
||||
s[0][i] = s[1][i] = nst_nt4_table[(int)seq[i]];
|
||||
seq_reverse(seq_len, s[1], 1);
|
||||
pac_pos = bwa_sa2pos(idx->bns, idx->bwt, sa, seq_len, &strand);
|
||||
if (strand) aln->flag |= 16;
|
||||
if (n_gaps) { // only for gapped alignment
|
||||
int n_cigar;
|
||||
bwa_cigar_t *cigar16;
|
||||
cigar16 = bwa_refine_gapped_core(idx->bns->l_pac, idx->pac, seq_len, s[strand], &pac_pos, strand? n_gaps : -n_gaps, &n_cigar, 1);
|
||||
aln->n_cigar = n_cigar;
|
||||
aln->cigar = xmalloc(n_cigar * 4);
|
||||
for (i = 0, pos3 = pac_pos; i < n_cigar; ++i) {
|
||||
int op = cigar16[i]>>14;
|
||||
int len = cigar16[i]&0x3fff;
|
||||
if (op == 3) op = 4; // the 16-bit CIGAR is different from the 32-bit CIGAR
|
||||
aln->cigar[i] = len<<4 | op;
|
||||
if (op == 0 || op == 2) pos3 += len;
|
||||
}
|
||||
free(cigar16);
|
||||
} else { // ungapped
|
||||
aln->n_cigar = 1;
|
||||
aln->cigar = xmalloc(4);
|
||||
aln->cigar[0] = seq_len<<4 | 0;
|
||||
pos3 = pac_pos + seq_len;
|
||||
}
|
||||
aln->n_n = bns_cnt_ambi(idx->bns, pac_pos, pos3 - pac_pos, &aln->ref_id);
|
||||
aln->offset = pac_pos - idx->bns->anns[aln->ref_id].offset;
|
||||
if (pos3 - idx->bns->anns[aln->ref_id].offset > idx->bns->anns[aln->ref_id].len) // read mapped beyond the end of a sequence
|
||||
aln->flag |= 4; // read unmapped
|
||||
compute_NM(idx->pac, idx->bns->l_pac, s[strand], pac_pos, aln->n_cigar, aln->cigar, &n_mm, &n_gap);
|
||||
aln->n_mm = n_mm;
|
||||
aln->n_gap = n_gap;
|
||||
}
|
||||
|
||||
/************************
|
||||
* Single-end alignment *
|
||||
************************/
|
||||
|
||||
bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar)
|
||||
{
|
||||
bwa_one_t *one;
|
||||
int best, cnt, i, seq_len;
|
||||
|
||||
seq_len = strlen(seq);
|
||||
one = xcalloc(1, sizeof(bwa_one_t));
|
||||
one->sai = bwa_sai(idx, buf, seq);
|
||||
if (one->sai.n == 0) return one;
|
||||
// count number of hits; randomly select one alignment
|
||||
best = one->sai.sai[0].score;
|
||||
for (i = cnt = 0; i < one->sai.n; ++i) {
|
||||
bwa_sai1_t *p = &one->sai.sai[i];
|
||||
if (p->score > best) break;
|
||||
if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) {
|
||||
one->which = p;
|
||||
one->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48());
|
||||
}
|
||||
cnt += p->l - p->k + 1;
|
||||
}
|
||||
one->c1 = cnt;
|
||||
for (; i < one->sai.n; ++i)
|
||||
cnt += one->sai.sai[i].l - one->sai.sai[i].k + 1;
|
||||
one->c2 = cnt - one->c1;
|
||||
// estimate single-end mapping quality
|
||||
one->mapQs = -1;
|
||||
if (one->c1 == 0) one->mapQs = 23; // FIXME: is it possible?
|
||||
else if (one->c1 > 1) one->mapQs = 0;
|
||||
else {
|
||||
int diff = one->which->n_mm + one->which->n_gapo + one->which->n_gape;
|
||||
if (diff >= buf->diff_tab[seq_len]) one->mapQs = 25;
|
||||
else if (one->c2 == 0) one->mapQs = 37;
|
||||
}
|
||||
if (one->mapQs < 0) {
|
||||
cnt = (one->c2 >= 255)? 255 : one->c2;
|
||||
one->mapQs = 23 < buf->logn[cnt]? 0 : 23 - buf->logn[cnt];
|
||||
}
|
||||
one->mapQ = one->mapQs;
|
||||
// compute CIGAR on request
|
||||
one->one.ref_id = -1;
|
||||
if (gen_cigar) bwa_sa2aln(idx, buf, seq, one->sa, one->which->n_gapo + one->which->n_gape, &one->one);
|
||||
return one;
|
||||
}
|
||||
|
||||
void bwa_one_destroy(bwa_one_t *one)
|
||||
{
|
||||
free(one->sai.sai);
|
||||
free(one->one.cigar);
|
||||
free(one);
|
||||
}
|
||||
|
||||
/************************
|
||||
* Paired-end alignment *
|
||||
************************/
|
||||
|
||||
void bwa_pestat(bwa_buf_t *buf, int n, bwa_one_t **o[2])
|
||||
{
|
||||
}
|
||||
|
||||
void bwa_pe(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq[2], bwa_one_t *o[2])
|
||||
{
|
||||
}
|
||||
|
|
|
|||
108
bwa.h
108
bwa.h
|
|
@ -2,103 +2,45 @@
|
|||
#define BWA_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include "bntseq.h"
|
||||
#include "bwt.h"
|
||||
|
||||
#define BWA_DEF_MAX_SCORE 2048
|
||||
#define BWA_MAX_QUERY_LEN 1024
|
||||
|
||||
// BWA index
|
||||
struct bwa_idx_t;
|
||||
typedef struct bwa_idx_t bwa_idx_t;
|
||||
|
||||
// Buffer for BWA alignment
|
||||
struct bwa_buf_t;
|
||||
typedef struct bwa_buf_t bwa_buf_t;
|
||||
|
||||
// BWA alignment options
|
||||
typedef struct {
|
||||
int s_gapo, s_gape; // gap open and extension penalties; the mismatch penalty is fixed at 3
|
||||
int max_diff, max_gapo, max_gape; // max differences (-1 to use fnr for length-adjusted max diff), gap opens and gap extensions
|
||||
int seed_len, max_seed_diff; // seed length and max differences allowed in the seed
|
||||
float fnr; // parameter for automatic length-adjusted max differences
|
||||
} bwa_opt_t;
|
||||
|
||||
// default BWA alignment options
|
||||
extern bwa_opt_t bwa_def_opt; // = { 11, 4, -1, 1, 6, 32, 2, 0.04 }
|
||||
|
||||
// an interval hit in the SA coordinate; basic unit in .sai files
|
||||
typedef struct {
|
||||
uint32_t n_mm:16, n_gapo:8, n_gape:8;
|
||||
int score;
|
||||
uint64_t k, l; // [k,l] is the SA interval; each interval has l-k+1 hits
|
||||
} bwa_sai1_t;
|
||||
|
||||
// all interval hits in the SA coordinate
|
||||
typedef struct {
|
||||
int n; // number of interval hits
|
||||
bwa_sai1_t *sai;
|
||||
} bwa_sai_t;
|
||||
|
||||
// an alignment
|
||||
typedef struct {
|
||||
uint32_t n_n:8, n_gap:12, n_mm:12; // number of ambiguous bases, gaps and mismatches in the alignment
|
||||
int32_t ref_id; // referece sequence index (the first seq is indexed by 0)
|
||||
uint32_t offset; // coordinate on the reference; zero-based
|
||||
uint32_t n_cigar:16, flag:16; // number of CIGAR operations; SAM flag
|
||||
uint32_t *cigar; // CIGAR in the BAM 28+4 encoding; having n_cigar operations
|
||||
} bwa_aln_t;
|
||||
#define BWA_IDX_BWT 0x1
|
||||
#define BWA_IDX_BNS 0x2
|
||||
#define BWA_IDX_PAC 0x4
|
||||
#define BWA_IDX_ALL 0x7
|
||||
|
||||
typedef struct {
|
||||
int mapQs, mapQ, c1, c2;
|
||||
uint64_t sa;
|
||||
bwa_sai1_t *which;
|
||||
bwa_sai_t sai;
|
||||
bwa_aln_t one;
|
||||
} bwa_one_t;
|
||||
bwt_t *bwt; // FM-index
|
||||
bntseq_t *bns; // information on the reference sequences
|
||||
uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
|
||||
} bwaidx_t;
|
||||
|
||||
typedef struct {
|
||||
double avg, std, ap_prior;
|
||||
uint64_t low, high, high_bayesian;
|
||||
} bwa_pestat_t;
|
||||
int l_seq;
|
||||
char *name, *comment, *seq, *qual, *sam;
|
||||
} bseq1_t;
|
||||
|
||||
extern int bwa_verbose;
|
||||
extern char bwa_rg_id[256];
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// load a BWA index
|
||||
bwa_idx_t *bwa_idx_load(const char *prefix);
|
||||
void bwa_idx_destroy(bwa_idx_t *p);
|
||||
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
|
||||
|
||||
// allocate a BWA alignment buffer; if unsure, set opt to &bwa_def_opt and max_score to BWA_DEF_MAX_SCORE
|
||||
bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score);
|
||||
void bwa_buf_destroy(bwa_buf_t *p);
|
||||
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
|
||||
int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re);
|
||||
|
||||
/**
|
||||
* Find all the SA intervals
|
||||
*
|
||||
* @param idx BWA index; multiple threads can share the same index
|
||||
* @param buf BWA alignment buffer; each thread should have its own buffer
|
||||
* @param seq NULL terminated C string, consisting of A/C/G/T/N only
|
||||
*
|
||||
* @return SA intervals seq is matched to
|
||||
*/
|
||||
bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq);
|
||||
char *bwa_idx_infer_prefix(const char *hint);
|
||||
bwt_t *bwa_idx_load_bwt(const char *hint);
|
||||
|
||||
/**
|
||||
* Construct an alignment in the base-pair coordinate
|
||||
*
|
||||
* @param idx BWA index
|
||||
* @param buf BWA alignment buffer
|
||||
* @param seq NULL terinated C string
|
||||
* @param sa Suffix array value
|
||||
* @param n_gaps Number of gaps (typically equal to bwa_sai1_t::n_gapo + bwa_sai1_t::n_gape
|
||||
*
|
||||
* @return An alignment
|
||||
*/
|
||||
void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps, bwa_aln_t *aln);
|
||||
bwaidx_t *bwa_idx_load(const char *hint, int which);
|
||||
void bwa_idx_destroy(bwaidx_t *idx);
|
||||
|
||||
bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar);
|
||||
|
||||
void bwa_one_destroy(bwa_one_t *one);
|
||||
void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line);
|
||||
char *bwa_set_rg(const char *s);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,791 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#ifdef HAVE_PTHREAD
|
||||
#include <pthread.h>
|
||||
#endif
|
||||
|
||||
#include "kstring.h"
|
||||
#include "bwamem.h"
|
||||
#include "bntseq.h"
|
||||
#include "ksw.h"
|
||||
#include "kvec.h"
|
||||
#include "ksort.h"
|
||||
#include "utils.h"
|
||||
|
||||
/* Theory on probability and scoring *ungapped* alignment
|
||||
*
|
||||
* s'(a,b) = log[P(b|a)/P(b)] = log[4P(b|a)], assuming uniform base distribution
|
||||
* s'(a,a) = log(4), s'(a,b) = log(4e/3), where e is the error rate
|
||||
*
|
||||
* Scale s'(a,b) to s(a,a) s.t. s(a,a)=x. Then s(a,b) = x*s'(a,b)/log(4), or conversely: s'(a,b)=s(a,b)*log(4)/x
|
||||
*
|
||||
* If the matching score is x and mismatch penalty is -y, we can compute error rate e:
|
||||
* e = .75 * exp[-log(4) * y/x]
|
||||
*
|
||||
* log P(seq) = \sum_i log P(b_i|a_i) = \sum_i {s'(a,b) - log(4)}
|
||||
* = \sum_i { s(a,b)*log(4)/x - log(4) } = log(4) * (S/x - l)
|
||||
*
|
||||
* where S=\sum_i s(a,b) is the alignment score. Converting to the phred scale:
|
||||
* Q(seq) = -10/log(10) * log P(seq) = 10*log(4)/log(10) * (l - S/x) = 6.02 * (l - S/x)
|
||||
*
|
||||
*
|
||||
* Gap open (zero gap): q' = log[P(gap-open)], r' = log[P(gap-ext)] (see Durbin et al. (1998) Section 4.1)
|
||||
* Then q = x*log[P(gap-open)]/log(4), r = x*log[P(gap-ext)]/log(4)
|
||||
*
|
||||
* When there are gaps, l should be the length of alignment matches (i.e. the M operator in CIGAR)
|
||||
*/
|
||||
|
||||
mem_opt_t *mem_opt_init()
|
||||
{
|
||||
mem_opt_t *o;
|
||||
o = xcalloc(1, sizeof(mem_opt_t));
|
||||
o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100;
|
||||
o->flag = 0;
|
||||
o->min_seed_len = 19;
|
||||
o->split_width = 10;
|
||||
o->max_occ = 10000;
|
||||
o->max_chain_gap = 10000;
|
||||
o->max_ins = 10000;
|
||||
o->mask_level = 0.50;
|
||||
o->chain_drop_ratio = 0.50;
|
||||
o->split_factor = 1.5;
|
||||
o->chunk_size = 10000000;
|
||||
o->n_threads = 1;
|
||||
o->pen_unpaired = 9;
|
||||
o->max_matesw = 100;
|
||||
mem_fill_scmat(o->a, o->b, o->mat);
|
||||
return o;
|
||||
}
|
||||
|
||||
void mem_fill_scmat(int a, int b, int8_t mat[25])
|
||||
{
|
||||
int i, j, k;
|
||||
for (i = k = 0; i < 4; ++i) {
|
||||
for (j = 0; j < 4; ++j)
|
||||
mat[k++] = i == j? a : -b;
|
||||
mat[k++] = 0; // ambiguous base
|
||||
}
|
||||
for (j = 0; j < 5; ++j) mat[k++] = 0;
|
||||
}
|
||||
|
||||
/***************************
|
||||
* SMEM iterator interface *
|
||||
***************************/
|
||||
|
||||
struct __smem_i {
|
||||
const bwt_t *bwt;
|
||||
const uint8_t *query;
|
||||
int start, len;
|
||||
bwtintv_v *matches; // matches; to be returned by smem_next()
|
||||
bwtintv_v *sub; // sub-matches inside the longest match; temporary
|
||||
bwtintv_v *tmpvec[2]; // temporary arrays
|
||||
};
|
||||
|
||||
smem_i *smem_itr_init(const bwt_t *bwt)
|
||||
{
|
||||
smem_i *itr;
|
||||
itr = xcalloc(1, sizeof(smem_i));
|
||||
itr->bwt = bwt;
|
||||
itr->tmpvec[0] = xcalloc(1, sizeof(bwtintv_v));
|
||||
itr->tmpvec[1] = xcalloc(1, sizeof(bwtintv_v));
|
||||
itr->matches = xcalloc(1, sizeof(bwtintv_v));
|
||||
itr->sub = xcalloc(1, sizeof(bwtintv_v));
|
||||
return itr;
|
||||
}
|
||||
|
||||
void smem_itr_destroy(smem_i *itr)
|
||||
{
|
||||
free(itr->tmpvec[0]->a); free(itr->tmpvec[0]);
|
||||
free(itr->tmpvec[1]->a); free(itr->tmpvec[1]);
|
||||
free(itr->matches->a); free(itr->matches);
|
||||
free(itr->sub->a); free(itr->sub);
|
||||
free(itr);
|
||||
}
|
||||
|
||||
void smem_set_query(smem_i *itr, int len, const uint8_t *query)
|
||||
{
|
||||
itr->query = query;
|
||||
itr->start = 0;
|
||||
itr->len = len;
|
||||
}
|
||||
|
||||
const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width)
|
||||
{
|
||||
int i, max, max_i, ori_start;
|
||||
itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0;
|
||||
if (itr->start >= itr->len || itr->start < 0) return 0;
|
||||
while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
|
||||
if (itr->start == itr->len) return 0;
|
||||
ori_start = itr->start;
|
||||
itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM
|
||||
if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here
|
||||
for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match
|
||||
bwtintv_t *p = &itr->matches->a[i];
|
||||
int len = (uint32_t)p->info - (p->info>>32);
|
||||
if (max < len) max = len, max_i = i;
|
||||
}
|
||||
if (split_len > 0 && max >= split_len && itr->matches->a[max_i].x[2] <= split_width) { // if the longest SMEM is unique and long
|
||||
int j;
|
||||
bwtintv_v *a = itr->tmpvec[0]; // reuse tmpvec[0] for merging
|
||||
bwtintv_t *p = &itr->matches->a[max_i];
|
||||
bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, itr->matches->a[max_i].x[2]+1, itr->sub, itr->tmpvec); // starting from the middle of the longest MEM
|
||||
i = j = 0; a->n = 0;
|
||||
while (i < itr->matches->n && j < itr->sub->n) { // ordered merge
|
||||
int64_t xi = itr->matches->a[i].info>>32<<32 | (itr->len - (uint32_t)itr->matches->a[i].info);
|
||||
int64_t xj = itr->sub->a[j].info>>32<<32 | (itr->len - (uint32_t)itr->sub->a[j].info);
|
||||
if (xi < xj) {
|
||||
kv_push(bwtintv_t, *a, itr->matches->a[i]);
|
||||
++i;
|
||||
} else if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1 && (uint32_t)itr->sub->a[j].info > ori_start) {
|
||||
kv_push(bwtintv_t, *a, itr->sub->a[j]);
|
||||
++j;
|
||||
} else ++j;
|
||||
}
|
||||
for (; i < itr->matches->n; ++i) kv_push(bwtintv_t, *a, itr->matches->a[i]);
|
||||
for (; j < itr->sub->n; ++j)
|
||||
if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1 && (uint32_t)itr->sub->a[j].info > ori_start)
|
||||
kv_push(bwtintv_t, *a, itr->sub->a[j]);
|
||||
kv_copy(bwtintv_t, *itr->matches, *a);
|
||||
}
|
||||
return itr->matches;
|
||||
}
|
||||
|
||||
/********************************
|
||||
* Chaining while finding SMEMs *
|
||||
********************************/
|
||||
|
||||
typedef struct {
|
||||
int64_t rbeg;
|
||||
int32_t qbeg, len;
|
||||
} mem_seed_t;
|
||||
|
||||
typedef struct {
|
||||
int n, m;
|
||||
int64_t pos;
|
||||
mem_seed_t *seeds;
|
||||
} mem_chain_t;
|
||||
|
||||
typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v;
|
||||
|
||||
#include "kbtree.h"
|
||||
|
||||
#define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos))
|
||||
KBTREE_INIT(chn, mem_chain_t, chain_cmp)
|
||||
|
||||
static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t *p)
|
||||
{
|
||||
int64_t qend, rend, x, y;
|
||||
const mem_seed_t *last = &c->seeds[c->n-1];
|
||||
qend = last->qbeg + last->len;
|
||||
rend = last->rbeg + last->len;
|
||||
if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend)
|
||||
return 1; // contained seed; do nothing
|
||||
x = p->qbeg - last->qbeg; // always non-negtive
|
||||
y = p->rbeg - last->rbeg;
|
||||
if (y >= 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain
|
||||
if (c->n == c->m) {
|
||||
c->m <<= 1;
|
||||
c->seeds = xrealloc(c->seeds, c->m * sizeof(mem_seed_t));
|
||||
}
|
||||
c->seeds[c->n++] = *p;
|
||||
return 1;
|
||||
}
|
||||
return 0; // request to add a new chain
|
||||
}
|
||||
|
||||
static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *itr)
|
||||
{
|
||||
const bwtintv_v *a;
|
||||
int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
|
||||
split_len = split_len < itr->len? split_len : itr->len;
|
||||
while ((a = smem_next(itr, split_len, opt->split_width)) != 0) { // to find all SMEM and some internal MEM
|
||||
int i;
|
||||
for (i = 0; i < a->n; ++i) { // go through each SMEM/MEM up to itr->start
|
||||
bwtintv_t *p = &a->a[i];
|
||||
int slen = (uint32_t)p->info - (p->info>>32); // seed length
|
||||
int64_t k;
|
||||
if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive
|
||||
for (k = 0; k < p->x[2]; ++k) {
|
||||
mem_chain_t tmp, *lower, *upper;
|
||||
mem_seed_t s;
|
||||
int to_add = 0;
|
||||
s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
|
||||
s.qbeg = p->info>>32;
|
||||
s.len = slen;
|
||||
if (kb_size(tree)) {
|
||||
kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain
|
||||
if (!lower || !test_and_merge(opt, lower, &s)) to_add = 1;
|
||||
} else to_add = 1;
|
||||
if (to_add) { // add the seed as a new chain
|
||||
tmp.n = 1; tmp.m = 4;
|
||||
tmp.seeds = xcalloc(tmp.m, sizeof(mem_seed_t));
|
||||
tmp.seeds[0] = s;
|
||||
kb_putp(chn, tree, &tmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < chn->n; ++i) {
|
||||
mem_chain_t *p = &chn->a[i];
|
||||
err_printf("%d", p->n);
|
||||
for (j = 0; j < p->n; ++j) {
|
||||
bwtint_t pos;
|
||||
int is_rev, ref_id;
|
||||
pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev);
|
||||
if (is_rev) pos -= p->seeds[j].len - 1;
|
||||
bns_cnt_ambi(bns, pos, p->seeds[j].len, &ref_id);
|
||||
err_printf("\t%d,%d,%ld(%s:%c%ld)", p->seeds[j].len, p->seeds[j].qbeg, (long)p->seeds[j].rbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
|
||||
}
|
||||
err_putchar('\n');
|
||||
}
|
||||
}
|
||||
|
||||
mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq)
|
||||
{
|
||||
mem_chain_v chain;
|
||||
smem_i *itr;
|
||||
kbtree_t(chn) *tree;
|
||||
|
||||
kv_init(chain);
|
||||
if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match
|
||||
tree = kb_init(chn, KB_DEFAULT_SIZE);
|
||||
itr = smem_itr_init(bwt);
|
||||
smem_set_query(itr, len, seq);
|
||||
mem_insert_seed(opt, tree, itr);
|
||||
|
||||
kv_resize(mem_chain_t, chain, kb_size(tree));
|
||||
|
||||
#define traverse_func(p_) (chain.a[chain.n++] = *(p_))
|
||||
__kb_traverse(mem_chain_t, tree, traverse_func);
|
||||
#undef traverse_func
|
||||
|
||||
smem_itr_destroy(itr);
|
||||
kb_destroy(chn, tree);
|
||||
return chain;
|
||||
}
|
||||
|
||||
/********************
|
||||
* Filtering chains *
|
||||
********************/
|
||||
|
||||
typedef struct {
|
||||
int beg, end, w;
|
||||
void *p, *p2;
|
||||
} flt_aux_t;
|
||||
|
||||
#define flt_lt(a, b) ((a).w > (b).w)
|
||||
KSORT_INIT(mem_flt, flt_aux_t, flt_lt)
|
||||
|
||||
int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains)
|
||||
{
|
||||
flt_aux_t *a;
|
||||
int i, j, n;
|
||||
if (n_chn <= 1) return n_chn; // no need to filter
|
||||
a = xmalloc(sizeof(flt_aux_t) * n_chn);
|
||||
for (i = 0; i < n_chn; ++i) {
|
||||
mem_chain_t *c = &chains[i];
|
||||
int64_t end;
|
||||
int w = 0, tmp;
|
||||
for (j = 0, end = 0; j < c->n; ++j) {
|
||||
const mem_seed_t *s = &c->seeds[j];
|
||||
if (s->qbeg >= end) w += s->len;
|
||||
else if (s->qbeg + s->len > end) w += s->qbeg + s->len - end;
|
||||
end = end > s->qbeg + s->len? end : s->qbeg + s->len;
|
||||
}
|
||||
tmp = w;
|
||||
for (j = 0, end = 0; j < c->n; ++j) {
|
||||
const mem_seed_t *s = &c->seeds[j];
|
||||
if (s->rbeg >= end) w += s->len;
|
||||
else if (s->rbeg + s->len > end) w += s->rbeg + s->len - end;
|
||||
end = end > s->qbeg + s->len? end : s->qbeg + s->len;
|
||||
}
|
||||
w = w < tmp? w : tmp;
|
||||
a[i].beg = c->seeds[0].qbeg;
|
||||
a[i].end = c->seeds[c->n-1].qbeg + c->seeds[c->n-1].len;
|
||||
a[i].w = w; a[i].p = c; a[i].p2 = 0;
|
||||
}
|
||||
ks_introsort(mem_flt, n_chn, a);
|
||||
{ // reorder chains such that the best chain appears first
|
||||
mem_chain_t *swap;
|
||||
swap = xmalloc(sizeof(mem_chain_t) * n_chn);
|
||||
for (i = 0; i < n_chn; ++i) {
|
||||
swap[i] = *((mem_chain_t*)a[i].p);
|
||||
a[i].p = &chains[i]; // as we will memcpy() below, a[i].p is changed
|
||||
}
|
||||
memcpy(chains, swap, sizeof(mem_chain_t) * n_chn);
|
||||
free(swap);
|
||||
}
|
||||
for (i = 1, n = 1; i < n_chn; ++i) {
|
||||
for (j = 0; j < n; ++j) {
|
||||
int b_max = a[j].beg > a[i].beg? a[j].beg : a[i].beg;
|
||||
int e_min = a[j].end < a[i].end? a[j].end : a[i].end;
|
||||
if (e_min > b_max) { // have overlap
|
||||
int min_l = a[i].end - a[i].beg < a[j].end - a[j].beg? a[i].end - a[i].beg : a[j].end - a[j].beg;
|
||||
if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap
|
||||
if (a[j].p2 == 0) a[j].p2 = a[i].p;
|
||||
if (a[i].w < a[j].w * opt->chain_drop_ratio && a[j].w - a[i].w >= opt->min_seed_len<<1)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (j == n) a[n++] = a[i]; // if have no significant overlap with better chains, keep it.
|
||||
}
|
||||
for (i = 0; i < n; ++i) { // mark chains to be kept
|
||||
mem_chain_t *c = (mem_chain_t*)a[i].p;
|
||||
if (c->n > 0) c->n = -c->n;
|
||||
c = (mem_chain_t*)a[i].p2;
|
||||
if (c && c->n > 0) c->n = -c->n;
|
||||
}
|
||||
free(a);
|
||||
for (i = 0; i < n_chn; ++i) { // free discarded chains
|
||||
mem_chain_t *c = &chains[i];
|
||||
if (c->n >= 0) {
|
||||
free(c->seeds);
|
||||
c->n = c->m = 0;
|
||||
} else c->n = -c->n;
|
||||
}
|
||||
for (i = n = 0; i < n_chn; ++i) { // squeeze out discarded chains
|
||||
if (chains[i].n > 0) {
|
||||
if (n != i) chains[n++] = chains[i];
|
||||
else ++n;
|
||||
}
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
/******************************
|
||||
* De-overlap single-end hits *
|
||||
******************************/
|
||||
|
||||
#define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb))))
|
||||
KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt)
|
||||
|
||||
int mem_sort_and_dedup(int n, mem_alnreg_t *a)
|
||||
{
|
||||
int m, i;
|
||||
if (n <= 1) return n;
|
||||
ks_introsort(mem_ars, n, a);
|
||||
for (i = 1; i < n; ++i) { // mark identical hits
|
||||
if (a[i].score == a[i-1].score && a[i].rb == a[i-1].rb && a[i].qb == a[i-1].qb)
|
||||
a[i].qe = a[i].qb;
|
||||
}
|
||||
for (i = 1, m = 1; i < n; ++i) // exclude identical hits
|
||||
if (a[i].qe > a[i].qb) {
|
||||
if (m != i) a[m++] = a[i];
|
||||
else ++m;
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORTANT: must run mem_sort_and_dedup() before calling this function
|
||||
{ // similar to the loop in mem_chain_flt()
|
||||
int i, k, tmp;
|
||||
kvec_t(int) z;
|
||||
if (n == 0) return;
|
||||
kv_init(z);
|
||||
for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1;
|
||||
tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r;
|
||||
kv_push(int, z, 0);
|
||||
for (i = 1; i < n; ++i) {
|
||||
for (k = 0; k < z.n; ++k) {
|
||||
int j = z.a[k];
|
||||
int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb;
|
||||
int e_min = a[j].qe < a[i].qe? a[j].qe : a[i].qe;
|
||||
if (e_min > b_max) { // have overlap
|
||||
int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb;
|
||||
if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap
|
||||
if (a[j].sub == 0) a[j].sub = a[i].score;
|
||||
if (a[j].score - a[i].score <= tmp) ++a[j].sub_n;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (k == z.n) kv_push(int, z, i);
|
||||
else a[i].secondary = z.a[k];
|
||||
}
|
||||
free(z.a);
|
||||
}
|
||||
|
||||
/****************************************
|
||||
* Construct the alignment from a chain *
|
||||
****************************************/
|
||||
|
||||
static inline int cal_max_gap(const mem_opt_t *opt, int qlen)
|
||||
{
|
||||
int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.);
|
||||
l = l > 1? l : 1;
|
||||
return l < opt->w<<1? l : opt->w<<1;
|
||||
}
|
||||
|
||||
void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av)
|
||||
{ // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds
|
||||
int i, k;
|
||||
int64_t rlen, rmax[2], tmp, max = 0;
|
||||
const mem_seed_t *s;
|
||||
uint8_t *rseq = 0;
|
||||
uint64_t *srt;
|
||||
|
||||
if (c->n == 0) return;
|
||||
// get the max possible span
|
||||
rmax[0] = l_pac<<1; rmax[1] = 0;
|
||||
for (i = 0; i < c->n; ++i) {
|
||||
int64_t b, e;
|
||||
const mem_seed_t *t = &c->seeds[i];
|
||||
b = t->rbeg - (t->qbeg + cal_max_gap(opt, t->qbeg));
|
||||
e = t->rbeg + t->len + ((l_query - t->qbeg - t->len) + cal_max_gap(opt, l_query - t->qbeg - t->len));
|
||||
rmax[0] = rmax[0] < b? rmax[0] : b;
|
||||
rmax[1] = rmax[1] > e? rmax[1] : e;
|
||||
if (t->len > max) max = t->len;
|
||||
}
|
||||
rmax[0] = rmax[0] > 0? rmax[0] : 0;
|
||||
rmax[1] = rmax[1] < l_pac<<1? rmax[1] : l_pac<<1;
|
||||
if (rmax[0] < l_pac && l_pac < rmax[1]) { // crossing the forward-reverse boundary; then choose one side
|
||||
if (l_pac - rmax[0] > rmax[1] - l_pac) rmax[1] = l_pac;
|
||||
else rmax[0] = l_pac;
|
||||
}
|
||||
// retrieve the reference sequence
|
||||
rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen);
|
||||
if (rlen != rmax[1] - rmax[0]) return;
|
||||
|
||||
srt = xmalloc(c->n * 8);
|
||||
for (i = 0; i < c->n; ++i)
|
||||
srt[i] = (uint64_t)c->seeds[i].len<<32 | i;
|
||||
ks_introsort_64(c->n, srt);
|
||||
|
||||
for (k = c->n - 1; k >= 0; --k) {
|
||||
mem_alnreg_t *a;
|
||||
s = &c->seeds[(uint32_t)srt[k]];
|
||||
|
||||
for (i = 0; i < av->n; ++i) { // test whether extension has been made before
|
||||
mem_alnreg_t *p = &av->a[i];
|
||||
int64_t rd;
|
||||
int qd, w, max_gap;
|
||||
if (s->rbeg < p->rb || s->rbeg + s->len > p->re || s->qbeg < p->qb || s->qbeg + s->len > p->qe) continue; // not fully contained
|
||||
// qd: distance ahead of the seed on query; rd: on reference
|
||||
qd = s->qbeg - p->qb; rd = s->rbeg - p->rb;
|
||||
max_gap = cal_max_gap(opt, qd < rd? qd : rd); // the maximal gap allowed in regions ahead of the seed
|
||||
w = max_gap < opt->w? max_gap : opt->w; // bounded by the band width
|
||||
if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit
|
||||
// similar to the previous four lines, but this time we look at the region behind
|
||||
qd = p->qe - (s->qbeg + s->len); rd = p->re - (s->rbeg + s->len);
|
||||
max_gap = cal_max_gap(opt, qd < rd? qd : rd);
|
||||
w = max_gap < opt->w? max_gap : opt->w;
|
||||
if (qd - rd < w && rd - qd < w) break;
|
||||
}
|
||||
if (i < av->n) continue;
|
||||
|
||||
a = kv_pushp(mem_alnreg_t, *av);
|
||||
memset(a, 0, sizeof(mem_alnreg_t));
|
||||
|
||||
if (s->qbeg) { // left extension
|
||||
uint8_t *rs, *qs;
|
||||
int qle, tle;
|
||||
qs = xmalloc(s->qbeg);
|
||||
for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i];
|
||||
tmp = s->rbeg - rmax[0];
|
||||
rs = xmalloc(tmp);
|
||||
for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i];
|
||||
a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle);
|
||||
a->qb = s->qbeg - qle; a->rb = s->rbeg - tle;
|
||||
free(qs); free(rs);
|
||||
} else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg;
|
||||
|
||||
if (s->qbeg + s->len != l_query) { // right extension
|
||||
int qle, tle, qe, re;
|
||||
qe = s->qbeg + s->len;
|
||||
re = s->rbeg + s->len - rmax[0];
|
||||
a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle);
|
||||
a->qe = qe + qle; a->re = rmax[0] + re + tle;
|
||||
} else a->qe = l_query, a->re = s->rbeg + s->len;
|
||||
if (bwa_verbose >= 4) err_printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re);
|
||||
|
||||
// compute seedcov
|
||||
for (i = 0, a->seedcov = 0; i < c->n; ++i) {
|
||||
const mem_seed_t *t = &c->seeds[i];
|
||||
if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained
|
||||
a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough
|
||||
}
|
||||
}
|
||||
free(srt); free(rseq);
|
||||
}
|
||||
|
||||
/*****************************
|
||||
* Basic hit->SAM conversion *
|
||||
*****************************/
|
||||
|
||||
void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m)
|
||||
{
|
||||
#define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1)
|
||||
int score, n_cigar, is_rev = 0, rid, mid, copy_mate = 0, NM = -1;
|
||||
uint32_t *cigar = 0;
|
||||
int64_t pos;
|
||||
bwahit_t ptmp, *p = &ptmp;
|
||||
|
||||
if (!p_) { // in this case, generate an unmapped alignment
|
||||
memset(&ptmp, 0, sizeof(bwahit_t));
|
||||
ptmp.rb = ptmp.re = -1;
|
||||
} else ptmp = *p_;
|
||||
p->flag |= m? 1 : 0; // is paired in sequencing
|
||||
p->flag |= !is_mapped(p)? 4 : 0; // is mapped
|
||||
p->flag |= m && !is_mapped(m)? 8 : 0; // is mate mapped
|
||||
if (m && !is_mapped(p) && is_mapped(m)) {
|
||||
p->rb = m->rb; p->re = m->re; p->qb = 0; p->qe = s->l_seq;
|
||||
copy_mate = 1;
|
||||
}
|
||||
p->flag |= p->rb >= bns->l_pac? 0x10 : 0; // is reverse strand
|
||||
p->flag |= m && m->rb >= bns->l_pac? 0x20 : 0; // is mate on reverse strand
|
||||
kputs(s->name, str); kputc('\t', str);
|
||||
if (is_mapped(p)) { // has a coordinate, no matter whether it is mapped or copied from the mate
|
||||
int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag
|
||||
if (p->flag&0x10000) sam_flag |= 0x100;
|
||||
if (!copy_mate) {
|
||||
cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM);
|
||||
p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened)
|
||||
} else n_cigar = 0, cigar = 0;
|
||||
pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev);
|
||||
bns_cnt_ambi(bns, pos, p->re - p->rb, &rid);
|
||||
kputw(sam_flag, str); kputc('\t', str);
|
||||
kputs(bns->anns[rid].name, str); kputc('\t', str); kputuw(pos - bns->anns[rid].offset + 1, str); kputc('\t', str);
|
||||
kputw(p->qual, str); kputc('\t', str);
|
||||
if (n_cigar) {
|
||||
int i, clip5, clip3;
|
||||
clip5 = is_rev? s->l_seq - p->qe : p->qb;
|
||||
clip3 = is_rev? p->qb : s->l_seq - p->qe;
|
||||
if (clip5) { kputw(clip5, str); kputc("SH"[(is_hard!=0)], str); }
|
||||
for (i = 0; i < n_cigar; ++i) {
|
||||
kputw(cigar[i]>>4, str); kputc("MIDSH"[cigar[i]&0xf], str);
|
||||
}
|
||||
if (clip3) { kputw(clip3, str); kputc("SH"[(is_hard!=0)], str); }
|
||||
} else kputc('*', str);
|
||||
} else { // no coordinate
|
||||
kputw(p->flag, str);
|
||||
kputs("\t*\t0\t0\t*", str);
|
||||
rid = -1;
|
||||
}
|
||||
if (m && is_mapped(m)) { // then print mate pos and isize
|
||||
pos = bns_depos(bns, m->rb < bns->l_pac? m->rb : m->re - 1, &is_rev);
|
||||
bns_cnt_ambi(bns, pos, m->re - m->rb, &mid);
|
||||
kputc('\t', str);
|
||||
if (mid == rid) kputc('=', str);
|
||||
else kputs(bns->anns[mid].name, str);
|
||||
kputc('\t', str); kputuw(pos - bns->anns[mid].offset + 1, str);
|
||||
kputc('\t', str);
|
||||
if (mid == rid) {
|
||||
int64_t p0 = p->rb < bns->l_pac? p->rb : (bns->l_pac<<1) - 1 - p->rb;
|
||||
int64_t p1 = m->rb < bns->l_pac? m->rb : (bns->l_pac<<1) - 1 - m->rb;
|
||||
kputw(p0 - p1 + (p0 > p1? 1 : -1), str);
|
||||
} else kputw(0, str);
|
||||
kputc('\t', str);
|
||||
} else kputsn("\t*\t0\t0\t", 7, str);
|
||||
if (p->flag&0x100) { // for secondary alignments, don't write SEQ and QUAL
|
||||
kputsn("*\t*", 3, str);
|
||||
} else if (!(p->flag&0x10)) { // print SEQ and QUAL, the forward strand
|
||||
int i, qb = 0, qe = s->l_seq;
|
||||
if (!(p->flag&4) && is_hard) qb = p->qb, qe = p->qe;
|
||||
ks_resize(str, str->l + (qe - qb) + 1);
|
||||
for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]];
|
||||
kputc('\t', str);
|
||||
if (s->qual) { // printf qual
|
||||
ks_resize(str, str->l + (qe - qb) + 1);
|
||||
for (i = qb; i < qe; ++i) str->s[str->l++] = s->qual[i];
|
||||
str->s[str->l] = 0;
|
||||
} else kputc('*', str);
|
||||
} else { // the reverse strand
|
||||
int i, qb = 0, qe = s->l_seq;
|
||||
if (!(p->flag&4) && is_hard) qb = p->qb, qe = p->qe;
|
||||
ks_resize(str, str->l + (qe - qb) + 1);
|
||||
for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]];
|
||||
kputc('\t', str);
|
||||
if (s->qual) { // printf qual
|
||||
ks_resize(str, str->l + (qe - qb) + 1);
|
||||
for (i = qe-1; i >= qb; --i) str->s[str->l++] = s->qual[i];
|
||||
str->s[str->l] = 0;
|
||||
} else kputc('*', str);
|
||||
}
|
||||
if (NM >= 0) { kputsn("\tNM:i:", 6, str); kputw(NM, str); }
|
||||
if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); }
|
||||
if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); }
|
||||
if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); }
|
||||
if (s->comment) { kputc('\t', str); kputs(s->comment, str); }
|
||||
kputc('\n', str);
|
||||
free(cigar);
|
||||
#undef is_mapped
|
||||
}
|
||||
|
||||
/************************
|
||||
* Integrated interface *
|
||||
************************/
|
||||
|
||||
int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a)
|
||||
{
|
||||
int mapq, l, sub = a->sub? a->sub : opt->min_seed_len * opt->a;
|
||||
double identity;
|
||||
sub = a->csub > sub? a->csub : sub;
|
||||
if (sub >= a->score) return 0;
|
||||
l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb;
|
||||
mapq = a->score? (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0;
|
||||
identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l;
|
||||
mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq;
|
||||
if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499);
|
||||
if (mapq > 60) mapq = 60;
|
||||
if (mapq < 0) mapq = 0;
|
||||
return mapq;
|
||||
}
|
||||
|
||||
void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h)
|
||||
{
|
||||
h->rb = a->rb; h->re = a->re; h->qb = a->qb; h->qe = a->qe;
|
||||
h->score = a->score;
|
||||
h->sub = a->secondary >= 0? -1 : a->sub > a->csub? a->sub : a->csub;
|
||||
h->qual = 0; // quality unset
|
||||
h->flag = a->secondary >= 0? 0x100 : 0; // only the "secondary" bit is set
|
||||
}
|
||||
|
||||
void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const bwahit_t *m)
|
||||
{
|
||||
int k;
|
||||
kstring_t str;
|
||||
str.l = str.m = 0; str.s = 0;
|
||||
if (a->n > 0) {
|
||||
int mapq0 = -1;
|
||||
for (k = 0; k < a->n; ++k) {
|
||||
bwahit_t h;
|
||||
mem_alnreg_t *p = &a->a[k];
|
||||
if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue;
|
||||
if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue;
|
||||
mem_alnreg2hit(p, &h);
|
||||
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s->seq, &h.qb, &h.qe, &h.rb, &h.re);
|
||||
h.flag |= extra_flag;
|
||||
if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) h.flag |= 0x10000; // print the sequence, but flag as secondary (for Picard)
|
||||
h.qual = p->secondary >= 0? 0 : mem_approx_mapq_se(opt, p);
|
||||
if (k == 0) mapq0 = h.qual;
|
||||
else if (h.qual > mapq0) h.qual = mapq0;
|
||||
bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m);
|
||||
}
|
||||
} else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP, m);
|
||||
s->sam = str.s;
|
||||
}
|
||||
|
||||
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
|
||||
{
|
||||
int i;
|
||||
mem_chain_v chn;
|
||||
mem_alnreg_v regs;
|
||||
|
||||
for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so
|
||||
seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]];
|
||||
|
||||
chn = mem_chain(opt, bwt, l_seq, (uint8_t*)seq);
|
||||
chn.n = mem_chain_flt(opt, chn.n, chn.a);
|
||||
if (bwa_verbose >= 4) mem_print_chain(bns, &chn);
|
||||
|
||||
kv_init(regs);
|
||||
for (i = 0; i < chn.n; ++i) {
|
||||
mem_chain_t *p = &chn.a[i];
|
||||
mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s);
|
||||
free(chn.a[i].seeds);
|
||||
}
|
||||
free(chn.a);
|
||||
regs.n = mem_sort_and_dedup(regs.n, regs.a);
|
||||
return regs;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
int start, step, n;
|
||||
const mem_opt_t *opt;
|
||||
const bwt_t *bwt;
|
||||
const bntseq_t *bns;
|
||||
const uint8_t *pac;
|
||||
const mem_pestat_t *pes;
|
||||
bseq1_t *seqs;
|
||||
mem_alnreg_v *regs;
|
||||
} worker_t;
|
||||
|
||||
static void *worker1(void *data)
|
||||
{
|
||||
worker_t *w = (worker_t*)data;
|
||||
int i;
|
||||
if (!(w->opt->flag&MEM_F_PE)) {
|
||||
for (i = w->start; i < w->n; i += w->step)
|
||||
w->regs[i] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq);
|
||||
} else { // for PE we align the two ends in the same thread in case the 2nd read is of worse quality, in which case some threads may be faster/slower
|
||||
for (i = w->start; i < w->n>>1; i += w->step) {
|
||||
w->regs[i<<1|0] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq);
|
||||
w->regs[i<<1|1] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void *worker2(void *data)
|
||||
{
|
||||
extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]);
|
||||
worker_t *w = (worker_t*)data;
|
||||
int i;
|
||||
if (!(w->opt->flag&MEM_F_PE)) {
|
||||
for (i = w->start; i < w->n; i += w->step) {
|
||||
mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a);
|
||||
mem_sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0);
|
||||
free(w->regs[i].a);
|
||||
}
|
||||
} else {
|
||||
int n = 0;
|
||||
for (i = w->start; i < w->n>>1; i += w->step) { // not implemented yet
|
||||
n += mem_sam_pe(w->opt, w->bns, w->pac, w->pes, i, &w->seqs[i<<1], &w->regs[i<<1]);
|
||||
free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a);
|
||||
}
|
||||
fprintf(stderr, "[M::%s@%d] performed mate-SW for %d reads\n", __func__, w->start, n);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs)
|
||||
{
|
||||
int i;
|
||||
worker_t *w;
|
||||
mem_alnreg_v *regs;
|
||||
mem_pestat_t pes[4];
|
||||
|
||||
w = xcalloc(opt->n_threads, sizeof(worker_t));
|
||||
regs = xmalloc(n * sizeof(mem_alnreg_v));
|
||||
for (i = 0; i < opt->n_threads; ++i) {
|
||||
worker_t *p = &w[i];
|
||||
p->start = i; p->step = opt->n_threads; p->n = n;
|
||||
p->opt = opt; p->bwt = bwt; p->bns = bns; p->pac = pac;
|
||||
p->seqs = seqs; p->regs = regs;
|
||||
p->pes = &pes[0];
|
||||
}
|
||||
#ifdef HAVE_PTHREAD
|
||||
if (opt->n_threads == 1) {
|
||||
worker1(w);
|
||||
if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes);
|
||||
worker2(w);
|
||||
} else {
|
||||
pthread_t *tid;
|
||||
tid = (pthread_t*)xcalloc(opt->n_threads, sizeof(pthread_t));
|
||||
for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker1, &w[i]);
|
||||
for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0);
|
||||
if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes);
|
||||
for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker2, &w[i]);
|
||||
for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0);
|
||||
free(tid);
|
||||
}
|
||||
#else
|
||||
worker1(w);
|
||||
if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes);
|
||||
worker2(w);
|
||||
#endif
|
||||
for (i = 0; i < n; ++i) {
|
||||
err_fputs(seqs[i].sam, stdout);
|
||||
free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam);
|
||||
}
|
||||
free(regs); free(w);
|
||||
}
|
||||
|
|
@ -0,0 +1,133 @@
|
|||
#ifndef BWAMEM_H_
|
||||
#define BWAMEM_H_
|
||||
|
||||
#include "bwt.h"
|
||||
#include "bntseq.h"
|
||||
#include "bwa.h"
|
||||
|
||||
#define MEM_MAPQ_COEF 30.0
|
||||
#define MEM_MAPQ_MAX 60
|
||||
|
||||
struct __smem_i;
|
||||
typedef struct __smem_i smem_i;
|
||||
|
||||
#define MEM_F_HARDCLIP 0x1
|
||||
#define MEM_F_PE 0x2
|
||||
#define MEM_F_NOPAIRING 0x4
|
||||
#define MEM_F_ALL 0x8
|
||||
#define MEM_F_NO_MULTI 0x10
|
||||
|
||||
typedef struct {
|
||||
int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r
|
||||
int w; // band width
|
||||
int flag; // see MEM_F_* macros
|
||||
int min_seed_len; // minimum seed length
|
||||
float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor
|
||||
int split_width; // split into a seed if its occurence is smaller than this value
|
||||
int max_occ; // skip a seed if its occurence is larger than this value
|
||||
int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed
|
||||
int n_threads; // number of threads
|
||||
int chunk_size; // process chunk_size-bp sequences in a batch
|
||||
float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
|
||||
float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain
|
||||
int pen_unpaired; // phred-scaled penalty for unpaired reads
|
||||
int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
|
||||
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
|
||||
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
|
||||
} mem_opt_t;
|
||||
|
||||
typedef struct {
|
||||
int64_t rb, re; // [rb,re): reference sequence in the alignment
|
||||
int qb, qe; // [qb,qe): query sequence in the alignment
|
||||
int score; // best SW score
|
||||
int sub; // 2nd best SW score
|
||||
int csub; // SW score of a tandem hit
|
||||
int sub_n; // approximate number of suboptimal hits
|
||||
int seedcov; // length of regions coverged by seeds
|
||||
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
|
||||
} mem_alnreg_t;
|
||||
|
||||
typedef struct {
|
||||
int low, high, failed;
|
||||
double avg, std;
|
||||
} mem_pestat_t;
|
||||
|
||||
typedef struct {
|
||||
int64_t rb, re;
|
||||
int qb, qe, flag, qual;
|
||||
// optional info
|
||||
int score, sub;
|
||||
} bwahit_t;
|
||||
|
||||
typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
smem_i *smem_itr_init(const bwt_t *bwt);
|
||||
void smem_itr_destroy(smem_i *itr);
|
||||
void smem_set_query(smem_i *itr, int len, const uint8_t *query);
|
||||
const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width);
|
||||
|
||||
mem_opt_t *mem_opt_init(void);
|
||||
void mem_fill_scmat(int a, int b, int8_t mat[25]);
|
||||
|
||||
/**
|
||||
* Align a batch of sequences and generate the alignments in the SAM format
|
||||
*
|
||||
* This routine requires $seqs[i].{l_seq,seq,name} and write $seqs[i].sam.
|
||||
* Note that $seqs[i].sam may consist of several SAM lines if the
|
||||
* corresponding sequence has multiple primary hits.
|
||||
*
|
||||
* In the paired-end mode (i.e. MEM_F_PE is set in $opt->flag), query
|
||||
* sequences must be interleaved: $n must be an even number and the 2i-th
|
||||
* sequence and the (2i+1)-th sequence constitute a read pair. In this
|
||||
* mode, there should be enough (typically >50) unique pairs for the
|
||||
* routine to infer the orientation and insert size.
|
||||
*
|
||||
* @param opt alignment parameters
|
||||
* @param bwt FM-index of the reference sequence
|
||||
* @param bns Information of the reference
|
||||
* @param pac 2-bit encoded reference
|
||||
* @param n number of query sequences
|
||||
* @param seqs query sequences; $seqs[i].seq/sam to be modified after the call
|
||||
*/
|
||||
void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs);
|
||||
|
||||
/**
|
||||
* Find the aligned regions for one query sequence
|
||||
*
|
||||
* Note that this routine does not generate CIGAR. CIGAR should be
|
||||
* generated later by bwa_gen_cigar() defined in bwa.c.
|
||||
*
|
||||
* @param opt alignment parameters
|
||||
* @param bwt FM-index of the reference sequence
|
||||
* @param bns Information of the reference
|
||||
* @param pac 2-bit encoded reference
|
||||
* @param l_seq length of query sequence
|
||||
* @param seq query sequence; conversion ACGTN/acgtn=>01234 to be applied
|
||||
*
|
||||
* @return list of aligned regions.
|
||||
*/
|
||||
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq);
|
||||
|
||||
/**
|
||||
* Infer the insert size distribution from interleaved alignment regions
|
||||
*
|
||||
* This function can be called after mem_align1(), as long as paired-end
|
||||
* reads are properly interleaved.
|
||||
*
|
||||
* @param opt alignment parameters
|
||||
* @param l_pac length of concatenated reference sequence
|
||||
* @param n number of query sequences; must be an even number
|
||||
* @param regs region array of size $n; 2i-th and (2i+1)-th elements constitute a pair
|
||||
* @param pes inferred insert size distribution (output)
|
||||
*/
|
||||
void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,314 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include "kstring.h"
|
||||
#include "bwamem.h"
|
||||
#include "kvec.h"
|
||||
#include "utils.h"
|
||||
#include "ksw.h"
|
||||
|
||||
#define MIN_RATIO 0.8
|
||||
#define MIN_DIR_CNT 10
|
||||
#define MIN_DIR_RATIO 0.05
|
||||
#define OUTLIER_BOUND 2.0
|
||||
#define MAPPING_BOUND 3.0
|
||||
#define MAX_STDDEV 4.0
|
||||
|
||||
static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist)
|
||||
{
|
||||
int64_t p2;
|
||||
int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac);
|
||||
p2 = r1 == r2? b2 : (l_pac<<1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand
|
||||
*dist = p2 > b1? p2 - b1 : b1 - p2;
|
||||
return (r1 == r2? 0 : 1) ^ (p2 > b1? 0 : 3);
|
||||
}
|
||||
|
||||
static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r)
|
||||
{
|
||||
int j;
|
||||
for (j = 1; j < r->n; ++j) { // choose unique alignment
|
||||
int b_max = r->a[j].qb > r->a[0].qb? r->a[j].qb : r->a[0].qb;
|
||||
int e_min = r->a[j].qe < r->a[0].qe? r->a[j].qe : r->a[0].qe;
|
||||
if (e_min > b_max) { // have overlap
|
||||
int min_l = r->a[j].qe - r->a[j].qb < r->a[0].qe - r->a[0].qb? r->a[j].qe - r->a[j].qb : r->a[0].qe - r->a[0].qb;
|
||||
if (e_min - b_max >= min_l * opt->mask_level) break; // significant overlap
|
||||
}
|
||||
}
|
||||
return j < r->n? r->a[j].score : opt->min_seed_len * opt->a;
|
||||
}
|
||||
|
||||
void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4])
|
||||
{
|
||||
int i, d, max;
|
||||
uint64_v isize[4];
|
||||
memset(pes, 0, 4 * sizeof(mem_pestat_t));
|
||||
memset(isize, 0, sizeof(kvec_t(int)) * 4);
|
||||
for (i = 0; i < n>>1; ++i) {
|
||||
int dir;
|
||||
int64_t is;
|
||||
mem_alnreg_v *r[2];
|
||||
r[0] = (mem_alnreg_v*)®s[i<<1|0];
|
||||
r[1] = (mem_alnreg_v*)®s[i<<1|1];
|
||||
if (r[0]->n == 0 || r[1]->n == 0) continue;
|
||||
if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue;
|
||||
if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue;
|
||||
dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is);
|
||||
if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is);
|
||||
}
|
||||
if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n);
|
||||
for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two.
|
||||
mem_pestat_t *r = &pes[d];
|
||||
uint64_v *q = &isize[d];
|
||||
int p25, p50, p75, x;
|
||||
if (q->n < MIN_DIR_CNT) {
|
||||
fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
|
||||
r->failed = 1;
|
||||
continue;
|
||||
} else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
|
||||
ks_introsort_64(q->n, q->a);
|
||||
p25 = q->a[(int)(.25 * q->n + .499)];
|
||||
p50 = q->a[(int)(.50 * q->n + .499)];
|
||||
p75 = q->a[(int)(.75 * q->n + .499)];
|
||||
r->low = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);
|
||||
if (r->low < 1) r->low = 1;
|
||||
r->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);
|
||||
fprintf(stderr, "[M::%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75);
|
||||
fprintf(stderr, "[M::%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r->low, r->high);
|
||||
for (i = x = 0, r->avg = 0; i < q->n; ++i)
|
||||
if (q->a[i] >= r->low && q->a[i] <= r->high)
|
||||
r->avg += q->a[i], ++x;
|
||||
r->avg /= x;
|
||||
for (i = 0, r->std = 0; i < q->n; ++i)
|
||||
if (q->a[i] >= r->low && q->a[i] <= r->high)
|
||||
r->std += (q->a[i] - r->avg) * (q->a[i] - r->avg);
|
||||
r->std = sqrt(r->std / x);
|
||||
fprintf(stderr, "[M::%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r->avg, r->std);
|
||||
r->low = (int)(p25 - MAPPING_BOUND * (p75 - p25) + .499);
|
||||
r->high = (int)(p75 + MAPPING_BOUND * (p75 - p25) + .499);
|
||||
if (r->low > r->avg - MAX_STDDEV * r->std) r->low = (int)(r->avg - MAX_STDDEV * r->std + .499);
|
||||
if (r->high < r->avg - MAX_STDDEV * r->std) r->high = (int)(r->avg + MAX_STDDEV * r->std + .499);
|
||||
if (r->low < 1) r->low = 1;
|
||||
fprintf(stderr, "[M::%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r->low, r->high);
|
||||
free(q->a);
|
||||
}
|
||||
for (d = 0, max = 0; d < 4; ++d)
|
||||
max = max > isize[d].n? max : isize[d].n;
|
||||
for (d = 0; d < 4; ++d)
|
||||
if (pes[d].failed == 0 && isize[d].n < max * MIN_DIR_RATIO) {
|
||||
pes[d].failed = 1;
|
||||
fprintf(stderr, "[M::%s] skip orientation %c%c\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
|
||||
}
|
||||
}
|
||||
|
||||
int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma)
|
||||
{
|
||||
int i, r, skip[4], n = 0;
|
||||
for (r = 0; r < 4; ++r)
|
||||
skip[r] = pes[r].failed? 1 : 0;
|
||||
for (i = 0; i < ma->n; ++i) { // check which orinentation has been found
|
||||
int64_t dist;
|
||||
r = mem_infer_dir(l_pac, a->rb, ma->a[i].rb, &dist);
|
||||
if (dist >= pes[r].low && dist <= pes[r].high)
|
||||
skip[r] = 1;
|
||||
}
|
||||
if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return 0; // consistent pair exist; no need to perform SW
|
||||
for (r = 0; r < 4; ++r) {
|
||||
int is_rev, is_larger;
|
||||
uint8_t *seq, *rev = 0, *ref;
|
||||
int64_t rb, re, len;
|
||||
if (skip[r]) continue;
|
||||
is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate
|
||||
is_larger = !(r>>1); // whether the mate has larger coordinate
|
||||
if (is_rev) {
|
||||
rev = xmalloc(l_ms); // this is the reverse complement of $ms
|
||||
for (i = 0; i < l_ms; ++i) rev[l_ms - 1 - i] = ms[i] < 4? 3 - ms[i] : 4;
|
||||
seq = rev;
|
||||
} else seq = (uint8_t*)ms;
|
||||
if (!is_rev) {
|
||||
rb = is_larger? a->rb + pes[r].low : a->rb - pes[r].high;
|
||||
re = (is_larger? a->rb + pes[r].high: a->rb - pes[r].low) + l_ms; // if on the same strand, end position should be larger to make room for the seq length
|
||||
} else {
|
||||
rb = (is_larger? a->rb + pes[r].low : a->rb - pes[r].high) - l_ms; // similarly on opposite strands
|
||||
re = is_larger? a->rb + pes[r].high: a->rb - pes[r].low;
|
||||
}
|
||||
if (rb < 0) rb = 0;
|
||||
if (re > l_pac<<1) re = l_pac<<1;
|
||||
ref = bns_get_seq(l_pac, pac, rb, re, &len);
|
||||
if (len == re - rb) { // no funny things happening
|
||||
kswr_t aln;
|
||||
mem_alnreg_t b;
|
||||
int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | opt->min_seed_len;
|
||||
aln = ksw_align(l_ms, seq, len, ref, 5, opt->mat, opt->q, opt->r, xtra, 0);
|
||||
memset(&b, 0, sizeof(mem_alnreg_t));
|
||||
if (aln.score >= opt->min_seed_len) {
|
||||
b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb;
|
||||
b.qe = is_rev? l_ms - aln.qb : aln.qe + 1;
|
||||
b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb;
|
||||
b.re = is_rev? (l_pac<<1) - (rb + aln.tb) : rb + aln.te + 1;
|
||||
b.score = aln.score;
|
||||
b.csub = aln.score2;
|
||||
b.secondary = -1;
|
||||
b.seedcov = (b.re - b.rb < b.qe - b.qb? b.re - b.rb : b.qe - b.qb) >> 1;
|
||||
// printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re);
|
||||
kv_push(mem_alnreg_t, *ma, b); // make room for a new element
|
||||
// move b s.t. ma is sorted
|
||||
for (i = 0; i < ma->n - 1; ++i) // find the insertion point
|
||||
if (ma->a[i].score < b.score) break;
|
||||
tmp = i;
|
||||
for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1];
|
||||
ma->a[i] = b;
|
||||
}
|
||||
++n;
|
||||
}
|
||||
if (rev) free(rev);
|
||||
free(ref);
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2])
|
||||
{
|
||||
extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h);
|
||||
pair64_v v, u;
|
||||
int r, i, k, y[4], ret; // y[] keeps the last hit
|
||||
kv_init(v); kv_init(u);
|
||||
for (r = 0; r < 2; ++r) { // loop through read number
|
||||
for (i = 0; i < a[r].n; ++i) {
|
||||
pair64_t key;
|
||||
mem_alnreg_t *e = &a[r].a[i];
|
||||
key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position
|
||||
key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= l_pac)<<1 | r;
|
||||
kv_push(pair64_t, v, key);
|
||||
}
|
||||
}
|
||||
ks_introsort_128(v.n, v.a);
|
||||
y[0] = y[1] = y[2] = y[3] = -1;
|
||||
//for (i = 0; i < v.n; ++i) printf("[%d]\t%d\t%c%ld\n", i, (int)(v.a[i].y&1)+1, "+-"[v.a[i].y>>1&1], (long)v.a[i].x);
|
||||
for (i = 0; i < v.n; ++i) {
|
||||
for (r = 0; r < 2; ++r) { // loop through direction
|
||||
int dir = r<<1 | (v.a[i].y>>1&1), which;
|
||||
if (pes[dir].failed) continue; // invalid orientation
|
||||
which = r<<1 | ((v.a[i].y&1)^1);
|
||||
if (y[which] < 0) continue; // no previous hits
|
||||
for (k = y[which]; k >= 0; --k) { // TODO: this is a O(n^2) solution in the worst case; remember to check if this loop takes a lot of time (I doubt)
|
||||
int64_t dist;
|
||||
int q;
|
||||
double ns;
|
||||
pair64_t *p;
|
||||
if ((v.a[k].y&3) != which) continue;
|
||||
dist = (int64_t)v.a[i].x - v.a[k].x;
|
||||
//printf("%d: %lld\n", k, dist);
|
||||
if (dist > pes[dir].high) break;
|
||||
if (dist < pes[dir].low) continue;
|
||||
ns = (dist - pes[dir].avg) / pes[dir].std;
|
||||
q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) + .499); // .721 = 1/log(4)
|
||||
if (q < 0) q = 0;
|
||||
p = kv_pushp(pair64_t, u);
|
||||
p->y = (uint64_t)k<<32 | i;
|
||||
p->x = (uint64_t)q<<32 | (hash_64(p->y ^ id<<8) & 0xffffffffU);
|
||||
//printf("[%lld,%lld]\t%d\tdist=%ld\n", v.a[k].x, v.a[i].x, q, (long)dist);
|
||||
}
|
||||
}
|
||||
y[v.a[i].y&3] = i;
|
||||
}
|
||||
if (u.n) { // found at least one proper pair
|
||||
int tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r;
|
||||
ks_introsort_128(u.n, u.a);
|
||||
i = u.a[u.n-1].y >> 32; k = u.a[u.n-1].y << 32 >> 32;
|
||||
z[v.a[i].y&1] = v.a[i].y<<32>>34; // index of the best pair
|
||||
z[v.a[k].y&1] = v.a[k].y<<32>>34;
|
||||
ret = u.a[u.n-1].x >> 32;
|
||||
*sub = u.n > 1? u.a[u.n-2].x>>32 : 0;
|
||||
for (i = (long)u.n - 2, *n_sub = 0; i >= 0; --i)
|
||||
if (*sub - (int)(u.a[i].x>>32) <= tmp) ++*n_sub;
|
||||
} else ret = 0, *sub = 0, *n_sub = 0;
|
||||
free(u.a); free(v.a);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2])
|
||||
{
|
||||
extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a);
|
||||
extern void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const bwahit_t *m);
|
||||
extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a);
|
||||
extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h);
|
||||
extern void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p, int is_hard, const bwahit_t *m);
|
||||
|
||||
int n = 0, i, j, z[2], o, subo, n_sub;
|
||||
kstring_t str;
|
||||
mem_alnreg_v b[2];
|
||||
bwahit_t h[2];
|
||||
|
||||
str.l = str.m = 0; str.s = 0;
|
||||
// perform SW for the best alignment
|
||||
kv_init(b[0]); kv_init(b[1]);
|
||||
for (i = 0; i < 2; ++i)
|
||||
for (j = 0; j < a[i].n; ++j)
|
||||
if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired)
|
||||
kv_push(mem_alnreg_t, b[i], a[i].a[j]);
|
||||
for (i = 0; i < 2; ++i)
|
||||
for (j = 0; j < b[i].n && j < opt->max_matesw; ++j)
|
||||
n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
|
||||
free(b[0].a); free(b[1].a);
|
||||
mem_mark_primary_se(opt, a[0].n, a[0].a);
|
||||
mem_mark_primary_se(opt, a[1].n, a[1].a);
|
||||
if (opt->flag&MEM_F_NOPAIRING) goto no_pairing;
|
||||
// pairing single-end hits
|
||||
if (a[0].n && a[1].n && (o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, &n_sub, z)) > 0) {
|
||||
int is_multi[2], q_pe, extra_flag = 1, score_un, q_se[2];
|
||||
// check if an end has multiple hits even after mate-SW
|
||||
for (i = 0; i < 2; ++i) {
|
||||
for (j = 1; j < a[i].n; ++j)
|
||||
if (a[i].a[j].secondary < 0) break;
|
||||
is_multi[i] = j < a[i].n? 1 : 0;
|
||||
}
|
||||
if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score
|
||||
// compute mapQ for the best SE hit
|
||||
score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired;
|
||||
//q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0;
|
||||
subo = subo > score_un? subo : score_un;
|
||||
q_pe = (o - subo) * 6;
|
||||
if (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499);
|
||||
if (q_pe < 0) q_pe = 0;
|
||||
if (q_pe > 60) q_pe = 60;
|
||||
// the following assumes no split hits
|
||||
if (o > score_un) { // paired alignment is preferred
|
||||
mem_alnreg_t *c[2];
|
||||
c[0] = &a[0].a[z[0]]; c[1] = &a[1].a[z[1]];
|
||||
for (i = 0; i < 2; ++i) {
|
||||
if (c[i]->secondary >= 0)
|
||||
c[i]->sub = a[i].a[c[i]->secondary].score, c[i]->secondary = -2;
|
||||
q_se[i] = mem_approx_mapq_se(opt, c[i]);
|
||||
}
|
||||
q_se[0] = q_se[0] > q_pe? q_se[0] : q_pe < q_se[0] + 40? q_pe : q_se[0] + 40;
|
||||
q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe < q_se[1] + 40? q_pe : q_se[1] + 40;
|
||||
extra_flag |= 2;
|
||||
// cap at the tandem repeat score
|
||||
q_se[0] = q_se[0] < (c[0]->score - c[0]->csub) * 6? q_se[0] : (c[0]->score - c[0]->csub) * 6;
|
||||
q_se[1] = q_se[1] < (c[1]->score - c[1]->csub) * 6? q_se[1] : (c[1]->score - c[1]->csub) * 6;
|
||||
} else { // the unpaired alignment is preferred
|
||||
z[0] = z[1] = 0;
|
||||
q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]);
|
||||
q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]);
|
||||
}
|
||||
mem_alnreg2hit(&a[0].a[z[0]], &h[0]); h[0].qual = q_se[0]; h[0].flag |= 0x40 | extra_flag;
|
||||
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[0].seq, &h[0].qb, &h[0].qe, &h[0].rb, &h[0].re);
|
||||
mem_alnreg2hit(&a[1].a[z[1]], &h[1]); h[1].qual = q_se[1]; h[1].flag |= 0x80 | extra_flag;
|
||||
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[1].seq, &h[1].qb, &h[1].qe, &h[1].rb, &h[1].re);
|
||||
bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->flag&MEM_F_HARDCLIP, &h[1]); s[0].sam = xstrdup(str.s); str.l = 0;
|
||||
bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->flag&MEM_F_HARDCLIP, &h[0]); s[1].sam = str.s;
|
||||
} else goto no_pairing;
|
||||
return n;
|
||||
|
||||
no_pairing:
|
||||
for (i = 0; i < 2; ++i) {
|
||||
if (a[i].n) {
|
||||
mem_alnreg2hit(&a[i].a[0], &h[i]);
|
||||
bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[i].seq, &h[i].qb, &h[i].qe, &h[i].rb, &h[i].re);
|
||||
} else h[i].rb = h[i].re = -1;
|
||||
}
|
||||
mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41, &h[1]);
|
||||
mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81, &h[0]);
|
||||
return n;
|
||||
}
|
||||
105
bwape.c
105
bwape.c
|
|
@ -10,6 +10,7 @@
|
|||
#include "utils.h"
|
||||
#include "stdaln.h"
|
||||
#include "bwase.h"
|
||||
#include "bwa.h"
|
||||
|
||||
typedef struct {
|
||||
int n;
|
||||
|
|
@ -21,24 +22,15 @@ typedef struct {
|
|||
bwtint_t low, high, high_bayesian;
|
||||
} isize_info_t;
|
||||
|
||||
typedef struct {
|
||||
uint64_t x, y;
|
||||
} b128_t;
|
||||
|
||||
#define b128_lt(a, b) ((a).x < (b).x)
|
||||
#define b128_eq(a, b) ((a).x == (b).x && (a).y == (b).y)
|
||||
#define b128_hash(a) ((uint32_t)(a).x)
|
||||
|
||||
#include "khash.h"
|
||||
KHASH_INIT(b128, b128_t, poslist_t, 1, b128_hash, b128_eq)
|
||||
|
||||
#include "ksort.h"
|
||||
KSORT_INIT(b128, b128_t, b128_lt)
|
||||
KSORT_INIT_GENERIC(uint64_t)
|
||||
KHASH_INIT(b128, pair64_t, poslist_t, 1, b128_hash, b128_eq)
|
||||
|
||||
typedef struct {
|
||||
kvec_t(b128_t) arr;
|
||||
kvec_t(b128_t) pos[2];
|
||||
pair64_v arr;
|
||||
pair64_v pos[2];
|
||||
kvec_t(bwt_aln1_t) aln[2];
|
||||
} pe_data_t;
|
||||
|
||||
|
|
@ -69,19 +61,6 @@ pe_opt_t *bwa_init_pe_opt()
|
|||
po->ap_prior = 1e-5;
|
||||
return po;
|
||||
}
|
||||
|
||||
static inline uint64_t hash_64(uint64_t key)
|
||||
{
|
||||
key += ~(key << 32);
|
||||
key ^= (key >> 22);
|
||||
key += ~(key << 13);
|
||||
key ^= (key >> 8);
|
||||
key += (key << 3);
|
||||
key ^= (key >> 15);
|
||||
key += ~(key << 27);
|
||||
key ^= (key >> 31);
|
||||
return key;
|
||||
}
|
||||
/*
|
||||
static double ierfc(double x) // inverse erfc(); iphi(x) = M_SQRT2 *ierfc(2 * x);
|
||||
{
|
||||
|
|
@ -120,7 +99,7 @@ static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double
|
|||
free(isizes);
|
||||
return -1;
|
||||
}
|
||||
ks_introsort(uint64_t, tot, isizes);
|
||||
ks_introsort_64(tot, isizes);
|
||||
p25 = isizes[(int)(tot*0.25 + 0.5)];
|
||||
p50 = isizes[(int)(tot*0.50 + 0.5)];
|
||||
p75 = isizes[(int)(tot*0.75 + 0.5)];
|
||||
|
|
@ -170,7 +149,7 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm,
|
|||
{
|
||||
int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len;
|
||||
uint64_t o_score, subo_score;
|
||||
b128_t last_pos[2][2], o_pos[2];
|
||||
pair64_t last_pos[2][2], o_pos[2];
|
||||
max_len = p[0]->full_len;
|
||||
if (max_len < p[1]->full_len) max_len = p[1]->full_len;
|
||||
if (low_bound < max_len) low_bound = max_len;
|
||||
|
|
@ -206,11 +185,11 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm,
|
|||
|
||||
o_score = subo_score = (uint64_t)-1;
|
||||
o_n = subo_n = 0;
|
||||
ks_introsort(b128, d->arr.n, d->arr.a);
|
||||
ks_introsort_128(d->arr.n, d->arr.a);
|
||||
for (j = 0; j < 2; ++j) last_pos[j][0].x = last_pos[j][0].y = last_pos[j][1].x = last_pos[j][1].y = (uint64_t)-1;
|
||||
if (opt->type == BWA_PET_STD) {
|
||||
for (i = 0; i < d->arr.n; ++i) {
|
||||
b128_t x = d->arr.a[i];
|
||||
pair64_t x = d->arr.a[i];
|
||||
int strand = x.y>>1&1;
|
||||
if (strand == 1) { // reverse strand, then check
|
||||
int y = 1 - (x.y&1);
|
||||
|
|
@ -221,19 +200,6 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm,
|
|||
last_pos[x.y&1][1] = x;
|
||||
}
|
||||
}
|
||||
} else if (opt->type == BWA_PET_SOLID) {
|
||||
for (i = 0; i < d->arr.n; ++i) {
|
||||
b128_t x = d->arr.a[i];
|
||||
int strand = x.y>>1&1;
|
||||
if ((strand^x.y)&1) { // push
|
||||
int y = 1 - (x.y&1);
|
||||
__pairing_aux(last_pos[y][1], x);
|
||||
__pairing_aux(last_pos[y][0], x);
|
||||
} else { // check
|
||||
last_pos[x.y&1][0] = last_pos[x.y&1][1];
|
||||
last_pos[x.y&1][1] = x;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "[paring] not implemented yet!\n");
|
||||
exit(1);
|
||||
|
|
@ -345,7 +311,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw
|
|||
if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT)
|
||||
&& (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT))
|
||||
{ // only when both ends mapped
|
||||
b128_t x;
|
||||
pair64_t x;
|
||||
int j, k;
|
||||
long long n_occ[2];
|
||||
for (j = 0; j < 2; ++j) {
|
||||
|
|
@ -360,7 +326,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw
|
|||
bwt_aln1_t *r = d->aln[j].a + k;
|
||||
bwtint_t l;
|
||||
if (0 && r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table
|
||||
b128_t key;
|
||||
pair64_t key;
|
||||
int ret;
|
||||
key.x = r->k; key.y = r->l;
|
||||
khint_t iter = kh_put(b128, g_hash, key, &ret);
|
||||
|
|
@ -377,14 +343,14 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw
|
|||
for (l = 0; l < kh_val(g_hash, iter).n; ++l) {
|
||||
x.x = kh_val(g_hash, iter).a[l]>>1;
|
||||
x.y = k<<2 | (kh_val(g_hash, iter).a[l]&1)<<1 | j;
|
||||
kv_push(b128_t, d->arr, x);
|
||||
kv_push(pair64_t, d->arr, x);
|
||||
}
|
||||
} else { // then calculate on the fly
|
||||
for (l = r->k; l <= r->l; ++l) {
|
||||
int strand;
|
||||
x.x = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand);
|
||||
x.y = k<<2 | strand<<1 | j;
|
||||
kv_push(b128_t, d->arr, x);
|
||||
kv_push(pair64_t, d->arr, x);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -576,11 +542,11 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
|
|||
++n_tot[is_singleton];
|
||||
cigar[0] = cigar[1] = 0;
|
||||
n_cigar[0] = n_cigar[1] = 0;
|
||||
if (popt->type != BWA_PET_STD && popt->type != BWA_PET_SOLID) continue; // other types of pairing is not considered
|
||||
if (popt->type != BWA_PET_STD) continue; // other types of pairing is not considered
|
||||
for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified
|
||||
ubyte_t *seq;
|
||||
if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip
|
||||
if (popt->type == BWA_PET_STD) {
|
||||
{ // note that popt->type == BWA_PET_STD always true; in older versions, there was a branch for color-space FF/RR reads
|
||||
if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate
|
||||
__set_rght_coor(beg[k], end[k], p[1-k], p[k]);
|
||||
seq = p[k]->rseq;
|
||||
|
|
@ -589,17 +555,6 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
|
|||
seq = p[k]->seq;
|
||||
seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly
|
||||
}
|
||||
} else { // BWA_PET_SOLID
|
||||
if (p[1-k]->strand == 0) { // R3-F3 pairing
|
||||
if (k == 0) __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3
|
||||
else __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3
|
||||
seq = p[k]->rseq;
|
||||
seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed
|
||||
} else { // F3-R3 pairing
|
||||
if (k == 0) __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3
|
||||
else __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3
|
||||
seq = p[k]->seq;
|
||||
}
|
||||
}
|
||||
// perform SW alignment
|
||||
cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]);
|
||||
|
|
@ -656,14 +611,14 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
|
|||
return pacseq;
|
||||
}
|
||||
|
||||
void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt)
|
||||
void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line)
|
||||
{
|
||||
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
|
||||
int i, j, n_seqs, tot_seqs = 0;
|
||||
bwa_seq_t *seqs[2];
|
||||
bwa_seqio_t *ks[2];
|
||||
clock_t t;
|
||||
bntseq_t *bns, *ntbns = 0;
|
||||
bntseq_t *bns;
|
||||
FILE *fp_sa[2];
|
||||
gap_opt_t opt, opt0;
|
||||
khint_t iter;
|
||||
|
|
@ -688,10 +643,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
|
|||
opt0 = opt;
|
||||
err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten!
|
||||
ks[1] = bwa_open_reads(opt.mode, fn_fa[1]);
|
||||
if (!(opt.mode & BWA_MODE_COMPREAD)) {
|
||||
popt->type = BWA_PET_SOLID;
|
||||
ntbns = bwa_open_nt(prefix);
|
||||
} else { // for Illumina alignment only
|
||||
{ // for Illumina alignment only
|
||||
if (popt->is_preload) {
|
||||
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
|
||||
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
|
||||
|
|
@ -702,7 +654,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
|
|||
}
|
||||
|
||||
// core loop
|
||||
bwa_print_sam_SQ(bns);
|
||||
bwa_print_sam_hdr(bns, rg_line);
|
||||
bwa_print_sam_PG();
|
||||
while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) {
|
||||
int cnt_chg;
|
||||
|
|
@ -724,7 +676,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
|
|||
|
||||
fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... ");
|
||||
for (j = 0; j < 2; ++j)
|
||||
bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq, ntbns);
|
||||
bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq);
|
||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
||||
if (pac == 0) free(pacseq);
|
||||
|
||||
|
|
@ -749,7 +701,6 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
|
|||
|
||||
// destroy
|
||||
bns_destroy(bns);
|
||||
if (ntbns) bns_destroy(ntbns);
|
||||
for (i = 0; i < 2; ++i) {
|
||||
bwa_seq_close(ks[i]);
|
||||
err_fclose(fp_sa[i]);
|
||||
|
|
@ -764,21 +715,15 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
|
|||
|
||||
int bwa_sai2sam_pe(int argc, char *argv[])
|
||||
{
|
||||
extern char *bwa_rg_line, *bwa_rg_id;
|
||||
extern int bwa_set_rg(const char *s);
|
||||
extern char *bwa_infer_prefix(const char *hint);
|
||||
int c;
|
||||
pe_opt_t *popt;
|
||||
char *prefix;
|
||||
char *prefix, *rg_line = 0;
|
||||
|
||||
popt = bwa_init_pe_opt();
|
||||
while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'r':
|
||||
if (bwa_set_rg(optarg) < 0) {
|
||||
fprintf(stderr, "[%s] malformated @RG line\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1;
|
||||
break;
|
||||
case 'a': popt->max_isize = atoi(optarg); break;
|
||||
case 'o': popt->max_occ = atoi(optarg); break;
|
||||
|
|
@ -812,13 +757,11 @@ int bwa_sai2sam_pe(int argc, char *argv[])
|
|||
fprintf(stderr, "\n");
|
||||
return 1;
|
||||
}
|
||||
if ((prefix = bwa_infer_prefix(argv[optind])) == 0) {
|
||||
if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
|
||||
fprintf(stderr, "[%s] fail to locate the index\n", __func__);
|
||||
free(bwa_rg_line); free(bwa_rg_id);
|
||||
return 0;
|
||||
}
|
||||
bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt);
|
||||
free(bwa_rg_line); free(bwa_rg_id); free(prefix);
|
||||
free(popt);
|
||||
bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt, rg_line);
|
||||
free(prefix); free(popt);
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
131
bwase.c
131
bwase.c
|
|
@ -10,9 +10,9 @@
|
|||
#include "bntseq.h"
|
||||
#include "utils.h"
|
||||
#include "kstring.h"
|
||||
#include "bwa.h"
|
||||
|
||||
int g_log_n[256];
|
||||
char *bwa_rg_line, *bwa_rg_id;
|
||||
|
||||
void bwa_print_sam_PG();
|
||||
|
||||
|
|
@ -71,8 +71,8 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma
|
|||
}
|
||||
rest -= q->l - q->k + 1;
|
||||
} else { // Random sampling (http://code.activestate.com/recipes/272884/). In fact, we never come here.
|
||||
int j, i, k;
|
||||
for (j = rest, i = q->l - q->k + 1, k = 0; j > 0; --j) {
|
||||
int j, i;
|
||||
for (j = rest, i = q->l - q->k + 1; j > 0; --j) {
|
||||
double p = 1.0, x = drand48();
|
||||
while (x < p) p -= p * j / (i--);
|
||||
s->multi[z].pos = q->l - i;
|
||||
|
|
@ -296,18 +296,12 @@ void bwa_correct_trimmed(bwa_seq_t *s)
|
|||
s->len = s->full_len;
|
||||
}
|
||||
|
||||
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns)
|
||||
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq)
|
||||
{
|
||||
ubyte_t *pacseq, *ntpac = 0;
|
||||
ubyte_t *pacseq;
|
||||
int i, j;
|
||||
kstring_t *str;
|
||||
|
||||
if (ntbns) { // in color space
|
||||
ntpac = (ubyte_t*)xcalloc(ntbns->l_pac/4+1, 1);
|
||||
err_rewind(ntbns->fp_pac);
|
||||
err_fread_noeof(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac);
|
||||
}
|
||||
|
||||
if (!_pacseq) {
|
||||
pacseq = (ubyte_t*)xcalloc(bns->l_pac/4+1, 1);
|
||||
err_rewind(bns->fp_pac);
|
||||
|
|
@ -328,28 +322,6 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t
|
|||
s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos,
|
||||
(s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1);
|
||||
}
|
||||
#if 0
|
||||
if (ntbns) { // in color space
|
||||
for (i = 0; i < n_seqs; ++i) {
|
||||
bwa_seq_t *s = seqs + i;
|
||||
bwa_cs2nt_core(s, bns->l_pac, ntpac);
|
||||
for (j = 0; j < s->n_multi; ++j) {
|
||||
bwt_multi1_t *q = s->multi + j;
|
||||
int n_cigar;
|
||||
if (q->gap == 0) continue;
|
||||
free(q->cigar);
|
||||
q->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, q->strand? s->rseq : s->seq, &q->pos,
|
||||
(q->strand? 1 : -1) * q->gap, &n_cigar, 0);
|
||||
q->n_cigar = n_cigar;
|
||||
}
|
||||
if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again
|
||||
free(s->cigar);
|
||||
s->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos,
|
||||
(s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// generate MD tag
|
||||
str = (kstring_t*)xcalloc(1, sizeof(kstring_t));
|
||||
for (i = 0; i != n_seqs; ++i) {
|
||||
|
|
@ -357,18 +329,16 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t
|
|||
if (s->type != BWA_TYPE_NO_MATCH) {
|
||||
int nm;
|
||||
s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq,
|
||||
bns->l_pac, ntbns? ntpac : pacseq, str, &nm);
|
||||
bns->l_pac, pacseq, str, &nm);
|
||||
s->nm = nm;
|
||||
}
|
||||
}
|
||||
free(str->s); free(str);
|
||||
|
||||
// correct for trimmed reads
|
||||
if (!ntbns) // trimming is only enabled for Illumina reads
|
||||
for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i);
|
||||
for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i);
|
||||
|
||||
if (!_pacseq) free(pacseq);
|
||||
free(ntpac);
|
||||
}
|
||||
|
||||
int64_t pos_end(const bwa_seq_t *p)
|
||||
|
|
@ -462,11 +432,11 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
|
|||
|
||||
// print mate coordinate
|
||||
if (mate && mate->type != BWA_TYPE_NO_MATCH) {
|
||||
int m_seqid, m_is_N;
|
||||
int m_seqid;
|
||||
long long isize;
|
||||
am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality
|
||||
// redundant calculation here, but should not matter too much
|
||||
m_is_N = bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid);
|
||||
bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid);
|
||||
err_printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name);
|
||||
isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0;
|
||||
if (p->type == BWA_TYPE_NO_MATCH) isize = 0;
|
||||
|
|
@ -482,7 +452,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
|
|||
err_printf("%s", p->qual);
|
||||
} else err_printf("*");
|
||||
|
||||
if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id);
|
||||
if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id);
|
||||
if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc);
|
||||
if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len);
|
||||
if (p->type != BWA_TYPE_NO_MATCH) {
|
||||
|
|
@ -532,74 +502,20 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
|
|||
if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
|
||||
err_printf("%s", p->qual);
|
||||
} else err_printf("*");
|
||||
if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id);
|
||||
if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id);
|
||||
if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc);
|
||||
if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len);
|
||||
err_putchar('\n');
|
||||
}
|
||||
}
|
||||
|
||||
bntseq_t *bwa_open_nt(const char *prefix)
|
||||
{
|
||||
bntseq_t *ntbns;
|
||||
char *str;
|
||||
str = (char*)xcalloc(strlen(prefix) + 10, 1);
|
||||
strcat(strcpy(str, prefix), ".nt");
|
||||
ntbns = bns_restore(str);
|
||||
free(str);
|
||||
return ntbns;
|
||||
}
|
||||
|
||||
void bwa_print_sam_SQ(const bntseq_t *bns)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < bns->n_seqs; ++i)
|
||||
err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
|
||||
if (bwa_rg_line) err_printf("%s\n", bwa_rg_line);
|
||||
}
|
||||
|
||||
void bwase_initialize()
|
||||
{
|
||||
int i;
|
||||
for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
|
||||
}
|
||||
|
||||
char *bwa_escape(char *s)
|
||||
{
|
||||
char *p, *q;
|
||||
for (p = q = s; *p; ++p) {
|
||||
if (*p == '\\') {
|
||||
++p;
|
||||
if (*p == 't') *q++ = '\t';
|
||||
else if (*p == 'n') *q++ = '\n';
|
||||
else if (*p == 'r') *q++ = '\r';
|
||||
else if (*p == '\\') *q++ = '\\';
|
||||
} else *q++ = *p;
|
||||
}
|
||||
*q = '\0';
|
||||
return s;
|
||||
}
|
||||
|
||||
int bwa_set_rg(const char *s)
|
||||
{
|
||||
char *p, *q, *r;
|
||||
if (strstr(s, "@RG") != s) return -1;
|
||||
if (bwa_rg_line) free(bwa_rg_line);
|
||||
if (bwa_rg_id) free(bwa_rg_id);
|
||||
bwa_rg_line = xstrdup(s);
|
||||
bwa_rg_id = 0;
|
||||
bwa_escape(bwa_rg_line);
|
||||
p = strstr(bwa_rg_line, "\tID:");
|
||||
if (p == 0) return -1;
|
||||
p += 4;
|
||||
for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
|
||||
bwa_rg_id = xcalloc(q - p + 1, 1);
|
||||
for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
|
||||
*r++ = *q;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ)
|
||||
void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ, const char *rg_line)
|
||||
{
|
||||
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
|
||||
int i, n_seqs, tot_seqs = 0, m_aln;
|
||||
|
|
@ -607,7 +523,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
|
|||
bwa_seq_t *seqs;
|
||||
bwa_seqio_t *ks;
|
||||
clock_t t;
|
||||
bntseq_t *bns, *ntbns = 0;
|
||||
bntseq_t *bns;
|
||||
FILE *fp_sa;
|
||||
gap_opt_t opt;
|
||||
|
||||
|
|
@ -619,9 +535,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
|
|||
|
||||
m_aln = 0;
|
||||
err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa);
|
||||
if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac
|
||||
ntbns = bwa_open_nt(prefix);
|
||||
bwa_print_sam_SQ(bns);
|
||||
bwa_print_sam_hdr(bns, rg_line);
|
||||
//bwa_print_sam_PG();
|
||||
// set ks
|
||||
ks = bwa_open_reads(opt.mode, fn_fa);
|
||||
|
|
@ -648,7 +562,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
|
|||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
||||
|
||||
fprintf(stderr, "[bwa_aln_core] refine gapped alignments... ");
|
||||
bwa_refine_gapped(bns, n_seqs, seqs, 0, ntbns);
|
||||
bwa_refine_gapped(bns, n_seqs, seqs, 0);
|
||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
||||
|
||||
fprintf(stderr, "[bwa_aln_core] print alignments... ");
|
||||
|
|
@ -662,7 +576,6 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
|
|||
|
||||
// destroy
|
||||
bwa_seq_close(ks);
|
||||
if (ntbns) bns_destroy(ntbns);
|
||||
bns_destroy(bns);
|
||||
err_fclose(fp_sa);
|
||||
free(aln);
|
||||
|
|
@ -670,17 +583,13 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
|
|||
|
||||
int bwa_sai2sam_se(int argc, char *argv[])
|
||||
{
|
||||
extern char *bwa_infer_prefix(const char *hint);
|
||||
int c, n_occ = 3;
|
||||
char *prefix;
|
||||
char *prefix, *rg_line = 0;
|
||||
while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'h': break;
|
||||
case 'r':
|
||||
if (bwa_set_rg(optarg) < 0) {
|
||||
fprintf(stderr, "[%s] malformated @RG line\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1;
|
||||
break;
|
||||
case 'n': n_occ = atoi(optarg); break;
|
||||
case 'f': xreopen(optarg, "w", stdout); break;
|
||||
|
|
@ -692,12 +601,10 @@ int bwa_sai2sam_se(int argc, char *argv[])
|
|||
fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] <prefix> <in.sai> <in.fq>\n");
|
||||
return 1;
|
||||
}
|
||||
if ((prefix = bwa_infer_prefix(argv[optind])) == 0) {
|
||||
if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
|
||||
fprintf(stderr, "[%s] fail to locate the index\n", __func__);
|
||||
free(bwa_rg_line); free(bwa_rg_id);
|
||||
return 0;
|
||||
}
|
||||
bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ);
|
||||
free(bwa_rg_line); free(bwa_rg_id);
|
||||
bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line);
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
2
bwase.h
2
bwase.h
|
|
@ -14,7 +14,7 @@ extern "C" {
|
|||
// Calculate the approximate position of the sequence from the specified bwt with loaded suffix array.
|
||||
void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t* bwt, bwa_seq_t* seq, const int max_mm, const float fnr);
|
||||
// Refine the approximate position of the sequence to an actual placement for the sequence.
|
||||
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns);
|
||||
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq);
|
||||
// Backfill certain alignment properties mainly centering around number of matches.
|
||||
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
|
||||
// Calculate the end position of a read given a certain sequence.
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@
|
|||
#include "bamlite.h"
|
||||
|
||||
#include "kseq.h"
|
||||
KSEQ_INIT(gzFile, err_gzread)
|
||||
KSEQ_DECLARE(gzFile)
|
||||
|
||||
extern unsigned char nst_nt4_table[256];
|
||||
static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
|
||||
|
|
|
|||
180
bwt.c
180
bwt.c
|
|
@ -45,6 +45,14 @@ void bwt_gen_cnt_table(bwt_t *bwt)
|
|||
}
|
||||
}
|
||||
|
||||
static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inverse CSA
|
||||
{
|
||||
bwtint_t x = k - (k > bwt->primary);
|
||||
x = bwt_B0(bwt, x);
|
||||
x = bwt->L2[x] + bwt_occ(bwt, k, x);
|
||||
return k == bwt->primary? 0 : x;
|
||||
}
|
||||
|
||||
// bwt->bwt and bwt->occ must be precalculated
|
||||
void bwt_cal_sa(bwt_t *bwt, int intv)
|
||||
{
|
||||
|
|
@ -93,21 +101,20 @@ static inline int __occ_aux(uint64_t y, int c)
|
|||
|
||||
bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c)
|
||||
{
|
||||
bwtint_t n, l, j;
|
||||
uint32_t *p;
|
||||
bwtint_t n;
|
||||
uint32_t *p, *end;
|
||||
|
||||
if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
|
||||
if (k == (bwtint_t)(-1)) return 0;
|
||||
if (k >= bwt->primary) --k; // because $ is not in bwt
|
||||
k -= (k >= bwt->primary); // because $ is not in bwt
|
||||
|
||||
// retrieve Occ at k/OCC_INTERVAL
|
||||
n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c];
|
||||
p += sizeof(bwtint_t); // jump to the start of the first BWT cell
|
||||
|
||||
// calculate Occ up to the last k/32
|
||||
j = k >> 5 << 5;
|
||||
for (l = k/OCC_INTERVAL*OCC_INTERVAL; l < j; l += 32, p += 2)
|
||||
n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
|
||||
end = p + (((k>>5) - ((k&~OCC_INTV_MASK)>>5))<<1);
|
||||
for (; p < end; p += 2) n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
|
||||
|
||||
// calculate Occ
|
||||
n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
|
||||
|
|
@ -156,20 +163,20 @@ void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok,
|
|||
|
||||
void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4])
|
||||
{
|
||||
bwtint_t l, j, x;
|
||||
uint32_t *p;
|
||||
bwtint_t x;
|
||||
uint32_t *p, tmp, *end;
|
||||
if (k == (bwtint_t)(-1)) {
|
||||
memset(cnt, 0, 4 * sizeof(bwtint_t));
|
||||
return;
|
||||
}
|
||||
if (k >= bwt->primary) --k; // because $ is not in bwt
|
||||
k -= (k >= bwt->primary); // because $ is not in bwt
|
||||
p = bwt_occ_intv(bwt, k);
|
||||
memcpy(cnt, p, 4 * sizeof(bwtint_t));
|
||||
p += sizeof(bwtint_t);
|
||||
j = k >> 4 << 4;
|
||||
for (l = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; l < j; l += 16, ++p)
|
||||
x += __occ_aux4(bwt, *p);
|
||||
x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15);
|
||||
p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t))
|
||||
end = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4)); // this is the end point of the following loop
|
||||
for (x = 0; p < end; ++p) x += __occ_aux4(bwt, *p);
|
||||
tmp = *p & ~((1U<<((~k&15)<<1)) - 1);
|
||||
x += __occ_aux4(bwt, tmp) - (~k&15);
|
||||
cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
|
||||
}
|
||||
|
||||
|
|
@ -177,29 +184,30 @@ void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4])
|
|||
void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4])
|
||||
{
|
||||
bwtint_t _k, _l;
|
||||
_k = (k >= bwt->primary)? k-1 : k;
|
||||
_l = (l >= bwt->primary)? l-1 : l;
|
||||
if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
|
||||
_k = k - (k >= bwt->primary);
|
||||
_l = l - (l >= bwt->primary);
|
||||
if (_l>>OCC_INTV_SHIFT != _k>>OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
|
||||
bwt_occ4(bwt, k, cntk);
|
||||
bwt_occ4(bwt, l, cntl);
|
||||
} else {
|
||||
bwtint_t i, j, x, y;
|
||||
uint32_t *p;
|
||||
if (k >= bwt->primary) --k; // because $ is not in bwt
|
||||
if (l >= bwt->primary) --l;
|
||||
bwtint_t x, y;
|
||||
uint32_t *p, tmp, *endk, *endl;
|
||||
k -= (k >= bwt->primary); // because $ is not in bwt
|
||||
l -= (l >= bwt->primary);
|
||||
p = bwt_occ_intv(bwt, k);
|
||||
memcpy(cntk, p, 4 * sizeof(bwtint_t));
|
||||
p += sizeof(bwtint_t);
|
||||
p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t))
|
||||
// prepare cntk[]
|
||||
j = k >> 4 << 4;
|
||||
for (i = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; i < j; i += 16, ++p)
|
||||
x += __occ_aux4(bwt, *p);
|
||||
endk = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4));
|
||||
endl = p + ((l>>4) - ((l&~OCC_INTV_MASK)>>4));
|
||||
for (x = 0; p < endk; ++p) x += __occ_aux4(bwt, *p);
|
||||
y = x;
|
||||
x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15);
|
||||
tmp = *p & ~((1U<<((~k&15)<<1)) - 1);
|
||||
x += __occ_aux4(bwt, tmp) - (~k&15);
|
||||
// calculate cntl[] and finalize cntk[]
|
||||
j = l >> 4 << 4;
|
||||
for (; i < j; i += 16, ++p) y += __occ_aux4(bwt, *p);
|
||||
y += __occ_aux4(bwt, *p & ~((1U<<((~l&15)<<1)) - 1)) - (~l&15);
|
||||
for (; p < endl; ++p) y += __occ_aux4(bwt, *p);
|
||||
tmp = *p & ~((1U<<((~l&15)<<1)) - 1);
|
||||
y += __occ_aux4(bwt, tmp) - (~l&15);
|
||||
memcpy(cntl, cntk, 4 * sizeof(bwtint_t));
|
||||
cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24;
|
||||
cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24;
|
||||
|
|
@ -273,7 +281,7 @@ static void bwt_reverse_intvs(bwtintv_v *p)
|
|||
}
|
||||
}
|
||||
|
||||
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2])
|
||||
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
|
||||
{
|
||||
int i, j, c, ret;
|
||||
bwtintv_t ik, ok[4];
|
||||
|
|
@ -281,45 +289,45 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem
|
|||
|
||||
mem->n = 0;
|
||||
if (q[x] > 3) return x + 1;
|
||||
if (min_intv < 1) min_intv = 1; // the interval size should be at least 1
|
||||
kv_init(a[0]); kv_init(a[1]);
|
||||
prev = tmpvec[0]? tmpvec[0] : &a[0];
|
||||
curr = tmpvec[1]? tmpvec[1] : &a[1];
|
||||
bwt_set_intv(bwt, q[x], ik);
|
||||
prev = tmpvec && tmpvec[0]? tmpvec[0] : &a[0]; // use the temporary vector if provided
|
||||
curr = tmpvec && tmpvec[1]? tmpvec[1] : &a[1];
|
||||
bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
|
||||
ik.info = x + 1;
|
||||
|
||||
for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search
|
||||
if (q[i] < 4) {
|
||||
c = 3 - q[i];
|
||||
if (q[i] < 4) { // an A/C/G/T base
|
||||
c = 3 - q[i]; // complement of q[i]
|
||||
bwt_extend(bwt, &ik, ok, 0);
|
||||
if (ok[c].x[2] != ik.x[2]) // change of the interval size
|
||||
if (ok[c].x[2] != ik.x[2]) { // change of the interval size
|
||||
kv_push(bwtintv_t, *curr, ik);
|
||||
if (ok[c].x[2] == 0) break; // cannot be extended
|
||||
if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further
|
||||
}
|
||||
ik = ok[c]; ik.info = i + 1;
|
||||
} else { // an ambiguous base
|
||||
kv_push(bwtintv_t, *curr, ik);
|
||||
break; // cannot be extended; in this case, i<len always stands
|
||||
break; // always terminate extension at an ambiguous base; in this case, i<len always stands
|
||||
}
|
||||
}
|
||||
if (i == len) kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end
|
||||
bwt_reverse_intvs(curr); // s.t. smaller intervals visited first
|
||||
bwt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first
|
||||
ret = curr->a[0].info; // this will be the returned value
|
||||
swap = curr; curr = prev; prev = swap;
|
||||
|
||||
for (i = x - 1; i >= -1; --i) { // backward search for MEMs
|
||||
if (q[i] > 3) break;
|
||||
c = i < 0? 0 : q[i];
|
||||
c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base
|
||||
for (j = 0, curr->n = 0; j < prev->n; ++j) {
|
||||
bwtintv_t *p = &prev->a[j];
|
||||
bwt_extend(bwt, p, ok, 1);
|
||||
if (ok[c].x[2] == 0 || i == -1) { // keep the hit if reaching the beginning or not extended further
|
||||
if (curr->n == 0) { // curr->n to make sure there is no longer matches
|
||||
if (c < 0 || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
|
||||
if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches
|
||||
if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches
|
||||
ik = *p; ik.info |= (uint64_t)(i + 1)<<32;
|
||||
kv_push(bwtintv_t, *mem, ik);
|
||||
}
|
||||
} // otherwise the match is contained in another longer match
|
||||
}
|
||||
if (ok[c].x[2] && (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2])) {
|
||||
} else if (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2]) {
|
||||
ok[c].info = p->info;
|
||||
kv_push(bwtintv_t, *curr, ok[c]);
|
||||
}
|
||||
|
|
@ -329,7 +337,85 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem
|
|||
}
|
||||
bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate
|
||||
|
||||
if (tmpvec[0] == 0) free(a[0].a);
|
||||
if (tmpvec[1] == 0) free(a[1].a);
|
||||
if (tmpvec == 0 || tmpvec[0] == 0) free(a[0].a);
|
||||
if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*************************
|
||||
* Read/write BWT and SA *
|
||||
*************************/
|
||||
|
||||
void bwt_dump_bwt(const char *fn, const bwt_t *bwt)
|
||||
{
|
||||
FILE *fp;
|
||||
fp = xopen(fn, "wb");
|
||||
err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
|
||||
err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
|
||||
err_fwrite(bwt->bwt, 4, bwt->bwt_size, fp);
|
||||
err_fflush(fp);
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
||||
void bwt_dump_sa(const char *fn, const bwt_t *bwt)
|
||||
{
|
||||
FILE *fp;
|
||||
fp = xopen(fn, "wb");
|
||||
err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
|
||||
err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
|
||||
err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
|
||||
err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp);
|
||||
err_fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
|
||||
err_fflush(fp);
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
||||
void bwt_restore_sa(const char *fn, bwt_t *bwt)
|
||||
{
|
||||
char skipped[256];
|
||||
FILE *fp;
|
||||
bwtint_t primary;
|
||||
|
||||
fp = xopen(fn, "rb");
|
||||
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
|
||||
xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same.");
|
||||
err_fread_noeof(skipped, sizeof(bwtint_t), 4, fp); // skip
|
||||
err_fread_noeof(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
|
||||
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
|
||||
xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
|
||||
|
||||
bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv;
|
||||
bwt->sa = (bwtint_t*)xcalloc(bwt->n_sa, sizeof(bwtint_t));
|
||||
bwt->sa[0] = -1;
|
||||
|
||||
err_fread_noeof(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
||||
bwt_t *bwt_restore_bwt(const char *fn)
|
||||
{
|
||||
bwt_t *bwt;
|
||||
FILE *fp;
|
||||
|
||||
bwt = (bwt_t*)xcalloc(1, sizeof(bwt_t));
|
||||
fp = xopen(fn, "rb");
|
||||
err_fseek(fp, 0, SEEK_END);
|
||||
bwt->bwt_size = (err_ftell(fp) - sizeof(bwtint_t) * 5) >> 2;
|
||||
bwt->bwt = (uint32_t*)xcalloc(bwt->bwt_size, 4);
|
||||
err_fseek(fp, 0, SEEK_SET);
|
||||
err_fread_noeof(&bwt->primary, sizeof(bwtint_t), 1, fp);
|
||||
err_fread_noeof(bwt->L2+1, sizeof(bwtint_t), 4, fp);
|
||||
err_fread_noeof(bwt->bwt, 4, bwt->bwt_size, fp);
|
||||
bwt->seq_len = bwt->L2[4];
|
||||
err_fclose(fp);
|
||||
bwt_gen_cnt_table(bwt);
|
||||
|
||||
return bwt;
|
||||
}
|
||||
|
||||
void bwt_destroy(bwt_t *bwt)
|
||||
{
|
||||
if (bwt == 0) return;
|
||||
free(bwt->sa); free(bwt->bwt);
|
||||
free(bwt);
|
||||
}
|
||||
|
|
|
|||
17
bwt.h
17
bwt.h
|
|
@ -30,8 +30,10 @@
|
|||
|
||||
#include <stdint.h>
|
||||
|
||||
// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line
|
||||
#define OCC_INTERVAL 0x80
|
||||
// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line because some part of the code assume OCC_INTERVAL=0x80
|
||||
#define OCC_INTV_SHIFT 7
|
||||
#define OCC_INTERVAL (1LL<<OCC_INTV_SHIFT)
|
||||
#define OCC_INTV_MASK (OCC_INTERVAL - 1)
|
||||
|
||||
#ifndef BWA_UBYTE
|
||||
#define BWA_UBYTE
|
||||
|
|
@ -74,13 +76,6 @@ typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v;
|
|||
* called bwt_B0 instead of bwt_B */
|
||||
#define bwt_B0(b, k) (bwt_bwt(b, k)>>((~(k)&0xf)<<1)&3)
|
||||
|
||||
// inverse Psi function
|
||||
#define bwt_invPsi(bwt, k) \
|
||||
(((k) == (bwt)->primary)? 0 : \
|
||||
((k) < (bwt)->primary)? \
|
||||
(bwt)->L2[bwt_B0(bwt, k)] + bwt_occ(bwt, k, bwt_B0(bwt, k)) \
|
||||
: (bwt)->L2[bwt_B0(bwt, (k)-1)] + bwt_occ(bwt, k, bwt_B0(bwt, (k)-1)))
|
||||
|
||||
#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)]+1, (ik).x[2] = (bwt)->L2[(int)(c)+1]-(bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3-(c)]+1, (ik).info = 0)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
@ -121,7 +116,9 @@ extern "C" {
|
|||
* Given a query _q_, collect potential SMEMs covering position _x_ and store them in _mem_.
|
||||
* Return the end of the longest exact match starting from _x_.
|
||||
*/
|
||||
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
|
||||
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
|
||||
|
||||
// SMEM iterator interface
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1449,7 +1449,7 @@ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, bgint_t initialMaxB
|
|||
}
|
||||
|
||||
err_fseek(packedFile, -1, SEEK_END);
|
||||
packedFileLen = ftell(packedFile);
|
||||
packedFileLen = err_ftell(packedFile);
|
||||
err_fread_noeof(&lastByteLength, sizeof(unsigned char), 1, packedFile);
|
||||
totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength);
|
||||
|
||||
|
|
|
|||
33
bwtaln.c
33
bwtaln.c
|
|
@ -11,6 +11,7 @@
|
|||
#include "bwtaln.h"
|
||||
#include "bwtgap.h"
|
||||
#include "utils.h"
|
||||
#include "bwa.h"
|
||||
|
||||
#ifdef HAVE_PTHREAD
|
||||
#include <pthread.h>
|
||||
|
|
@ -219,32 +220,6 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
|
|||
bwa_seq_close(ks);
|
||||
}
|
||||
|
||||
char *bwa_infer_prefix(const char *hint)
|
||||
{
|
||||
char *prefix;
|
||||
int l_hint;
|
||||
FILE *fp;
|
||||
l_hint = strlen(hint);
|
||||
prefix = xmalloc(l_hint + 3 + 4 + 1);
|
||||
strcpy(prefix, hint);
|
||||
strcpy(prefix + l_hint, ".64.bwt");
|
||||
if ((fp = fopen(prefix, "rb")) != 0) {
|
||||
fclose(fp);
|
||||
prefix[l_hint + 3] = 0;
|
||||
return prefix;
|
||||
} else {
|
||||
strcpy(prefix + l_hint, ".bwt");
|
||||
if ((fp = fopen(prefix, "rb")) == 0) {
|
||||
free(prefix);
|
||||
return 0;
|
||||
} else {
|
||||
fclose(fp);
|
||||
prefix[l_hint] = 0;
|
||||
return prefix;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int bwa_aln(int argc, char *argv[])
|
||||
{
|
||||
int c, opte = -1;
|
||||
|
|
@ -252,7 +227,7 @@ int bwa_aln(int argc, char *argv[])
|
|||
char *prefix;
|
||||
|
||||
opt = gap_init_opt();
|
||||
while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) {
|
||||
while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:LR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'n':
|
||||
if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1;
|
||||
|
|
@ -272,7 +247,6 @@ int bwa_aln(int argc, char *argv[])
|
|||
case 'L': opt->mode |= BWA_MODE_LOGGAP; break;
|
||||
case 'R': opt->max_top2 = atoi(optarg); break;
|
||||
case 'q': opt->trim_qual = atoi(optarg); break;
|
||||
case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break;
|
||||
case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break;
|
||||
case 'f': xreopen(optarg, "wb", stdout); break;
|
||||
case 'b': opt->mode |= BWA_MODE_BAM; break;
|
||||
|
|
@ -310,7 +284,6 @@ int bwa_aln(int argc, char *argv[])
|
|||
fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual);
|
||||
fprintf(stderr, " -f FILE file to write output to instead of stdout\n");
|
||||
fprintf(stderr, " -B INT length of barcode\n");
|
||||
// fprintf(stderr, " -c input sequences are in the color space\n");
|
||||
fprintf(stderr, " -L log-scaled gap penalty for long deletions\n");
|
||||
fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n");
|
||||
fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n");
|
||||
|
|
@ -330,7 +303,7 @@ int bwa_aln(int argc, char *argv[])
|
|||
k = l;
|
||||
}
|
||||
}
|
||||
if ((prefix = bwa_infer_prefix(argv[optind])) == 0) {
|
||||
if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
|
||||
fprintf(stderr, "[%s] fail to locate the index\n", __func__);
|
||||
free(opt);
|
||||
return 0;
|
||||
|
|
|
|||
1
bwtaln.h
1
bwtaln.h
|
|
@ -107,7 +107,6 @@ typedef struct {
|
|||
} gap_opt_t;
|
||||
|
||||
#define BWA_PET_STD 1
|
||||
#define BWA_PET_SOLID 2
|
||||
|
||||
typedef struct {
|
||||
int max_isize, force_isize;
|
||||
|
|
|
|||
173
bwtindex.c
173
bwtindex.c
|
|
@ -36,17 +36,160 @@
|
|||
#include "main.h"
|
||||
#include "utils.h"
|
||||
|
||||
bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is);
|
||||
void bwa_pac_rev_core(const char *fn, const char *fn_rev);
|
||||
#ifdef _DIVBWT
|
||||
#include "divsufsort.h"
|
||||
#endif
|
||||
|
||||
int bwa_index(int argc, char *argv[])
|
||||
int is_bwt(ubyte_t *T, int n);
|
||||
|
||||
int64_t bwa_seq_len(const char *fn_pac)
|
||||
{
|
||||
FILE *fp;
|
||||
int64_t pac_len;
|
||||
ubyte_t c;
|
||||
fp = xopen(fn_pac, "rb");
|
||||
err_fseek(fp, -1, SEEK_END);
|
||||
pac_len = err_ftell(fp);
|
||||
err_fread_noeof(&c, 1, 1, fp);
|
||||
err_fclose(fp);
|
||||
return (pac_len - 1) * 4 + (int)c;
|
||||
}
|
||||
|
||||
bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
|
||||
{
|
||||
bwt_t *bwt;
|
||||
ubyte_t *buf, *buf2;
|
||||
int i, pac_size;
|
||||
FILE *fp;
|
||||
|
||||
// initialization
|
||||
bwt = (bwt_t*)xcalloc(1, sizeof(bwt_t));
|
||||
bwt->seq_len = bwa_seq_len(fn_pac);
|
||||
bwt->bwt_size = (bwt->seq_len + 15) >> 4;
|
||||
fp = xopen(fn_pac, "rb");
|
||||
|
||||
// prepare sequence
|
||||
pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1);
|
||||
buf2 = (ubyte_t*)xcalloc(pac_size, 1);
|
||||
err_fread_noeof(buf2, 1, pac_size, fp);
|
||||
err_fclose(fp);
|
||||
memset(bwt->L2, 0, 5 * 4);
|
||||
buf = (ubyte_t*)xcalloc(bwt->seq_len + 1, 1);
|
||||
for (i = 0; i < bwt->seq_len; ++i) {
|
||||
buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3;
|
||||
++bwt->L2[1+buf[i]];
|
||||
}
|
||||
for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1];
|
||||
free(buf2);
|
||||
|
||||
// Burrows-Wheeler Transform
|
||||
if (use_is) {
|
||||
bwt->primary = is_bwt(buf, bwt->seq_len);
|
||||
} else {
|
||||
#ifdef _DIVBWT
|
||||
bwt->primary = divbwt(buf, buf, 0, bwt->seq_len);
|
||||
#else
|
||||
err_fatal_simple("libdivsufsort is not compiled in.");
|
||||
#endif
|
||||
}
|
||||
bwt->bwt = (u_int32_t*)xcalloc(bwt->bwt_size, 4);
|
||||
for (i = 0; i < bwt->seq_len; ++i)
|
||||
bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1);
|
||||
free(buf);
|
||||
return bwt;
|
||||
}
|
||||
|
||||
int bwa_pac2bwt(int argc, char *argv[]) // the "pac2bwt" command; IMPORTANT: bwt generated at this step CANNOT be used with BWA. bwtupdate is required!
|
||||
{
|
||||
bwt_t *bwt;
|
||||
int c, use_is = 1;
|
||||
while ((c = getopt(argc, argv, "d")) >= 0) {
|
||||
switch (c) {
|
||||
case 'd': use_is = 0; break;
|
||||
default: return 1;
|
||||
}
|
||||
}
|
||||
if (optind + 2 > argc) {
|
||||
fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
|
||||
return 1;
|
||||
}
|
||||
bwt = bwt_pac2bwt(argv[optind], use_is);
|
||||
bwt_dump_bwt(argv[optind+1], bwt);
|
||||
bwt_destroy(bwt);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
|
||||
|
||||
void bwt_bwtupdate_core(bwt_t *bwt)
|
||||
{
|
||||
bwtint_t i, k, c[4], n_occ;
|
||||
uint32_t *buf;
|
||||
|
||||
n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;
|
||||
bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size
|
||||
buf = (uint32_t*)xcalloc(bwt->bwt_size, 4); // will be the new bwt
|
||||
c[0] = c[1] = c[2] = c[3] = 0;
|
||||
for (i = k = 0; i < bwt->seq_len; ++i) {
|
||||
if (i % OCC_INTERVAL == 0) {
|
||||
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
|
||||
k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4)
|
||||
}
|
||||
if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2
|
||||
++c[bwt_B00(bwt, i)];
|
||||
}
|
||||
// the last element
|
||||
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
|
||||
xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size");
|
||||
// update bwt
|
||||
free(bwt->bwt); bwt->bwt = buf;
|
||||
}
|
||||
|
||||
int bwa_bwtupdate(int argc, char *argv[]) // the "bwtupdate" command
|
||||
{
|
||||
bwt_t *bwt;
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
|
||||
return 1;
|
||||
}
|
||||
bwt = bwt_restore_bwt(argv[1]);
|
||||
bwt_bwtupdate_core(bwt);
|
||||
bwt_dump_bwt(argv[1], bwt);
|
||||
bwt_destroy(bwt);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command
|
||||
{
|
||||
bwt_t *bwt;
|
||||
int c, sa_intv = 32;
|
||||
while ((c = getopt(argc, argv, "i:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'i': sa_intv = atoi(optarg); break;
|
||||
default: return 1;
|
||||
}
|
||||
}
|
||||
if (optind + 2 > argc) {
|
||||
fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
|
||||
return 1;
|
||||
}
|
||||
bwt = bwt_restore_bwt(argv[optind]);
|
||||
bwt_cal_sa(bwt, sa_intv);
|
||||
bwt_dump_sa(argv[optind+1], bwt);
|
||||
bwt_destroy(bwt);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bwa_index(int argc, char *argv[]) // the "index" command
|
||||
{
|
||||
extern void bwa_pac_rev_core(const char *fn, const char *fn_rev);
|
||||
|
||||
char *prefix = 0, *str, *str2, *str3;
|
||||
int c, algo_type = 0, is_color = 0, is_64 = 0;
|
||||
int c, algo_type = 0, is_64 = 0;
|
||||
clock_t t;
|
||||
int64_t l_pac;
|
||||
|
||||
while ((c = getopt(argc, argv, "6ca:p:")) >= 0) {
|
||||
while ((c = getopt(argc, argv, "6a:p:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'a': // if -a is not set, algo_type will be determined later
|
||||
if (strcmp(optarg, "div") == 0) algo_type = 1;
|
||||
|
|
@ -55,7 +198,6 @@ int bwa_index(int argc, char *argv[])
|
|||
else err_fatal(__func__, "unknown algorithm: '%s'.", optarg);
|
||||
break;
|
||||
case 'p': prefix = xstrdup(optarg); break;
|
||||
case 'c': is_color = 1; break;
|
||||
case '6': is_64 = 1; break;
|
||||
default: return 1;
|
||||
}
|
||||
|
|
@ -67,7 +209,6 @@ int bwa_index(int argc, char *argv[])
|
|||
fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n");
|
||||
fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n");
|
||||
fprintf(stderr, " -6 index files named as <in.fasta>.64.* instead of <in.fasta>.* \n");
|
||||
// fprintf(stderr, " -c build color-space index\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n");
|
||||
fprintf(stderr, " `-a div' do not work not for long genomes. Please choose `-a'\n");
|
||||
|
|
@ -83,29 +224,13 @@ int bwa_index(int argc, char *argv[])
|
|||
str2 = (char*)xcalloc(strlen(prefix) + 10, 1);
|
||||
str3 = (char*)xcalloc(strlen(prefix) + 10, 1);
|
||||
|
||||
if (is_color == 0) { // nucleotide indexing
|
||||
{ // nucleotide indexing
|
||||
gzFile fp = xzopen(argv[optind], "r");
|
||||
t = clock();
|
||||
fprintf(stderr, "[bwa_index] Pack FASTA... ");
|
||||
l_pac = bns_fasta2bntseq(fp, prefix, 0);
|
||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||
err_gzclose(fp);
|
||||
} else { // color indexing
|
||||
gzFile fp = xzopen(argv[optind], "r");
|
||||
strcat(strcpy(str, prefix), ".nt");
|
||||
t = clock();
|
||||
fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... ");
|
||||
l_pac = bns_fasta2bntseq(fp, str, 0);
|
||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||
err_gzclose(fp);
|
||||
{
|
||||
char *tmp_argv[3];
|
||||
tmp_argv[0] = argv[0]; tmp_argv[1] = str; tmp_argv[2] = prefix;
|
||||
t = clock();
|
||||
fprintf(stderr, "[bwa_index] Convert nucleotide PAC to color PAC... ");
|
||||
bwa_pac2cspac(3, tmp_argv);
|
||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||
}
|
||||
}
|
||||
if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT
|
||||
{
|
||||
|
|
|
|||
79
bwtio.c
79
bwtio.c
|
|
@ -1,79 +0,0 @@
|
|||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "bwt.h"
|
||||
#include "utils.h"
|
||||
|
||||
void bwt_dump_bwt(const char *fn, const bwt_t *bwt)
|
||||
{
|
||||
FILE *fp = NULL;
|
||||
fp = xopen(fn, "wb");
|
||||
err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
|
||||
err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
|
||||
err_fwrite(bwt->bwt, 4, bwt->bwt_size, fp);
|
||||
err_fflush(fp);
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
||||
void bwt_dump_sa(const char *fn, const bwt_t *bwt)
|
||||
{
|
||||
FILE *fp;
|
||||
fp = xopen(fn, "wb");
|
||||
err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
|
||||
err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
|
||||
err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
|
||||
err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp);
|
||||
err_fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
|
||||
err_fflush(fp);
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
||||
void bwt_restore_sa(const char *fn, bwt_t *bwt)
|
||||
{
|
||||
char skipped[256];
|
||||
FILE *fp;
|
||||
bwtint_t primary;
|
||||
|
||||
fp = xopen(fn, "rb");
|
||||
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
|
||||
xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same.");
|
||||
err_fread_noeof(skipped, sizeof(bwtint_t), 4, fp); // skip
|
||||
err_fread_noeof(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
|
||||
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
|
||||
xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
|
||||
|
||||
bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv;
|
||||
bwt->sa = (bwtint_t*)xcalloc(bwt->n_sa, sizeof(bwtint_t));
|
||||
bwt->sa[0] = -1;
|
||||
|
||||
err_fread_noeof(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
||||
bwt_t *bwt_restore_bwt(const char *fn)
|
||||
{
|
||||
bwt_t *bwt;
|
||||
FILE *fp;
|
||||
|
||||
bwt = (bwt_t*)xcalloc(1, sizeof(bwt_t));
|
||||
fp = xopen(fn, "rb");
|
||||
err_fseek(fp, 0, SEEK_END);
|
||||
bwt->bwt_size = (err_ftell(fp) - sizeof(bwtint_t) * 5) >> 2;
|
||||
bwt->bwt = (uint32_t*)xcalloc(bwt->bwt_size, 4);
|
||||
err_fseek(fp, 0, SEEK_SET);
|
||||
err_fread_noeof(&bwt->primary, sizeof(bwtint_t), 1, fp);
|
||||
err_fread_noeof(bwt->L2+1, sizeof(bwtint_t), 4, fp);
|
||||
err_fread_noeof(bwt->bwt, 4, bwt->bwt_size, fp);
|
||||
bwt->seq_len = bwt->L2[4];
|
||||
err_fclose(fp);
|
||||
bwt_gen_cnt_table(bwt);
|
||||
|
||||
return bwt;
|
||||
}
|
||||
|
||||
void bwt_destroy(bwt_t *bwt)
|
||||
{
|
||||
if (bwt == 0) return;
|
||||
free(bwt->sa); free(bwt->bwt);
|
||||
free(bwt);
|
||||
}
|
||||
231
bwtmisc.c
231
bwtmisc.c
|
|
@ -1,231 +0,0 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2008 Genome Research Ltd (GRL).
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
/* Contact: Heng Li <lh3@sanger.ac.uk> */
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include "bntseq.h"
|
||||
#include "utils.h"
|
||||
#include "main.h"
|
||||
#include "bwt.h"
|
||||
|
||||
#ifdef _DIVBWT
|
||||
#include "divsufsort.h"
|
||||
#endif
|
||||
|
||||
int is_bwt(ubyte_t *T, int n);
|
||||
|
||||
int64_t bwa_seq_len(const char *fn_pac)
|
||||
{
|
||||
FILE *fp;
|
||||
int64_t pac_len;
|
||||
ubyte_t c;
|
||||
fp = xopen(fn_pac, "rb");
|
||||
err_fseek(fp, -1, SEEK_END);
|
||||
pac_len = err_ftell(fp);
|
||||
err_fread_noeof(&c, 1, 1, fp);
|
||||
err_fclose(fp);
|
||||
return (pac_len - 1) * 4 + (int)c;
|
||||
}
|
||||
|
||||
bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
|
||||
{
|
||||
bwt_t *bwt;
|
||||
ubyte_t *buf, *buf2;
|
||||
int i, pac_size;
|
||||
FILE *fp;
|
||||
|
||||
// initialization
|
||||
bwt = (bwt_t*)xcalloc(1, sizeof(bwt_t));
|
||||
bwt->seq_len = bwa_seq_len(fn_pac);
|
||||
bwt->bwt_size = (bwt->seq_len + 15) >> 4;
|
||||
fp = xopen(fn_pac, "rb");
|
||||
|
||||
// prepare sequence
|
||||
pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1);
|
||||
buf2 = (ubyte_t*)xcalloc(pac_size, 1);
|
||||
err_fread_noeof(buf2, 1, pac_size, fp);
|
||||
err_fclose(fp);
|
||||
memset(bwt->L2, 0, 5 * 4);
|
||||
buf = (ubyte_t*)xcalloc(bwt->seq_len + 1, 1);
|
||||
for (i = 0; i < bwt->seq_len; ++i) {
|
||||
buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3;
|
||||
++bwt->L2[1+buf[i]];
|
||||
}
|
||||
for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1];
|
||||
free(buf2);
|
||||
|
||||
// Burrows-Wheeler Transform
|
||||
if (use_is) {
|
||||
bwt->primary = is_bwt(buf, bwt->seq_len);
|
||||
} else {
|
||||
#ifdef _DIVBWT
|
||||
bwt->primary = divbwt(buf, buf, 0, bwt->seq_len);
|
||||
#else
|
||||
err_fatal_simple("libdivsufsort is not compiled in.");
|
||||
#endif
|
||||
}
|
||||
bwt->bwt = (u_int32_t*)xcalloc(bwt->bwt_size, 4);
|
||||
for (i = 0; i < bwt->seq_len; ++i)
|
||||
bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1);
|
||||
free(buf);
|
||||
return bwt;
|
||||
}
|
||||
|
||||
int bwa_pac2bwt(int argc, char *argv[])
|
||||
{
|
||||
bwt_t *bwt;
|
||||
int c, use_is = 1;
|
||||
while ((c = getopt(argc, argv, "d")) >= 0) {
|
||||
switch (c) {
|
||||
case 'd': use_is = 0; break;
|
||||
default: return 1;
|
||||
}
|
||||
}
|
||||
if (optind + 2 > argc) {
|
||||
fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
|
||||
return 1;
|
||||
}
|
||||
bwt = bwt_pac2bwt(argv[optind], use_is);
|
||||
bwt_dump_bwt(argv[optind+1], bwt);
|
||||
bwt_destroy(bwt);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
|
||||
|
||||
void bwt_bwtupdate_core(bwt_t *bwt)
|
||||
{
|
||||
bwtint_t i, k, c[4], n_occ;
|
||||
uint32_t *buf;
|
||||
|
||||
n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;
|
||||
bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size
|
||||
buf = (uint32_t*)xcalloc(bwt->bwt_size, 4); // will be the new bwt
|
||||
c[0] = c[1] = c[2] = c[3] = 0;
|
||||
for (i = k = 0; i < bwt->seq_len; ++i) {
|
||||
if (i % OCC_INTERVAL == 0) {
|
||||
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
|
||||
k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4)
|
||||
}
|
||||
if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2
|
||||
++c[bwt_B00(bwt, i)];
|
||||
}
|
||||
// the last element
|
||||
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
|
||||
xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size");
|
||||
// update bwt
|
||||
free(bwt->bwt); bwt->bwt = buf;
|
||||
}
|
||||
|
||||
int bwa_bwtupdate(int argc, char *argv[])
|
||||
{
|
||||
bwt_t *bwt;
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
|
||||
return 1;
|
||||
}
|
||||
bwt = bwt_restore_bwt(argv[1]);
|
||||
bwt_bwtupdate_core(bwt);
|
||||
bwt_dump_bwt(argv[1], bwt);
|
||||
bwt_destroy(bwt);
|
||||
return 0;
|
||||
}
|
||||
|
||||
const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4};
|
||||
|
||||
/* this function is not memory efficient, but this will make life easier
|
||||
Ideally we should also change .amb files as one 'N' in the nucleotide
|
||||
sequence leads to two ambiguous colors. I may do this later... */
|
||||
uint8_t *bwa_pac2cspac_core(const bntseq_t *bns)
|
||||
{
|
||||
uint8_t *pac, *cspac;
|
||||
bwtint_t i;
|
||||
int c1, c2;
|
||||
pac = (uint8_t*)xcalloc(bns->l_pac/4 + 1, 1);
|
||||
cspac = (uint8_t*)xcalloc(bns->l_pac/4 + 1, 1);
|
||||
err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac);
|
||||
err_rewind(bns->fp_pac);
|
||||
c1 = pac[0]>>6; cspac[0] = c1<<6;
|
||||
for (i = 1; i < bns->l_pac; ++i) {
|
||||
c2 = pac[i>>2] >> (~i&3)*2 & 3;
|
||||
cspac[i>>2] |= nst_color_space_table[(1<<c1)|(1<<c2)] << (~i&3)*2;
|
||||
c1 = c2;
|
||||
}
|
||||
free(pac);
|
||||
return cspac;
|
||||
}
|
||||
|
||||
int bwa_pac2cspac(int argc, char *argv[])
|
||||
{
|
||||
bntseq_t *bns;
|
||||
uint8_t *cspac, ct;
|
||||
char *str;
|
||||
FILE *fp;
|
||||
|
||||
if (argc < 3) {
|
||||
fprintf(stderr, "Usage: bwa pac2cspac <in.nt.prefix> <out.cs.prefix>\n");
|
||||
return 1;
|
||||
}
|
||||
bns = bns_restore(argv[1]);
|
||||
cspac = bwa_pac2cspac_core(bns);
|
||||
bns_dump(bns, argv[2]);
|
||||
// now write cspac
|
||||
str = (char*)xcalloc(strlen(argv[2]) + 5, 1);
|
||||
strcat(strcpy(str, argv[2]), ".pac");
|
||||
fp = xopen(str, "wb");
|
||||
err_fwrite(cspac, 1, bns->l_pac/4 + 1, fp);
|
||||
ct = bns->l_pac % 4;
|
||||
err_fwrite(&ct, 1, 1, fp);
|
||||
err_fflush(fp);
|
||||
err_fclose(fp);
|
||||
bns_destroy(bns);
|
||||
free(cspac);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bwa_bwt2sa(int argc, char *argv[])
|
||||
{
|
||||
bwt_t *bwt;
|
||||
int c, sa_intv = 32;
|
||||
while ((c = getopt(argc, argv, "i:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'i': sa_intv = atoi(optarg); break;
|
||||
default: return 1;
|
||||
}
|
||||
}
|
||||
if (optind + 2 > argc) {
|
||||
fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
|
||||
return 1;
|
||||
}
|
||||
bwt = bwt_restore_bwt(argv[optind]);
|
||||
bwt_cal_sa(bwt, sa_intv);
|
||||
bwt_dump_sa(argv[optind+1], bwt);
|
||||
bwt_destroy(bwt);
|
||||
return 0;
|
||||
}
|
||||
64
bwtsw2_aux.c
64
bwtsw2_aux.c
|
|
@ -13,9 +13,10 @@
|
|||
#include "bwtsw2.h"
|
||||
#include "stdaln.h"
|
||||
#include "kstring.h"
|
||||
#include "bwa.h"
|
||||
|
||||
#include "kseq.h"
|
||||
KSEQ_INIT(gzFile, err_gzread)
|
||||
KSEQ_DECLARE(gzFile)
|
||||
|
||||
#include "ksort.h"
|
||||
#define __left_lt(a, b) ((a).end > (b).end)
|
||||
|
|
@ -186,14 +187,14 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8
|
|||
bsw2aux_t *q = b->aux + i;
|
||||
uint8_t *query;
|
||||
bwtint_t k;
|
||||
int score, path_len, beg, end;
|
||||
int path_len, beg, end;
|
||||
if (p->l) continue;
|
||||
beg = (p->flag & 0x10)? lq - p->end : p->beg;
|
||||
end = (p->flag & 0x10)? lq - p->beg : p->end;
|
||||
query = seq[(p->flag & 0x10)? 1 : 0] + beg;
|
||||
for (k = p->k; k < p->k + p->len; ++k) // in principle, no out-of-boundary here
|
||||
target[k - p->k] = pac[k>>2] >> (~k&3)*2 & 0x3;
|
||||
score = aln_global_core(target, p->len, query, end - beg, &par, path, &path_len);
|
||||
aln_global_core(target, p->len, query, end - beg, &par, path, &path_len);
|
||||
q->cigar = aln_path2cigar32(path, path_len, &q->n_cigar);
|
||||
#if 0
|
||||
if (name && score != p->G) { // debugging only
|
||||
|
|
@ -747,7 +748,7 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *
|
|||
// print and reset
|
||||
for (i = 0; i < _seq->n; ++i) {
|
||||
bsw2seq1_t *p = _seq->seq + i;
|
||||
if (p->sam) printf("%s", p->sam);
|
||||
if (p->sam) err_printf("%s", p->sam);
|
||||
free(p->name); free(p->seq); free(p->qual); free(p->sam);
|
||||
p->tid = -1; p->l = 0;
|
||||
p->name = p->seq = p->qual = p->sam = 0;
|
||||
|
|
@ -756,28 +757,18 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *
|
|||
_seq->n = 0;
|
||||
}
|
||||
|
||||
static void kseq_to_bsw2seq(const kseq_t *ks, bsw2seq1_t *p)
|
||||
{
|
||||
p->tid = -1;
|
||||
p->l = ks->seq.l;
|
||||
p->name = xstrdup(ks->name.s);
|
||||
p->seq = xstrdup(ks->seq.s);
|
||||
p->qual = ks->qual.l? xstrdup(ks->qual.s) : 0;
|
||||
p->comment = ks->comment.l? xstrdup(ks->comment.s) : 0;
|
||||
p->sam = 0;
|
||||
}
|
||||
|
||||
void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2)
|
||||
{
|
||||
gzFile fp, fp2;
|
||||
kseq_t *ks, *ks2;
|
||||
int l, size = 0, is_pe = 0;
|
||||
int l, is_pe = 0, i, n;
|
||||
uint8_t *pac;
|
||||
bsw2seq_t *_seq;
|
||||
bseq1_t *bseq;
|
||||
|
||||
pac = xcalloc(bns->l_pac/4+1, 1);
|
||||
for (l = 0; l < bns->n_seqs; ++l)
|
||||
printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len);
|
||||
err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len);
|
||||
err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac);
|
||||
fp = xzopen(fn, "r");
|
||||
ks = kseq_init(fp);
|
||||
|
|
@ -787,34 +778,25 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c
|
|||
ks2 = kseq_init(fp2);
|
||||
is_pe = 1;
|
||||
} else fp2 = 0, ks2 = 0, is_pe = 0;
|
||||
while (kseq_read(ks) >= 0) {
|
||||
if (ks->name.l > 2 && ks->name.s[ks->name.l-2] == '/')
|
||||
ks->name.l -= 2, ks->name.s[ks->name.l] = 0;
|
||||
if (_seq->n == _seq->max) {
|
||||
_seq->max = _seq->max? _seq->max<<1 : 1024;
|
||||
while ((bseq = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) {
|
||||
int size = 0;
|
||||
if (n > _seq->max) {
|
||||
_seq->max = n;
|
||||
kroundup32(_seq->max);
|
||||
_seq->seq = xrealloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t));
|
||||
}
|
||||
kseq_to_bsw2seq(ks, &_seq->seq[_seq->n++]);
|
||||
size += ks->seq.l;
|
||||
if (ks2) {
|
||||
if (kseq_read(ks2) >= 0) {
|
||||
if (ks2->name.l > 2 && ks2->name.s[ks2->name.l-2] == '/')
|
||||
ks2->name.l -= 2, ks2->name.s[ks2->name.l] = 0;
|
||||
kseq_to_bsw2seq(ks2, &_seq->seq[_seq->n++]); // for PE, _seq->n here must be odd and we do not need to enlarge
|
||||
size += ks->seq.l;
|
||||
} else {
|
||||
fprintf(stderr, "[%s] The second query file has fewer reads. Switched to the single-end mode for the following batches.\n", __func__);
|
||||
is_pe = 0;
|
||||
}
|
||||
}
|
||||
if (size > opt->chunk_size * opt->n_threads) {
|
||||
fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size);
|
||||
process_seqs(_seq, opt, bns, pac, target, is_pe);
|
||||
size = 0;
|
||||
_seq->n = n;
|
||||
for (i = 0; i < n; ++i) {
|
||||
bseq1_t *b = &bseq[i];
|
||||
bsw2seq1_t *p = &_seq->seq[i];
|
||||
p->tid = -1; p->l = b->l_seq;
|
||||
p->name = b->name; p->seq = b->seq; p->qual = b->qual; p->comment = b->comment; p->sam = 0;
|
||||
size += p->l;
|
||||
}
|
||||
fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp) ...\n", n, size);
|
||||
free(bseq);
|
||||
process_seqs(_seq, opt, bns, pac, target, is_pe);
|
||||
}
|
||||
fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size);
|
||||
process_seqs(_seq, opt, bns, pac, target, is_pe);
|
||||
// free
|
||||
free(pac);
|
||||
free(_seq->seq); free(_seq);
|
||||
|
|
|
|||
|
|
@ -6,14 +6,12 @@
|
|||
#include "bwt.h"
|
||||
#include "bwtsw2.h"
|
||||
#include "utils.h"
|
||||
#include "bwa.h"
|
||||
|
||||
int bwa_bwtsw2(int argc, char *argv[])
|
||||
{
|
||||
extern char *bwa_infer_prefix(const char *hint);
|
||||
bsw2opt_t *opt;
|
||||
bwt_t *target;
|
||||
char buf[1024], *prefix;
|
||||
bntseq_t *bns;
|
||||
bwaidx_t *idx;
|
||||
int c;
|
||||
|
||||
opt = bsw2_init_opt();
|
||||
|
|
@ -81,19 +79,10 @@ int bwa_bwtsw2(int argc, char *argv[])
|
|||
opt->t *= opt->a;
|
||||
opt->coef *= opt->a;
|
||||
|
||||
if ((prefix = bwa_infer_prefix(argv[optind])) == 0) {
|
||||
fprintf(stderr, "[%s] fail to locate the index\n", __func__);
|
||||
return 0;
|
||||
}
|
||||
strcpy(buf, prefix); target = bwt_restore_bwt(strcat(buf, ".bwt"));
|
||||
strcpy(buf, prefix); bwt_restore_sa(strcat(buf, ".sa"), target);
|
||||
bns = bns_restore(prefix);
|
||||
|
||||
bsw2_aln(opt, bns, target, argv[optind+1], optind+2 < argc? argv[optind+2] : 0);
|
||||
|
||||
bns_destroy(bns);
|
||||
bwt_destroy(target);
|
||||
free(opt); free(prefix);
|
||||
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 0;
|
||||
bsw2_aln(opt, idx->bns, idx->bwt, argv[optind+1], optind+2 < argc? argv[optind+2] : 0);
|
||||
bwa_idx_destroy(idx);
|
||||
free(opt);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@
|
|||
#include "bntseq.h"
|
||||
#include "bwtsw2.h"
|
||||
#include "kstring.h"
|
||||
#include "utils.h"
|
||||
#ifndef _NO_SSE2
|
||||
#include "ksw.h"
|
||||
#else
|
||||
|
|
@ -25,7 +26,6 @@ typedef struct {
|
|||
|
||||
bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins)
|
||||
{
|
||||
extern void ks_introsort_uint64_t(size_t n, uint64_t *a);
|
||||
int i, k, x, p25, p50, p75, tmp, max_len = 0;
|
||||
uint64_t *isize;
|
||||
bsw2pestat_t r;
|
||||
|
|
@ -45,7 +45,7 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins)
|
|||
max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg;
|
||||
isize[k++] = l;
|
||||
}
|
||||
ks_introsort_uint64_t(k, isize);
|
||||
ks_introsort_64(k, isize);
|
||||
p25 = isize[(int)(.25 * k + .499)];
|
||||
p50 = isize[(int)(.50 * k + .499)];
|
||||
p75 = isize[(int)(.75 * k + .499)];
|
||||
|
|
@ -75,9 +75,9 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins)
|
|||
r.low = tmp > max_len? tmp : max_len;
|
||||
if (r.low < 1) r.low = 1;
|
||||
r.high = (int)(p75 + 3. * (p75 - p25) + .499);
|
||||
if (r.low > r.avg - MAX_STDDEV * 4.) r.low = (int)(r.avg - MAX_STDDEV * 4. + .499);
|
||||
if (r.low > r.avg - MAX_STDDEV * r.std) r.low = (int)(r.avg - MAX_STDDEV * r.std + .499);
|
||||
r.low = tmp > max_len? tmp : max_len;
|
||||
if (r.high < r.avg - MAX_STDDEV * 4.) r.high = (int)(r.avg + MAX_STDDEV * 4. + .499);
|
||||
if (r.high < r.avg - MAX_STDDEV * r.std) r.high = (int)(r.avg + MAX_STDDEV * r.std + .499);
|
||||
ksprintf(msg, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high);
|
||||
free(isize);
|
||||
return r;
|
||||
|
|
@ -128,35 +128,24 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b
|
|||
seq[i] = nst_nt4_table[(int)mseq[i]];
|
||||
}
|
||||
#ifndef _NO_SSE2
|
||||
{
|
||||
ksw_query_t *q;
|
||||
ksw_aux_t aux[2];
|
||||
// forward Smith-Waterman
|
||||
aux[0].T = opt->t; aux[0].gapo = opt->q; aux[0].gape = opt->r; aux[1] = aux[0];
|
||||
q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat);
|
||||
ksw_sse2(q, end - beg, ref, &aux[0]);
|
||||
free(q);
|
||||
if (aux[0].score < opt->t) {
|
||||
free(seq);
|
||||
return;
|
||||
}
|
||||
++aux[0].qe; ++aux[0].te;
|
||||
// reverse Smith-Waterman
|
||||
seq_reverse(aux[0].qe, seq, 0);
|
||||
seq_reverse(aux[0].te, ref, 0);
|
||||
q = ksw_qinit(aux[0].qe * g_mat[0] < 250? 1 : 2, aux[0].qe, seq, 5, g_mat);
|
||||
ksw_sse2(q, aux[0].te, ref, &aux[1]);
|
||||
free(q);
|
||||
++aux[1].qe; ++aux[1].te;
|
||||
// write output
|
||||
a->G = aux[0].score;
|
||||
a->G2 = aux[0].score2 > aux[1].score2? aux[0].score2 : aux[1].score2;
|
||||
{ // FIXME!!! The following block has not been tested since the update of the ksw library
|
||||
int flag = KSW_XSUBO | KSW_XSTART | (l_mseq * g_mat[0] < 250? KSW_XBYTE : 0) | opt->t;
|
||||
kswr_t aln;
|
||||
aln = ksw_align(l_mseq, seq, end - beg, ref, 5, g_mat, opt->q, opt->r, flag, 0);
|
||||
a->G = aln.score;
|
||||
a->G2 = aln.score2;
|
||||
if (a->G < opt->t) a->G = 0;
|
||||
if (a->G2 < opt->t) a->G2 = 0;
|
||||
if (a->G2) a->flag |= BSW2_FLAG_TANDEM;
|
||||
a->k = beg + (aux[0].te - aux[1].te);
|
||||
a->len = aux[1].te;
|
||||
a->beg = aux[0].qe - aux[1].qe;
|
||||
a->end = aux[0].qe;
|
||||
a->k = beg + aln.tb;
|
||||
a->len = aln.te - aln.tb + 1;
|
||||
a->beg = aln.qb;
|
||||
a->end = aln.qe + 1;
|
||||
/*
|
||||
printf("[Q] "); for (i = 0; i < l_mseq; ++i) putchar("ACGTN"[(int)seq[i]]); putchar('\n');
|
||||
printf("[R] "); for (i = 0; i < end - beg; ++i) putchar("ACGTN"[(int)ref[i]]); putchar('\n');
|
||||
printf("G=%d,G2=%d,beg=%d,end=%d,k=%lld,len=%d\n", a->G, a->G2, a->beg, a->end, a->k, a->len);
|
||||
*/
|
||||
}
|
||||
#else
|
||||
{
|
||||
|
|
@ -169,6 +158,7 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b
|
|||
a->G = aln_local_core(ref, end - beg, seq, l_mseq, &ap, path, 0, opt->t, &a->G2);
|
||||
if (a->G < opt->t) a->G = 0;
|
||||
if (a->G2 < opt->t) a->G2 = 0;
|
||||
if (a->G2) a->flag |= BSW2_FLAG_TANDEM;
|
||||
a->k = beg + path[0].i - 1;
|
||||
a->len = path[1].i - path[0].i + 1;
|
||||
a->beg = path[0].j - 1;
|
||||
|
|
|
|||
192
cs2nt.c
192
cs2nt.c
|
|
@ -1,192 +0,0 @@
|
|||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include "bwtaln.h"
|
||||
#include "stdaln.h"
|
||||
#include "utils.h"
|
||||
|
||||
/*
|
||||
Here is a delicate example. ref_nt=ATTAAC(RBRBG), read_cs=RBBOG. If we
|
||||
decode as ATTGAC(RBGOG), there are one color change and one nt change;
|
||||
if we decode as ATTAAC(RBRBG), there are two color changes.
|
||||
|
||||
In DP, if color quality is smaller than COLOR_MM, we will use COLOR_MM
|
||||
as the penalty; otherwise, we will use color quality as the
|
||||
penalty. This means we always prefer two consistent color changes over
|
||||
a nt change, but if a color has high quality, we may prefer one nt
|
||||
change.
|
||||
|
||||
In the above example, the penalties of the two types of decoding are
|
||||
q(B)+25 and q(B)+q(O), respectively. If q(O)>25, we prefer the first;
|
||||
otherwise the second. Note that no matter what we choose, the fourth
|
||||
base will get a low nt quality.
|
||||
*/
|
||||
|
||||
#define COLOR_MM 19
|
||||
#define NUCL_MM 25
|
||||
|
||||
static const int nst_ntnt2cs_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4 };
|
||||
|
||||
/*
|
||||
{A,C,G,T,N} -> {0,1,2,3,4}
|
||||
nt_ref[0..size]: nucleotide reference: 0/1/2/3/4
|
||||
cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N
|
||||
nt_read[0..size]: nucleotide read sequence: 0/1/2/3 (returned)
|
||||
btarray[0..4*size]: backtrack array (working space)
|
||||
*/
|
||||
void cs2nt_DP(int size, const uint8_t *nt_ref, const uint8_t *cs_read, uint8_t *nt_read, uint8_t *btarray)
|
||||
{
|
||||
int h[8], curr, last;
|
||||
int x, y, xmin, hmin, k;
|
||||
|
||||
// h[0..3] and h[4..7] are the current and last best score array, depending on curr and last
|
||||
|
||||
// recursion: initial value
|
||||
if (nt_ref[0] >= 4) memset(h, 0, sizeof(int) << 2);
|
||||
else {
|
||||
for (x = 0; x != 4; ++x) h[x] = NUCL_MM;
|
||||
h[nt_ref[0]] = 0;
|
||||
}
|
||||
// recursion: main loop
|
||||
curr = 1; last = 0;
|
||||
for (k = 1; k <= size; ++k) {
|
||||
for (x = 0; x != 4; ++x) {
|
||||
int min = 0x7fffffff, ymin = 0;
|
||||
for (y = 0; y != 4; ++y) {
|
||||
int s = h[last<<2|y];
|
||||
if ((cs_read[k-1]&0x3f) != 63 && cs_read[k-1]>>6 != nst_ntnt2cs_table[1<<x|1<<y])
|
||||
s += ((cs_read[k-1]&0x3f) < COLOR_MM)? COLOR_MM : (cs_read[k-1]&0x3f); // color mismatch
|
||||
if (nt_ref[k] < 4 && nt_ref[k] != x) s += NUCL_MM; // nt mismatch
|
||||
if (s < min) {
|
||||
min = s; ymin = y;
|
||||
}
|
||||
}
|
||||
h[curr<<2|x] = min; btarray[k<<2|x] = ymin;
|
||||
}
|
||||
last = curr; curr = 1 - curr; // swap
|
||||
}
|
||||
// back trace
|
||||
hmin = 0x7fffffff; xmin = 0;
|
||||
for (x = 0; x != 4; ++x) {
|
||||
if (h[last<<2|x] < hmin) {
|
||||
hmin = h[last<<2|x]; xmin = x;
|
||||
}
|
||||
}
|
||||
nt_read[size] = xmin;
|
||||
for (k = size - 1; k >= 0; --k)
|
||||
nt_read[k] = btarray[(k+1)<<2 | nt_read[k+1]];
|
||||
}
|
||||
/*
|
||||
nt_read[0..size]: nucleotide read sequence: 0/1/2/3
|
||||
cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N
|
||||
tarray[0..size*2-1]: temporary array
|
||||
*/
|
||||
uint8_t *cs2nt_nt_qual(int size, const uint8_t *nt_read, const uint8_t *cs_read, uint8_t *tarray)
|
||||
{
|
||||
int k, c1, c2;
|
||||
uint8_t *t2array = tarray + size;
|
||||
// get the color sequence of nt_read
|
||||
c1 = nt_read[0];
|
||||
for (k = 1; k <= size; ++k) {
|
||||
c2 = nt_read[k]; // in principle, there is no 'N' in nt_read[]; just in case
|
||||
tarray[k-1] = (c1 >= 4 || c2 >= 4)? 4 : nst_ntnt2cs_table[1<<c1 | 1<<c2];
|
||||
c1 = c2;
|
||||
}
|
||||
for (k = 1; k != size; ++k) {
|
||||
int q = 0;
|
||||
if (tarray[k-1] == cs_read[k-1]>>6 && tarray[k] == cs_read[k]>>6) {
|
||||
q = (int)(cs_read[k-1]&0x3f) + (int)(cs_read[k]&0x3f) + 10;
|
||||
} else if (tarray[k-1] == cs_read[k-1]>>6) {
|
||||
q = (int)(cs_read[k-1]&0x3f) - (int)(cs_read[k]&0x3f);
|
||||
} else if (tarray[k] == cs_read[k]>>6) {
|
||||
q = (int)(cs_read[k]&0x3f) - (int)(cs_read[k-1]&0x3f);
|
||||
} // else, q = 0
|
||||
if (q < 0) q = 0;
|
||||
if (q > 60) q = 60;
|
||||
t2array[k] = nt_read[k]<<6 | q;
|
||||
if ((cs_read[k-1]&0x3f) == 63 || (cs_read[k]&0x3f) == 63) t2array[k] = 0;
|
||||
}
|
||||
return t2array + 1; // of size-2
|
||||
}
|
||||
|
||||
// this function will be called when p->seq has been reversed by refine_gapped()
|
||||
void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac)
|
||||
{
|
||||
uint8_t *ta, *nt_read, *btarray, *tarray, *nt_ref, *cs_read, *new_nt_read;
|
||||
int i, len;
|
||||
uint8_t *seq;
|
||||
|
||||
// set temporary arrays
|
||||
if (p->type == BWA_TYPE_NO_MATCH) return;
|
||||
len = p->len + p->n_gapo + p->n_gape + 100; // leave enough space
|
||||
ta = (uint8_t*)xmalloc(len * 7);
|
||||
nt_ref = ta;
|
||||
cs_read = nt_ref + len;
|
||||
nt_read = cs_read + len;
|
||||
btarray = nt_read + len;
|
||||
tarray = nt_read + len;
|
||||
|
||||
#define __gen_csbase(_cs, _i, _seq) do { \
|
||||
int q = p->qual[p->strand? p->len - 1 - (_i) : (_i)] - 33; \
|
||||
if (q > 60) q = 60; \
|
||||
if (_seq[_i] > 3) q = 63; \
|
||||
(_cs) = _seq[_i]<<6 | q; \
|
||||
} while (0)
|
||||
|
||||
// generate len, nt_ref[] and cs_read
|
||||
seq = p->strand? p->rseq : p->seq;
|
||||
nt_ref[0] = p->pos? bns_pac(pac, p->pos-1) : 4;
|
||||
if (p->cigar == 0) { // no gap or clipping
|
||||
len = p->len;
|
||||
for (i = 0; i < p->len; ++i) {
|
||||
__gen_csbase(cs_read[i], i, seq);
|
||||
nt_ref[i+1] = bns_pac(pac, p->pos + i);
|
||||
}
|
||||
} else {
|
||||
int k, z;
|
||||
bwtint_t x, y;
|
||||
x = p->pos; y = 0;
|
||||
for (k = z = 0; k < p->n_cigar; ++k) {
|
||||
int l = __cigar_len(p->cigar[k]);
|
||||
if (__cigar_op(p->cigar[k]) == FROM_M) {
|
||||
for (i = 0; i < l; ++i, ++x, ++y) {
|
||||
__gen_csbase(cs_read[z], y, seq);
|
||||
nt_ref[z+1] = bns_pac(pac, x);
|
||||
++z;
|
||||
}
|
||||
} else if (__cigar_op(p->cigar[k]) == FROM_I) {
|
||||
for (i = 0; i < l; ++i, ++y) {
|
||||
__gen_csbase(cs_read[z], y, seq);
|
||||
nt_ref[z+1] = 4;
|
||||
++z;
|
||||
}
|
||||
} else if (__cigar_op(p->cigar[k]) == FROM_S) y += l;
|
||||
else x += l;
|
||||
}
|
||||
len = z;
|
||||
}
|
||||
|
||||
cs2nt_DP(len, nt_ref, cs_read, nt_read, btarray);
|
||||
new_nt_read = cs2nt_nt_qual(len, nt_read, cs_read, tarray);
|
||||
|
||||
// update p
|
||||
p->len = p->full_len = len - 1;
|
||||
for (i = 0; i < p->len; ++i) {
|
||||
if ((new_nt_read[i]&0x3f) == 63) {
|
||||
p->qual[i] = 33; seq[i] = 4;
|
||||
} else {
|
||||
p->qual[i] = (new_nt_read[i]&0x3f) + 33;
|
||||
seq[i] = new_nt_read[i]>>6;
|
||||
}
|
||||
}
|
||||
p->qual[p->len] = seq[p->len] = 0;
|
||||
if (p->strand) {
|
||||
memcpy(p->seq, seq, p->len);
|
||||
seq_reverse(p->len, p->seq, 1);
|
||||
seq_reverse(p->len, p->qual, 0);
|
||||
} else {
|
||||
memcpy(p->rseq, seq, p->len);
|
||||
seq_reverse(p->len, p->rseq, 1);
|
||||
}
|
||||
free(ta);
|
||||
}
|
||||
194
fastmap.c
194
fastmap.c
|
|
@ -2,115 +2,174 @@
|
|||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include "bntseq.h"
|
||||
#include "bwt.h"
|
||||
#include "bwa.h"
|
||||
#include "bwamem.h"
|
||||
#include "kvec.h"
|
||||
#include "utils.h"
|
||||
#include "kseq.h"
|
||||
#include "utils.h"
|
||||
KSEQ_INIT(gzFile, err_gzread)
|
||||
KSEQ_DECLARE(gzFile)
|
||||
|
||||
extern unsigned char nst_nt4_table[256];
|
||||
|
||||
typedef struct {
|
||||
const bwt_t *bwt;
|
||||
const uint8_t *query;
|
||||
int start, len;
|
||||
bwtintv_v *tmpvec[2], *matches;
|
||||
} smem_i;
|
||||
void *kopen(const char *fn, int *_fd);
|
||||
int kclose(void *a);
|
||||
|
||||
smem_i *smem_iter_init(const bwt_t *bwt)
|
||||
int main_mem(int argc, char *argv[])
|
||||
{
|
||||
smem_i *iter;
|
||||
iter = xcalloc(1, sizeof(smem_i));
|
||||
iter->bwt = bwt;
|
||||
iter->tmpvec[0] = xcalloc(1, sizeof(bwtintv_v));
|
||||
iter->tmpvec[1] = xcalloc(1, sizeof(bwtintv_v));
|
||||
iter->matches = xcalloc(1, sizeof(bwtintv_v));
|
||||
return iter;
|
||||
}
|
||||
mem_opt_t *opt;
|
||||
int fd, fd2, i, c, n, copy_comment = 0;
|
||||
gzFile fp, fp2 = 0;
|
||||
kseq_t *ks, *ks2 = 0;
|
||||
bseq1_t *seqs;
|
||||
bwaidx_t *idx;
|
||||
char *rg_line = 0;
|
||||
void *ko = 0, *ko2 = 0;
|
||||
|
||||
void smem_iter_destroy(smem_i *iter)
|
||||
{
|
||||
free(iter->tmpvec[0]->a);
|
||||
free(iter->tmpvec[1]->a);
|
||||
free(iter->matches->a);
|
||||
free(iter);
|
||||
}
|
||||
opt = mem_opt_init();
|
||||
while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:w:")) >= 0) {
|
||||
if (c == 'k') opt->min_seed_len = atoi(optarg);
|
||||
else if (c == 'w') opt->w = atoi(optarg);
|
||||
else if (c == 'A') opt->a = atoi(optarg);
|
||||
else if (c == 'B') opt->b = atoi(optarg);
|
||||
else if (c == 'O') opt->q = atoi(optarg);
|
||||
else if (c == 'E') opt->r = atoi(optarg);
|
||||
else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
|
||||
else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
|
||||
else if (c == 'H') opt->flag |= MEM_F_HARDCLIP;
|
||||
else if (c == 'a') opt->flag |= MEM_F_ALL;
|
||||
else if (c == 'p') opt->flag |= MEM_F_PE;
|
||||
else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
|
||||
else if (c == 'c') opt->max_occ = atoi(optarg);
|
||||
else if (c == 'v') bwa_verbose = atoi(optarg);
|
||||
else if (c == 'r') opt->split_factor = atof(optarg);
|
||||
else if (c == 'C') copy_comment = 1;
|
||||
else if (c == 'R') {
|
||||
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak
|
||||
} else if (c == 's') opt->split_width = atoi(optarg);
|
||||
}
|
||||
if (opt->n_threads < 1) opt->n_threads = 1;
|
||||
if (optind + 1 >= argc) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Usage: bwa mem [options] <idxbase> <in1.fq> [in2.fq]\n\n");
|
||||
fprintf(stderr, "Algorithm options:\n\n");
|
||||
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
|
||||
fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len);
|
||||
fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
|
||||
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
|
||||
fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
|
||||
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
|
||||
fprintf(stderr, " -P skip pairing; perform mate SW only\n");
|
||||
fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a);
|
||||
fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
|
||||
fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q);
|
||||
fprintf(stderr, " -E INT gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r);
|
||||
fprintf(stderr, "\nInput/output options:\n\n");
|
||||
fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n");
|
||||
fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
|
||||
fprintf(stderr, " -a output all alignments for SE or unpaired PE\n");
|
||||
fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n");
|
||||
fprintf(stderr, " -H hard clipping\n");
|
||||
fprintf(stderr, " -M mark shorter split hits as secondary (for Picard/GATK compatibility)\n");
|
||||
fprintf(stderr, "\n");
|
||||
free(opt);
|
||||
return 1;
|
||||
}
|
||||
|
||||
void smem_set_query(smem_i *iter, int len, const uint8_t *query)
|
||||
{
|
||||
iter->query = query;
|
||||
iter->start = 0;
|
||||
iter->len = len;
|
||||
}
|
||||
mem_fill_scmat(opt->a, opt->b, opt->mat);
|
||||
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
|
||||
bwa_print_sam_hdr(idx->bns, rg_line);
|
||||
|
||||
int smem_next(smem_i *iter)
|
||||
{
|
||||
iter->tmpvec[0]->n = iter->tmpvec[1]->n = iter->matches->n = 0;
|
||||
if (iter->start >= iter->len || iter->start < 0) return -1;
|
||||
while (iter->start < iter->len && iter->query[iter->start] > 3) ++iter->start; // skip ambiguous bases
|
||||
if (iter->start == iter->len) return -1;
|
||||
iter->start = bwt_smem1(iter->bwt, iter->len, iter->query, iter->start, iter->matches, iter->tmpvec);
|
||||
return iter->start;
|
||||
ko = kopen(argv[optind + 1], &fd);
|
||||
fp = gzdopen(fd, "r");
|
||||
ks = kseq_init(fp);
|
||||
if (optind + 2 < argc) {
|
||||
if (opt->flag&MEM_F_PE) {
|
||||
if (bwa_verbose >= 2)
|
||||
fprintf(stderr, "[W::%s] when '-p' is in use, the second query file will be ignored.\n", __func__);
|
||||
} else {
|
||||
ko2 = kopen(argv[optind + 2], &fd2);
|
||||
fp2 = gzdopen(fd2, "r");
|
||||
ks2 = kseq_init(fp2);
|
||||
opt->flag |= MEM_F_PE;
|
||||
}
|
||||
}
|
||||
while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) {
|
||||
int64_t size = 0;
|
||||
if (!copy_comment)
|
||||
for (i = 0; i < n; ++i) {
|
||||
free(seqs[i].comment); seqs[i].comment = 0;
|
||||
}
|
||||
for (i = 0; i < n; ++i) size += seqs[i].l_seq;
|
||||
if (bwa_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size);
|
||||
mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n, seqs);
|
||||
free(seqs);
|
||||
}
|
||||
|
||||
free(opt);
|
||||
bwa_idx_destroy(idx);
|
||||
kseq_destroy(ks);
|
||||
err_gzclose(fp); kclose(ko);
|
||||
if (ks2) {
|
||||
kseq_destroy(ks2);
|
||||
err_gzclose(fp2); kclose(ko2);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main_fastmap(int argc, char *argv[])
|
||||
{
|
||||
int c, i, min_iwidth = 20, min_len = 17, print_seq = 0;
|
||||
int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, split_width = 0;
|
||||
kseq_t *seq;
|
||||
bwtint_t k;
|
||||
gzFile fp;
|
||||
bwt_t *bwt;
|
||||
bntseq_t *bns;
|
||||
smem_i *iter;
|
||||
smem_i *itr;
|
||||
const bwtintv_v *a;
|
||||
bwaidx_t *idx;
|
||||
|
||||
while ((c = getopt(argc, argv, "w:l:s")) >= 0) {
|
||||
while ((c = getopt(argc, argv, "w:l:ps:")) >= 0) {
|
||||
switch (c) {
|
||||
case 's': print_seq = 1; break;
|
||||
case 's': split_width = atoi(optarg); break;
|
||||
case 'p': print_seq = 1; break;
|
||||
case 'w': min_iwidth = atoi(optarg); break;
|
||||
case 'l': min_len = atoi(optarg); break;
|
||||
}
|
||||
}
|
||||
if (optind + 1 >= argc) {
|
||||
fprintf(stderr, "Usage: bwa fastmap [-s] [-l minLen=%d] [-w maxSaSize=%d] <idxbase> <in.fq>\n", min_len, min_iwidth);
|
||||
fprintf(stderr, "Usage: bwa fastmap [-p] [-s splitWidth=%d] [-l minLen=%d] [-w maxSaSize=%d] <idxbase> <in.fq>\n", split_width, min_len, min_iwidth);
|
||||
return 1;
|
||||
}
|
||||
|
||||
fp = xzopen(argv[optind + 1], "r");
|
||||
seq = kseq_init(fp);
|
||||
{ // load the packed sequences, BWT and SA
|
||||
char *tmp = xcalloc(strlen(argv[optind]) + 5, 1);
|
||||
strcat(strcpy(tmp, argv[optind]), ".bwt");
|
||||
bwt = bwt_restore_bwt(tmp);
|
||||
strcat(strcpy(tmp, argv[optind]), ".sa");
|
||||
bwt_restore_sa(tmp, bwt);
|
||||
free(tmp);
|
||||
bns = bns_restore(argv[optind]);
|
||||
}
|
||||
iter = smem_iter_init(bwt);
|
||||
idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS);
|
||||
itr = smem_itr_init(idx->bwt);
|
||||
while (kseq_read(seq) >= 0) {
|
||||
printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l);
|
||||
err_printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l);
|
||||
if (print_seq) {
|
||||
err_putchar('\t');
|
||||
err_puts(seq->seq.s);
|
||||
} else err_putchar('\n');
|
||||
for (i = 0; i < seq->seq.l; ++i)
|
||||
seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
|
||||
smem_set_query(iter, seq->seq.l, (uint8_t*)seq->seq.s);
|
||||
while (smem_next(iter) > 0) {
|
||||
for (i = 0; i < iter->matches->n; ++i) {
|
||||
bwtintv_t *p = &iter->matches->a[i];
|
||||
smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s);
|
||||
while ((a = smem_next(itr, min_len<<1, split_width)) != 0) {
|
||||
for (i = 0; i < a->n; ++i) {
|
||||
bwtintv_t *p = &a->a[i];
|
||||
if ((uint32_t)p->info - (p->info>>32) < min_len) continue;
|
||||
printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]);
|
||||
err_printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]);
|
||||
if (p->x[2] <= min_iwidth) {
|
||||
for (k = 0; k < p->x[2]; ++k) {
|
||||
bwtint_t pos;
|
||||
int len, is_rev, ref_id;
|
||||
len = (uint32_t)p->info - (p->info>>32);
|
||||
pos = bns_depos(bns, bwt_sa(bwt, p->x[0] + k), &is_rev);
|
||||
pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &is_rev);
|
||||
if (is_rev) pos -= len - 1;
|
||||
bns_cnt_ambi(bns, pos, len, &ref_id);
|
||||
printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
|
||||
bns_cnt_ambi(idx->bns, pos, len, &ref_id);
|
||||
err_printf("\t%s:%c%ld", idx->bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - idx->bns->anns[ref_id].offset) + 1);
|
||||
}
|
||||
} else err_puts("\t*");
|
||||
err_putchar('\n');
|
||||
|
|
@ -119,9 +178,8 @@ int main_fastmap(int argc, char *argv[])
|
|||
err_puts("//");
|
||||
}
|
||||
|
||||
smem_iter_destroy(iter);
|
||||
bns_destroy(bns);
|
||||
bwt_destroy(bwt);
|
||||
smem_itr_destroy(itr);
|
||||
bwa_idx_destroy(idx);
|
||||
kseq_destroy(seq);
|
||||
err_gzclose(fp);
|
||||
return 0;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,385 @@
|
|||
/*-
|
||||
* Copyright 1997-1999, 2001, John-Mark Gurney.
|
||||
* 2008-2009, Attractive Chaos <attractor@live.co.uk>
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef __AC_KBTREE_H
|
||||
#define __AC_KBTREE_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "utils.h"
|
||||
|
||||
typedef struct {
|
||||
int32_t is_internal:1, n:31;
|
||||
} kbnode_t;
|
||||
|
||||
#define __KB_KEY(type, x) ((type*)((char*)x + 4))
|
||||
#define __KB_PTR(btr, x) ((kbnode_t**)((char*)x + btr->off_ptr))
|
||||
|
||||
#define __KB_TREE_T(name) \
|
||||
typedef struct { \
|
||||
kbnode_t *root; \
|
||||
int off_key, off_ptr, ilen, elen; \
|
||||
int n, t; \
|
||||
int n_keys, n_nodes; \
|
||||
} kbtree_##name##_t;
|
||||
|
||||
#define __KB_INIT(name, key_t) \
|
||||
kbtree_##name##_t *kb_init_##name(int size) \
|
||||
{ \
|
||||
kbtree_##name##_t *b; \
|
||||
b = (kbtree_##name##_t*)xcalloc(1, sizeof(kbtree_##name##_t)); \
|
||||
b->t = ((size - 4 - sizeof(void*)) / (sizeof(void*) + sizeof(key_t)) + 1) >> 1; \
|
||||
if (b->t < 2) { \
|
||||
free(b); return 0; \
|
||||
} \
|
||||
b->n = 2 * b->t - 1; \
|
||||
b->off_ptr = 4 + b->n * sizeof(key_t); \
|
||||
b->ilen = (4 + sizeof(void*) + b->n * (sizeof(void*) + sizeof(key_t)) + 3) >> 2 << 2; \
|
||||
b->elen = (b->off_ptr + 3) >> 2 << 2; \
|
||||
b->root = (kbnode_t*)xcalloc(1, b->ilen); \
|
||||
++b->n_nodes; \
|
||||
return b; \
|
||||
}
|
||||
|
||||
#define __kb_destroy(b) do { \
|
||||
int i, max = 8; \
|
||||
kbnode_t *x, **top, **stack = 0; \
|
||||
if (b) { \
|
||||
top = stack = (kbnode_t**)xcalloc(max, sizeof(kbnode_t*)); \
|
||||
*top++ = (b)->root; \
|
||||
while (top != stack) { \
|
||||
x = *--top; \
|
||||
if (x->is_internal == 0) { free(x); continue; } \
|
||||
for (i = 0; i <= x->n; ++i) \
|
||||
if (__KB_PTR(b, x)[i]) { \
|
||||
if (top - stack == max) { \
|
||||
max <<= 1; \
|
||||
stack = (kbnode_t**)xrealloc(stack, max * sizeof(kbnode_t*)); \
|
||||
top = stack + (max>>1); \
|
||||
} \
|
||||
*top++ = __KB_PTR(b, x)[i]; \
|
||||
} \
|
||||
free(x); \
|
||||
} \
|
||||
} \
|
||||
free(b); free(stack); \
|
||||
} while (0)
|
||||
|
||||
#define __kb_get_first(key_t, b, ret) do { \
|
||||
kbnode_t *__x = (b)->root; \
|
||||
while (__KB_PTR(b, __x)[0] != 0) \
|
||||
__x = __KB_PTR(b, __x)[0]; \
|
||||
(ret) = __KB_KEY(key_t, __x)[0]; \
|
||||
} while (0)
|
||||
|
||||
#define __KB_GET_AUX0(name, key_t, __cmp) \
|
||||
static inline int __kb_get_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \
|
||||
{ \
|
||||
int tr, *rr, begin, end, n = x->n >> 1; \
|
||||
if (x->n == 0) return -1; \
|
||||
if (__cmp(*k, __KB_KEY(key_t, x)[n]) < 0) { \
|
||||
begin = 0; end = n; \
|
||||
} else { begin = n; end = x->n - 1; } \
|
||||
rr = r? r : &tr; \
|
||||
n = end; \
|
||||
while (n >= begin && (*rr = __cmp(*k, __KB_KEY(key_t, x)[n])) < 0) --n; \
|
||||
return n; \
|
||||
}
|
||||
|
||||
#define __KB_GET_AUX1(name, key_t, __cmp) \
|
||||
static inline int __kb_getp_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \
|
||||
{ \
|
||||
int tr, *rr, begin = 0, end = x->n; \
|
||||
if (x->n == 0) return -1; \
|
||||
rr = r? r : &tr; \
|
||||
while (begin < end) { \
|
||||
int mid = (begin + end) >> 1; \
|
||||
if (__cmp(__KB_KEY(key_t, x)[mid], *k) < 0) begin = mid + 1; \
|
||||
else end = mid; \
|
||||
} \
|
||||
if (begin == x->n) { *rr = 1; return x->n - 1; } \
|
||||
if ((*rr = __cmp(*k, __KB_KEY(key_t, x)[begin])) < 0) --begin; \
|
||||
return begin; \
|
||||
}
|
||||
|
||||
#define __KB_GET(name, key_t) \
|
||||
static key_t *kb_getp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
|
||||
{ \
|
||||
int i, r = 0; \
|
||||
kbnode_t *x = b->root; \
|
||||
while (x) { \
|
||||
i = __kb_getp_aux_##name(x, k, &r); \
|
||||
if (i >= 0 && r == 0) return &__KB_KEY(key_t, x)[i]; \
|
||||
if (x->is_internal == 0) return 0; \
|
||||
x = __KB_PTR(b, x)[i + 1]; \
|
||||
} \
|
||||
return 0; \
|
||||
} \
|
||||
static inline key_t *kb_get_##name(kbtree_##name##_t *b, const key_t k) \
|
||||
{ \
|
||||
return kb_getp_##name(b, &k); \
|
||||
}
|
||||
|
||||
#define __KB_INTERVAL(name, key_t) \
|
||||
static void kb_intervalp_##name(kbtree_##name##_t *b, const key_t * __restrict k, key_t **lower, key_t **upper) \
|
||||
{ \
|
||||
int i, r = 0; \
|
||||
kbnode_t *x = b->root; \
|
||||
*lower = *upper = 0; \
|
||||
while (x) { \
|
||||
i = __kb_getp_aux_##name(x, k, &r); \
|
||||
if (i >= 0 && r == 0) { \
|
||||
*lower = *upper = &__KB_KEY(key_t, x)[i]; \
|
||||
return; \
|
||||
} \
|
||||
if (i >= 0) *lower = &__KB_KEY(key_t, x)[i]; \
|
||||
if (i < x->n - 1) *upper = &__KB_KEY(key_t, x)[i + 1]; \
|
||||
if (x->is_internal == 0) return; \
|
||||
x = __KB_PTR(b, x)[i + 1]; \
|
||||
} \
|
||||
} \
|
||||
static inline void kb_interval_##name(kbtree_##name##_t *b, const key_t k, key_t **lower, key_t **upper) \
|
||||
{ \
|
||||
kb_intervalp_##name(b, &k, lower, upper); \
|
||||
}
|
||||
|
||||
#define __KB_PUT(name, key_t, __cmp) \
|
||||
/* x must be an internal node */ \
|
||||
static void __kb_split_##name(kbtree_##name##_t *b, kbnode_t *x, int i, kbnode_t *y) \
|
||||
{ \
|
||||
kbnode_t *z; \
|
||||
z = (kbnode_t*)xcalloc(1, y->is_internal? b->ilen : b->elen); \
|
||||
++b->n_nodes; \
|
||||
z->is_internal = y->is_internal; \
|
||||
z->n = b->t - 1; \
|
||||
memcpy(__KB_KEY(key_t, z), __KB_KEY(key_t, y) + b->t, sizeof(key_t) * (b->t - 1)); \
|
||||
if (y->is_internal) memcpy(__KB_PTR(b, z), __KB_PTR(b, y) + b->t, sizeof(void*) * b->t); \
|
||||
y->n = b->t - 1; \
|
||||
memmove(__KB_PTR(b, x) + i + 2, __KB_PTR(b, x) + i + 1, sizeof(void*) * (x->n - i)); \
|
||||
__KB_PTR(b, x)[i + 1] = z; \
|
||||
memmove(__KB_KEY(key_t, x) + i + 1, __KB_KEY(key_t, x) + i, sizeof(key_t) * (x->n - i)); \
|
||||
__KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[b->t - 1]; \
|
||||
++x->n; \
|
||||
} \
|
||||
static void __kb_putp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k) \
|
||||
{ \
|
||||
int i = x->n - 1; \
|
||||
if (x->is_internal == 0) { \
|
||||
i = __kb_getp_aux_##name(x, k, 0); \
|
||||
if (i != x->n - 1) \
|
||||
memmove(__KB_KEY(key_t, x) + i + 2, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
|
||||
__KB_KEY(key_t, x)[i + 1] = *k; \
|
||||
++x->n; \
|
||||
} else { \
|
||||
i = __kb_getp_aux_##name(x, k, 0) + 1; \
|
||||
if (__KB_PTR(b, x)[i]->n == 2 * b->t - 1) { \
|
||||
__kb_split_##name(b, x, i, __KB_PTR(b, x)[i]); \
|
||||
if (__cmp(*k, __KB_KEY(key_t, x)[i]) > 0) ++i; \
|
||||
} \
|
||||
__kb_putp_aux_##name(b, __KB_PTR(b, x)[i], k); \
|
||||
} \
|
||||
} \
|
||||
static void kb_putp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
|
||||
{ \
|
||||
kbnode_t *r, *s; \
|
||||
++b->n_keys; \
|
||||
r = b->root; \
|
||||
if (r->n == 2 * b->t - 1) { \
|
||||
++b->n_nodes; \
|
||||
s = (kbnode_t*)xcalloc(1, b->ilen); \
|
||||
b->root = s; s->is_internal = 1; s->n = 0; \
|
||||
__KB_PTR(b, s)[0] = r; \
|
||||
__kb_split_##name(b, s, 0, r); \
|
||||
r = s; \
|
||||
} \
|
||||
__kb_putp_aux_##name(b, r, k); \
|
||||
} \
|
||||
static inline void kb_put_##name(kbtree_##name##_t *b, const key_t k) \
|
||||
{ \
|
||||
kb_putp_##name(b, &k); \
|
||||
}
|
||||
|
||||
|
||||
#define __KB_DEL(name, key_t) \
|
||||
static key_t __kb_delp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k, int s) \
|
||||
{ \
|
||||
int yn, zn, i, r = 0; \
|
||||
kbnode_t *xp, *y, *z; \
|
||||
key_t kp; \
|
||||
if (x == 0) return *k; \
|
||||
if (s) { /* s can only be 0, 1 or 2 */ \
|
||||
r = x->is_internal == 0? 0 : s == 1? 1 : -1; \
|
||||
i = s == 1? x->n - 1 : -1; \
|
||||
} else i = __kb_getp_aux_##name(x, k, &r); \
|
||||
if (x->is_internal == 0) { \
|
||||
if (s == 2) ++i; \
|
||||
kp = __KB_KEY(key_t, x)[i]; \
|
||||
memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
|
||||
--x->n; \
|
||||
return kp; \
|
||||
} \
|
||||
if (r == 0) { \
|
||||
if ((yn = __KB_PTR(b, x)[i]->n) >= b->t) { \
|
||||
xp = __KB_PTR(b, x)[i]; \
|
||||
kp = __KB_KEY(key_t, x)[i]; \
|
||||
__KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 1); \
|
||||
return kp; \
|
||||
} else if ((zn = __KB_PTR(b, x)[i + 1]->n) >= b->t) { \
|
||||
xp = __KB_PTR(b, x)[i + 1]; \
|
||||
kp = __KB_KEY(key_t, x)[i]; \
|
||||
__KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 2); \
|
||||
return kp; \
|
||||
} else if (yn == b->t - 1 && zn == b->t - 1) { \
|
||||
y = __KB_PTR(b, x)[i]; z = __KB_PTR(b, x)[i + 1]; \
|
||||
__KB_KEY(key_t, y)[y->n++] = *k; \
|
||||
memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, z), z->n * sizeof(key_t)); \
|
||||
if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, z), (z->n + 1) * sizeof(void*)); \
|
||||
y->n += z->n; \
|
||||
memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
|
||||
memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \
|
||||
--x->n; \
|
||||
free(z); \
|
||||
return __kb_delp_aux_##name(b, y, k, s); \
|
||||
} \
|
||||
} \
|
||||
++i; \
|
||||
if ((xp = __KB_PTR(b, x)[i])->n == b->t - 1) { \
|
||||
if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n >= b->t) { \
|
||||
memmove(__KB_KEY(key_t, xp) + 1, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \
|
||||
if (xp->is_internal) memmove(__KB_PTR(b, xp) + 1, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \
|
||||
__KB_KEY(key_t, xp)[0] = __KB_KEY(key_t, x)[i - 1]; \
|
||||
__KB_KEY(key_t, x)[i - 1] = __KB_KEY(key_t, y)[y->n - 1]; \
|
||||
if (xp->is_internal) __KB_PTR(b, xp)[0] = __KB_PTR(b, y)[y->n]; \
|
||||
--y->n; ++xp->n; \
|
||||
} else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n >= b->t) { \
|
||||
__KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \
|
||||
__KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[0]; \
|
||||
if (xp->is_internal) __KB_PTR(b, xp)[xp->n] = __KB_PTR(b, y)[0]; \
|
||||
--y->n; \
|
||||
memmove(__KB_KEY(key_t, y), __KB_KEY(key_t, y) + 1, y->n * sizeof(key_t)); \
|
||||
if (y->is_internal) memmove(__KB_PTR(b, y), __KB_PTR(b, y) + 1, (y->n + 1) * sizeof(void*)); \
|
||||
} else if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n == b->t - 1) { \
|
||||
__KB_KEY(key_t, y)[y->n++] = __KB_KEY(key_t, x)[i - 1]; \
|
||||
memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \
|
||||
if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \
|
||||
y->n += xp->n; \
|
||||
memmove(__KB_KEY(key_t, x) + i - 1, __KB_KEY(key_t, x) + i, (x->n - i) * sizeof(key_t)); \
|
||||
memmove(__KB_PTR(b, x) + i, __KB_PTR(b, x) + i + 1, (x->n - i) * sizeof(void*)); \
|
||||
--x->n; \
|
||||
free(xp); \
|
||||
xp = y; \
|
||||
} else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n == b->t - 1) { \
|
||||
__KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \
|
||||
memmove(__KB_KEY(key_t, xp) + xp->n, __KB_KEY(key_t, y), y->n * sizeof(key_t)); \
|
||||
if (xp->is_internal) memmove(__KB_PTR(b, xp) + xp->n, __KB_PTR(b, y), (y->n + 1) * sizeof(void*)); \
|
||||
xp->n += y->n; \
|
||||
memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
|
||||
memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \
|
||||
--x->n; \
|
||||
free(y); \
|
||||
} \
|
||||
} \
|
||||
return __kb_delp_aux_##name(b, xp, k, s); \
|
||||
} \
|
||||
static key_t kb_delp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
|
||||
{ \
|
||||
kbnode_t *x; \
|
||||
key_t ret; \
|
||||
ret = __kb_delp_aux_##name(b, b->root, k, 0); \
|
||||
--b->n_keys; \
|
||||
if (b->root->n == 0 && b->root->is_internal) { \
|
||||
--b->n_nodes; \
|
||||
x = b->root; \
|
||||
b->root = __KB_PTR(b, x)[0]; \
|
||||
free(x); \
|
||||
} \
|
||||
return ret; \
|
||||
} \
|
||||
static inline key_t kb_del_##name(kbtree_##name##_t *b, const key_t k) \
|
||||
{ \
|
||||
return kb_delp_##name(b, &k); \
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
kbnode_t *x;
|
||||
int i;
|
||||
} __kbstack_t;
|
||||
|
||||
#define __kb_traverse(key_t, b, __func) do { \
|
||||
int __kmax = 8; \
|
||||
__kbstack_t *__kstack, *__kp; \
|
||||
__kp = __kstack = (__kbstack_t*)xcalloc(__kmax, sizeof(__kbstack_t)); \
|
||||
__kp->x = (b)->root; __kp->i = 0; \
|
||||
for (;;) { \
|
||||
while (__kp->x && __kp->i <= __kp->x->n) { \
|
||||
if (__kp - __kstack == __kmax - 1) { \
|
||||
__kmax <<= 1; \
|
||||
__kstack = (__kbstack_t*)xrealloc(__kstack, __kmax * sizeof(__kbstack_t)); \
|
||||
__kp = __kstack + (__kmax>>1) - 1; \
|
||||
} \
|
||||
(__kp+1)->i = 0; (__kp+1)->x = __kp->x->is_internal? __KB_PTR(b, __kp->x)[__kp->i] : 0; \
|
||||
++__kp; \
|
||||
} \
|
||||
--__kp; \
|
||||
if (__kp >= __kstack) { \
|
||||
if (__kp->x && __kp->i < __kp->x->n) __func(&__KB_KEY(key_t, __kp->x)[__kp->i]); \
|
||||
++__kp->i; \
|
||||
} else break; \
|
||||
} \
|
||||
free(__kstack); \
|
||||
} while (0)
|
||||
|
||||
#define KBTREE_INIT(name, key_t, __cmp) \
|
||||
__KB_TREE_T(name) \
|
||||
__KB_INIT(name, key_t) \
|
||||
__KB_GET_AUX1(name, key_t, __cmp) \
|
||||
__KB_GET(name, key_t) \
|
||||
__KB_INTERVAL(name, key_t) \
|
||||
__KB_PUT(name, key_t, __cmp) \
|
||||
__KB_DEL(name, key_t)
|
||||
|
||||
#define KB_DEFAULT_SIZE 512
|
||||
|
||||
#define kbtree_t(name) kbtree_##name##_t
|
||||
#define kb_init(name, s) kb_init_##name(s)
|
||||
#define kb_destroy(name, b) __kb_destroy(b)
|
||||
#define kb_get(name, b, k) kb_get_##name(b, k)
|
||||
#define kb_put(name, b, k) kb_put_##name(b, k)
|
||||
#define kb_del(name, b, k) kb_del_##name(b, k)
|
||||
#define kb_interval(name, b, k, l, u) kb_interval_##name(b, k, l, u)
|
||||
#define kb_getp(name, b, k) kb_getp_##name(b, k)
|
||||
#define kb_putp(name, b, k) kb_putp_##name(b, k)
|
||||
#define kb_delp(name, b, k) kb_delp_##name(b, k)
|
||||
#define kb_intervalp(name, b, k, l, u) kb_intervalp_##name(b, k, l, u)
|
||||
|
||||
#define kb_size(b) ((b)->n_keys)
|
||||
|
||||
#define kb_generic_cmp(a, b) (((b) < (a)) - ((a) < (b)))
|
||||
#define kb_str_cmp(a, b) strcmp(a, b)
|
||||
|
||||
#endif
|
||||
282
khash.h
282
khash.h
|
|
@ -1,6 +1,6 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2008, 2009 by attractor <attractor@live.co.uk>
|
||||
Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
|
|
@ -33,7 +33,6 @@ int main() {
|
|||
khiter_t k;
|
||||
khash_t(32) *h = kh_init(32);
|
||||
k = kh_put(32, h, 5, &ret);
|
||||
if (!ret) kh_del(32, h, k);
|
||||
kh_value(h, k) = 10;
|
||||
k = kh_get(32, h, 10);
|
||||
is_missing = (k == kh_end(h));
|
||||
|
|
@ -47,6 +46,29 @@ int main() {
|
|||
*/
|
||||
|
||||
/*
|
||||
2011-12-29 (0.2.7):
|
||||
|
||||
* Minor code clean up; no actual effect.
|
||||
|
||||
2011-09-16 (0.2.6):
|
||||
|
||||
* The capacity is a power of 2. This seems to dramatically improve the
|
||||
speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
|
||||
|
||||
- http://code.google.com/p/ulib/
|
||||
- http://nothings.org/computer/judy/
|
||||
|
||||
* Allow to optionally use linear probing which usually has better
|
||||
performance for random input. Double hashing is still the default as it
|
||||
is more robust to certain non-random input.
|
||||
|
||||
* Added Wang's integer hash function (not used by default). This hash
|
||||
function is more robust to certain non-random input.
|
||||
|
||||
2011-02-14 (0.2.5):
|
||||
|
||||
* Allow to declare global functions.
|
||||
|
||||
2009-09-26 (0.2.4):
|
||||
|
||||
* Improve portability
|
||||
|
|
@ -86,11 +108,9 @@ int main() {
|
|||
@header
|
||||
|
||||
Generic hash table library.
|
||||
|
||||
@copyright Heng Li
|
||||
*/
|
||||
|
||||
#define AC_VERSION_KHASH_H "0.2.4"
|
||||
#define AC_VERSION_KHASH_H "0.2.6"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
@ -112,24 +132,14 @@ typedef unsigned long long khint64_t;
|
|||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define inline __inline
|
||||
#define kh_inline __inline
|
||||
#else
|
||||
#define kh_inline inline
|
||||
#endif
|
||||
|
||||
typedef khint32_t khint_t;
|
||||
typedef khint_t khiter_t;
|
||||
|
||||
#define __ac_HASH_PRIME_SIZE 32
|
||||
static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
|
||||
{
|
||||
0ul, 3ul, 11ul, 23ul, 53ul,
|
||||
97ul, 193ul, 389ul, 769ul, 1543ul,
|
||||
3079ul, 6151ul, 12289ul, 24593ul, 49157ul,
|
||||
98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,
|
||||
3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,
|
||||
100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,
|
||||
3221225473ul, 4294967291ul
|
||||
};
|
||||
|
||||
#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
|
||||
#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
|
||||
#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
|
||||
|
|
@ -138,88 +148,128 @@ static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
|
|||
#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
|
||||
#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
|
||||
|
||||
#ifdef KHASH_LINEAR
|
||||
#define __ac_inc(k, m) 1
|
||||
#else
|
||||
#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m)
|
||||
#endif
|
||||
|
||||
#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
|
||||
|
||||
#ifndef kroundup32
|
||||
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
|
||||
#endif
|
||||
|
||||
#ifndef kcalloc
|
||||
#define kcalloc(N,Z) xcalloc(N,Z)
|
||||
#endif
|
||||
#ifndef kmalloc
|
||||
#define kmalloc(Z) xmalloc(Z)
|
||||
#endif
|
||||
#ifndef krealloc
|
||||
#define krealloc(P,Z) xrealloc(P,Z)
|
||||
#endif
|
||||
#ifndef kfree
|
||||
#define kfree(P) free(P)
|
||||
#endif
|
||||
|
||||
static const double __ac_HASH_UPPER = 0.77;
|
||||
|
||||
#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
|
||||
typedef struct { \
|
||||
khint_t n_buckets, size, n_occupied, upper_bound; \
|
||||
khint32_t *flags; \
|
||||
khkey_t *keys; \
|
||||
khval_t *vals; \
|
||||
} kh_##name##_t; \
|
||||
static inline kh_##name##_t *kh_init_##name() { \
|
||||
return (kh_##name##_t*)xcalloc(1, sizeof(kh_##name##_t)); \
|
||||
#define __KHASH_TYPE(name, khkey_t, khval_t) \
|
||||
typedef struct { \
|
||||
khint_t n_buckets, size, n_occupied, upper_bound; \
|
||||
khint32_t *flags; \
|
||||
khkey_t *keys; \
|
||||
khval_t *vals; \
|
||||
} kh_##name##_t;
|
||||
|
||||
#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \
|
||||
extern kh_##name##_t *kh_init_##name(void); \
|
||||
extern void kh_destroy_##name(kh_##name##_t *h); \
|
||||
extern void kh_clear_##name(kh_##name##_t *h); \
|
||||
extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
|
||||
extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
|
||||
extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
|
||||
extern void kh_del_##name(kh_##name##_t *h, khint_t x);
|
||||
|
||||
#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
|
||||
SCOPE kh_##name##_t *kh_init_##name(void) { \
|
||||
return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \
|
||||
} \
|
||||
static inline void kh_destroy_##name(kh_##name##_t *h) \
|
||||
SCOPE void kh_destroy_##name(kh_##name##_t *h) \
|
||||
{ \
|
||||
if (h) { \
|
||||
free(h->keys); free(h->flags); \
|
||||
free(h->vals); \
|
||||
free(h); \
|
||||
kfree((void *)h->keys); kfree(h->flags); \
|
||||
kfree((void *)h->vals); \
|
||||
kfree(h); \
|
||||
} \
|
||||
} \
|
||||
static inline void kh_clear_##name(kh_##name##_t *h) \
|
||||
SCOPE void kh_clear_##name(kh_##name##_t *h) \
|
||||
{ \
|
||||
if (h && h->flags) { \
|
||||
memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \
|
||||
memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
|
||||
h->size = h->n_occupied = 0; \
|
||||
} \
|
||||
} \
|
||||
static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
|
||||
SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
|
||||
{ \
|
||||
if (h->n_buckets) { \
|
||||
khint_t inc, k, i, last; \
|
||||
k = __hash_func(key); i = k % h->n_buckets; \
|
||||
inc = 1 + k % (h->n_buckets - 1); last = i; \
|
||||
khint_t inc, k, i, last, mask; \
|
||||
mask = h->n_buckets - 1; \
|
||||
k = __hash_func(key); i = k & mask; \
|
||||
inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \
|
||||
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
|
||||
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
|
||||
else i += inc; \
|
||||
i = (i + inc) & mask; \
|
||||
if (i == last) return h->n_buckets; \
|
||||
} \
|
||||
return __ac_iseither(h->flags, i)? h->n_buckets : i; \
|
||||
} else return 0; \
|
||||
} \
|
||||
static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
|
||||
{ \
|
||||
SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
|
||||
{ /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
|
||||
khint32_t *new_flags = 0; \
|
||||
khint_t j = 1; \
|
||||
{ \
|
||||
khint_t t = __ac_HASH_PRIME_SIZE - 1; \
|
||||
while (__ac_prime_list[t] > new_n_buckets) --t; \
|
||||
new_n_buckets = __ac_prime_list[t+1]; \
|
||||
if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \
|
||||
else { \
|
||||
new_flags = (khint32_t*)xmalloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \
|
||||
memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \
|
||||
if (h->n_buckets < new_n_buckets) { \
|
||||
h->keys = (khkey_t*)xrealloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
|
||||
if (kh_is_map) \
|
||||
h->vals = (khval_t*)xrealloc(h->vals, new_n_buckets * sizeof(khval_t)); \
|
||||
} \
|
||||
kroundup32(new_n_buckets); \
|
||||
if (new_n_buckets < 4) new_n_buckets = 4; \
|
||||
if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
|
||||
else { /* hash table size to be changed (shrink or expand); rehash */ \
|
||||
new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
|
||||
if (!new_flags) return -1; \
|
||||
memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
|
||||
if (h->n_buckets < new_n_buckets) { /* expand */ \
|
||||
khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
|
||||
if (!new_keys) return -1; \
|
||||
h->keys = new_keys; \
|
||||
if (kh_is_map) { \
|
||||
khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
|
||||
if (!new_vals) return -1; \
|
||||
h->vals = new_vals; \
|
||||
} \
|
||||
} /* otherwise shrink */ \
|
||||
} \
|
||||
} \
|
||||
if (j) { \
|
||||
if (j) { /* rehashing is needed */ \
|
||||
for (j = 0; j != h->n_buckets; ++j) { \
|
||||
if (__ac_iseither(h->flags, j) == 0) { \
|
||||
khkey_t key = h->keys[j]; \
|
||||
khval_t val; \
|
||||
khint_t new_mask; \
|
||||
new_mask = new_n_buckets - 1; \
|
||||
if (kh_is_map) val = h->vals[j]; \
|
||||
__ac_set_isdel_true(h->flags, j); \
|
||||
while (1) { \
|
||||
while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
|
||||
khint_t inc, k, i; \
|
||||
k = __hash_func(key); \
|
||||
i = k % new_n_buckets; \
|
||||
inc = 1 + k % (new_n_buckets - 1); \
|
||||
while (!__ac_isempty(new_flags, i)) { \
|
||||
if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \
|
||||
else i += inc; \
|
||||
} \
|
||||
i = k & new_mask; \
|
||||
inc = __ac_inc(k, new_mask); \
|
||||
while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \
|
||||
__ac_set_isempty_false(new_flags, i); \
|
||||
if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \
|
||||
if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
|
||||
{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
|
||||
if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
|
||||
__ac_set_isdel_true(h->flags, i); \
|
||||
} else { \
|
||||
__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
|
||||
} else { /* write the element and jump out of the loop */ \
|
||||
h->keys[i] = key; \
|
||||
if (kh_is_map) h->vals[i] = val; \
|
||||
break; \
|
||||
|
|
@ -227,35 +277,39 @@ static const double __ac_HASH_UPPER = 0.77;
|
|||
} \
|
||||
} \
|
||||
} \
|
||||
if (h->n_buckets > new_n_buckets) { \
|
||||
h->keys = (khkey_t*)xrealloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
|
||||
if (kh_is_map) \
|
||||
h->vals = (khval_t*)xrealloc(h->vals, new_n_buckets * sizeof(khval_t)); \
|
||||
if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
|
||||
h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
|
||||
if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
|
||||
} \
|
||||
free(h->flags); \
|
||||
kfree(h->flags); /* free the working space */ \
|
||||
h->flags = new_flags; \
|
||||
h->n_buckets = new_n_buckets; \
|
||||
h->n_occupied = h->size; \
|
||||
h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
|
||||
} \
|
||||
return 0; \
|
||||
} \
|
||||
static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
|
||||
SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
|
||||
{ \
|
||||
khint_t x; \
|
||||
if (h->n_occupied >= h->upper_bound) { \
|
||||
if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \
|
||||
else kh_resize_##name(h, h->n_buckets + 1); \
|
||||
} \
|
||||
if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
|
||||
if (h->n_buckets > (h->size<<1)) { \
|
||||
if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
|
||||
*ret = -1; return h->n_buckets; \
|
||||
} \
|
||||
} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
|
||||
*ret = -1; return h->n_buckets; \
|
||||
} \
|
||||
} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
|
||||
{ \
|
||||
khint_t inc, k, i, site, last; \
|
||||
x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \
|
||||
if (__ac_isempty(h->flags, i)) x = i; \
|
||||
khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \
|
||||
x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
|
||||
if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
|
||||
else { \
|
||||
inc = 1 + k % (h->n_buckets - 1); last = i; \
|
||||
inc = __ac_inc(k, mask); last = i; \
|
||||
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
|
||||
if (__ac_isdel(h->flags, i)) site = i; \
|
||||
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
|
||||
else i += inc; \
|
||||
i = (i + inc) & mask; \
|
||||
if (i == last) { x = site; break; } \
|
||||
} \
|
||||
if (x == h->n_buckets) { \
|
||||
|
|
@ -264,20 +318,20 @@ static const double __ac_HASH_UPPER = 0.77;
|
|||
} \
|
||||
} \
|
||||
} \
|
||||
if (__ac_isempty(h->flags, x)) { \
|
||||
if (__ac_isempty(h->flags, x)) { /* not present at all */ \
|
||||
h->keys[x] = key; \
|
||||
__ac_set_isboth_false(h->flags, x); \
|
||||
++h->size; ++h->n_occupied; \
|
||||
*ret = 1; \
|
||||
} else if (__ac_isdel(h->flags, x)) { \
|
||||
} else if (__ac_isdel(h->flags, x)) { /* deleted */ \
|
||||
h->keys[x] = key; \
|
||||
__ac_set_isboth_false(h->flags, x); \
|
||||
++h->size; \
|
||||
*ret = 2; \
|
||||
} else *ret = 0; \
|
||||
} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
|
||||
return x; \
|
||||
} \
|
||||
static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \
|
||||
SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \
|
||||
{ \
|
||||
if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
|
||||
__ac_set_isdel_true(h->flags, x); \
|
||||
|
|
@ -285,6 +339,17 @@ static const double __ac_HASH_UPPER = 0.77;
|
|||
} \
|
||||
}
|
||||
|
||||
#define KHASH_DECLARE(name, khkey_t, khval_t) \
|
||||
__KHASH_TYPE(name, khkey_t, khval_t) \
|
||||
__KHASH_PROTOTYPES(name, khkey_t, khval_t)
|
||||
|
||||
#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
|
||||
__KHASH_TYPE(name, khkey_t, khval_t) \
|
||||
__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
|
||||
|
||||
#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
|
||||
KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
|
||||
|
||||
/* --- BEGIN OF HASH FUNCTIONS --- */
|
||||
|
||||
/*! @function
|
||||
|
|
@ -312,10 +377,10 @@ static const double __ac_HASH_UPPER = 0.77;
|
|||
@param s Pointer to a null terminated string
|
||||
@return The hash value
|
||||
*/
|
||||
static inline khint_t __ac_X31_hash_string(const char *s)
|
||||
static kh_inline khint_t __ac_X31_hash_string(const char *s)
|
||||
{
|
||||
khint_t h = *s;
|
||||
if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
|
||||
khint_t h = (khint_t)*s;
|
||||
if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
|
||||
return h;
|
||||
}
|
||||
/*! @function
|
||||
|
|
@ -329,9 +394,21 @@ static inline khint_t __ac_X31_hash_string(const char *s)
|
|||
*/
|
||||
#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
|
||||
|
||||
static kh_inline khint_t __ac_Wang_hash(khint_t key)
|
||||
{
|
||||
key += ~(key << 15);
|
||||
key ^= (key >> 10);
|
||||
key += (key << 3);
|
||||
key ^= (key >> 6);
|
||||
key += ~(key << 11);
|
||||
key ^= (key >> 16);
|
||||
return key;
|
||||
}
|
||||
#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key)
|
||||
|
||||
/* --- END OF HASH FUNCTIONS --- */
|
||||
|
||||
/* Other necessary macros... */
|
||||
/* Other convenient macros... */
|
||||
|
||||
/*!
|
||||
@abstract Type of the hash table.
|
||||
|
|
@ -397,7 +474,6 @@ static inline khint_t __ac_X31_hash_string(const char *s)
|
|||
*/
|
||||
#define kh_del(name, h, k) kh_del_##name(h, k)
|
||||
|
||||
|
||||
/*! @function
|
||||
@abstract Test whether a bucket contains data.
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
|
|
@ -456,6 +532,34 @@ static inline khint_t __ac_X31_hash_string(const char *s)
|
|||
*/
|
||||
#define kh_n_buckets(h) ((h)->n_buckets)
|
||||
|
||||
/*! @function
|
||||
@abstract Iterate over the entries in the hash table
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param kvar Variable to which key will be assigned
|
||||
@param vvar Variable to which value will be assigned
|
||||
@param code Block of code to execute
|
||||
*/
|
||||
#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \
|
||||
for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
|
||||
if (!kh_exist(h,__i)) continue; \
|
||||
(kvar) = kh_key(h,__i); \
|
||||
(vvar) = kh_val(h,__i); \
|
||||
code; \
|
||||
} }
|
||||
|
||||
/*! @function
|
||||
@abstract Iterate over the values in the hash table
|
||||
@param h Pointer to the hash table [khash_t(name)*]
|
||||
@param vvar Variable to which value will be assigned
|
||||
@param code Block of code to execute
|
||||
*/
|
||||
#define kh_foreach_value(h, vvar, code) { khint_t __i; \
|
||||
for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
|
||||
if (!kh_exist(h,__i)) continue; \
|
||||
(vvar) = kh_val(h,__i); \
|
||||
code; \
|
||||
} }
|
||||
|
||||
/* More conenient interfaces */
|
||||
|
||||
/*! @function
|
||||
|
|
|
|||
|
|
@ -0,0 +1,372 @@
|
|||
#include <stdio.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <ctype.h>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <signal.h>
|
||||
#include <sys/wait.h>
|
||||
#include <sys/types.h>
|
||||
#ifndef _WIN32
|
||||
#include <netdb.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <sys/socket.h>
|
||||
#endif
|
||||
|
||||
#include "utils.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
#define _KO_NO_NET
|
||||
#endif
|
||||
|
||||
#ifndef _KO_NO_NET
|
||||
static int socket_wait(int fd, int is_read)
|
||||
{
|
||||
fd_set fds, *fdr = 0, *fdw = 0;
|
||||
struct timeval tv;
|
||||
int ret;
|
||||
tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
|
||||
FD_ZERO(&fds);
|
||||
FD_SET(fd, &fds);
|
||||
if (is_read) fdr = &fds;
|
||||
else fdw = &fds;
|
||||
ret = select(fd+1, fdr, fdw, 0, &tv);
|
||||
if (ret == -1) perror("select");
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int socket_connect(const char *host, const char *port)
|
||||
{
|
||||
#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
|
||||
|
||||
int on = 1, fd;
|
||||
struct linger lng = { 0, 0 };
|
||||
struct addrinfo hints, *res = 0;
|
||||
memset(&hints, 0, sizeof(struct addrinfo));
|
||||
hints.ai_family = AF_UNSPEC;
|
||||
hints.ai_socktype = SOCK_STREAM;
|
||||
if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
|
||||
if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
|
||||
if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
|
||||
if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
|
||||
if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
|
||||
freeaddrinfo(res);
|
||||
return fd;
|
||||
#undef __err_connect
|
||||
}
|
||||
|
||||
static int write_bytes(int fd, const char *buf, size_t len)
|
||||
{
|
||||
ssize_t bytes;
|
||||
do {
|
||||
bytes = write(fd, buf, len);
|
||||
if (bytes >= 0) {
|
||||
len -= bytes;
|
||||
} else if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
|
||||
return -1;
|
||||
}
|
||||
} while (len > 0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int http_open(const char *fn)
|
||||
{
|
||||
char *p, *proxy, *q, *http_host, *host, *port, *path, *buf;
|
||||
int fd, ret, l;
|
||||
ssize_t bytes = 0, bufsz = 0x10000;
|
||||
|
||||
/* parse URL; adapted from khttp_parse_url() in knetfile.c */
|
||||
if (strstr(fn, "http://") != fn) return 0;
|
||||
// set ->http_host
|
||||
for (p = (char*)fn + 7; *p && *p != '/'; ++p);
|
||||
l = p - fn - 7;
|
||||
http_host = xcalloc(l + 1, 1);
|
||||
strncpy(http_host, fn + 7, l);
|
||||
http_host[l] = 0;
|
||||
for (q = http_host; *q && *q != ':'; ++q);
|
||||
if (*q == ':') *q++ = 0;
|
||||
// get http_proxy
|
||||
proxy = getenv("http_proxy");
|
||||
// set host, port and path
|
||||
if (proxy == 0) {
|
||||
host = xstrdup(http_host); // when there is no proxy, server name is identical to http_host name.
|
||||
port = xstrdup(*q? q : "80");
|
||||
path = xstrdup(*p? p : "/");
|
||||
} else {
|
||||
host = (strstr(proxy, "http://") == proxy)? xstrdup(proxy + 7) : xstrdup(proxy);
|
||||
for (q = host; *q && *q != ':'; ++q);
|
||||
if (*q == ':') *q++ = 0;
|
||||
port = xstrdup(*q? q : "80");
|
||||
path = xstrdup(fn);
|
||||
}
|
||||
|
||||
/* connect; adapted from khttp_connect() in knetfile.c */
|
||||
l = 0;
|
||||
fd = socket_connect(host, port);
|
||||
buf = xcalloc(bufsz, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
|
||||
l += snprintf(buf + l, bufsz, "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n",
|
||||
path, http_host);
|
||||
if (write_bytes(fd, buf, l) != 0) {
|
||||
close(fd);
|
||||
fd = -1;
|
||||
goto out;
|
||||
}
|
||||
l = 0;
|
||||
retry:
|
||||
while (l < bufsz && (bytes = read(fd, buf + l, 1)) > 0) { // read HTTP header; FIXME: bad efficiency
|
||||
if (buf[l] == '\n' && l >= 3)
|
||||
if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
|
||||
++l;
|
||||
}
|
||||
if (bytes < 0 && (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)) goto retry;
|
||||
|
||||
buf[l] = 0;
|
||||
if (bytes < 0 || l < 14) { // prematured header
|
||||
close(fd);
|
||||
fd = -1;
|
||||
goto out;
|
||||
}
|
||||
ret = strtol(buf + 8, &p, 0); // HTTP return code
|
||||
if (ret != 200) {
|
||||
close(fd);
|
||||
fd = -1;
|
||||
}
|
||||
out:
|
||||
free(buf); free(http_host); free(host); free(port); free(path);
|
||||
return fd;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
int max_response, ctrl_fd;
|
||||
char *response;
|
||||
} ftpaux_t;
|
||||
|
||||
static int kftp_get_response(ftpaux_t *aux)
|
||||
{
|
||||
unsigned char c;
|
||||
int n = 0;
|
||||
char *p;
|
||||
if (socket_wait(aux->ctrl_fd, 1) <= 0) return 0;
|
||||
while (read(aux->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
|
||||
if (n >= aux->max_response) {
|
||||
aux->max_response = aux->max_response? aux->max_response<<1 : 256;
|
||||
aux->response = xrealloc(aux->response, aux->max_response);
|
||||
}
|
||||
aux->response[n++] = c;
|
||||
if (c == '\n') {
|
||||
if (n >= 4 && isdigit(aux->response[0]) && isdigit(aux->response[1]) && isdigit(aux->response[2])
|
||||
&& aux->response[3] != '-') break;
|
||||
n = 0;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (n < 2) return -1;
|
||||
aux->response[n-2] = 0;
|
||||
return strtol(aux->response, &p, 0);
|
||||
}
|
||||
|
||||
static int kftp_send_cmd(ftpaux_t *aux, const char *cmd, int is_get)
|
||||
{
|
||||
if (socket_wait(aux->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
|
||||
if (write_bytes(aux->ctrl_fd, cmd, strlen(cmd)) != 0) return -1;
|
||||
return is_get? kftp_get_response(aux) : 0;
|
||||
}
|
||||
|
||||
static int ftp_open(const char *fn)
|
||||
{
|
||||
char *p, *host = 0, *port = 0, *retr = 0;
|
||||
char host2[80], port2[10];
|
||||
int v[6], l, fd = -1, ret, pasv_port, pasv_ip[4];
|
||||
ftpaux_t aux;
|
||||
|
||||
/* parse URL */
|
||||
if (strstr(fn, "ftp://") != fn) return 0;
|
||||
for (p = (char*)fn + 6; *p && *p != '/'; ++p);
|
||||
if (*p != '/') return 0;
|
||||
l = p - fn - 6;
|
||||
port = xstrdup("21");
|
||||
host = xcalloc(l + 1, 1);
|
||||
strncpy(host, fn + 6, l);
|
||||
retr = xcalloc(strlen(p) + 8, 1);
|
||||
sprintf(retr, "RETR %s\r\n", p);
|
||||
|
||||
/* connect to ctrl */
|
||||
memset(&aux, 0, sizeof(ftpaux_t));
|
||||
aux.ctrl_fd = socket_connect(host, port);
|
||||
if (aux.ctrl_fd == -1) goto ftp_open_end; /* fail to connect ctrl */
|
||||
|
||||
/* connect to the data stream */
|
||||
kftp_get_response(&aux);
|
||||
kftp_send_cmd(&aux, "USER anonymous\r\n", 1);
|
||||
kftp_send_cmd(&aux, "PASS kopen@\r\n", 1);
|
||||
kftp_send_cmd(&aux, "TYPE I\r\n", 1);
|
||||
kftp_send_cmd(&aux, "PASV\r\n", 1);
|
||||
for (p = aux.response; *p && *p != '('; ++p);
|
||||
if (*p != '(') goto ftp_open_end;
|
||||
++p;
|
||||
sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
|
||||
memcpy(pasv_ip, v, 4 * sizeof(int));
|
||||
pasv_port = (v[4]<<8&0xff00) + v[5];
|
||||
kftp_send_cmd(&aux, retr, 0);
|
||||
sprintf(host2, "%d.%d.%d.%d", pasv_ip[0], pasv_ip[1], pasv_ip[2], pasv_ip[3]);
|
||||
sprintf(port2, "%d", pasv_port);
|
||||
fd = socket_connect(host2, port2);
|
||||
if (fd == -1) goto ftp_open_end;
|
||||
ret = kftp_get_response(&aux);
|
||||
if (ret != 150) {
|
||||
close(fd);
|
||||
fd = -1;
|
||||
}
|
||||
close(aux.ctrl_fd);
|
||||
|
||||
ftp_open_end:
|
||||
free(host); free(port); free(retr); free(aux.response);
|
||||
return fd;
|
||||
}
|
||||
#endif /* !defined(_KO_NO_NET) */
|
||||
|
||||
static char **cmd2argv(const char *cmd)
|
||||
{
|
||||
int i, beg, end, argc;
|
||||
char **argv, *str;
|
||||
end = strlen(cmd);
|
||||
for (i = end - 1; i >= 0; --i)
|
||||
if (!isspace(cmd[i])) break;
|
||||
end = i + 1;
|
||||
for (beg = 0; beg < end; ++beg)
|
||||
if (!isspace(cmd[beg])) break;
|
||||
if (beg == end) return 0;
|
||||
for (i = beg + 1, argc = 0; i < end; ++i)
|
||||
if (isspace(cmd[i]) && !isspace(cmd[i-1]))
|
||||
++argc;
|
||||
argv = (char**)xcalloc(argc + 2, sizeof(void*));
|
||||
argv[0] = str = (char*)xcalloc(end - beg + 1, 1);
|
||||
strncpy(argv[0], cmd + beg, end - beg);
|
||||
for (i = argc = 1; i < end - beg; ++i)
|
||||
if (isspace(str[i])) str[i] = 0;
|
||||
else if (str[i] && str[i-1] == 0) argv[argc++] = &str[i];
|
||||
return argv;
|
||||
}
|
||||
|
||||
#define KO_STDIN 1
|
||||
#define KO_FILE 2
|
||||
#define KO_PIPE 3
|
||||
#define KO_HTTP 4
|
||||
#define KO_FTP 5
|
||||
|
||||
typedef struct {
|
||||
int type, fd;
|
||||
pid_t pid;
|
||||
} koaux_t;
|
||||
|
||||
void *kopen(const char *fn, int *_fd)
|
||||
{
|
||||
koaux_t *aux = 0;
|
||||
*_fd = -1;
|
||||
if (strstr(fn, "http://") == fn) {
|
||||
aux = xcalloc(1, sizeof(koaux_t));
|
||||
aux->type = KO_HTTP;
|
||||
aux->fd = http_open(fn);
|
||||
} else if (strstr(fn, "ftp://") == fn) {
|
||||
aux = xcalloc(1, sizeof(koaux_t));
|
||||
aux->type = KO_FTP;
|
||||
aux->fd = ftp_open(fn);
|
||||
} else if (strcmp(fn, "-") == 0) {
|
||||
aux = xcalloc(1, sizeof(koaux_t));
|
||||
aux->type = KO_STDIN;
|
||||
aux->fd = STDIN_FILENO;
|
||||
} else {
|
||||
const char *p, *q;
|
||||
for (p = fn; *p; ++p)
|
||||
if (!isspace(*p)) break;
|
||||
if (*p == '<') { // pipe open
|
||||
int need_shell, pfd[2];
|
||||
pid_t pid;
|
||||
// a simple check to see if we need to invoke a shell; not always working
|
||||
for (q = p + 1; *q; ++q)
|
||||
if (ispunct(*q) && *q != '.' && *q != '_' && *q != '-' && *q != ':')
|
||||
break;
|
||||
need_shell = (*q != 0);
|
||||
if (pipe(pfd) != 0) return 0;
|
||||
pid = vfork();
|
||||
if (pid == -1) { /* vfork() error */
|
||||
close(pfd[0]); close(pfd[1]);
|
||||
return 0;
|
||||
}
|
||||
if (pid == 0) { /* the child process */
|
||||
char **argv; /* FIXME: I do not know if this will lead to a memory leak */
|
||||
close(pfd[0]);
|
||||
dup2(pfd[1], STDOUT_FILENO);
|
||||
close(pfd[1]);
|
||||
if (!need_shell) {
|
||||
argv = cmd2argv(p + 1);
|
||||
execvp(argv[0], argv);
|
||||
free(argv[0]); free(argv);
|
||||
} else execl("/bin/sh", "sh", "-c", p + 1, NULL);
|
||||
exit(1);
|
||||
} else { /* parent process */
|
||||
close(pfd[1]);
|
||||
aux = xcalloc(1, sizeof(koaux_t));
|
||||
aux->type = KO_PIPE;
|
||||
aux->fd = pfd[0];
|
||||
aux->pid = pid;
|
||||
}
|
||||
} else {
|
||||
#ifdef _WIN32
|
||||
*_fd = open(fn, O_RDONLY | O_BINARY);
|
||||
#else
|
||||
*_fd = open(fn, O_RDONLY);
|
||||
#endif
|
||||
if (*_fd) {
|
||||
aux = xcalloc(1, sizeof(koaux_t));
|
||||
aux->type = KO_FILE;
|
||||
aux->fd = *_fd;
|
||||
}
|
||||
}
|
||||
}
|
||||
*_fd = aux->fd;
|
||||
return aux;
|
||||
}
|
||||
|
||||
int kclose(void *a)
|
||||
{
|
||||
koaux_t *aux = (koaux_t*)a;
|
||||
if (aux->type == KO_PIPE) {
|
||||
int status;
|
||||
pid_t pid;
|
||||
pid = waitpid(aux->pid, &status, WNOHANG);
|
||||
if (pid != aux->pid) kill(aux->pid, 15);
|
||||
}
|
||||
free(aux);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef _KO_MAIN
|
||||
#define BUF_SIZE 0x10000
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
void *x;
|
||||
int l, fd;
|
||||
unsigned char buf[BUF_SIZE];
|
||||
FILE *fp;
|
||||
if (argc == 1) {
|
||||
fprintf(stderr, "Usage: kopen <file>\n");
|
||||
return 1;
|
||||
}
|
||||
x = kopen(argv[1], &fd);
|
||||
fp = fdopen(fd, "r");
|
||||
if (fp == 0) {
|
||||
fprintf(stderr, "ERROR: fail to open the input\n");
|
||||
return 1;
|
||||
}
|
||||
do {
|
||||
if ((l = fread(buf, 1, BUF_SIZE, fp)) != 0)
|
||||
fwrite(buf, 1, l, stdout);
|
||||
} while (l == BUF_SIZE);
|
||||
fclose(fp);
|
||||
kclose(x);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
137
kseq.h
137
kseq.h
|
|
@ -1,6 +1,6 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2008, by Heng Li <lh3@sanger.ac.uk>
|
||||
Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
|
|
@ -23,6 +23,8 @@
|
|||
SOFTWARE.
|
||||
*/
|
||||
|
||||
/* Last Modified: 05MAR2012 */
|
||||
|
||||
#ifndef AC_KSEQ_H
|
||||
#define AC_KSEQ_H
|
||||
|
||||
|
|
@ -31,9 +33,14 @@
|
|||
#include <stdlib.h>
|
||||
#include "utils.h"
|
||||
|
||||
#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
|
||||
#define KS_SEP_TAB 1 // isspace() && !' '
|
||||
#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
|
||||
#define KS_SEP_MAX 2
|
||||
|
||||
#define __KS_TYPE(type_t) \
|
||||
typedef struct __kstream_t { \
|
||||
char *buf; \
|
||||
unsigned char *buf; \
|
||||
int begin, end, is_eof; \
|
||||
type_t f; \
|
||||
} kstream_t;
|
||||
|
|
@ -46,7 +53,7 @@
|
|||
{ \
|
||||
kstream_t *ks = (kstream_t*)xcalloc(1, sizeof(kstream_t)); \
|
||||
ks->f = f; \
|
||||
ks->buf = (char*)xmalloc(__bufsize); \
|
||||
ks->buf = (unsigned char*)xmalloc(__bufsize); \
|
||||
return ks; \
|
||||
} \
|
||||
static inline void ks_destroy(kstream_t *ks) \
|
||||
|
|
@ -83,10 +90,10 @@ typedef struct __kstring_t {
|
|||
#endif
|
||||
|
||||
#define __KS_GETUNTIL(__read, __bufsize) \
|
||||
static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
|
||||
static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
|
||||
{ \
|
||||
if (dret) *dret = 0; \
|
||||
str->l = 0; \
|
||||
str->l = append? str->l : 0; \
|
||||
if (ks->begin >= ks->end && ks->is_eof) return -1; \
|
||||
for (;;) { \
|
||||
int i; \
|
||||
|
|
@ -98,14 +105,20 @@ typedef struct __kstring_t {
|
|||
if (ks->end == 0) break; \
|
||||
} else break; \
|
||||
} \
|
||||
if (delimiter) { \
|
||||
if (delimiter == KS_SEP_LINE) { \
|
||||
for (i = ks->begin; i < ks->end; ++i) \
|
||||
if (ks->buf[i] == '\n') break; \
|
||||
} else if (delimiter > KS_SEP_MAX) { \
|
||||
for (i = ks->begin; i < ks->end; ++i) \
|
||||
if (ks->buf[i] == delimiter) break; \
|
||||
} else { \
|
||||
} else if (delimiter == KS_SEP_SPACE) { \
|
||||
for (i = ks->begin; i < ks->end; ++i) \
|
||||
if (isspace(ks->buf[i])) break; \
|
||||
} \
|
||||
if (str->m - str->l < i - ks->begin + 1) { \
|
||||
} else if (delimiter == KS_SEP_TAB) { \
|
||||
for (i = ks->begin; i < ks->end; ++i) \
|
||||
if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
|
||||
} else i = 0; /* never come to here! */ \
|
||||
if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
|
||||
str->m = str->l + (i - ks->begin) + 1; \
|
||||
kroundup32(str->m); \
|
||||
str->s = (char*)xrealloc(str->s, str->m); \
|
||||
|
|
@ -118,9 +131,15 @@ typedef struct __kstring_t {
|
|||
break; \
|
||||
} \
|
||||
} \
|
||||
if (str->s == 0) { \
|
||||
str->m = 1; \
|
||||
str->s = (char*)xcalloc(1, 1); \
|
||||
} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
|
||||
str->s[str->l] = '\0'; \
|
||||
return str->l; \
|
||||
}
|
||||
} \
|
||||
static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
|
||||
{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
|
||||
|
||||
#define KSTREAM_INIT(type_t, __read, __bufsize) \
|
||||
__KS_TYPE(type_t) \
|
||||
|
|
@ -128,19 +147,16 @@ typedef struct __kstring_t {
|
|||
__KS_GETC(__read, __bufsize) \
|
||||
__KS_GETUNTIL(__read, __bufsize)
|
||||
|
||||
#define __KSEQ_BASIC(type_t) \
|
||||
static inline kseq_t *kseq_init(type_t fd) \
|
||||
#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
|
||||
|
||||
#define __KSEQ_BASIC(SCOPE, type_t) \
|
||||
SCOPE kseq_t *kseq_init(type_t fd) \
|
||||
{ \
|
||||
kseq_t *s = (kseq_t*)xcalloc(1, sizeof(kseq_t)); \
|
||||
s->f = ks_init(fd); \
|
||||
return s; \
|
||||
} \
|
||||
static inline void kseq_rewind(kseq_t *ks) \
|
||||
{ \
|
||||
ks->last_char = 0; \
|
||||
ks->f->is_eof = ks->f->begin = ks->f->end = 0; \
|
||||
} \
|
||||
static inline void kseq_destroy(kseq_t *ks) \
|
||||
SCOPE void kseq_destroy(kseq_t *ks) \
|
||||
{ \
|
||||
if (!ks) return; \
|
||||
free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
|
||||
|
|
@ -153,44 +169,46 @@ typedef struct __kstring_t {
|
|||
-1 end-of-file
|
||||
-2 truncated quality string
|
||||
*/
|
||||
#define __KSEQ_READ \
|
||||
static int kseq_read(kseq_t *seq) \
|
||||
{ \
|
||||
int c; \
|
||||
kstream_t *ks = seq->f; \
|
||||
#define __KSEQ_READ(SCOPE) \
|
||||
SCOPE int kseq_read(kseq_t *seq) \
|
||||
{ \
|
||||
int c; \
|
||||
kstream_t *ks = seq->f; \
|
||||
if (seq->last_char == 0) { /* then jump to the next header line */ \
|
||||
while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
|
||||
if (c == -1) return -1; /* end of file */ \
|
||||
seq->last_char = c; \
|
||||
} /* the first header char has been read */ \
|
||||
seq->comment.l = seq->seq.l = seq->qual.l = 0; \
|
||||
if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \
|
||||
if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \
|
||||
while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
|
||||
if (c == -1) return -1; /* end of file */ \
|
||||
seq->last_char = c; \
|
||||
} /* else: the first header char has been read in the previous call */ \
|
||||
seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
|
||||
if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
|
||||
if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
|
||||
if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
|
||||
seq->seq.m = 256; \
|
||||
seq->seq.s = (char*)xmalloc(seq->seq.m); \
|
||||
} \
|
||||
while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
|
||||
if (isgraph(c)) { /* printable non-space character */ \
|
||||
if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
|
||||
seq->seq.m = seq->seq.l + 2; \
|
||||
kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
|
||||
seq->seq.s = (char*)xrealloc(seq->seq.s, seq->seq.m); \
|
||||
} \
|
||||
seq->seq.s[seq->seq.l++] = (char)c; \
|
||||
} \
|
||||
} \
|
||||
if (c == '\n') continue; /* skip empty lines */ \
|
||||
seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
|
||||
ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
|
||||
} \
|
||||
if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
|
||||
seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
|
||||
if (c != '+') return seq->seq.l; /* FASTA */ \
|
||||
if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \
|
||||
seq->qual.m = seq->seq.m; \
|
||||
seq->qual.s = (char*)xrealloc(seq->qual.s, seq->qual.m); \
|
||||
} \
|
||||
if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
|
||||
seq->seq.m = seq->seq.l + 2; \
|
||||
kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
|
||||
seq->seq.s = (char*)xrealloc(seq->seq.s, seq->seq.m); \
|
||||
} \
|
||||
seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
|
||||
if (c != '+') return seq->seq.l; /* FASTA */ \
|
||||
if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
|
||||
seq->qual.m = seq->seq.m; \
|
||||
seq->qual.s = (char*)xrealloc(seq->qual.s, seq->qual.m); \
|
||||
} \
|
||||
while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
|
||||
if (c == -1) return -2; /* we should not stop here */ \
|
||||
while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \
|
||||
if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
|
||||
seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \
|
||||
if (c == -1) return -2; /* error: no quality string */ \
|
||||
while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
|
||||
seq->last_char = 0; /* we have not come to the next header line */ \
|
||||
if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
|
||||
return seq->seq.l; \
|
||||
if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
|
||||
return seq->seq.l; \
|
||||
}
|
||||
|
||||
#define __KSEQ_TYPE(type_t) \
|
||||
|
|
@ -200,10 +218,19 @@ typedef struct __kstring_t {
|
|||
kstream_t *f; \
|
||||
} kseq_t;
|
||||
|
||||
#define KSEQ_INIT(type_t, __read) \
|
||||
KSTREAM_INIT(type_t, __read, 4096) \
|
||||
#define KSEQ_INIT2(SCOPE, type_t, __read) \
|
||||
KSTREAM_INIT(type_t, __read, 16384) \
|
||||
__KSEQ_TYPE(type_t) \
|
||||
__KSEQ_BASIC(type_t) \
|
||||
__KSEQ_READ
|
||||
__KSEQ_BASIC(SCOPE, type_t) \
|
||||
__KSEQ_READ(SCOPE)
|
||||
|
||||
#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
|
||||
|
||||
#define KSEQ_DECLARE(type_t) \
|
||||
__KS_TYPE(type_t) \
|
||||
__KSEQ_TYPE(type_t) \
|
||||
extern kseq_t *kseq_init(type_t fd); \
|
||||
void kseq_destroy(kseq_t *ks); \
|
||||
int kseq_read(kseq_t *seq);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
2
ksort.h
2
ksort.h
|
|
@ -140,7 +140,7 @@ typedef struct {
|
|||
tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \
|
||||
} \
|
||||
} \
|
||||
inline void __ks_insertsort_##name(type_t *s, type_t *t) \
|
||||
static inline void __ks_insertsort_##name(type_t *s, type_t *t) \
|
||||
{ \
|
||||
type_t *i, *j, swap_tmp; \
|
||||
for (i = s + 1; i < t; ++i) \
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ int ksprintf(kstring_t *s, const char *fmt, ...)
|
|||
int main()
|
||||
{
|
||||
kstring_t *s;
|
||||
s = (kstring_t*)calloc(1, sizeof(kstring_t));
|
||||
s = (kstring_t*)xcalloc(1, sizeof(kstring_t));
|
||||
ksprintf(s, "abcdefg: %d", 100);
|
||||
printf("%s\n", s->s);
|
||||
free(s);
|
||||
|
|
|
|||
54
kstring.h
54
kstring.h
|
|
@ -17,19 +17,33 @@ typedef struct __kstring_t {
|
|||
} kstring_t;
|
||||
#endif
|
||||
|
||||
static inline int kputs(const char *p, kstring_t *s)
|
||||
static inline void ks_resize(kstring_t *s, size_t size)
|
||||
{
|
||||
if (s->m < size) {
|
||||
s->m = size;
|
||||
kroundup32(s->m);
|
||||
s->s = (char*)xrealloc(s->s, s->m);
|
||||
}
|
||||
}
|
||||
|
||||
static inline int kputsn(const char *p, int l, kstring_t *s)
|
||||
{
|
||||
int l = strlen(p);
|
||||
if (s->l + l + 1 >= s->m) {
|
||||
s->m = s->l + l + 2;
|
||||
kroundup32(s->m);
|
||||
s->s = (char*)xrealloc(s->s, s->m);
|
||||
}
|
||||
strcpy(s->s + s->l, p);
|
||||
memcpy(s->s + s->l, p, l);
|
||||
s->l += l;
|
||||
s->s[s->l] = 0;
|
||||
return l;
|
||||
}
|
||||
|
||||
static inline int kputs(const char *p, kstring_t *s)
|
||||
{
|
||||
return kputsn(p, strlen(p), s);
|
||||
}
|
||||
|
||||
static inline int kputc(int c, kstring_t *s)
|
||||
{
|
||||
if (s->l + 1 >= s->m) {
|
||||
|
|
@ -42,6 +56,40 @@ static inline int kputc(int c, kstring_t *s)
|
|||
return c;
|
||||
}
|
||||
|
||||
static inline int kputw(int c, kstring_t *s)
|
||||
{
|
||||
char buf[16];
|
||||
int l, x;
|
||||
if (c == 0) return kputc('0', s);
|
||||
for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
|
||||
if (c < 0) buf[l++] = '-';
|
||||
if (s->l + l + 1 >= s->m) {
|
||||
s->m = s->l + l + 2;
|
||||
kroundup32(s->m);
|
||||
s->s = (char*)xrealloc(s->s, s->m);
|
||||
}
|
||||
for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
|
||||
s->s[s->l] = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int kputuw(unsigned c, kstring_t *s)
|
||||
{
|
||||
char buf[16];
|
||||
int l, i;
|
||||
unsigned x;
|
||||
if (c == 0) return kputc('0', s);
|
||||
for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0';
|
||||
if (s->l + l + 1 >= s->m) {
|
||||
s->m = s->l + l + 2;
|
||||
kroundup32(s->m);
|
||||
s->s = (char*)xrealloc(s->s, s->m);
|
||||
}
|
||||
for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
|
||||
s->s[s->l] = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ksprintf(kstring_t *s, const char *fmt, ...);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
378
ksw.c
378
ksw.c
|
|
@ -23,7 +23,6 @@
|
|||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef _NO_SSE2
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <emmintrin.h>
|
||||
|
|
@ -38,22 +37,35 @@
|
|||
#define UNLIKELY(x) (x)
|
||||
#endif
|
||||
|
||||
struct _ksw_query_t {
|
||||
const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 };
|
||||
|
||||
struct _kswq_t {
|
||||
int qlen, slen;
|
||||
uint8_t shift, mdiff, max, size;
|
||||
__m128i *qp, *H0, *H1, *E, *Hmax;
|
||||
};
|
||||
|
||||
ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
|
||||
/**
|
||||
* Initialize the query data structure
|
||||
*
|
||||
* @param size Number of bytes used to store a score; valid valures are 1 or 2
|
||||
* @param qlen Length of the query sequence
|
||||
* @param query Query sequence
|
||||
* @param m Size of the alphabet
|
||||
* @param mat Scoring matrix in a one-dimension array
|
||||
*
|
||||
* @return Query data structure
|
||||
*/
|
||||
kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
|
||||
{
|
||||
ksw_query_t *q;
|
||||
kswq_t *q;
|
||||
int slen, a, tmp, p;
|
||||
|
||||
size = size > 1? 2 : 1;
|
||||
p = 8 * (3 - size); // # values per __m128i
|
||||
slen = (qlen + p - 1) / p; // segmented length
|
||||
q = xmalloc(sizeof(ksw_query_t) + 256 + 16 * slen * (m + 4)); // a single block of memory
|
||||
q->qp = (__m128i*)(((size_t)q + sizeof(ksw_query_t) + 15) >> 4 << 4); // align memory
|
||||
q = (kswq_t*)xmalloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory
|
||||
q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory
|
||||
q->H0 = q->qp + slen * m;
|
||||
q->H1 = q->H0 + slen;
|
||||
q->E = q->H1 + slen;
|
||||
|
|
@ -92,11 +104,12 @@ ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const in
|
|||
return q;
|
||||
}
|
||||
|
||||
int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e)
|
||||
kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e)
|
||||
{
|
||||
int slen, i, m_b, n_b, te = -1, gmax = 0;
|
||||
int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
|
||||
uint64_t *b;
|
||||
__m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax;
|
||||
kswr_t r;
|
||||
|
||||
#define __max_16(ret, xx) do { \
|
||||
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \
|
||||
|
|
@ -107,10 +120,13 @@ int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) /
|
|||
} while (0)
|
||||
|
||||
// initialization
|
||||
r = g_defr;
|
||||
minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
|
||||
endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
|
||||
m_b = n_b = 0; b = 0;
|
||||
zero = _mm_set1_epi32(0);
|
||||
gapoe = _mm_set1_epi8(a->gapo + a->gape);
|
||||
gape = _mm_set1_epi8(a->gape);
|
||||
gapoe = _mm_set1_epi8(_gapo + _gape);
|
||||
gape = _mm_set1_epi8(_gape);
|
||||
shift = _mm_set1_epi8(q->shift);
|
||||
H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
|
||||
slen = q->slen;
|
||||
|
|
@ -166,11 +182,11 @@ int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) /
|
|||
end_loop16:
|
||||
//int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n");
|
||||
__max_16(imax, max); // imax is the maximum number in max
|
||||
if (imax >= a->T) { // write the b array; this condition adds branching unfornately
|
||||
if (imax >= minsc) { // write the b array; this condition adds branching unfornately
|
||||
if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append
|
||||
if (n_b == m_b) {
|
||||
m_b = m_b? m_b<<1 : 8;
|
||||
b = xrealloc(b, 8 * m_b);
|
||||
b = (uint64_t*)xrealloc(b, 8 * m_b);
|
||||
}
|
||||
b[n_b++] = (uint64_t)imax<<32 | i;
|
||||
} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
|
||||
|
|
@ -179,34 +195,38 @@ end_loop16:
|
|||
gmax = imax; te = i; // te is the end position on the target
|
||||
for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector
|
||||
_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
|
||||
if (gmax + q->shift >= 255) break;
|
||||
if (gmax + q->shift >= 255 || gmax >= endsc) break;
|
||||
}
|
||||
S = H1; H1 = H0; H0 = S; // swap H0 and H1
|
||||
}
|
||||
a->score = gmax; a->te = te;
|
||||
{ // get a->qe, the end of query match; find the 2nd best score
|
||||
r.score = gmax + q->shift < 255? gmax : 255;
|
||||
r.te = te;
|
||||
if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score
|
||||
int max = -1, low, high, qlen = slen * 16;
|
||||
uint8_t *t = (uint8_t*)Hmax;
|
||||
for (i = 0, a->qe = -1; i < qlen; ++i, ++t)
|
||||
if ((int)*t > max) max = *t, a->qe = i / 16 + i % 16 * slen;
|
||||
for (i = 0; i < qlen; ++i, ++t)
|
||||
if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen;
|
||||
//printf("%d,%d\n", max, gmax);
|
||||
i = (a->score + q->max - 1) / q->max;
|
||||
low = te - i; high = te + i;
|
||||
for (i = 0, a->score2 = 0; i < n_b; ++i) {
|
||||
int e = (int32_t)b[i];
|
||||
if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2)
|
||||
a->score2 = b[i]>>32, a->te2 = e;
|
||||
if (b) {
|
||||
i = (r.score + q->max - 1) / q->max;
|
||||
low = te - i; high = te + i;
|
||||
for (i = 0; i < n_b; ++i) {
|
||||
int e = (int32_t)b[i];
|
||||
if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
|
||||
r.score2 = b[i]>>32, r.te2 = e;
|
||||
}
|
||||
}
|
||||
}
|
||||
free(b);
|
||||
return a->score + q->shift >= 255? 255 : a->score;
|
||||
return r;
|
||||
}
|
||||
|
||||
int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e)
|
||||
kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e)
|
||||
{
|
||||
int slen, i, m_b, n_b, te = -1, gmax = 0;
|
||||
int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
|
||||
uint64_t *b;
|
||||
__m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax;
|
||||
kswr_t r;
|
||||
|
||||
#define __max_8(ret, xx) do { \
|
||||
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
|
||||
|
|
@ -216,10 +236,13 @@ int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) //
|
|||
} while (0)
|
||||
|
||||
// initialization
|
||||
r = g_defr;
|
||||
minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
|
||||
endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
|
||||
m_b = n_b = 0; b = 0;
|
||||
zero = _mm_set1_epi32(0);
|
||||
gapoe = _mm_set1_epi16(a->gapo + a->gape);
|
||||
gape = _mm_set1_epi16(a->gape);
|
||||
gapoe = _mm_set1_epi16(_gapo + _gape);
|
||||
gape = _mm_set1_epi16(_gape);
|
||||
H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
|
||||
slen = q->slen;
|
||||
for (i = 0; i < slen; ++i) {
|
||||
|
|
@ -261,11 +284,11 @@ int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) //
|
|||
}
|
||||
end_loop8:
|
||||
__max_8(imax, max);
|
||||
if (imax >= a->T) {
|
||||
if (imax >= minsc) {
|
||||
if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) {
|
||||
if (n_b == m_b) {
|
||||
m_b = m_b? m_b<<1 : 8;
|
||||
b = xrealloc(b, 8 * m_b);
|
||||
b = (uint64_t*)xrealloc(b, 8 * m_b);
|
||||
}
|
||||
b[n_b++] = (uint64_t)imax<<32 | i;
|
||||
} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
|
||||
|
|
@ -274,31 +297,238 @@ end_loop8:
|
|||
gmax = imax; te = i;
|
||||
for (j = 0; LIKELY(j < slen); ++j)
|
||||
_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
|
||||
if (gmax >= endsc) break;
|
||||
}
|
||||
S = H1; H1 = H0; H0 = S;
|
||||
}
|
||||
a->score = gmax; a->te = te;
|
||||
r.score = gmax; r.te = te;
|
||||
{
|
||||
int max = -1, low, high, qlen = slen * 8;
|
||||
uint16_t *t = (uint16_t*)Hmax;
|
||||
for (i = 0, a->qe = -1; i < qlen; ++i, ++t)
|
||||
if ((int)*t > max) max = *t, a->qe = i / 8 + i % 8 * slen;
|
||||
i = (a->score + q->max - 1) / q->max;
|
||||
low = te - i; high = te + i;
|
||||
for (i = 0, a->score2 = 0; i < n_b; ++i) {
|
||||
int e = (int32_t)b[i];
|
||||
if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2)
|
||||
a->score2 = b[i]>>32, a->te2 = e;
|
||||
for (i = 0, r.qe = -1; i < qlen; ++i, ++t)
|
||||
if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen;
|
||||
if (b) {
|
||||
i = (r.score + q->max - 1) / q->max;
|
||||
low = te - i; high = te + i;
|
||||
for (i = 0; i < n_b; ++i) {
|
||||
int e = (int32_t)b[i];
|
||||
if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
|
||||
r.score2 = b[i]>>32, r.te2 = e;
|
||||
}
|
||||
}
|
||||
}
|
||||
free(b);
|
||||
return a->score;
|
||||
return r;
|
||||
}
|
||||
|
||||
int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a)
|
||||
static void revseq(int l, uint8_t *s)
|
||||
{
|
||||
if (q->size == 1) return ksw_sse2_16(q, tlen, target, a);
|
||||
else return ksw_sse2_8(q, tlen, target, a);
|
||||
int i, t;
|
||||
for (i = 0; i < l>>1; ++i)
|
||||
t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t;
|
||||
}
|
||||
|
||||
kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry)
|
||||
{
|
||||
int size;
|
||||
kswq_t *q;
|
||||
kswr_t r, rr;
|
||||
kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int);
|
||||
|
||||
q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat);
|
||||
if (qry && *qry == 0) *qry = q;
|
||||
func = q->size == 2? ksw_i16 : ksw_u8;
|
||||
size = q->size;
|
||||
r = func(q, tlen, target, gapo, gape, xtra);
|
||||
if (qry == 0) free(q);
|
||||
if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r;
|
||||
revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end
|
||||
q = ksw_qinit(size, r.qe + 1, query, m, mat);
|
||||
rr = func(q, tlen, target, gapo, gape, KSW_XSTOP | r.score);
|
||||
revseq(r.qe + 1, query); revseq(r.te + 1, target);
|
||||
free(q);
|
||||
if (r.score == rr.score)
|
||||
r.tb = r.te - rr.te, r.qb = r.qe - rr.qe;
|
||||
return r;
|
||||
}
|
||||
|
||||
/********************
|
||||
*** SW extension ***
|
||||
********************/
|
||||
|
||||
typedef struct {
|
||||
int32_t h, e;
|
||||
} eh_t;
|
||||
|
||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle)
|
||||
{
|
||||
eh_t *eh; // score array
|
||||
int8_t *qp; // query profile
|
||||
int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap;
|
||||
if (h0 < 0) h0 = 0;
|
||||
// allocate memory
|
||||
qp = xmalloc(qlen * m);
|
||||
eh = xcalloc(qlen + 1, 8);
|
||||
// generate the query profile
|
||||
for (k = i = 0; k < m; ++k) {
|
||||
const int8_t *p = &mat[k * m];
|
||||
for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
|
||||
}
|
||||
// fill the first row
|
||||
eh[0].h = h0; eh[1].h = h0 > gapoe? h0 - gapoe : 0;
|
||||
for (j = 2; j <= qlen && eh[j-1].h > gape; ++j)
|
||||
eh[j].h = eh[j-1].h - gape;
|
||||
// adjust $w if it is too large
|
||||
k = m * m;
|
||||
for (i = 0, max = 0; i < k; ++i) // get the max score
|
||||
max = max > mat[i]? max : mat[i];
|
||||
max_gap = (int)((double)(qlen * max - gapo) / gape + 1.);
|
||||
max_gap = max_gap > 1? max_gap : 1;
|
||||
w = w < max_gap? w : max_gap;
|
||||
// DP loop
|
||||
max = h0, max_i = max_j = -1;
|
||||
beg = 0, end = qlen;
|
||||
for (i = 0; LIKELY(i < tlen); ++i) {
|
||||
int f = 0, h1, m = 0, mj = -1;
|
||||
int8_t *q = &qp[target[i] * qlen];
|
||||
// compute the first column
|
||||
h1 = h0 - (gapo + gape * (i + 1));
|
||||
if (h1 < 0) h1 = 0;
|
||||
// apply the band and the constraint (if provided)
|
||||
if (beg < i - w) beg = i - w;
|
||||
if (end > i + w + 1) end = i + w + 1;
|
||||
if (end > qlen) end = qlen;
|
||||
for (j = beg; LIKELY(j < end); ++j) {
|
||||
// At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
|
||||
// Similar to SSE2-SW, cells are computed in the following order:
|
||||
// H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
|
||||
// E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape
|
||||
// F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape
|
||||
eh_t *p = &eh[j];
|
||||
int h = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j)
|
||||
p->h = h1; // set H(i,j-1) for the next row
|
||||
h += q[j];
|
||||
h = h > e? h : e;
|
||||
h = h > f? h : f;
|
||||
h1 = h; // save H(i,j) to h1 for the next column
|
||||
mj = m > h? mj : j;
|
||||
m = m > h? m : h; // m is stored at eh[mj+1]
|
||||
h -= gapoe;
|
||||
h = h > 0? h : 0;
|
||||
e -= gape;
|
||||
e = e > h? e : h; // computed E(i+1,j)
|
||||
p->e = e; // save E(i+1,j) for the next row
|
||||
f -= gape;
|
||||
f = f > h? f : h; // computed F(i,j+1)
|
||||
}
|
||||
eh[end].h = h1; eh[end].e = 0;
|
||||
if (m == 0) break;
|
||||
if (m > max) max = m, max_i = i, max_j = mj;
|
||||
// update beg and end for the next round
|
||||
for (j = mj; j >= beg && eh[j].h; --j);
|
||||
beg = j + 1;
|
||||
for (j = mj + 2; j <= end && eh[j].h; ++j);
|
||||
end = j;
|
||||
//beg = 0; end = qlen; // uncomment this line for debugging
|
||||
}
|
||||
free(eh); free(qp);
|
||||
if (_qle) *_qle = max_j + 1;
|
||||
if (_tle) *_tle = max_i + 1;
|
||||
return max;
|
||||
}
|
||||
|
||||
/********************
|
||||
* Global alignment *
|
||||
********************/
|
||||
|
||||
#define MINUS_INF -0x40000000
|
||||
|
||||
static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, int op, int len)
|
||||
{
|
||||
if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) {
|
||||
if (*n_cigar == *m_cigar) {
|
||||
*m_cigar = *m_cigar? (*m_cigar)<<1 : 4;
|
||||
cigar = xrealloc(cigar, (*m_cigar) << 2);
|
||||
}
|
||||
cigar[(*n_cigar)++] = len<<4 | op;
|
||||
} else cigar[(*n_cigar)-1] += len<<4;
|
||||
return cigar;
|
||||
}
|
||||
|
||||
int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_)
|
||||
{
|
||||
eh_t *eh;
|
||||
int8_t *qp; // query profile
|
||||
int i, j, k, gapoe = gapo + gape, score, n_col;
|
||||
uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex
|
||||
if (n_cigar_) *n_cigar_ = 0;
|
||||
// allocate memory
|
||||
n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix
|
||||
z = xmalloc(n_col * tlen);
|
||||
qp = xmalloc(qlen * m);
|
||||
eh = xcalloc(qlen + 1, 8);
|
||||
// generate the query profile
|
||||
for (k = i = 0; k < m; ++k) {
|
||||
const int8_t *p = &mat[k * m];
|
||||
for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
|
||||
}
|
||||
// fill the first row
|
||||
eh[0].h = 0; eh[0].e = MINUS_INF;
|
||||
for (j = 1; j <= qlen && j <= w; ++j)
|
||||
eh[j].h = -(gapo + gape * j), eh[j].e = MINUS_INF;
|
||||
for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band
|
||||
// DP loop
|
||||
for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop
|
||||
int32_t f = MINUS_INF, h1, beg, end;
|
||||
int8_t *q = &qp[target[i] * qlen];
|
||||
uint8_t *zi = &z[i * n_col];
|
||||
beg = i > w? i - w : 0;
|
||||
end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence
|
||||
h1 = beg == 0? -(gapo + gape * (i + 1)) : MINUS_INF;
|
||||
for (j = beg; LIKELY(j < end); ++j) {
|
||||
// This loop is organized in a similar way to ksw_extend() and ksw_sse2(), except:
|
||||
// 1) not checking h>0; 2) recording direction for backtracking
|
||||
eh_t *p = &eh[j];
|
||||
int32_t h = p->h, e = p->e;
|
||||
uint8_t d; // direction
|
||||
p->h = h1;
|
||||
h += q[j];
|
||||
d = h >= e? 0 : 1;
|
||||
h = h >= e? h : e;
|
||||
d = h >= f? d : 2;
|
||||
h = h >= f? h : f;
|
||||
h1 = h;
|
||||
h -= gapoe;
|
||||
e -= gape;
|
||||
d |= e > h? 1<<2 : 0;
|
||||
e = e > h? e : h;
|
||||
p->e = e;
|
||||
f -= gape;
|
||||
d |= f > h? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two
|
||||
f = f > h? f : h;
|
||||
zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell
|
||||
}
|
||||
eh[end].h = h1; eh[end].e = MINUS_INF;
|
||||
}
|
||||
score = eh[qlen].h;
|
||||
if (n_cigar_ && cigar_) { // backtrack
|
||||
int n_cigar = 0, m_cigar = 0, which = 0;
|
||||
uint32_t *cigar = 0, tmp;
|
||||
i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell
|
||||
while (i >= 0 && k >= 0) {
|
||||
which = z[i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3;
|
||||
if (which == 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k;
|
||||
else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i;
|
||||
else cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k;
|
||||
}
|
||||
if (i >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1);
|
||||
if (k >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1);
|
||||
for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
|
||||
tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;
|
||||
*n_cigar_ = n_cigar, *cigar_ = cigar;
|
||||
}
|
||||
free(eh); free(qp); free(z);
|
||||
return score;
|
||||
}
|
||||
|
||||
/*******************************************
|
||||
|
|
@ -334,30 +564,33 @@ unsigned char seq_nt4_table[256] = {
|
|||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int c, sa = 1, sb = 3, i, j, k, forward_only = 0, size = 2;
|
||||
int c, sa = 1, sb = 3, i, j, k, forward_only = 0, max_rseq = 0;
|
||||
int8_t mat[25];
|
||||
ksw_aux_t a;
|
||||
int gapo = 5, gape = 2, minsc = 0, xtra = KSW_XSTART;
|
||||
uint8_t *rseq = 0;
|
||||
gzFile fpt, fpq;
|
||||
kseq_t *kst, *ksq;
|
||||
|
||||
// parse command line
|
||||
a.gapo = 5; a.gape = 2; a.T = 10;
|
||||
while ((c = getopt(argc, argv, "a:b:q:r:ft:s:")) >= 0) {
|
||||
while ((c = getopt(argc, argv, "a:b:q:r:ft:1")) >= 0) {
|
||||
switch (c) {
|
||||
case 'a': sa = atoi(optarg); break;
|
||||
case 'b': sb = atoi(optarg); break;
|
||||
case 'q': a.gapo = atoi(optarg); break;
|
||||
case 'r': a.gape = atoi(optarg); break;
|
||||
case 't': a.T = atoi(optarg); break;
|
||||
case 'q': gapo = atoi(optarg); break;
|
||||
case 'r': gape = atoi(optarg); break;
|
||||
case 't': minsc = atoi(optarg); break;
|
||||
case 'f': forward_only = 1; break;
|
||||
case 's': size = atoi(optarg); break;
|
||||
case '1': xtra |= KSW_XBYTE; break;
|
||||
}
|
||||
}
|
||||
if (optind + 2 > argc) {
|
||||
fprintf(stderr, "Usage: ksw [-s%d] [-a%d] [-b%d] [-q%d] [-r%d] <target.fa> <query.fa>\n", size, sa, sb, a.gapo, a.gape);
|
||||
fprintf(stderr, "Usage: ksw [-1] [-f] [-a%d] [-b%d] [-q%d] [-r%d] [-t%d] <target.fa> <query.fa>\n", sa, sb, gapo, gape, minsc);
|
||||
return 1;
|
||||
}
|
||||
if (minsc > 0xffff) minsc = 0xffff;
|
||||
xtra |= KSW_XSUBO | minsc;
|
||||
// initialize scoring matrix
|
||||
for (i = k = 0; i < 5; ++i) {
|
||||
for (i = k = 0; i < 4; ++i) {
|
||||
for (j = 0; j < 4; ++j)
|
||||
mat[k++] = i == j? sa : -sb;
|
||||
mat[k++] = 0; // ambiguous base
|
||||
|
|
@ -368,35 +601,34 @@ int main(int argc, char *argv[])
|
|||
fpq = xzopen(argv[optind+1], "r"); ksq = kseq_init(fpq);
|
||||
// all-pair alignment
|
||||
while (kseq_read(ksq) > 0) {
|
||||
ksw_query_t *q[2];
|
||||
for (i = 0; i < ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]];
|
||||
q[0] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat);
|
||||
kswq_t *q[2] = {0, 0};
|
||||
kswr_t r;
|
||||
for (i = 0; i < (int)ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]];
|
||||
if (!forward_only) { // reverse
|
||||
for (i = 0; i < ksq->seq.l/2; ++i) {
|
||||
int t = ksq->seq.s[i];
|
||||
ksq->seq.s[i] = ksq->seq.s[ksq->seq.l-1-i];
|
||||
ksq->seq.s[ksq->seq.l-1-i] = t;
|
||||
if ((int)ksq->seq.m > max_rseq) {
|
||||
max_rseq = ksq->seq.m;
|
||||
rseq = (uint8_t*)xrealloc(rseq, max_rseq);
|
||||
}
|
||||
for (i = 0; i < ksq->seq.l; ++i)
|
||||
ksq->seq.s[i] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i];
|
||||
q[1] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat);
|
||||
} else q[1] = 0;
|
||||
for (i = 0, j = ksq->seq.l - 1; i < (int)ksq->seq.l; ++i, --j)
|
||||
rseq[j] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i];
|
||||
}
|
||||
gzrewind(fpt); kseq_rewind(kst);
|
||||
while (kseq_read(kst) > 0) {
|
||||
int s;
|
||||
for (i = 0; i < kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]];
|
||||
s = ksw_sse2(q[0], kst->seq.l, (uint8_t*)kst->seq.s, &a);
|
||||
printf("%s\t%s\t+\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1);
|
||||
if (q[1]) {
|
||||
s = ksw_sse2(q[1], kst->seq.l, (uint8_t*)kst->seq.s, &a);
|
||||
printf("%s\t%s\t-\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1);
|
||||
for (i = 0; i < (int)kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]];
|
||||
r = ksw_align(ksq->seq.l, (uint8_t*)ksq->seq.s, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[0]);
|
||||
if (r.score >= minsc)
|
||||
err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, r.qb, r.qe+1, r.score, r.score2, r.te2);
|
||||
if (rseq) {
|
||||
r = ksw_align(ksq->seq.l, rseq, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[1]);
|
||||
if (r.score >= minsc)
|
||||
err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, (int)ksq->seq.l - r.qb, (int)ksq->seq.l - 1 - r.qe, r.score, r.score2, r.te2);
|
||||
}
|
||||
}
|
||||
free(q[0]); free(q[1]);
|
||||
}
|
||||
free(rseq);
|
||||
kseq_destroy(kst); err_gzclose(fpt);
|
||||
kseq_destroy(ksq); err_gzclose(fpq);
|
||||
return 0;
|
||||
}
|
||||
#endif // _KSW_MAIN
|
||||
#endif // _NO_SSE2
|
||||
#endif
|
||||
|
|
|
|||
84
ksw.h
84
ksw.h
|
|
@ -1,51 +1,69 @@
|
|||
#ifndef __AC_KSW_H
|
||||
#define __AC_KSW_H
|
||||
|
||||
struct _ksw_query_t;
|
||||
typedef struct _ksw_query_t ksw_query_t;
|
||||
#include <stdint.h>
|
||||
|
||||
#define KSW_XBYTE 0x10000
|
||||
#define KSW_XSTOP 0x20000
|
||||
#define KSW_XSUBO 0x40000
|
||||
#define KSW_XSTART 0x80000
|
||||
|
||||
struct _kswq_t;
|
||||
typedef struct _kswq_t kswq_t;
|
||||
|
||||
typedef struct {
|
||||
// input
|
||||
unsigned gapo, gape; // the first gap costs gapo+gape
|
||||
unsigned T; // threshold
|
||||
// output
|
||||
int score, te, qe, score2, te2;
|
||||
} ksw_aux_t;
|
||||
int score; // best score
|
||||
int te, qe; // target end and query end
|
||||
int score2, te2; // second best score and ending position on the target
|
||||
int tb, qb; // target start and query start
|
||||
} kswr_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Initialize the query data structure
|
||||
* Aligning two sequences
|
||||
*
|
||||
* @param size Number of bytes used to store a score; valid valures are 1 or 2
|
||||
* @param qlen Length of the query sequence
|
||||
* @param query Query sequence
|
||||
* @param m Size of the alphabet
|
||||
* @param mat Scoring matrix in a one-dimension array
|
||||
* @param qlen length of the query sequence (typically <tlen)
|
||||
* @param query query sequence with 0 <= query[i] < m
|
||||
* @param tlen length of the target sequence
|
||||
* @param target target sequence
|
||||
* @param m number of residue types
|
||||
* @param mat m*m scoring matrix in one-dimention array
|
||||
* @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)"
|
||||
* @param gape gap extension penalty
|
||||
* @param xtra extra information (see below)
|
||||
* @param qry query profile (see below)
|
||||
*
|
||||
* @return Query data structure
|
||||
* @return alignment information in a struct; unset values to -1
|
||||
*
|
||||
* When xtra==0, ksw_align() uses a signed two-byte integer to store a
|
||||
* score and only finds the best score and the end positions. The 2nd best
|
||||
* score or the start positions are not attempted. The default behavior can
|
||||
* be tuned by setting KSW_X* flags:
|
||||
*
|
||||
* KSW_XBYTE: use an unsigned byte to store a score. If overflow occurs,
|
||||
* kswr_t::score will be set to 255
|
||||
*
|
||||
* KSW_XSUBO: track the 2nd best score and the ending position on the
|
||||
* target if the 2nd best is higher than (xtra&0xffff)
|
||||
*
|
||||
* KSW_XSTOP: stop if the maximum score is above (xtra&0xffff)
|
||||
*
|
||||
* KSW_XSTART: find the start positions
|
||||
*
|
||||
* When *qry==NULL, ksw_align() will compute and allocate the query profile
|
||||
* and when the function returns, *qry will point to the profile, which can
|
||||
* be deallocated simply by free(). If one query is aligned against multiple
|
||||
* target sequences, *qry should be set to NULL during the first call and
|
||||
* freed after the last call. Note that qry can equal 0. In this case, the
|
||||
* query profile will be deallocated in ksw_align().
|
||||
*/
|
||||
ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat); // to free, simply call free()
|
||||
kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry);
|
||||
|
||||
/**
|
||||
* Compute the maximum local score for queries initialized with ksw_qinit(1, ...)
|
||||
*
|
||||
* @param q Query data structure returned by ksw_qinit(1, ...)
|
||||
* @param tlen Length of the target sequence
|
||||
* @param target Target sequence
|
||||
* @param a Auxiliary data structure (see ksw.h)
|
||||
*
|
||||
* @return The maximum local score; if the returned value equals 255, the SW may not be finished
|
||||
*/
|
||||
int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a);
|
||||
|
||||
/** Compute the maximum local score for queries initialized with ksw_qinit(2, ...) */
|
||||
int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a);
|
||||
|
||||
/** Unified interface for ksw_sse2_8() and ksw_sse2_16() */
|
||||
int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a);
|
||||
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle);
|
||||
int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *_n_cigar, uint32_t **_cigar);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
12
kvec.h
12
kvec.h
|
|
@ -1,6 +1,6 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2008, by Attractive Chaos <attractivechaos@aol.co.uk>
|
||||
Copyright (c) 2008, by Attractive Chaos <attractor@live.co.uk>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
|
|
@ -76,15 +76,15 @@ int main() {
|
|||
(v).a[(v).n++] = (x); \
|
||||
} while (0)
|
||||
|
||||
#define kv_pushp(type, v) (((v).n == (v).m)? \
|
||||
#define kv_pushp(type, v) ((((v).n == (v).m)? \
|
||||
((v).m = ((v).m? (v).m<<1 : 2), \
|
||||
(v).a = (type*)xrealloc((v).a, sizeof(type) * (v).m), 0) \
|
||||
: 0), ((v).a + ((v).n++))
|
||||
: 0), &(v).a[(v).n++])
|
||||
|
||||
#define kv_a(type, v, i) ((v).m <= (size_t)(i)? \
|
||||
#define kv_a(type, v, i) (((v).m <= (size_t)(i)? \
|
||||
((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \
|
||||
(v).a = (type*)xrealloc((v).a, sizeof(type) * (v).m), 0) \
|
||||
: (v).n <= (size_t)(i)? (v).n = (i) \
|
||||
: 0), (v).a[(i)]
|
||||
: (v).n <= (size_t)(i)? (v).n = (i) + 1 \
|
||||
: 0), (v).a[(i)])
|
||||
|
||||
#endif
|
||||
|
|
|
|||
11
main.c
11
main.c
|
|
@ -4,7 +4,7 @@
|
|||
#include "utils.h"
|
||||
|
||||
#ifndef PACKAGE_VERSION
|
||||
#define PACKAGE_VERSION "0.6.2-r132"
|
||||
#define PACKAGE_VERSION "0.6.2-r301-beta"
|
||||
#endif
|
||||
|
||||
static int usage()
|
||||
|
|
@ -20,21 +20,20 @@ static int usage()
|
|||
fprintf(stderr, " sampe generate alignment (paired ended)\n");
|
||||
fprintf(stderr, " bwasw BWA-SW for long queries\n");
|
||||
fprintf(stderr, " fastmap identify super-maximal exact matches\n");
|
||||
fprintf(stderr, " mem BWA-MEM algorithm\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
|
||||
fprintf(stderr, " pac2bwt generate BWT from PAC\n");
|
||||
fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n");
|
||||
fprintf(stderr, " bwtupdate update .bwt to the new format\n");
|
||||
fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n");
|
||||
fprintf(stderr, " pac2cspac convert PAC to color-space PAC\n");
|
||||
fprintf(stderr, " stdsw standard SW/NW alignment\n");
|
||||
fprintf(stderr, "\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
void bwa_print_sam_PG()
|
||||
{
|
||||
printf("@PG\tID:bwa\tPN:bwa\tVN:%s\n", PACKAGE_VERSION);
|
||||
err_printf("@PG\tID:bwa\tPN:bwa\tVN:%s\n", PACKAGE_VERSION);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
|
|
@ -50,15 +49,13 @@ int main(int argc, char *argv[])
|
|||
else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "sw") == 0) ret = bwa_stdsw(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "pac2cspac") == 0) ret = bwa_pac2cspac(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "stdsw") == 0) ret = bwa_stdsw(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1);
|
||||
else {
|
||||
fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
|
||||
return 1;
|
||||
|
|
|
|||
4
main.h
4
main.h
|
|
@ -6,7 +6,6 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
int bwa_fa2pac(int argc, char *argv[]);
|
||||
int bwa_pac2cspac(int argc, char *argv[]);
|
||||
int bwa_pac2bwt(int argc, char *argv[]);
|
||||
int bwa_bwtupdate(int argc, char *argv[]);
|
||||
int bwa_bwt2sa(int argc, char *argv[]);
|
||||
|
|
@ -17,11 +16,10 @@ extern "C" {
|
|||
int bwa_sai2sam_se(int argc, char *argv[]);
|
||||
int bwa_sai2sam_pe(int argc, char *argv[]);
|
||||
|
||||
int bwa_stdsw(int argc, char *argv[]);
|
||||
|
||||
int bwa_bwtsw2(int argc, char *argv[]);
|
||||
|
||||
int main_fastmap(int argc, char *argv[]);
|
||||
int main_mem(int argc, char *argv[]);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
162
simple_dp.c
162
simple_dp.c
|
|
@ -1,162 +0,0 @@
|
|||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <zlib.h>
|
||||
#include <stdint.h>
|
||||
#include "stdaln.h"
|
||||
#include "utils.h"
|
||||
|
||||
#include "kseq.h"
|
||||
KSEQ_INIT(gzFile, err_gzread)
|
||||
|
||||
typedef struct {
|
||||
int l;
|
||||
unsigned char *s;
|
||||
char *n;
|
||||
} seq1_t;
|
||||
|
||||
typedef struct {
|
||||
int n_seqs, m_seqs;
|
||||
seq1_t *seqs;
|
||||
} seqs_t;
|
||||
|
||||
unsigned char aln_rev_table[256] = {
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','T','V','G', 'H','N','N','C', 'D','N','N','M', 'N','K','N','N',
|
||||
'N','N','Y','S', 'A','N','B','W', 'X','R','N','N', 'N','N','N','N',
|
||||
'N','t','v','g', 'h','N','N','c', 'd','N','N','m', 'N','k','N','N',
|
||||
'N','N','y','s', 'a','N','b','w', 'x','r','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
|
||||
'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N'
|
||||
};
|
||||
|
||||
static int g_is_global = 0, g_thres = 1, g_strand = 0, g_aa = 0;
|
||||
static AlnParam g_aln_param;
|
||||
|
||||
static void revseq(int len, uint8_t *seq)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < len>>1; ++i) {
|
||||
uint8_t tmp = aln_rev_table[seq[len-1-i]];
|
||||
seq[len-1-i] = aln_rev_table[seq[i]];
|
||||
seq[i] = tmp;
|
||||
}
|
||||
if (len&1) seq[i] = aln_rev_table[seq[i]];
|
||||
}
|
||||
|
||||
static seqs_t *load_seqs(const char *fn)
|
||||
{
|
||||
seqs_t *s;
|
||||
seq1_t *p;
|
||||
gzFile fp;
|
||||
int l;
|
||||
kseq_t *seq;
|
||||
|
||||
fp = xzopen(fn, "r");
|
||||
seq = kseq_init(fp);
|
||||
s = (seqs_t*)xcalloc(1, sizeof(seqs_t));
|
||||
s->m_seqs = 256;
|
||||
s->seqs = (seq1_t*)xcalloc(s->m_seqs, sizeof(seq1_t));
|
||||
while ((l = kseq_read(seq)) >= 0) {
|
||||
if (s->n_seqs == s->m_seqs) {
|
||||
s->m_seqs <<= 1;
|
||||
s->seqs = (seq1_t*)xrealloc(s->seqs, s->m_seqs * sizeof(seq1_t));
|
||||
}
|
||||
p = s->seqs + (s->n_seqs++);
|
||||
p->l = seq->seq.l;
|
||||
p->s = (unsigned char*)xmalloc(p->l + 1);
|
||||
memcpy(p->s, seq->seq.s, p->l);
|
||||
p->s[p->l] = 0;
|
||||
p->n = xstrdup((const char*)seq->name.s);
|
||||
}
|
||||
kseq_destroy(seq);
|
||||
err_gzclose(fp);
|
||||
fprintf(stderr, "[load_seqs] %d sequences are loaded.\n", s->n_seqs);
|
||||
return s;
|
||||
}
|
||||
|
||||
static void aln_1seq(const seqs_t *ss, const char *name, int l, const char *s, char strand)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < ss->n_seqs; ++i) {
|
||||
AlnAln *aa;
|
||||
seq1_t *p = ss->seqs + i;
|
||||
g_aln_param.band_width = l + p->l;
|
||||
aa = aln_stdaln_aux(s, (const char*)p->s, &g_aln_param, g_is_global, g_thres, l, p->l);
|
||||
if (aa->score >= g_thres || g_is_global) {
|
||||
printf(">%s\t%d\t%d\t%s\t%c\t%d\t%d\t%d\t%d\t", p->n, aa->start1? aa->start1 : 1, aa->end1, name, strand,
|
||||
aa->start2? aa->start2 : 1, aa->end2, aa->score, aa->subo);
|
||||
// NB: I put the short sequence as the first sequence in SW, an insertion to
|
||||
// the reference becomes a deletion from the short sequence. Therefore, I use
|
||||
// "MDI" here rather than "MID", and print ->out2 first rather than ->out1.
|
||||
for (i = 0; i != aa->n_cigar; ++i)
|
||||
printf("%d%c", aa->cigar32[i]>>4, "MDI"[aa->cigar32[i]&0xf]);
|
||||
printf("\n%s\n%s\n%s\n", aa->out2, aa->outm, aa->out1);
|
||||
}
|
||||
aln_free_AlnAln(aa);
|
||||
}
|
||||
}
|
||||
|
||||
static void aln_seqs(const seqs_t *ss, const char *fn)
|
||||
{
|
||||
gzFile fp;
|
||||
kseq_t *seq;
|
||||
int l;
|
||||
|
||||
fp = xzopen(fn, "r");
|
||||
seq = kseq_init(fp);
|
||||
while ((l = kseq_read(seq)) >= 0) {
|
||||
if (g_strand&1) aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '+');
|
||||
if (g_strand&2) {
|
||||
revseq(l, (uint8_t*)seq->seq.s);
|
||||
aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '-');
|
||||
}
|
||||
}
|
||||
kseq_destroy(seq);
|
||||
err_gzclose(fp);
|
||||
}
|
||||
|
||||
int bwa_stdsw(int argc, char *argv[])
|
||||
{
|
||||
int c;
|
||||
seqs_t *ss;
|
||||
|
||||
while ((c = getopt(argc, argv, "gT:frp")) >= 0) {
|
||||
switch (c) {
|
||||
case 'g': g_is_global = 1; break;
|
||||
case 'T': g_thres = atoi(optarg); break;
|
||||
case 'f': g_strand |= 1; break;
|
||||
case 'r': g_strand |= 2; break;
|
||||
case 'p': g_aa = 1; break;
|
||||
}
|
||||
}
|
||||
if (g_strand == 0) g_strand = 3;
|
||||
if (g_aa) g_strand = 1;
|
||||
if (optind + 1 >= argc) {
|
||||
fprintf(stderr, "\nUsage: bwa stdsw [options] <seq1.long.fa> <seq2.short.fa>\n\n");
|
||||
fprintf(stderr, "Options: -T INT minimum score [%d]\n", g_thres);
|
||||
fprintf(stderr, " -p protein alignment (suppressing -r)\n");
|
||||
fprintf(stderr, " -f forward strand only\n");
|
||||
fprintf(stderr, " -r reverse strand only\n");
|
||||
fprintf(stderr, " -g global alignment\n\n");
|
||||
fprintf(stderr, "Note: This program is specifically designed for alignment between multiple short\n");
|
||||
fprintf(stderr, " sequences and ONE long sequence. It outputs the suboptimal score on the long\n");
|
||||
fprintf(stderr, " sequence.\n\n");
|
||||
return 1;
|
||||
}
|
||||
g_aln_param = g_aa? aln_param_aa2aa : aln_param_blast;
|
||||
g_aln_param.gap_end = 0;
|
||||
ss = load_seqs(argv[optind]);
|
||||
aln_seqs(ss, argv[optind+1]);
|
||||
return 0;
|
||||
}
|
||||
111
solid2fastq.pl
111
solid2fastq.pl
|
|
@ -1,111 +0,0 @@
|
|||
#!/usr/bin/perl -w
|
||||
|
||||
# Author: lh3
|
||||
# Note: Ideally, this script should be written in C. It is a bit slow at present.
|
||||
# Also note that this script is different from the one contained in MAQ.
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use Getopt::Std;
|
||||
|
||||
my %opts;
|
||||
my $version = '0.1.4';
|
||||
my $usage = qq{
|
||||
Usage: solid2fastq.pl <in.title> <out.prefix>
|
||||
|
||||
Note: <in.title> is the string showed in the `# Title:' line of a
|
||||
".csfasta" read file. Then <in.title>F3.csfasta is read sequence
|
||||
file and <in.title>F3_QV.qual is the quality file. If
|
||||
<in.title>R3.csfasta is present, this script assumes reads are
|
||||
paired; otherwise reads will be regarded as single-end.
|
||||
|
||||
The read name will be <out.prefix>:panel_x_y/[12] with `1' for R3
|
||||
tag and `2' for F3. Usually you may want to use short <out.prefix>
|
||||
to save diskspace. Long <out.prefix> also causes troubles to maq.
|
||||
|
||||
};
|
||||
|
||||
getopts('', \%opts);
|
||||
die($usage) if (@ARGV != 2);
|
||||
my ($title, $pre) = @ARGV;
|
||||
my (@fhr, @fhw);
|
||||
my @fn_suff = ('F3.csfasta', 'F3_QV.qual', 'R3.csfasta', 'R3_QV.qual');
|
||||
my $is_paired = (-f "$title$fn_suff[2]" || -f "$title$fn_suff[2].gz")? 1 : 0;
|
||||
if ($is_paired) { # paired end
|
||||
for (0 .. 3) {
|
||||
my $fn = "$title$fn_suff[$_]";
|
||||
$fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz");
|
||||
open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n");
|
||||
}
|
||||
open($fhw[0], "|gzip >$pre.read2.fastq.gz") || die; # this is NOT a typo
|
||||
open($fhw[1], "|gzip >$pre.read1.fastq.gz") || die;
|
||||
open($fhw[2], "|gzip >$pre.single.fastq.gz") || die;
|
||||
my (@df, @dr);
|
||||
@df = &read1(1); @dr = &read1(2);
|
||||
while (@df && @dr) {
|
||||
if ($df[0] eq $dr[0]) { # mate pair
|
||||
print {$fhw[0]} $df[1]; print {$fhw[1]} $dr[1];
|
||||
@df = &read1(1); @dr = &read1(2);
|
||||
} else {
|
||||
if ($df[0] le $dr[0]) {
|
||||
print {$fhw[2]} $df[1];
|
||||
@df = &read1(1);
|
||||
} else {
|
||||
print {$fhw[2]} $dr[1];
|
||||
@dr = &read1(2);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (@df) {
|
||||
print {$fhw[2]} $df[1];
|
||||
while (@df = &read1(1, $fhr[0], $fhr[1])) {
|
||||
print {$fhw[2]} $df[1];
|
||||
}
|
||||
}
|
||||
if (@dr) {
|
||||
print {$fhw[2]} $dr[1];
|
||||
while (@dr = &read1(2, $fhr[2], $fhr[3])) {
|
||||
print {$fhw[2]} $dr[1];
|
||||
}
|
||||
}
|
||||
close($fhr[$_]) for (0 .. $#fhr);
|
||||
close($fhw[$_]) for (0 .. $#fhw);
|
||||
} else { # single end
|
||||
for (0 .. 1) {
|
||||
my $fn = "$title$fn_suff[$_]";
|
||||
$fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz");
|
||||
open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n");
|
||||
}
|
||||
open($fhw[2], "|gzip >$pre.single.fastq.gz") || die;
|
||||
my @df;
|
||||
while (@df = &read1(1, $fhr[0], $fhr[1])) {
|
||||
print {$fhw[2]} $df[1];
|
||||
}
|
||||
close($fhr[$_]) for (0 .. $#fhr);
|
||||
close($fhw[2]);
|
||||
}
|
||||
|
||||
sub read1 {
|
||||
my $i = shift(@_);
|
||||
my $j = ($i-1)<<1;
|
||||
my ($key, $seq);
|
||||
my ($fhs, $fhq) = ($fhr[$j], $fhr[$j|1]);
|
||||
while (<$fhs>) {
|
||||
my $t = <$fhq>;
|
||||
if (/^>(\d+)_(\d+)_(\d+)_[FR]3/) {
|
||||
$key = sprintf("%.4d_%.4d_%.4d", $1, $2, $3); # this line could be improved on 64-bit machines
|
||||
die(qq/** unmatched read name: '$_' != '$_'\n/) unless ($_ eq $t);
|
||||
my $name = "$pre:$1_$2_$3/$i";
|
||||
$_ = substr(<$fhs>, 2);
|
||||
tr/0123./ACGTN/;
|
||||
my $s = $_;
|
||||
$_ = <$fhq>;
|
||||
s/-1\b/0/eg;
|
||||
s/^(\d+)\s*//;
|
||||
s/(\d+)\s*/chr($1+33)/eg;
|
||||
$seq = qq/\@$name\n$s+\n$_\n/;
|
||||
last;
|
||||
}
|
||||
}
|
||||
return defined($seq)? ($key, $seq) : ();
|
||||
}
|
||||
6
stdaln.c
6
stdaln.c
|
|
@ -543,13 +543,12 @@ int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2,
|
|||
int start, end, max_score;
|
||||
int thres, *suba, *ss;
|
||||
|
||||
int gap_open, gap_ext, b;
|
||||
int gap_open, gap_ext;
|
||||
int *score_matrix, N_MATRIX_ROW;
|
||||
|
||||
/* initialize some align-related parameters. just for compatibility */
|
||||
gap_open = ap->gap_open;
|
||||
gap_ext = ap->gap_ext;
|
||||
b = ap->band_width;
|
||||
score_matrix = ap->matrix;
|
||||
N_MATRIX_ROW = ap->row;
|
||||
thres = _thres > 0? _thres : -_thres;
|
||||
|
|
@ -863,7 +862,7 @@ uint16_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar)
|
|||
int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap,
|
||||
path_t *path, int *path_len, int G0, uint8_t *_mem)
|
||||
{
|
||||
int q, r, qr, tmp_len;
|
||||
int q, r, qr;
|
||||
int32_t **s_array, *score_array;
|
||||
int is_overflow, of_base;
|
||||
uint32_t *eh;
|
||||
|
|
@ -890,7 +889,6 @@ int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2
|
|||
s_array[i] = (int32_t*)_p, _p += 4 * len1;
|
||||
/* initialization */
|
||||
aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array);
|
||||
tmp_len = len1 + 1;
|
||||
start = 1; end = 2;
|
||||
end_i = end_j = 0;
|
||||
score = 0;
|
||||
|
|
|
|||
90
utils.c
90
utils.c
|
|
@ -41,6 +41,18 @@
|
|||
#include <sys/time.h>
|
||||
#include "utils.h"
|
||||
|
||||
#include "ksort.h"
|
||||
#define pair64_lt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y < (b).y))
|
||||
KSORT_INIT(128, pair64_t, pair64_lt)
|
||||
KSORT_INIT(64, uint64_t, ks_lt_generic)
|
||||
|
||||
#include "kseq.h"
|
||||
KSEQ_INIT2(, gzFile, err_gzread)
|
||||
|
||||
/********************
|
||||
* System utilities *
|
||||
********************/
|
||||
|
||||
FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
|
||||
{
|
||||
FILE *fp = 0;
|
||||
|
|
@ -51,6 +63,7 @@ FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
|
|||
}
|
||||
return fp;
|
||||
}
|
||||
|
||||
FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp)
|
||||
{
|
||||
if (freopen(fn, mode, fp) == 0) {
|
||||
|
|
@ -58,6 +71,7 @@ FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE
|
|||
}
|
||||
return fp;
|
||||
}
|
||||
|
||||
gzFile err_xzopen_core(const char *func, const char *fn, const char *mode)
|
||||
{
|
||||
gzFile fp;
|
||||
|
|
@ -109,12 +123,10 @@ void _err_fatal_simple_core(const char *func, const char *msg)
|
|||
|
||||
size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
|
||||
{
|
||||
size_t ret = fwrite(ptr, size, nmemb, stream);
|
||||
if (ret != nmemb)
|
||||
{
|
||||
_err_fatal_simple("fwrite", strerror(errno));
|
||||
}
|
||||
return ret;
|
||||
size_t ret = fwrite(ptr, size, nmemb, stream);
|
||||
if (ret != nmemb)
|
||||
_err_fatal_simple("fwrite", strerror(errno));
|
||||
return ret;
|
||||
}
|
||||
|
||||
size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream)
|
||||
|
|
@ -163,36 +175,26 @@ long err_ftell(FILE *stream)
|
|||
|
||||
int err_printf(const char *format, ...)
|
||||
{
|
||||
va_list arg;
|
||||
int done;
|
||||
|
||||
va_start(arg, format);
|
||||
done = vfprintf(stdout, format, arg);
|
||||
int saveErrno = errno;
|
||||
va_end(arg);
|
||||
|
||||
if (done < 0)
|
||||
{
|
||||
_err_fatal_simple("vfprintf(stdout)", strerror(saveErrno));
|
||||
}
|
||||
return done;
|
||||
va_list arg;
|
||||
int done;
|
||||
va_start(arg, format);
|
||||
done = vfprintf(stdout, format, arg);
|
||||
int saveErrno = errno;
|
||||
va_end(arg);
|
||||
if (done < 0) _err_fatal_simple("vfprintf(stdout)", strerror(saveErrno));
|
||||
return done;
|
||||
}
|
||||
|
||||
int err_fprintf(FILE *stream, const char *format, ...)
|
||||
{
|
||||
va_list arg;
|
||||
int done;
|
||||
|
||||
va_start(arg, format);
|
||||
done = vfprintf(stream, format, arg);
|
||||
int saveErrno = errno;
|
||||
va_end(arg);
|
||||
|
||||
if (done < 0)
|
||||
{
|
||||
_err_fatal_simple("vfprintf", strerror(saveErrno));
|
||||
}
|
||||
return done;
|
||||
va_list arg;
|
||||
int done;
|
||||
va_start(arg, format);
|
||||
done = vfprintf(stream, format, arg);
|
||||
int saveErrno = errno;
|
||||
va_end(arg);
|
||||
if (done < 0) _err_fatal_simple("vfprintf", strerror(saveErrno));
|
||||
return done;
|
||||
}
|
||||
|
||||
int err_fputc(int c, FILE *stream)
|
||||
|
|
@ -220,10 +222,8 @@ int err_fputs(const char *s, FILE *stream)
|
|||
int err_fflush(FILE *stream)
|
||||
{
|
||||
int ret = fflush(stream);
|
||||
if (ret != 0)
|
||||
{
|
||||
_err_fatal_simple("fflush", strerror(errno));
|
||||
}
|
||||
if (ret != 0) _err_fatal_simple("fflush", strerror(errno));
|
||||
|
||||
#ifdef FSYNC_ON_FLUSH
|
||||
/* Calling fflush() ensures that all the data has made it to the
|
||||
kernel buffers, but this may not be sufficient for remote filesystems
|
||||
|
|
@ -234,15 +234,12 @@ int err_fflush(FILE *stream)
|
|||
{
|
||||
struct stat sbuf;
|
||||
if (0 != fstat(fileno(stream), &sbuf))
|
||||
{
|
||||
_err_fatal_simple("fstat", strerror(errno));
|
||||
}
|
||||
|
||||
if (S_ISREG(sbuf.st_mode))
|
||||
{
|
||||
if (0 != fsync(fileno(stream)))
|
||||
{
|
||||
_err_fatal_simple("fsync", strerror(errno));
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
@ -251,12 +248,9 @@ int err_fflush(FILE *stream)
|
|||
|
||||
int err_fclose(FILE *stream)
|
||||
{
|
||||
int ret = fclose(stream);
|
||||
if (ret != 0)
|
||||
{
|
||||
_err_fatal_simple("fclose", strerror(errno));
|
||||
}
|
||||
return ret;
|
||||
int ret = fclose(stream);
|
||||
if (ret != 0) _err_fatal_simple("fclose", strerror(errno));
|
||||
return ret;
|
||||
}
|
||||
|
||||
int err_gzclose(gzFile file)
|
||||
|
|
@ -311,6 +305,10 @@ char *err_strdup(const char *s, const char *file, unsigned int line, const char
|
|||
return p;
|
||||
}
|
||||
|
||||
/*********
|
||||
* Timer *
|
||||
*********/
|
||||
|
||||
double cputime()
|
||||
{
|
||||
struct rusage r;
|
||||
|
|
|
|||
27
utils.h
27
utils.h
|
|
@ -28,6 +28,7 @@
|
|||
#ifndef LH3_UTILS_H
|
||||
#define LH3_UTILS_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <zlib.h>
|
||||
|
||||
|
|
@ -38,10 +39,9 @@
|
|||
#define ATTRIBUTE(list)
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#define err_fatal_simple(msg) _err_fatal_simple(__func__, msg)
|
||||
#define err_fatal_simple_core(msg) _err_fatal_simple_core(__func__, msg)
|
||||
|
||||
#define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
|
||||
#define xreopen(fn, mode, fp) err_xreopen_core(__func__, fn, mode, fp)
|
||||
#define xzopen(fn, mode) err_xzopen_core(__func__, fn, mode)
|
||||
|
|
@ -54,6 +54,13 @@
|
|||
#define xstrdup(s) err_strdup( (s), __FILE__, __LINE__, __func__)
|
||||
|
||||
|
||||
typedef struct {
|
||||
uint64_t x, y;
|
||||
} pair64_t;
|
||||
|
||||
typedef struct { size_t n, m; uint64_t *a; } uint64_v;
|
||||
typedef struct { size_t n, m; pair64_t *a; } pair64_v;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
|
@ -92,8 +99,24 @@ extern "C" {
|
|||
double cputime();
|
||||
double realtime();
|
||||
|
||||
void ks_introsort_64 (size_t n, uint64_t *a);
|
||||
void ks_introsort_128(size_t n, pair64_t *a);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline uint64_t hash_64(uint64_t key)
|
||||
{
|
||||
key += ~(key << 32);
|
||||
key ^= (key >> 22);
|
||||
key += ~(key << 13);
|
||||
key ^= (key >> 8);
|
||||
key += (key << 3);
|
||||
key ^= (key >> 15);
|
||||
key += ~(key << 27);
|
||||
key ^= (key >> 31);
|
||||
return key;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
|||
Loading…
Reference in New Issue