做了一些代码清理,目前结果应该是完全一致的

This commit is contained in:
zzh 2024-02-22 01:26:57 +08:00
parent 17618ee5f2
commit 4bd0fd4f91
15 changed files with 524 additions and 500 deletions

4
.vscode/launch.json vendored
View File

@ -18,8 +18,8 @@
"-R",
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
"~/reference/human_g1k_v37_decoy.fasta",
"~/fastq/tiny_n_r1.fq",
"~/fastq/tiny_n_r2.fq",
"~/fastq/diff_r1.fq",
"~/fastq/diff_r2.fq",
"-o",
"/dev/null"
],

20
bwa.c
View File

@ -317,14 +317,6 @@ FMTIndex *bwa_idx_load_fmt(const char *hint)
fprintf(stderr, "%s\n", kmer_idx_fn);
fmt->kmer_hash = fmt_restore_kmer_idx(kmer_idx_fn);
// restore kmer bit
// fmt->bit_kmer_len = 18;
// sprintf(suffix, ".%d.kmer.bit", fmt->bit_kmer_len);
// strcpy(kmer_bit_fn, hint);
// strcpy(kmer_bit_fn + l_hint, suffix);
// fprintf(stderr, "%s\n", kmer_bit_fn);
// fmt->kmer_bit = fmt_restore_kmer_bit(kmer_bit_fn, fmt->bit_kmer_len);
strcpy(sa_fn, hint);
sprintf(suffix, ".33.%d.sa", SA_INTV);
strcpy(sa_fn + l_hint, suffix); // partial suffix array (SA)
@ -348,13 +340,9 @@ bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
}
idx = calloc(1, sizeof(bwaidx_t));
if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
if (which & BWA_IDX_BWT) {
idx->fmt = bwa_idx_load_fmt(hint);
// 先和bwt共用sa
//idx->fmt->sa = idx->bwt->sa;
//idx->fmt->n_sa = idx->bwt->n_sa;
//idx->fmt->sa_intv = idx->bwt->sa_intv;
}
if (which & BWA_IDX_BWT) idx->fmt = bwa_idx_load_fmt(hint);
idx->bwt->kmer_hash = idx->fmt->kmer_hash;
if (which & BWA_IDX_BNS)
{
int i, c;
@ -404,7 +392,7 @@ int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx)
// generate idx->bwt
x = sizeof(bwt_t); idx->bwt = malloc(x); memcpy(idx->bwt, mem + k, x); k += x;
x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x;
x = SA_BYTES(idx->bwt->n_sa); idx->bwt->sa = (uint8_t*)(mem + k); k += x;
x = SA_BYTES(idx->bwt->n_sa); idx->bwt->sa = (bwtint_t*)(mem + k); k += x;
// generate idx->bns and idx->pac
x = sizeof(bntseq_t); idx->bns = malloc(x); memcpy(idx->bns, mem + k, x); k += x;

View File

@ -148,18 +148,11 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
int max_seed_len = 0;
int start_N_num = 0, start_flag = 1;
a->mem.n = 0;
// char *BASE = "ACGT";
// for (i = 0; i < len; ++i)
// if (seq[i] < 4)
// fprintf(stderr, "%c", BASE[seq[i]]);
// else
// fprintf(stderr, "N");
// fprintf(stderr, "\n");
// first pass: find all SMEMs
fprintf(fp1, "seq: %ld\n", dn++);
//fprintf(stderr, "seq: %ld\n", dn++);
//dn ++;
// fprintf(stderr, "seq: %ld\n", dn++);
// dn ++;
// goto third_seed;
while (x < len) {
@ -169,8 +162,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
int64_t tmp_time = realtime_msec();
#endif
#if USE_FMT
x = fmt_smem(fmt, len, seq, x, start_width, &a->mem1, a->tmpv);
x = fmt_smem(fmt, len, seq, x, start_width, opt->min_seed_len, &a->mem1, a->tmpv[0]);
#else
x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv);
#endif
@ -178,15 +170,17 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
tmp_time = realtime_msec() - tmp_time;
__sync_fetch_and_add(&time_seed_1, tmp_time);
#endif
s1n += a->mem1.n;
for (i = 0; i < a->mem1.n; ++i) {
bwtintv_t *p = &a->mem1.a[i];
//fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
//fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
int slen = (uint32_t)p->info - (p->info >> 32); // seed length
s1l += slen;
max_seed_len = fmax(max_seed_len, slen);
if (slen >= opt->min_seed_len) {
fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
//fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
kv_push(bwtintv_t, a->mem, *p);
}
}
@ -210,7 +204,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
int64_t tmp_time = realtime_msec();
#endif
#if USE_FMT
fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, &a->mem1, a->tmpv);
fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, &a->mem1, a->tmpv[0]);
#else
bwt_smem1(bwt, len, seq, (start + end) >> 1, p->x[2] + 1, &a->mem1, a->tmpv);
#endif
@ -218,13 +212,17 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
tmp_time = realtime_msec() - tmp_time;
__sync_fetch_and_add(&time_seed_2, tmp_time);
#endif
s2n += a->mem1.n;
for (i = 0; i < a->mem1.n; ++i) {
bwtintv_t *p = &a->mem1.a[i];
//fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
// fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
// fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
int slen = (uint32_t)p->info - (p->info >> 32);
s2l += slen;
if (slen >= opt->min_seed_len) {
g_num_smem2 += 1;
fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
}
@ -233,7 +231,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
//if (max_seed_len == len - start_N_num)
// goto collect_intv_end;
//goto collect_intv_end;
//third_seed:
third_seed:
// third pass: LAST-like
if (opt->max_mem_intv > 0) {
x = 0;
@ -254,13 +252,15 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
tmp_time = realtime_msec() - tmp_time;
__sync_fetch_and_add(&time_seed_3, tmp_time);
#endif
//bwtintv_t *p = &m;
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
s3n += 1;
s3l += (uint32_t)m.info - (m.info >> 32);
// bwtintv_t *p = &m;
// fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
if (m.x[2] > 0) {
kv_push(bwtintv_t, a->mem, m);
bwtintv_t *p = &m;
fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
//bwtintv_t *p = &m;
//fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
}
} else { // for now, we never come to this block which is slower
@ -272,7 +272,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
}
}
collect_intv_end:
//collect_intv_end:
// sort
ks_introsort(mem_intv, a->mem.n, a->mem.a);
}
@ -438,10 +438,10 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fm
#endif
#if USE_FMT
s.rbeg = tmp.pos = fmt_sa(fmt, p->x[0] + k);
uint64_t tpos = bwt_sa(bwt, p->x[0] + k);
if (s.rbeg != tpos) {
fprintf(stderr, "diff: %ld, %ld %ld\n", p->x[0] + k, tmp.pos, tpos);
}
// uint64_t tpos = bwt_sa(bwt, p->x[0] + k);
// if (s.rbeg != tpos) {
// fprintf(stderr, "diff: %ld, %ld %ld\n", p->x[0] + k, tmp.pos, tpos);
// }
#else
s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
#endif

224
bwt.c
View File

@ -53,20 +53,10 @@ void bwt_gen_cnt_table(bwt_t *bwt)
static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inverse CSA
{
bwtint_t x = k - (k > bwt->primary); // bwt中不包含$所以位置超过序列长度后要减掉1因为后边加上了互补链
// x = bwt_B0(bwt, x); // 获取x位置对应的字符
uint32_t t1 = (bwt->bwt[((x) >> 7 << 4) + sizeof(bwtint_t) + (((x) & 0x7f) >> 4)]);
uint32_t t2 = t1 >> ((~(x) & 0xf) << 1) & 3;
x = t2;
// bwtint_t cnt[4], cnt1;
// bwt_occ4(bwt, k, cnt);
// cnt1 = bwt_occ(bwt, k, x);
x = bwt->L2[x] + bwt_occ(bwt, k, x); // 获取x字符在k位置后缀索引
return k == bwt->primary? 0 : x;
bwtint_t x = k - (k > bwt->primary);
x = bwt_B0(bwt, x);
x = bwt->L2[x] + bwt_occ(bwt, k, x);
return k == bwt->primary ? 0 : x;
}
// 设置某一行的排序值sa的索引有效值从1开始0设置为-1, 小端模式)
@ -91,8 +81,55 @@ bwtint_t bwt_get_sa(uint8_t *sa_arr, bwtint_t k)
return val;
}
// 获取kmer的fmt匹配信息
inline void kmer_getval_at(uint8_t *mem_addr, bwtintv_t *ok, int pos)
{
bwtint_t x0, x1, x2;
int byte_idx = pos * 14;
uint8_t *arr = mem_addr + byte_idx;
x0 = *arr;
x0 = (x0 << 32) | *((uint32_t *)(arr + 1));
arr += 5;
x1 = *arr;
x1 = (x1 << 32) | *((uint32_t *)(arr + 1));
arr += 5;
x2 = *((uint32_t *)arr);
ok->x[0] = x0;
ok->x[1] = x1;
ok->x[2] = x2;
}
// 设置kmer第pos个碱基对应的fmt匹配信息
inline void kmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos)
{
int byte_idx = pos * 14;
uint8_t *arr = mem_addr + byte_idx;
arr[0] = (uint8_t)(ik.x[0] >> 32);
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[0];
arr += 5;
arr[0] = (uint8_t)(ik.x[1] >> 32);
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[1];
arr += 5;
*((uint32_t *)arr) = (uint32_t)ik.x[2];
}
// 获取kmer对应的fmt匹配信息, pos should be [0, 13]
inline void bwt_kmer_get(const KmerHash *kmer_hash, bwtintv_t *ok, uint32_t qbit, int pos)
{
if (pos == 13)
kmer_getval_at(kmer_hash->ke14[qbit].intv_arr, ok, 0);
else if (pos == 12)
kmer_getval_at(kmer_hash->ke13[qbit >> 2].intv_arr, ok, 0);
else if (pos == 11)
kmer_getval_at(kmer_hash->ke12[qbit >> 4].intv_arr, ok, 0);
else if (pos == 10)
kmer_getval_at(kmer_hash->ke11[qbit >> 6].intv_arr, ok, 0);
else
kmer_getval_at(kmer_hash->ke10[qbit >> 8].intv_arr, ok, pos);
}
// bwt->bwt and bwt->occ must be precalculated
void bwt_cal_sa(bwt_t *bwt, int intv)
void bwt_cal_byte_sa(bwt_t *bwt, int intv)
{
bwtint_t isa, sa, i, block_size; // S(isa) = sa isa是后缀数组的索引sa表示位置
double tmp_time, elapsed_time;
@ -102,10 +139,10 @@ void bwt_cal_sa(bwt_t *bwt, int intv)
xassert(intv_round == intv, "SA sample interval is not a power of 2.");
xassert(bwt->bwt, "bwt_t::bwt is not initialized.");
if (bwt->sa) free(bwt->sa);
if (bwt->byte_sa) free(bwt->byte_sa);
bwt->sa_intv = intv;
bwt->n_sa = (bwt->seq_len + intv) / intv;
bwt->sa = (uint8_t *)calloc(SA_BYTES(bwt->n_sa), 1); // 用33位表示位置
bwt->byte_sa = (uint8_t *)calloc(SA_BYTES(bwt->n_sa), 1); // 用33位表示位置
fprintf(stderr, "bytes: %ld, sa size: %ld\n", SA_BYTES(bwt->n_sa), bwt->n_sa);
// calculate SA value
isa = 0; sa = bwt->seq_len;
@ -118,19 +155,43 @@ void bwt_cal_sa(bwt_t *bwt, int intv)
fprintf(stderr, "%ld%% percent complished. %f s elapsed.\n", i / block_size, elapsed_time);
}
if (isa % intv == 0) {
bwt_set_sa(bwt->sa, isa / intv, sa); // 第一个位置是$,所以位置就是序列长度
bwt_set_sa(bwt->byte_sa, isa / intv, sa); // 第一个位置是$,所以位置就是序列长度
if (i % (block_size / 2) == 0)
{
fprintf(stderr, "%ld %ld\n", sa, bwt_get_sa(bwt->sa, isa / intv));
fprintf(stderr, "%ld %ld\n", sa, bwt_get_sa(bwt->byte_sa, isa / intv));
}
}
--sa; // 从后往前一个位置一个位置的找对应的后缀数组isa就是与sa对应的后缀数组排序后在sa数组中的相对位置
isa = bwt_invPsi(bwt, isa); // 下一个后缀数组的相对位置
}
if (isa % intv == 0)
bwt_set_sa(bwt->sa, isa / intv, sa);
// bwt_set_sa(bwt->sa, 0, (bwtint_t)-1); // 赋值成-1也没问题set_sa那里已经修正了
bwt_set_sa(bwt->sa, 0, 8589934591); // before this line, bwt->sa[0] = bwt->seq_len
if (isa % intv == 0) bwt_set_sa(bwt->byte_sa, isa / intv, sa);
// bwt_set_sa(bwt->byte_sa, 0, (bwtint_t)-1); // 赋值成-1也没问题set_sa那里已经修正了
bwt_set_sa(bwt->byte_sa, 0, 8589934591); // before this line, bwt->sa[0] = bwt->seq_len
}
// bwt->bwt and bwt->occ must be precalculated
void bwt_cal_sa(bwt_t *bwt, int intv)
{
bwtint_t isa, sa, i; // S(isa) = sa
int intv_round = intv;
kv_roundup32(intv_round);
xassert(intv_round == intv, "SA sample interval is not a power of 2.");
xassert(bwt->bwt, "bwt_t::bwt is not initialized.");
if (bwt->sa) free(bwt->sa);
bwt->sa_intv = intv;
bwt->n_sa = (bwt->seq_len + intv) / intv;
bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
// calculate SA value
isa = 0; sa = bwt->seq_len;
for (i = 0; i < bwt->seq_len; ++i) {
if (isa % intv == 0) bwt->sa[isa/intv] = sa;
--sa;
isa = bwt_invPsi(bwt, isa);
}
if (isa % intv == 0) bwt->sa[isa/intv] = sa;
bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len
}
bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k)
@ -141,17 +202,6 @@ bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k)
k = bwt_invPsi(bwt, k);
}
return sa + bwt->sa[k / bwt->sa_intv];
/* without setting bwt->sa[0] = -1, the following line should be
changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */
//#ifdef SHOW_PERF
// int64_t tmp_time = realtime_msec();
//#endif
sa += bwt_get_sa(bwt->sa, k / bwt->sa_intv);
//#ifdef SHOW_PERF
// tmp_time = realtime_msec() - tmp_time;
// __sync_fetch_and_add(&time_bwt_sa_read, tmp_time);
//#endif
return sa;
}
static inline int __occ_aux(uint64_t y, int c)
@ -320,11 +370,6 @@ int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t
void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back)
{
#ifdef SHOW_PERF
#if 0
int64_t tmp_time = realtime_msec();
#endif
#endif
bwtint_t tk[4], tl[4];
int i;
bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl);
@ -336,12 +381,34 @@ void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_b
ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2];
ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2];
ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2];
#ifdef SHOW_PERF
#if 0
tmp_time = realtime_msec() - tmp_time;
__sync_fetch_and_add(&time_bwt_extend, tmp_time);
#endif
#endif
}
// 创建正向的kmer
inline uint32_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed)
{
uint32_t qbit = 0, i;
qlen = qlen < kmer_len ? qlen : kmer_len;
for (i = 0; i < qlen; ++i) {
if (q[i] > 3) break; // 要考虑碱基是N
qbit |= q[i] << ((kmer_len - 1 - i) << 1);
}
*base_consumed = i;
return qbit;
}
// 创建f反向的kmer
inline uint32_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed)
{
uint32_t qbit = 0;
int i, j, end_pos;
end_pos = start_pos - kmer_len;
end_pos = end_pos < 0 ? -1 : end_pos;
for (i = start_pos, j = 0; i > end_pos; --i, ++j) {
if (q[i] > 3) break; // 要考虑碱基是N
qbit |= q[i] << ((kmer_len - 1 - j) << 1);
}
*base_consumed = start_pos - i;
return (~qbit) & ((1L << (kmer_len << 1)) - 1);
}
static void bwt_reverse_intvs(bwtintv_v *p)
@ -372,24 +439,8 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
ik.info = x + 1;
// int flag = 0;
// int small_intv_qlen = 0;
// int small_intv_val = 0;
// int intv_19 = 0;
for (i = x + 1, curr->n = 0; i < len; ++i)
{ // forward search
//if (ik.x[2] <= 2)
//{
// if (!flag) {
// small_intv_qlen = i - x;
// small_intv_val = ik.x[2];
// flag = 1;
// }
//}
//if (i-x == 19)
//{
// intv_19 = ik.x[2];
//}
if (ik.x[2] < max_intv) { // an interval small enough
kv_push(bwtintv_t, *curr, ik);
break;
@ -407,11 +458,6 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
break; // always terminate extension at an ambiguous base; in this case, i<len always stands
}
}
//if (small_intv_qlen == 0)
// fprintf(fp1, "stop_len: %d; final_len: %d; intval: %ld\n", i-x, i-x, ik.x[2]);
//else
// fprintf(fp1, "stop_len: %d; final_len: %d; intval: %d; final_intval: %ld\n", small_intv_qlen, i - x, small_intv_val, ik.x[2]);
// fprintf(fp1, "19-intv: %d\n", intv_19);
if (i == len) kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end
bwt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first
@ -452,13 +498,21 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem)
{
int i, c;
int i = x + 1, c, kmer_len;
bwtintv_t ik = {0}, ok[4] = {0};
memset(mem, 0, sizeof(bwtintv_t));
if (q[x] > 3) return x + 1;
bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
for (i = x + 1; i < len; ++i) { // forward search
uint32_t qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
bwt_kmer_get(&bwt->kmer_hash, &ik, qbit, kmer_len - 1);
ik.info = x + kmer_len;
i = (int)ik.info;
//bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
//i = x + 1;
for (; i < len; ++i) { // forward search
if (q[i] < 4) { // an A/C/G/T base
c = 3 - q[i]; // complement of q[i]
bwt_extend(bwt, &ik, ok, 0);
@ -496,22 +550,22 @@ void bwt_dump_sa(const char *fn, const bwt_t *bwt)
err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp);
err_fwrite(bwt->sa, sizeof(bwtint_t), SA_BYTES(bwt->n_sa) >> 3, fp);
err_fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
err_fflush(fp);
err_fclose(fp);
}
static bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a)
{ // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks
//const int bufsize = 0x1000000; // 16M block
const int bufsize = 0x4000000; //64M block
bwtint_t offset = 0;
while (size) {
int x = bufsize < size? bufsize : size;
if ((x = err_fread_noeof(a + offset, 1, x, fp)) == 0) break;
size -= x; offset += x;
}
return offset;
void bwt_dump_byte_sa(const char *fn, const bwt_t *bwt)
{
FILE *fp;
fp = xopen(fn, "wb");
err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
err_fwrite(bwt->L2 + 1, sizeof(bwtint_t), 4, fp);
err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp);
err_fwrite(bwt->byte_sa, sizeof(bwtint_t), SA_BYTES(bwt->n_sa) >> 3, fp);
err_fflush(fp);
err_fclose(fp);
}
void bwt_restore_sa(const char *fn, bwt_t *bwt)
@ -530,10 +584,6 @@ void bwt_restore_sa(const char *fn, bwt_t *bwt)
bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv;
//bwt->sa = (uint8_t*)malloc(SA_BYTES(bwt->n_sa));
//fprintf(stderr, "zzh-read-sa %ld\n", SA_BYTES(bwt->n_sa));
//fread_fix(fp, SA_BYTES(bwt->n_sa), bwt->sa);
bwt->sa = (bwtint_t *)calloc(bwt->n_sa, sizeof(bwtint_t));
bwt->sa[0] = -1;
fread_fix(fp, sizeof(bwtint_t) * (bwt->n_sa - 1), bwt->sa + 1);
@ -565,6 +615,6 @@ bwt_t *bwt_restore_bwt(const char *fn)
void bwt_destroy(bwt_t *bwt)
{
if (bwt == 0) return;
//free(bwt->sa); free(bwt->bwt);
//free(bwt);
free(bwt->sa); free(bwt->bwt);
free(bwt);
}

37
bwt.h
View File

@ -45,6 +45,30 @@ typedef unsigned char ubyte_t;
typedef uint64_t bwtint_t;
#define HASH_KMER_LEN 14
// 用来保存kmer对应的fmt的位置信息
typedef struct
{
// 40+40+32 14个byte这样好处理
uint8_t intv_arr[14]; // 保存kmer中每扩展一个碱基对应的bwtintv_t数据
} KmerEntry;
typedef struct
{
uint8_t intv_arr[140]; // 保存长度为10的kmer每个碱基对应的bwt匹配信息
} KmerEntryArr;
// 保存各个位置对应的bwt匹配信息
typedef struct
{
KmerEntryArr *ke10;
KmerEntry *ke11;
KmerEntry *ke12;
KmerEntry *ke13;
KmerEntry *ke14;
} KmerHash;
typedef struct {
bwtint_t primary; // S^{-1}(0), or the primary index of BWT
bwtint_t L2[5]; // C(), cumulative count
@ -53,11 +77,13 @@ typedef struct {
uint32_t *bwt; // BWT
// occurance array, separated to two parts
uint32_t cnt_table[256];
// 保存kmer对应的fmt位置信息
KmerHash kmer_hash;
// suffix array
int sa_intv;
bwtint_t n_sa;
//uint8_t *sa;
bwtint_t *sa;
uint8_t *byte_sa;
} bwt_t;
typedef struct {
@ -99,6 +125,7 @@ extern "C" {
void bwt_dump_bwt(const char *fn, const bwt_t *bwt);
void bwt_dump_sa(const char *fn, const bwt_t *bwt);
void bwt_dump_byte_sa(const char *fn, const bwt_t *bwt);
bwt_t *bwt_restore_bwt(const char *fn);
void bwt_restore_sa(const char *fn, bwt_t *bwt);
@ -108,6 +135,7 @@ extern "C" {
void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW
void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size); // from BWT-SW
void bwt_cal_sa(bwt_t *bwt, int intv);
void bwt_cal_byte_sa(bwt_t *bwt, int intv);
void bwt_bwtupdate_core(bwt_t *bwt);
@ -118,6 +146,13 @@ extern "C" {
void bwt_set_sa(uint8_t *sa_arr, bwtint_t k, bwtint_t val);
bwtint_t bwt_get_sa(uint8_t *sa_arr, bwtint_t k);
void kmer_getval_at(uint8_t *mem_addr, bwtintv_t *ok, int pos);
void kmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos);
void bwt_kmer_get(const KmerHash *kmer_hash, bwtintv_t *ok, uint32_t qbit, int pos);
uint32_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed);
uint32_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed);
// more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values
void bwt_gen_cnt_table(bwt_t *bwt);
void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol);

View File

@ -206,6 +206,27 @@ int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command
return 0;
}
int bwa_bwt2bytesa(int argc, char *argv[]) // the "bwt2sa" command
{
bwt_t *bwt;
int c, sa_intv = 32;
while ((c = getopt(argc, argv, "i:")) >= 0) {
switch (c) {
case 'i': sa_intv = atoi(optarg); break;
default: return 1;
}
}
if (optind + 2 > argc) {
fprintf(stderr, "Usage: bwa bwt2bytesa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
return 1;
}
bwt = bwt_restore_bwt(argv[optind]);
bwt_cal_byte_sa(bwt, sa_intv);
bwt_dump_byte_sa(argv[optind+1], bwt);
bwt_destroy(bwt);
return 0;
}
int bwa_index(int argc, char *argv[]) // the "index" command
{
int c, algo_type = BWTALGO_AUTO, is_64 = 0, block_size = 10000000;

38
debug.sh 100755
View File

@ -0,0 +1,38 @@
thread=1
#n_r1=~/data/fastq/ZY2105177532213000/n_r1.fq
#n_r2=~/data/fastq/ZY2105177532213000/n_r2.fq
#n_r1=~/data/fastq/ZY2105177532213000/sn_r1.fq
#n_r2=~/data/fastq/ZY2105177532213000/sn_r2.fq
#reference=~/data/reference/human_g1k_v37_decoy.fasta
#n_r1=~/fastq/sn_r1.fq
#n_r2=~/fastq/sn_r2.fq
#n_r1=~/fastq/ssn_r1.fq
#n_r2=~/fastq/ssn_r2.fq
#n_r1=~/fastq/tiny_n_r1.fq
#n_r2=~/fastq/tiny_n_r2.fq
n_r1=~/fastq/diff_r1.fq
n_r2=~/fastq/diff_r2.fq
#n_r1=~/fastq/d_r1.fq
#n_r2=~/fastq/d_r2.fq
reference=~/reference/human_g1k_v37_decoy.fasta
#out=./ssn.sam
out=./out.sam
#out=/dev/null
#time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
# /home/zzh/data/reference/human_g1k_v37_decoy.fasta \
# /home/zzh/data/fastq/nm1.fq \
# /home/zzh/data/fastq/nm2.fq -o /dev/null
#time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
# /home/zzh/data/reference/human_g1k_v37_decoy.fasta \
# /mnt/d/data/fastq/ZY2105177532213000/ZY2105177532213010_L4_1.fq.gz \
# /mnt/d/data/fastq/ZY2105177532213000/ZY2105177532213010_L4_2.fq.gz \
# -o /dev/null
time ./bwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
$reference \
$n_r1 \
$n_r2 \
-o $out

View File

@ -59,7 +59,8 @@ int64_t time_ksw_extend2 = 0,
time_bns = 0;
int64_t dn = 0, n16 = 0, n17 = 0, n18 = 0, n19 = 0, nall = 0, num_sa = 0;
int64_t s1n = 0, s2n = 0, s3n = 0, s1l = 0, s2l = 0, s3l = 0;
int64_t g_num_smem2 = 0;
FILE *fp1;
#endif
@ -433,22 +434,20 @@ int main_mem(int argc, char *argv[])
#ifdef SHOW_PERF
fprintf(stderr, "\n");
fprintf(stderr, "time_bwt_smem1a: %f s\n", time_bwt_smem1a / 1000.0 / opt->n_threads);
fprintf(stderr, "time_seed_1: %f s\n", time_seed_1 / 1000.0 / opt->n_threads);
fprintf(stderr, "time_seed_2: %f s\n", time_seed_2 / 1000.0 / opt->n_threads);
fprintf(stderr, "time_seed_3: %f s\n", time_seed_3 / 1000.0 / opt->n_threads);
fprintf(stderr, "time_fmt_smem_0: %f s\n", time_fmt_smem_0 / 1000.0 / opt->n_threads);
fprintf(stderr, "time_bwt_extend: %f s\n", time_bwt_extend / 1000.0 / opt->n_threads);
fprintf(stderr, "time_bwt_sa: %f s\n", time_bwt_sa / 1000.0 / opt->n_threads);
fprintf(stderr, "time_ksw_extend2: %f s\n", time_ksw_extend2 / 1000.0 / opt->n_threads);
fprintf(stderr, "time_bwt_sa_read: %f s\n", time_bwt_sa_read / 1000.0 / opt->n_threads);
fprintf(stderr, "time_bns: %f s\n", time_bns / 1000.0 / opt->n_threads);
fprintf(stderr, "all seed num: %ld\n", nall);
fprintf(stderr, "all seed num litter than 19: %ld\n", n19);
fprintf(stderr, "all seed num litter than 18: %ld\n", n18);
fprintf(stderr, "all seed num litter than 17: %ld\n", n17);
fprintf(stderr, "all seed num litter than 16: %ld\n", n16);
fprintf(stderr, "s1 num: %ld\n", s1n);
fprintf(stderr, "s2 num: %ld\n", s2n);
fprintf(stderr, "s3 num: %ld\n", s3n);
fprintf(stderr, "s1 len: %ld\n", s1l / s1n);
fprintf(stderr, "s2 len: %ld\n", s2l / s2n);
fprintf(stderr, "s3 len: %ld\n", s3l / s3n);
fprintf(stderr, "get sa num: %ld\n", num_sa);
fprintf(stderr, "seed 2 num: %ld\n", g_num_smem2);
fprintf(stderr, "\n");
fclose(fp1);

433
fmt_idx.c
View File

@ -18,19 +18,18 @@ Date : 2023/12/24
const static char BASE[4] = {'A', 'C', 'G', 'T'};
static bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a)
{ // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks
const int bufsize = 0x4000000; // 16 * 4M block
bwtint_t offset = 0;
while (size)
// 生成所有KMER_LEN长度的序列字符串表示
void gen_all_seq(char **seq_arr, int kmer_len)
{
uint32_t seq_up_val = (1 << (kmer_len << 1));
for (uint32_t i = 0; i < seq_up_val; ++i)
{
int x = bufsize < size ? bufsize : size;
if ((x = err_fread_noeof(a + offset, 1, x, fp)) == 0)
break;
size -= x;
offset += x;
seq_arr[i] = (char *)malloc(kmer_len);
for (int j = kmer_len - 1; j >= 0; --j)
{
seq_arr[i][kmer_len - 1 - j] = BASE[(i >> (j << 1)) & 3];
}
}
return offset;
}
// 生成occ每个字节对应一个pattern
@ -155,23 +154,12 @@ KmerHash fmt_restore_kmer_idx(const char *fn)
return khash;
}
uint16_t *fmt_restore_kmer_bit(const char *fn, int kmer_len)
{
uint16_t *kbit = (uint16_t *)calloc(1L << ((kmer_len << 1) - 4), sizeof(uint16_t));
FILE *fp;
fp = xopen(fn, "rb");
fread_fix(fp, (1L << ((kmer_len << 1) - 4)) * sizeof(uint16_t), kbit);
err_fclose(fp);
return kbit;
}
// 读取sa数据
void fmt_restore_sa(const char *fn, FMTIndex *fmt)
{
char skipped[256];
FILE *fp;
bwtint_t primary;
fp = xopen(fn, "rb");
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
xassert(primary == fmt->primary, "SA-BWT inconsistency: primary is not the same.");
@ -179,10 +167,8 @@ void fmt_restore_sa(const char *fn, FMTIndex *fmt)
err_fread_noeof(&fmt->sa_intv, sizeof(bwtint_t), 1, fp);
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
xassert(primary == fmt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
fmt->n_sa = (fmt->seq_len + fmt->sa_intv) / fmt->sa_intv;
fmt->sa = (uint8_t *)malloc(SA_BYTES(fmt->n_sa));
fread_fix(fp, SA_BYTES(fmt->n_sa), fmt->sa);
err_fclose(fp);
}
@ -224,7 +210,6 @@ FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
#endif
buf = (uint32_t *)calloc(fmt->bwt_size, 4); // 开辟计算fmt用到的缓存
c[0] = c[1] = c[2] = c[3] = 0;
// 首行的c2应该是对应的ACGT对应的行减去1的occ
for (i = 0; i < 4; ++i)
@ -382,7 +367,6 @@ inline void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t
cnt[3] = q[b2]; // b2的occ
p += 20;
#ifdef FMT_MID_INTERVAL // 加入了middle checkpoint
// 使用mid interval信息
int mk = k % FMT_OCC_INTERVAL;
int n_mintv = mk >> FMT_MID_INTV_SHIFT;
@ -401,35 +385,12 @@ inline void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t
x = 0;
p += 4;
}
#if FMT_MID_INTERVAL == 16 // middle check point interval等于16时候只需要判断一下是不是要计算两个uint32表示的碱基序列
if ((mk & FMT_MID_INTV_MASK) >> 3)
{
x += __fmt_occ_e2_aux2(fmt, ti, *p);
++p;
}
#elif FMT_MID_INTERVAL > 16 // 该地址是bwt和pre_bwt字符串数据的首地址
uint32_t *end = p + ((k >> 3) - ((k & ~FMT_MID_INTV_MASK) >> 3));
for (; p < end; ++p)
{
x += __fmt_occ_e2_aux2(fmt, ti, *p);
}
#endif
#else // 没有加入middle check point interval
#if FMT_OCC_INTERVAL > 16
uint32_t *end = p + ((k >> 3) - ((k & ~FMT_OCC_INTV_MASK) >> 3));
// p = end - (end - p) / 4;
for (; p < end; ++p)
{
x += __fmt_occ_e2_aux2(fmt, ti, *p);
}
#else // FMT_OCC_INTERVAL等于16的时候只需要判断一次就可以
if ((k & FMT_OCC_INTV_MASK) >> 3)
{
x += __fmt_occ_e2_aux2(fmt, ti, *p);
++p;
}
#endif
#endif
tmp = *p & ~((1U << ((~k & 7) << 2)) - 1);
x += __fmt_occ_e2_aux2(fmt, ti, tmp);
@ -456,11 +417,6 @@ inline void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t
// 扩展两个碱基
inline void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwtintv_t *ok2, int is_back, int b1, int b2)
{
#ifdef SHOW_PERF
#if 0
int64_t tmp_time = realtime_msec();
#endif
#endif
bwtint_t tk[4], tl[4];
bwtintv_t intv = {0};
// tk表示在k行之前所有各个碱基累积出现次数tl表示在l行之前的累积
@ -476,22 +432,11 @@ inline void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwti
intv.x[!is_back] = fmt->L2[b2] + 1 + tk[3];
intv.x[2] = tl[3] - tk[3];
*ok2 = intv;
#ifdef SHOW_PERF
#if 0
tmp_time = realtime_msec() - tmp_time;
__sync_fetch_and_add(&time_bwt_extend, tmp_time);
#endif
#endif
}
// 扩展一个碱基
inline void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1)
{
#ifdef SHOW_PERF
#if 0
int64_t tmp_time = realtime_msec();
#endif
#endif
bwtint_t tk[4], tl[4];
int b2 = 3; // 如果只扩展一次那么第二个碱基设置成T可以减小一些计算量如计算大于b2的累积数量
// tk表示在k行之前所有各个碱基累积出现次数tl表示在l行之前的累积
@ -502,90 +447,70 @@ inline void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int i
ok->x[2] = tl[1] - tk[1];
// 第一次正向扩展
ok->x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
#ifdef SHOW_PERF
#if 0
tmp_time = realtime_msec() - tmp_time;
__sync_fetch_and_add(&time_bwt_extend, tmp_time);
#endif
#endif
}
// 获取kmer的fmt匹配信息
inline void kmer_getval_at(uint8_t *mem_addr, bwtintv_t *ok, int pos)
// 序列和参考基因直接对比
static void direct_extend(const FMTIndex *fmt, int len, const uint8_t *q, int left_pos, int right_pos, bwtint_t mtx_line, bwtintv_t *mt)
{
bwtint_t x0, x1, x2;
int byte_idx = pos * 14;
uint8_t *arr = mem_addr + byte_idx;
x0 = *arr;
x0 = (x0 << 32) | *((uint32_t *)(arr + 1));
arr += 5;
x1 = *arr;
x1 = (x1 << 32) | *((uint32_t *)(arr + 1));
arr += 5;
x2 = *((uint32_t *)arr);
ok->x[0] = x0;
ok->x[1] = x1;
ok->x[2] = x2;
#define PAC_BASE(pac, l) ((pac)[(l) >> 2] >> ((~(l) & 3) << 1) & 3)
#define EXTEND_BASE_LOOP(qcond, rcond, qstep, rstep) \
while (k != qcond && r != rcond) \
{ \
const int base = PAC_BASE(fmt->pac, r); \
if (q[k] != base) break; \
k += qstep; \
r += rstep; \
}
#define EXTEND_BASE_LOOP_COMP(qcond, rcond, qstep, rstep) \
while (k != qcond && r != rcond) \
{ \
const int base = 3 - PAC_BASE(fmt->pac, r); \
if (q[k] != base) break; \
k += qstep; \
r += rstep; \
}
int k;
int64_t r, rp;
mt->num_match = 1;
rp = fmt_sa(fmt, mtx_line);
r = rp >= fmt->l_pac ? (fmt->l_pac << 1) - 1 - rp : rp;
k = right_pos;
if (rp < fmt->l_pac) // 匹配到了正向链
{
// 向前继续扩展
r += right_pos - left_pos;
EXTEND_BASE_LOOP(len, fmt->l_pac, 1, 1);
mt->rm[0].qe = k;
mt->rm[0].reverse = 0;
// 向后扩展x位置之前的碱基
r -= k - left_pos + 1;
k = left_pos - 1;
EXTEND_BASE_LOOP(-1, -1, -1, -1);
mt->rm[0].qs = k + 1;
mt->rm[0].rs = r + 1;
}
else // 匹配到了互补链
{
r -= right_pos - left_pos;
EXTEND_BASE_LOOP_COMP(len, -1, 1, -1);
mt->rm[0].qe = k;
mt->rm[0].reverse = 1;
// 扩展x之前的碱基
r += k - left_pos + 1;
k = left_pos - 1;
EXTEND_BASE_LOOP_COMP(-1, fmt->l_pac, -1, 1);
mt->rm[0].qs = k + 1;
mt->rm[0].rs = r - 1;
}
mt->info = mt->rm[0].qs;
mt->info = mt->info << 32 | mt->rm[0].qe;
mt->x[2] = 1;
}
// 设置kmer第pos个碱基对应的fmt匹配信息
inline void kmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos)
static inline void fmt_reverse_intvs(bwtintv_v *p)
{
int byte_idx = pos * 14;
uint8_t *arr = mem_addr + byte_idx;
arr[0] = (uint8_t)(ik.x[0] >> 32);
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[0];
arr += 5;
arr[0] = (uint8_t)(ik.x[1] >> 32);
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[1];
arr += 5;
*((uint32_t *)arr) = (uint32_t)ik.x[2];
}
// 获取kmer对应的fmt匹配信息, pos should be [0, 13]
inline void fmt_kmer_get(const FMTIndex *fmt, bwtintv_t *ok, uint32_t qbit, int pos)
{
if (pos == 13)
{
kmer_getval_at(fmt->kmer_hash.ke14[qbit].intv_arr, ok, 0);
}
else if (pos == 12)
{
kmer_getval_at(fmt->kmer_hash.ke13[qbit >> 2].intv_arr, ok, 0);
}
else if (pos == 11)
{
kmer_getval_at(fmt->kmer_hash.ke12[qbit >> 4].intv_arr, ok, 0);
}
else if (pos == 10)
{
kmer_getval_at(fmt->kmer_hash.ke11[qbit >> 6].intv_arr, ok, 0);
}
else
{
kmer_getval_at(fmt->kmer_hash.ke10[qbit >> 8].intv_arr, ok, pos);
}
}
// 生成所有KMER_LEN长度的序列字符串表示
void gen_all_seq(char **seq_arr, int kmer_len)
{
uint32_t seq_up_val = (1 << (kmer_len << 1));
for (uint32_t i = 0; i < seq_up_val; ++i)
{
seq_arr[i] = (char *)malloc(kmer_len);
for (int j = kmer_len - 1; j >= 0; --j)
{
seq_arr[i][kmer_len - 1 - j] = BASE[(i >> (j << 1)) & 3];
}
}
}
static void fmt_reverse_intvs(bwtintv_v *p)
{
if (p->n > 1)
{
if (p->n > 1) {
int j;
for (j = 0; j < p->n >> 1; ++j)
{
@ -596,58 +521,23 @@ static void fmt_reverse_intvs(bwtintv_v *p)
}
}
// 创建正向的kmer
inline static uint64_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed)
{
uint64_t qbit = 0;
int i;
qlen = qlen < kmer_len ? qlen : kmer_len;
for (i = 0; i < qlen; ++i)
{
if (q[i] > 3) // 要考虑碱基是N
break;
qbit |= (uint64_t)q[i] << ((kmer_len - 1 - i) << 1);
}
*base_consumed = i;
return qbit;
}
// 创建f反向的kmer
inline static uint64_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed)
{
uint64_t qbit = 0;
int i, j, end_pos;
end_pos = start_pos - kmer_len;
end_pos = end_pos < 0 ? -1 : end_pos;
for (i = start_pos, j = 0; i > end_pos; --i, ++j)
{
if (q[i] > 3) // 要考虑碱基是N
break;
qbit |= ((uint64_t)q[i]) << ((kmer_len - 1 - j) << 1);
}
*base_consumed = start_pos - i;
return (~qbit) & ((1L << (kmer_len << 1)) - 1);
}
// 找smemseed
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_v *mem, bwtintv_v *tmpvec)
{
int i, j, ret, kmer_len;
bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0};
bwtintv_t mt = {0};
bwtintv_v a[1], *curr;
bwtintv_v *curr;
uint32_t qbit = 0;
mem->n = 0;
int only_forward = x == 0 || q[x - 1] > 3;
if (q[x] > 3) return x + 1;
if (min_intv < 1) min_intv = 1; // the interval size should be at least 1
kv_init(a[0]);
curr = tmpvec && tmpvec[0] ? tmpvec[0] : &a[0]; // use the temporary vector if provided
curr = tmpvec; // use the temporary vector if provided
qbit = (uint32_t)build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
fmt_kmer_get(fmt, &ik, qbit, 0); // 初始碱基位置
qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, 0); // 初始碱基位置
ik.info = x + 1;
// check change of the interval size and whether the interval size is too small to be extended further
@ -657,125 +547,37 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
do { kv_push(bwtintv_t, *curr, iv); goto backward_search; } while(0)
// 处理kmer对应的匹配信息
if (only_forward)
j = kmer_len - 1;
else
j = 1;
if (only_forward) j = kmer_len - 1;
else j = 1;
for (curr->n = 0; j < kmer_len; ++j)
{
fmt_kmer_get(fmt, &ok1, qbit, j);
bwt_kmer_get(&fmt->kmer_hash, &ok1, qbit, j);
CHECK_INTV_CHANGE(ik, ok1, x + j + 1);
}
if (kmer_len != HASH_KMER_LEN) // 遇到了N或者到了序列最后
PUSH_VAL_AND_SKIP(ik);
#define PAC_BASE(pac, l) ((pac)[(l) >> 2] >> ((~(l) & 3) << 1) & 3)
// 扩展kmer之后的碱基
__builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
for (i = (int)ik.info; i + 1 < len; i += 2)
{ // forward search
if (q[i] < 4 && q[i + 1] < 4)
{
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
CHECK_INTV_CHANGE(ik, ok1, i + 1);
CHECK_INTV_CHANGE(ik, ok2, i + 2);
// 在这里进行判断是否只有一个候选了
//if (0)
if (min_intv < 2 && ok2.x[2] == min_intv)
{ // 最多处理两个
mt.num_match = min_intv;
int64_t r, rp;
int k;
int max_qs = 0, min_qe = len;
for (j = 0; j < ok2.x[2]; ++j)
{
rp = fmt_sa(fmt, ok2.x[0] + j);
r = rp >= fmt->l_pac ? (fmt->l_pac << 1) - 1 - rp : rp;
k = i + 2;
if (rp < fmt->l_pac) // 匹配到了正向链
{
// 向前继续扩展
r += i + 2 - x;
while (k < len && r < fmt->l_pac) {
int base = PAC_BASE(fmt->pac, r);
if (q[k] != base)
break;
++k;
++r;
}
mt.rm[j].qe = k;
mt.rm[j].reverse = 0;
// 向后扩展x位置之前的碱基
r -= k - x + 1;
k = x - 1;
while (k > -1 && r > -1) {
int base = PAC_BASE(fmt->pac, r);
if (q[k] != base)
break;
--k;
--r;
}
mt.rm[j].qs = k + 1;
mt.rm[j].rs = r + 1;
}
else // 匹配到了互补链
{
r -= i + 2 - x;
while (k < len && r > -1)
{
const int base = 3 - PAC_BASE(fmt->pac, r);
if (q[k] != base)
break;
++k;
--r;
}
mt.rm[j].qe = k;
mt.rm[j].reverse = 1;
// 扩展x之前的碱基
r += k - x + 1;
k = x - 1;
while (k > -1 && r < fmt->l_pac)
{
const int base = 3 - PAC_BASE(fmt->pac, r);
if (q[k] != base)
break;
--k;
++r;
}
mt.rm[j].qs = k + 1;
mt.rm[j].rs = r - 1;
}
max_qs = max_qs < mt.rm[j].qs ? mt.rm[j].qs : max_qs;
min_qe = min_qe > mt.rm[j].qe ? mt.rm[j].qe : min_qe;
}
if (min_intv > 1)
{
// 修正两个匹配的位置,使比对的碱基个数和位置一致
for (j = 0; j < min_intv; ++j)
{
if (mt.rm[j].reverse)
mt.rm[j].rs -= max_qs - mt.rm[j].qs;
else
mt.rm[j].rs += max_qs - mt.rm[j].qs;
mt.rm[j].qs = max_qs;
mt.rm[j].qe = min_qe;
}
}
mt.info = mt.rm[0].qs;
mt.info = mt.info << 32 | mt.rm[0].qe;
mt.x[2] = min_intv;
kv_push(bwtintv_t, *mem, mt);
ret = (uint32_t)mt.info;
if (only_forward || mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
{
goto fmt_smem_end;
}
goto backward_search;
}
//if (min_intv == 1 && ok2.x[2] == min_intv)
//{
// direct_extend(fmt, len, q, x, i + 2, ok2.x[0], &mt);
// kv_push(bwtintv_t, *mem, mt);
// ret = (uint32_t)mt.info;
// if (only_forward || mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) goto fmt_smem_end;
// goto backward_search;
//}
} else if (q[i] < 4) // q[i+1] >= 4
{
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
@ -802,26 +604,30 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
backward_search:
fmt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first
if (mt.num_match == 0)
ret = curr->a[0].info; // this will be the returned value扩展到的最远的位置
if (mt.num_match == 0) ret = curr->a[0].info; // this will be the returned value扩展到的最远的位置
// 按照种子进行遍历,反向扩展
#define CHECK_ADD_MEM(pos, intv, mem) \
if (mem->n == 0 || (pos) < mem->a[mem->n - 1].info >> 32) { (intv).info |= (uint64_t)(pos) << 32; kv_push(bwtintv_t, *mem, (intv)); }
if (((int)((intv).info) - (pos) >= min_seed_len) && (mem->n == 0 || (pos) < mem->a[mem->n - 1].info >> 32)) { \
(intv).info |= (uint64_t)(pos) << 32; kv_push(bwtintv_t, *mem, (intv)); \
}
#define CHECK_INTV_ADD_MEM(ok, pos, intv, mem) \
if (ok.x[2] < min_intv) { CHECK_ADD_MEM(pos, intv, mem); break; }
int last_kmer_start = 0;
for (j = 0; j < curr->n; ++j)
{
bwtintv_t *p = &curr->a[j]; // 前向扩展的种子
__builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2);
uint64_t qbit = 0;
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2);
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2);
if (!only_forward && p->info - x < HASH_KMER_LEN) {
qbit = build_backward_kmer(q, p->info - 1, HASH_KMER_LEN, &kmer_len); // 创建反向kmer
if (last_kmer_start && kmer_len == HASH_KMER_LEN && p->info == last_kmer_start && p->info - kmer_len > 0 && q[p->info - kmer_len] < 4)
qbit = ((qbit << 2) | (3 - q[p->info - kmer_len])) & ((1L << (kmer_len << 1)) - 1); // 创建反向kmer
else qbit = build_backward_kmer(q, p->info - 1, HASH_KMER_LEN, &kmer_len); // 创建反向kmer
last_kmer_start = p->info - 1;
i = 1;
do { fmt_kmer_get(fmt, &ik, qbit, kmer_len - i++); } while (ik.x[2] < min_intv);
do { bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len - i++); } while (ik.x[2] < min_intv);
if (i > 2) continue;
p->x[0] = ik.x[1]; p->x[1] = ik.x[0]; p->x[2] = ik.x[2];
i = p->info - (kmer_len - i + 3);
@ -833,8 +639,8 @@ backward_search:
if (q[i] < 4 && q[i - 1] < 4) // 两个都可以扩展
{
fmt_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2);
// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2);
// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2);
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
ok1.info = p->info;
CHECK_INTV_ADD_MEM(ok2, i, ok1, mem);
@ -873,8 +679,6 @@ backward_search:
fmt_smem_end:
fmt_reverse_intvs(mem); // s.t. sorted by the start coordinate
if (tmpvec == 0 || tmpvec[0] == 0)
free(a[0].a);
return ret;
}
@ -883,18 +687,18 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in
int i, kmer_len;
bwtintv_t ik = {0}, ok1={0}, ok2={0};
uint64_t qbit;
memset(mem, 0, sizeof(bwtintv_t));
if (q[x] > 3)
return x + 1;
if (q[x] > 3) return x + 1;
qbit = (uint32_t)build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
fmt_kmer_get(fmt, &ik, qbit, kmer_len-1); // 初始碱基位置
qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len-1); // 初始碱基位置
ik.info = x + kmer_len;
__builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
//fmt_set_intv(fmt, q[x], ik);
//ik.info = x + 1;
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
#define COND_SET_RETURN(iv, ov, start_pos, end_pos, max_intv, min_len) \
if (iv.x[2] < max_intv && end_pos - start_pos >= min_len) \
@ -909,8 +713,8 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in
if (q[i] < 4 && q[i + 1] < 4)
{
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len);
COND_SET_RETURN(ok2, *mem, x, i + 1, max_intv, min_len);
ik = ok2;
@ -938,7 +742,6 @@ inline static void fmt_get_previous_base(const FMTIndex *fmt, bwtint_t k, uint8_
{
uint32_t *p;
uint8_t base2;
// 第一步找到check point位置
p = fmt_occ_intv(fmt, k); // check point起始位置
p += 20; // bwt碱基起始位置
@ -949,7 +752,6 @@ inline static void fmt_get_previous_base(const FMTIndex *fmt, bwtint_t k, uint8_
// 第三步找到具体的uint32_t
p += (k & FMT_MID_INTV_MASK) >> 3; // 每个uint32_t包含8个碱基和8个倒数第二bwt碱基
// 第四步,获取碱基
base2 = *p >> ((~(k) & 0x7) << 2) & 0xf;
*b2 = base2 >> 2 & 3;
*b1 = base2 & 3;
@ -982,13 +784,6 @@ bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k)
++sa;
k = k2;
}
//#ifdef SHOW_PERF
// int64_t tmp_time = realtime_msec();
//#endif
sa += bwt_get_sa(fmt->sa, k / fmt->sa_intv);
//#ifdef SHOW_PERF
// tmp_time = realtime_msec() - tmp_time;
// __sync_fetch_and_add(&time_bwt_sa_read, tmp_time);
//#endif
return sa;
}

View File

@ -49,33 +49,7 @@ Date : 2023/12/24
((x) >> 24 & 0xff) + ((x) >> 16 & 0xff) + ((x) >> 8 & 0xff) + ((x) & 0xff)
// sa存储的行间隔
#define SA_INTV 2
#define HASH_KMER_LEN 14
#define BIT_KMER_LEN 17
// 用来保存kmer对应的fmt的位置信息
typedef struct
{
// 40+40+32 14个byte这样好处理
uint8_t intv_arr[14]; // 保存kmer中每扩展一个碱基对应的bwtintv_t数据
} KmerEntry;
typedef struct
{
uint8_t intv_arr[140]; // 保存长度为10的kmer每个碱基对应的bwt匹配信息
} KmerEntryArr;
// 保存各个位置对应的bwt匹配信息
typedef struct
{
KmerEntryArr *ke10;
KmerEntry *ke11;
KmerEntry *ke12;
KmerEntry *ke13;
KmerEntry *ke14;
} KmerHash;
#define SA_INTV 4
// fm-index, extend twice in one search step (one memory access)
typedef struct
@ -96,9 +70,6 @@ typedef struct
uint8_t *pac; // 保存2bit编码的参考序列
// 保存kmer对应的fmt位置信息
KmerHash kmer_hash;
// kmer_bit好像效果一般
uint16_t *kmer_bit; // 用来检测特定长度序列有没有fm-index匹配
int bit_kmer_len;
// suffix array
int sa_intv;
bwtint_t n_sa;
@ -113,8 +84,6 @@ FMTIndex *fmt_restore_fmt(const char *fn);
void fmt_dump_kmer_idx(const char *fn, const KmerHash *kh);
// 从文件中读取kmer hash信息
KmerHash fmt_restore_kmer_idx(const char *fn);
// 读取kmer bit数据
uint16_t *fmt_restore_kmer_bit(const char *fn, int kmer_len);
// 读取sa数据
void fmt_restore_sa(const char *fn, FMTIndex *fmt);
// 根据interval-bwt创建fmt-index
@ -133,7 +102,7 @@ void kmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos);
void kmer_getval_at(uint8_t *mem_addr, bwtintv_t *ok, int pos);
void fmt_kmer_get(const FMTIndex *fmt, bwtintv_t *ok, uint32_t qbit, int pos);
// 找smemseed
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_v *mem, bwtintv_v *tmpvec);
int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem);

3
main.c
View File

@ -37,6 +37,7 @@ int bwa_fa2pac(int argc, char *argv[]);
int bwa_pac2bwt(int argc, char *argv[]);
int bwa_bwtupdate(int argc, char *argv[]);
int bwa_bwt2sa(int argc, char *argv[]);
int bwa_bwt2bytesa(int argc, char *argv[]);
int bwa_index(int argc, char *argv[]);
int bwt_bwtgen_main(int argc, char *argv[]);
@ -75,6 +76,7 @@ static int usage()
fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n");
fprintf(stderr, " bwtupdate update .bwt to the new format\n");
fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n");
fprintf(stderr, " bwt2bytesa generate SA(using byte array) from BWT and Occ\n");
fprintf(stderr, "\n");
fprintf(stderr,
"Note: To use BWA, you need to first index the genome with `bwa index'.\n"
@ -100,6 +102,7 @@ int main(int argc, char *argv[])
else if (strcmp(argv[1], "pac2bwtgen") == 0) ret = bwt_bwtgen_main(argc-1, argv+1);
else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1);
else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1);
else if (strcmp(argv[1], "bwt2bytesa") == 0) ret = bwa_bwt2bytesa(argc-1, argv+1);
else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1);
else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1);
else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1);

108
out.sam 100644
View File

@ -0,0 +1,108 @@
@SQ SN:1 LN:249250621
@SQ SN:2 LN:243199373
@SQ SN:3 LN:198022430
@SQ SN:4 LN:191154276
@SQ SN:5 LN:180915260
@SQ SN:6 LN:171115067
@SQ SN:7 LN:159138663
@SQ SN:8 LN:146364022
@SQ SN:9 LN:141213431
@SQ SN:10 LN:135534747
@SQ SN:11 LN:135006516
@SQ SN:12 LN:133851895
@SQ SN:13 LN:115169878
@SQ SN:14 LN:107349540
@SQ SN:15 LN:102531392
@SQ SN:16 LN:90354753
@SQ SN:17 LN:81195210
@SQ SN:18 LN:78077248
@SQ SN:19 LN:59128983
@SQ SN:20 LN:63025520
@SQ SN:21 LN:48129895
@SQ SN:22 LN:51304566
@SQ SN:X LN:155270560
@SQ SN:Y LN:59373566
@SQ SN:MT LN:16569
@SQ SN:GL000207.1 LN:4262
@SQ SN:GL000226.1 LN:15008
@SQ SN:GL000229.1 LN:19913
@SQ SN:GL000231.1 LN:27386
@SQ SN:GL000210.1 LN:27682
@SQ SN:GL000239.1 LN:33824
@SQ SN:GL000235.1 LN:34474
@SQ SN:GL000201.1 LN:36148
@SQ SN:GL000247.1 LN:36422
@SQ SN:GL000245.1 LN:36651
@SQ SN:GL000197.1 LN:37175
@SQ SN:GL000203.1 LN:37498
@SQ SN:GL000246.1 LN:38154
@SQ SN:GL000249.1 LN:38502
@SQ SN:GL000196.1 LN:38914
@SQ SN:GL000248.1 LN:39786
@SQ SN:GL000244.1 LN:39929
@SQ SN:GL000238.1 LN:39939
@SQ SN:GL000202.1 LN:40103
@SQ SN:GL000234.1 LN:40531
@SQ SN:GL000232.1 LN:40652
@SQ SN:GL000206.1 LN:41001
@SQ SN:GL000240.1 LN:41933
@SQ SN:GL000236.1 LN:41934
@SQ SN:GL000241.1 LN:42152
@SQ SN:GL000243.1 LN:43341
@SQ SN:GL000242.1 LN:43523
@SQ SN:GL000230.1 LN:43691
@SQ SN:GL000237.1 LN:45867
@SQ SN:GL000233.1 LN:45941
@SQ SN:GL000204.1 LN:81310
@SQ SN:GL000198.1 LN:90085
@SQ SN:GL000208.1 LN:92689
@SQ SN:GL000191.1 LN:106433
@SQ SN:GL000227.1 LN:128374
@SQ SN:GL000228.1 LN:129120
@SQ SN:GL000214.1 LN:137718
@SQ SN:GL000221.1 LN:155397
@SQ SN:GL000209.1 LN:159169
@SQ SN:GL000218.1 LN:161147
@SQ SN:GL000220.1 LN:161802
@SQ SN:GL000213.1 LN:164239
@SQ SN:GL000211.1 LN:166566
@SQ SN:GL000199.1 LN:169874
@SQ SN:GL000217.1 LN:172149
@SQ SN:GL000216.1 LN:172294
@SQ SN:GL000215.1 LN:172545
@SQ SN:GL000205.1 LN:174588
@SQ SN:GL000219.1 LN:179198
@SQ SN:GL000224.1 LN:179693
@SQ SN:GL000223.1 LN:180455
@SQ SN:GL000195.1 LN:182896
@SQ SN:GL000212.1 LN:186858
@SQ SN:GL000222.1 LN:186861
@SQ SN:GL000200.1 LN:187035
@SQ SN:GL000193.1 LN:189789
@SQ SN:GL000194.1 LN:191469
@SQ SN:GL000225.1 LN:211173
@SQ SN:GL000192.1 LN:547496
@SQ SN:NC_007605 LN:171823
@SQ SN:hs37d5 LN:35477943
@HD VN:1.5 SO:unsorted GO:query
@RG ID:normal SM:normal PL:illumina LB:normal PG:bwa
@PG ID:bwa PN:bwa VN:0.7.17-r1198-dirty CL:./bwa mem -t 1 -M -R @RG\tID:normal\tSM:normal\tPL:illumina\tLB:normal\tPG:bwa /home/zzh/reference/human_g1k_v37_decoy.fasta /home/zzh/fastq/diff_r1.fq /home/zzh/fastq/diff_r2.fq -o ./out.sam
A00289:748:HLCH3DSX3:4:1102:24749:34929 97 16 75642166 60 150M 12 66451372 0 AAAGCCATTGGCAGTAACATCCAAAGGCTGCTCAGGAACTGCACTCCAGCTGATGGCTATGAAAAGATAAGATTCTAGGGTTAAAACTGCTCAATTTCTTGTGAAAGTCAGAAGTATTTTAATAAAAATATTTCTCTGACTCTACAGACC FFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFF: NM:i:0 MD:Z:150 MC:Z:17S91M42S AS:i:150 XS:i:0 RG:Z:normal
A00289:748:HLCH3DSX3:4:1102:24749:34929 145 12 66451372 0 17S91M42S 16 75642166 0 TTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATAAATAATTCTTTGTGAAACCCCT F,F,FFF,:FFF,,F,F,,F,F:,FFF:F:FFF:FFFFFFF:F,F,F,FFFFFF::,,FFFFFFF,:F,:,F,::,,F:FF:,FF:F::F::FFFFFFFFFF,F:F,FFFF,FFFFF,FF,F,,F,F,,,,,,:,,,,,,,,,,,,,,,F NM:i:0 MD:Z:91 MC:Z:150M AS:i:91 XS:i:90 RG:Z:normal SA:Z:12,49106363,+,117S33M,0,0; XA:Z:12,-66451373,35S90M25S,0;12,-66451373,28S90M32S,0;6,-160521757,23S83M44S,0;15,-77910871,23S79M48S,0;
A00289:748:HLCH3DSX3:4:1102:24749:34929 385 12 49106363 0 117H33M 16 75642166 0 AAAAAAAAAAAAAAATAAAAAAAAAAAAAAAAA FFF:F:FFF,:F,F,,F,F,,FFF:,FFF,F,F NM:i:0 MD:Z:33 MC:Z:150M AS:i:33 XS:i:32 RG:Z:normal SA:Z:12,66451372,-,17S91M42S,0,0; XA:Z:4,-19079503,32M118S,0;15,-55218248,32M118S,0;X,-149551159,32M118S,0;
A00289:748:HLCH3DSX3:4:1104:6234:8656 113 12 66451373 8 12S91M47S 22 18873826 0 TTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGTTTGTTTTTTTTTTTTTTGGGGGGGGGGGGGGGTGAGGTGGTCTGTG ,,:FFF:,:,::,,,,,,FF,FFF,,FF,F,FFF:,,:FFFFFF:FFF,:,:FF:F:::::FFF::,::F:FFFFFF:,FFFF,,,:FFFFFFFF:,,::,,,,,:,,FFF:,,,FF:F,:,FFFF:F::,,FF:,:FFFFFF:FFFFFF NM:i:1 MD:Z:13T77 MC:Z:150M AS:i:86 XS:i:81 RG:Z:normal SA:Z:3,121668661,-,89S32M29S,0,0; XA:Z:12,-66451372,25S91M34S,2;6,-160521757,19S84M47S,1;15,-77910874,26S77M47S,0;7,-107410599,25M1I3M15D74M47S,16;
A00289:748:HLCH3DSX3:4:1104:6234:8656 369 3 121668661 0 89H32M29H 22 18873826 0 TTTTTTTTTTTTTGTTTGTTTTTTTTTTTTTT FFFFFF:,,::,,,,,:,,FFF:,,,FF:F,: NM:i:0 MD:Z:32 MC:Z:150M AS:i:32 XS:i:32 RG:Z:normal SA:Z:12,66451373,-,12S91M47S,8,1;
A00289:748:HLCH3DSX3:4:1104:6234:8656 177 22 18873826 22 150M 12 66451373 0 CTATACCCCACATCCTCATTCCCACCAGATGTCTTGGATCATGGAGGCTCTCCAGACAAAAGCCAGCAGTTAAGCTCCAGATTTCCTATAGAATCCTTTTCTAACAACCAGTGAGTGATTCCAGAATACGTACCATTGAATGTGCTCCCT FFF:FFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NM:i:0 MD:Z:150 MC:Z:12S91M47S AS:i:150 XS:i:140 RG:Z:normal
A00289:748:HLCH3DSX3:4:1104:1741:18129 97 11 113220686 60 150M 12 66451373 0 TATTTCACCCGGATGCTGACTCCTTTTTTGGTTTCCTTGCAAGGTTACCCTGTTGGGATGTTGTTAGTTGGAGGACAGCTACCTCTGAGGTTATTATGGTTTTTTGCAGATTATTGGAAGCGCTGGTGTCATTTCTTGATTTCTCGGATA FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFF,FFFFFFFFFFFFFFFFFF:FFFFFFF:FFFF NM:i:0 MD:Z:150 MC:Z:90M60S AS:i:150 XS:i:19 RG:Z:normal
A00289:748:HLCH3DSX3:4:1104:1741:18129 145 12 66451373 7 90M60S 11 113220686 0 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTTTTTTCCCAC FF:FFFF,F,,FFFFFFFF:FFFF:,F,,:F:F::::,F::::FFF::F::,,:FFF,:::FFFFFFFFFFF:FFFFFFFF,FFFF:FFF,:,:,,::FFFFFFFFFFF,FF:FFF:F:F:F:,F,,::F,,F,,FFFFF:F:,,,,,,, NM:i:1 MD:Z:81T8 MC:Z:150M AS:i:85 XS:i:81 RG:Z:normal SA:Z:8,112168232,-,79S59M12S,0,1; XA:Z:6,-160521757,81M69S,0;4,-82515382,13S69M68S,0;
A00289:748:HLCH3DSX3:4:1104:1741:18129 401 8 112168232 0 79H59M12H 11 113220686 0 TTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTT FF,FFFF:FFF,:,:,,::FFFFFFFFFFF,FF:FFF:F:F:F:,F,,::F,,F,,FFF NM:i:1 MD:Z:46T12 MC:Z:150M AS:i:54 XS:i:53 RG:Z:normal SA:Z:12,66451373,-,90M60S,7,1;
A00289:748:HLCH3DSX3:4:1105:27281:29700 113 X 117117442 60 150M 12 66451372 0 CCCTGAAGTTGATTTTTTGAATATATCCTTGAAATCGGCTTGTTTTAATGACAATTGCCTCCGTCTCCTGAAAGTGGAGTGCAAAAAGAAACCTGGTTAATTGTAGTTTTATTATATTTAGTTCAAATTTTTCAAAGCATACCCTGACAG FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NM:i:0 MD:Z:150 MC:Z:12S89M49S AS:i:150 XS:i:0 RG:Z:normal
A00289:748:HLCH3DSX3:4:1105:27281:29700 177 12 66451372 0 12S89M49S X 117117442 0 TTTTTTAAAAATATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTATTTATTTTTTTATTTTTTTATTATTTTTTTTTTTTTTTTTTTTTTTTTGGTGCCGCCCCC ,,,,:,,F:F,,,,:FF,F:FF,::F:F:F,,F::FFF,FFFFFFFFF,FFFFFF::FFF:F:FFFFFFF,FFFFFFF::FFFF:,:FF:,,F,:,FFFFF,:,,:F,,,,:,FFFFF:F,,FFF:FFFFF:,F,F,,,:,,,,,:FFFF NM:i:3 MD:Z:73T3T3T7 MC:Z:150M AS:i:74 XS:i:72 RG:Z:normal SA:Z:15,85942734,+,20S48M82S,0,1;
A00289:748:HLCH3DSX3:4:1105:27281:29700 417 15 85942734 0 20H48M82H X 117117442 0 AAAAAAAAAAAAAAAAATAATAAAAAAATAAAAAAATAAATAAATAAA FFFF:FFF,,F:FFFFF,:,,,,F:,,:,FFFFF,:,F,,:FF:,:FF NM:i:1 MD:Z:17A30 MC:Z:150M AS:i:43 XS:i:36 RG:Z:normal SA:Z:12,66451372,-,12S89M49S,0,3; XA:Z:11,-118049564,69S35M7D34M12S,11;2,+204499190,12S31M1D21M86S,3;
A00289:748:HLCH3DSX3:4:1106:2880:19288 97 4 169630630 0 29M1I4M1I68M9D22M25S 12 66451373 0 CAGCCTGGGTGACAGAGTGAGACTCCAGCTAAGGCAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAATGAAGGAAGGAAGGAGGGAGGGAGGGAGGGAGGGAGAGATCGGAAGAGCACACGTCTGAAC FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFF:FFF:FFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFF NM:i:14 MD:Z:18C8T59G13^AAGGAAGGA22 MC:Z:36S91M23S AS:i:73 XS:i:72 RG:Z:normal
A00289:748:HLCH3DSX3:4:1106:2880:19288 145 12 66451373 9 36S91M23S 4 169630630 0 TATATACTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGTGGGTGGGTGGGTGGGTTGGCAG :,,,,,,,:FF:FFFF:,:FF:,F,F,FFF,,,FFF:FFFFFFFFF,:,,F:F,FFFF,:FFFFFFFFFFFFFFFFFFFF:FF:FFFF:FFFFFFF:FFF:FFFFFFF:FF,FFF:,:::,:F::,,:,,:,:,,,,,:,::,,,:F,FF NM:i:1 MD:Z:10T80 MC:Z:29M1I4M1I68M9D22M25S AS:i:86 XS:i:80 RG:Z:normal SA:Z:15,55218228,-,9S52M89S,0,0; XA:Z:15,-77910871,47S80M23S,0;6,-160521761,47S80M23S,0;6,-160521757,47S79M24S,0;7,-107410599,25S28M15D74M23S,16;
A00289:748:HLCH3DSX3:4:1106:2880:19288 401 15 55218228 0 9H52M89H 4 169630630 0 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTT FF:FFFF:,:FF:,F,F,FFF,,,FFF:FFFFFFFFF,:,,F:F,FFFF,:F NM:i:0 MD:Z:52 MC:Z:29M1I4M1I68M9D22M25H AS:i:52 XS:i:50 RG:Z:normal SA:Z:12,66451373,-,36S91M23S,9,1;
A00289:748:HLCH3DSX3:4:1107:2365:20462 97 22 18737670 0 150M 12 66451372 0 GAGGACACCATACAAGCCGTCGCTAACAAGGACACTGTACACAACATCGCTAATGACGGCACCGTACAAGACATCACCAATGAGGGCGCTTTATACGACATTGCTAATGATACCGACAAGGCACGCTAACGTGGACGCTGTACACGACAT FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFF NM:i:0 MD:Z:150 MC:Z:21S90M39S AS:i:150 XS:i:150 RG:Z:normal
A00289:748:HLCH3DSX3:4:1107:2365:20462 145 12 66451372 13 21S90M39S 22 18737670 0 TTTTATATATTTATATTTATTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGTTTGTTTTTTGTGGTGGGGGGGGGGTTTGGCATAGCAT F:F,,F,F,F:FFF,,,:,F:,:,,:,,,,:,F:,,FFFFFFFFFFFFF,F:FFFFFFFFF:FFFF,,FFFFF:FFF::FFFFF,F:F,,F:F,FFFFFFFF,:F:F::::,,:F,F,,F,:,,,,,::,,,,:,,:,,,,,,F,F::FF NM:i:0 MD:Z:90 MC:Z:150M AS:i:90 XS:i:84 RG:Z:normal XA:Z:6,-160521757,28S84M38S,0;15,-77910871,32S80M38S,0;7,-107410642,38S74M38S,0;
A00289:748:HLCH3DSX3:4:1107:26377:28385 113 GL000220.1 122227 60 150M 6 160521757 0 TGCTCTGTTGCTCACGCTGGTCTCAAACTCCTGGCCTTGACGCTTCTCCCGTCACATCCGCCGTCTGGTTGTTGAAATGAGCATCTCTCGTAAAATGGAAAAGATGAAAGAAATAAACACGAAGACGGAAAGCACGGTGTGAACGTTTCT FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NM:i:0 MD:Z:150 MC:Z:85M65S AS:i:150 XS:i:113 RG:Z:normal
A00289:748:HLCH3DSX3:4:1107:26377:28385 177 6 160521757 0 85M65S GL000220.1 122227 0 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGGTGTTTTTTTTTTGTTGGGGGTTGGGTGTTGGGGGGCGGGGGGGGGGGCGCCCCCCCCCAAGGAGA F:FF,F,,F,F,FFFFFF,:FF,:FF:F,:FF,FFFF:FFF:FF,FFFFFFFFFFFFFFFFFFFFFFFF:FF,F,F:F,:FF,,,FFFF,F,:,,,,,:::,,:,,,,::,,,,,,,,,F,,FF,:F:,,F,,,,,FF:FF:F,FFFF:: NM:i:0 MD:Z:85 MC:Z:150M AS:i:85 XS:i:84 RG:Z:normal XA:Z:12,-66451380,84M66S,0;12,-66451373,83M67S,0;15,-77910871,4S80M66S,0;

12
run.sh
View File

@ -6,17 +6,17 @@ thread=1
#reference=~/data/reference/human_g1k_v37_decoy.fasta
#n_r1=~/fastq/sn_r1.fq
#n_r2=~/fastq/sn_r2.fq
#n_r1=~/fastq/ssn_r1.fq
#n_r2=~/fastq/ssn_r2.fq
n_r1=~/fastq/ssn_r1.fq
n_r2=~/fastq/ssn_r2.fq
#n_r1=~/fastq/tiny_n_r1.fq
#n_r2=~/fastq/tiny_n_r2.fq
n_r1=~/fastq/diff_r1.fq
n_r2=~/fastq/diff_r2.fq
#n_r1=~/fastq/diff_r1.fq
#n_r2=~/fastq/diff_r2.fq
#n_r1=~/fastq/d_r1.fq
#n_r2=~/fastq/d_r2.fq
reference=~/reference/human_g1k_v37_decoy.fasta
#out=./ssn.sam
out=./out.sam
out=./ssn.sam
#out=./out.sam
#out=/dev/null
#time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
# /home/zzh/data/reference/human_g1k_v37_decoy.fasta \

16
utils.c
View File

@ -139,6 +139,22 @@ size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream)
return ret;
}
uint64_t fread_fix(FILE *fp, uint64_t size, void *a)
{ // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks
// const int bufsize = 0x1000000; // 16M block
const int bufsize = 0x4000000; // 64M block
uint64_t offset = 0;
while (size)
{
int x = bufsize < size ? bufsize : size;
if ((x = err_fread_noeof(a + offset, 1, x, fp)) == 0)
break;
size -= x;
offset += x;
}
return offset;
}
int err_gzread(gzFile file, void *ptr, unsigned int len)
{
int ret = gzread(file, ptr, len);

View File

@ -48,7 +48,8 @@ extern int64_t time_ksw_extend2,
time_bns;
extern int64_t dn, n16, n17, n18, n19, nall, num_sa;
extern int64_t s1n, s2n, s3n, s1l, s2l, s3l;
extern int64_t g_num_smem2;
extern FILE *fp1;
#endif
@ -89,6 +90,7 @@ extern "C" {
gzFile err_xzopen_core(const char *func, const char *fn, const char *mode);
size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream);
uint64_t fread_fix(FILE *fp, uint64_t size, void *a);
int err_gzread(gzFile file, void *ptr, unsigned int len);
int err_fseek(FILE *stream, long offset, int whence);