做了一些代码清理,目前结果应该是完全一致的
This commit is contained in:
parent
17618ee5f2
commit
4bd0fd4f91
|
|
@ -18,8 +18,8 @@
|
|||
"-R",
|
||||
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
|
||||
"~/reference/human_g1k_v37_decoy.fasta",
|
||||
"~/fastq/tiny_n_r1.fq",
|
||||
"~/fastq/tiny_n_r2.fq",
|
||||
"~/fastq/diff_r1.fq",
|
||||
"~/fastq/diff_r2.fq",
|
||||
"-o",
|
||||
"/dev/null"
|
||||
],
|
||||
|
|
|
|||
20
bwa.c
20
bwa.c
|
|
@ -317,14 +317,6 @@ FMTIndex *bwa_idx_load_fmt(const char *hint)
|
|||
fprintf(stderr, "%s\n", kmer_idx_fn);
|
||||
fmt->kmer_hash = fmt_restore_kmer_idx(kmer_idx_fn);
|
||||
|
||||
// restore kmer bit
|
||||
// fmt->bit_kmer_len = 18;
|
||||
// sprintf(suffix, ".%d.kmer.bit", fmt->bit_kmer_len);
|
||||
// strcpy(kmer_bit_fn, hint);
|
||||
// strcpy(kmer_bit_fn + l_hint, suffix);
|
||||
// fprintf(stderr, "%s\n", kmer_bit_fn);
|
||||
// fmt->kmer_bit = fmt_restore_kmer_bit(kmer_bit_fn, fmt->bit_kmer_len);
|
||||
|
||||
strcpy(sa_fn, hint);
|
||||
sprintf(suffix, ".33.%d.sa", SA_INTV);
|
||||
strcpy(sa_fn + l_hint, suffix); // partial suffix array (SA)
|
||||
|
|
@ -348,13 +340,9 @@ bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
|
|||
}
|
||||
idx = calloc(1, sizeof(bwaidx_t));
|
||||
if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
|
||||
if (which & BWA_IDX_BWT) {
|
||||
idx->fmt = bwa_idx_load_fmt(hint);
|
||||
// 先和bwt共用sa
|
||||
//idx->fmt->sa = idx->bwt->sa;
|
||||
//idx->fmt->n_sa = idx->bwt->n_sa;
|
||||
//idx->fmt->sa_intv = idx->bwt->sa_intv;
|
||||
}
|
||||
if (which & BWA_IDX_BWT) idx->fmt = bwa_idx_load_fmt(hint);
|
||||
idx->bwt->kmer_hash = idx->fmt->kmer_hash;
|
||||
|
||||
if (which & BWA_IDX_BNS)
|
||||
{
|
||||
int i, c;
|
||||
|
|
@ -404,7 +392,7 @@ int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx)
|
|||
// generate idx->bwt
|
||||
x = sizeof(bwt_t); idx->bwt = malloc(x); memcpy(idx->bwt, mem + k, x); k += x;
|
||||
x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x;
|
||||
x = SA_BYTES(idx->bwt->n_sa); idx->bwt->sa = (uint8_t*)(mem + k); k += x;
|
||||
x = SA_BYTES(idx->bwt->n_sa); idx->bwt->sa = (bwtint_t*)(mem + k); k += x;
|
||||
|
||||
// generate idx->bns and idx->pac
|
||||
x = sizeof(bntseq_t); idx->bns = malloc(x); memcpy(idx->bns, mem + k, x); k += x;
|
||||
|
|
|
|||
48
bwamem.c
48
bwamem.c
|
|
@ -148,18 +148,11 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
|||
int max_seed_len = 0;
|
||||
int start_N_num = 0, start_flag = 1;
|
||||
a->mem.n = 0;
|
||||
// char *BASE = "ACGT";
|
||||
// for (i = 0; i < len; ++i)
|
||||
// if (seq[i] < 4)
|
||||
// fprintf(stderr, "%c", BASE[seq[i]]);
|
||||
// else
|
||||
// fprintf(stderr, "N");
|
||||
// fprintf(stderr, "\n");
|
||||
|
||||
// first pass: find all SMEMs
|
||||
fprintf(fp1, "seq: %ld\n", dn++);
|
||||
//fprintf(stderr, "seq: %ld\n", dn++);
|
||||
//dn ++;
|
||||
// fprintf(stderr, "seq: %ld\n", dn++);
|
||||
// dn ++;
|
||||
// goto third_seed;
|
||||
|
||||
while (x < len) {
|
||||
|
|
@ -169,8 +162,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
|||
int64_t tmp_time = realtime_msec();
|
||||
#endif
|
||||
#if USE_FMT
|
||||
x = fmt_smem(fmt, len, seq, x, start_width, &a->mem1, a->tmpv);
|
||||
|
||||
x = fmt_smem(fmt, len, seq, x, start_width, opt->min_seed_len, &a->mem1, a->tmpv[0]);
|
||||
#else
|
||||
x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv);
|
||||
#endif
|
||||
|
|
@ -178,15 +170,17 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
|||
tmp_time = realtime_msec() - tmp_time;
|
||||
__sync_fetch_and_add(&time_seed_1, tmp_time);
|
||||
#endif
|
||||
s1n += a->mem1.n;
|
||||
for (i = 0; i < a->mem1.n; ++i) {
|
||||
bwtintv_t *p = &a->mem1.a[i];
|
||||
//fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
//fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
int slen = (uint32_t)p->info - (p->info >> 32); // seed length
|
||||
s1l += slen;
|
||||
max_seed_len = fmax(max_seed_len, slen);
|
||||
if (slen >= opt->min_seed_len) {
|
||||
fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
//fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
kv_push(bwtintv_t, a->mem, *p);
|
||||
}
|
||||
}
|
||||
|
|
@ -210,7 +204,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
|||
int64_t tmp_time = realtime_msec();
|
||||
#endif
|
||||
#if USE_FMT
|
||||
fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, &a->mem1, a->tmpv);
|
||||
fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, &a->mem1, a->tmpv[0]);
|
||||
#else
|
||||
bwt_smem1(bwt, len, seq, (start + end) >> 1, p->x[2] + 1, &a->mem1, a->tmpv);
|
||||
#endif
|
||||
|
|
@ -218,13 +212,17 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
|||
tmp_time = realtime_msec() - tmp_time;
|
||||
__sync_fetch_and_add(&time_seed_2, tmp_time);
|
||||
#endif
|
||||
s2n += a->mem1.n;
|
||||
|
||||
for (i = 0; i < a->mem1.n; ++i) {
|
||||
bwtintv_t *p = &a->mem1.a[i];
|
||||
//fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
// fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
// fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
int slen = (uint32_t)p->info - (p->info >> 32);
|
||||
int slen = (uint32_t)p->info - (p->info >> 32);
|
||||
s2l += slen;
|
||||
if (slen >= opt->min_seed_len) {
|
||||
g_num_smem2 += 1;
|
||||
fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
|
||||
}
|
||||
|
|
@ -233,7 +231,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
|||
//if (max_seed_len == len - start_N_num)
|
||||
// goto collect_intv_end;
|
||||
//goto collect_intv_end;
|
||||
//third_seed:
|
||||
third_seed:
|
||||
// third pass: LAST-like
|
||||
if (opt->max_mem_intv > 0) {
|
||||
x = 0;
|
||||
|
|
@ -254,13 +252,15 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
|||
tmp_time = realtime_msec() - tmp_time;
|
||||
__sync_fetch_and_add(&time_seed_3, tmp_time);
|
||||
#endif
|
||||
//bwtintv_t *p = &m;
|
||||
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
s3n += 1;
|
||||
s3l += (uint32_t)m.info - (m.info >> 32);
|
||||
// bwtintv_t *p = &m;
|
||||
// fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
|
||||
if (m.x[2] > 0) {
|
||||
kv_push(bwtintv_t, a->mem, m);
|
||||
bwtintv_t *p = &m;
|
||||
fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
//bwtintv_t *p = &m;
|
||||
//fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
//fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info);
|
||||
}
|
||||
} else { // for now, we never come to this block which is slower
|
||||
|
|
@ -272,7 +272,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn
|
|||
}
|
||||
}
|
||||
|
||||
collect_intv_end:
|
||||
//collect_intv_end:
|
||||
// sort
|
||||
ks_introsort(mem_intv, a->mem.n, a->mem.a);
|
||||
}
|
||||
|
|
@ -438,10 +438,10 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fm
|
|||
#endif
|
||||
#if USE_FMT
|
||||
s.rbeg = tmp.pos = fmt_sa(fmt, p->x[0] + k);
|
||||
uint64_t tpos = bwt_sa(bwt, p->x[0] + k);
|
||||
if (s.rbeg != tpos) {
|
||||
fprintf(stderr, "diff: %ld, %ld %ld\n", p->x[0] + k, tmp.pos, tpos);
|
||||
}
|
||||
// uint64_t tpos = bwt_sa(bwt, p->x[0] + k);
|
||||
// if (s.rbeg != tpos) {
|
||||
// fprintf(stderr, "diff: %ld, %ld %ld\n", p->x[0] + k, tmp.pos, tpos);
|
||||
// }
|
||||
#else
|
||||
s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
|
||||
#endif
|
||||
|
|
|
|||
224
bwt.c
224
bwt.c
|
|
@ -53,20 +53,10 @@ void bwt_gen_cnt_table(bwt_t *bwt)
|
|||
|
||||
static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inverse CSA
|
||||
{
|
||||
bwtint_t x = k - (k > bwt->primary); // bwt中不包含$,所以位置超过序列长度后,要减掉1,(因为后边加上了互补链)
|
||||
// x = bwt_B0(bwt, x); // 获取x位置对应的字符
|
||||
|
||||
uint32_t t1 = (bwt->bwt[((x) >> 7 << 4) + sizeof(bwtint_t) + (((x) & 0x7f) >> 4)]);
|
||||
uint32_t t2 = t1 >> ((~(x) & 0xf) << 1) & 3;
|
||||
|
||||
x = t2;
|
||||
|
||||
// bwtint_t cnt[4], cnt1;
|
||||
// bwt_occ4(bwt, k, cnt);
|
||||
// cnt1 = bwt_occ(bwt, k, x);
|
||||
|
||||
x = bwt->L2[x] + bwt_occ(bwt, k, x); // 获取x字符在k位置(后缀索引),
|
||||
return k == bwt->primary? 0 : x;
|
||||
bwtint_t x = k - (k > bwt->primary);
|
||||
x = bwt_B0(bwt, x);
|
||||
x = bwt->L2[x] + bwt_occ(bwt, k, x);
|
||||
return k == bwt->primary ? 0 : x;
|
||||
}
|
||||
|
||||
// 设置某一行的排序值,sa的索引有效值从1开始,(0设置为-1, 小端模式)
|
||||
|
|
@ -91,8 +81,55 @@ bwtint_t bwt_get_sa(uint8_t *sa_arr, bwtint_t k)
|
|||
return val;
|
||||
}
|
||||
|
||||
// 获取kmer的fmt匹配信息
|
||||
inline void kmer_getval_at(uint8_t *mem_addr, bwtintv_t *ok, int pos)
|
||||
{
|
||||
bwtint_t x0, x1, x2;
|
||||
int byte_idx = pos * 14;
|
||||
uint8_t *arr = mem_addr + byte_idx;
|
||||
x0 = *arr;
|
||||
x0 = (x0 << 32) | *((uint32_t *)(arr + 1));
|
||||
arr += 5;
|
||||
x1 = *arr;
|
||||
x1 = (x1 << 32) | *((uint32_t *)(arr + 1));
|
||||
arr += 5;
|
||||
x2 = *((uint32_t *)arr);
|
||||
ok->x[0] = x0;
|
||||
ok->x[1] = x1;
|
||||
ok->x[2] = x2;
|
||||
}
|
||||
|
||||
// 设置kmer第pos个碱基对应的fmt匹配信息
|
||||
inline void kmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos)
|
||||
{
|
||||
int byte_idx = pos * 14;
|
||||
uint8_t *arr = mem_addr + byte_idx;
|
||||
arr[0] = (uint8_t)(ik.x[0] >> 32);
|
||||
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[0];
|
||||
arr += 5;
|
||||
arr[0] = (uint8_t)(ik.x[1] >> 32);
|
||||
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[1];
|
||||
arr += 5;
|
||||
*((uint32_t *)arr) = (uint32_t)ik.x[2];
|
||||
}
|
||||
|
||||
// 获取kmer对应的fmt匹配信息, pos should be [0, 13]
|
||||
inline void bwt_kmer_get(const KmerHash *kmer_hash, bwtintv_t *ok, uint32_t qbit, int pos)
|
||||
{
|
||||
if (pos == 13)
|
||||
kmer_getval_at(kmer_hash->ke14[qbit].intv_arr, ok, 0);
|
||||
else if (pos == 12)
|
||||
kmer_getval_at(kmer_hash->ke13[qbit >> 2].intv_arr, ok, 0);
|
||||
else if (pos == 11)
|
||||
kmer_getval_at(kmer_hash->ke12[qbit >> 4].intv_arr, ok, 0);
|
||||
else if (pos == 10)
|
||||
kmer_getval_at(kmer_hash->ke11[qbit >> 6].intv_arr, ok, 0);
|
||||
else
|
||||
kmer_getval_at(kmer_hash->ke10[qbit >> 8].intv_arr, ok, pos);
|
||||
}
|
||||
|
||||
// bwt->bwt and bwt->occ must be precalculated
|
||||
void bwt_cal_sa(bwt_t *bwt, int intv)
|
||||
void bwt_cal_byte_sa(bwt_t *bwt, int intv)
|
||||
{
|
||||
bwtint_t isa, sa, i, block_size; // S(isa) = sa isa是后缀数组的索引,sa表示位置
|
||||
double tmp_time, elapsed_time;
|
||||
|
|
@ -102,10 +139,10 @@ void bwt_cal_sa(bwt_t *bwt, int intv)
|
|||
xassert(intv_round == intv, "SA sample interval is not a power of 2.");
|
||||
xassert(bwt->bwt, "bwt_t::bwt is not initialized.");
|
||||
|
||||
if (bwt->sa) free(bwt->sa);
|
||||
if (bwt->byte_sa) free(bwt->byte_sa);
|
||||
bwt->sa_intv = intv;
|
||||
bwt->n_sa = (bwt->seq_len + intv) / intv;
|
||||
bwt->sa = (uint8_t *)calloc(SA_BYTES(bwt->n_sa), 1); // 用33位表示位置
|
||||
bwt->byte_sa = (uint8_t *)calloc(SA_BYTES(bwt->n_sa), 1); // 用33位表示位置
|
||||
fprintf(stderr, "bytes: %ld, sa size: %ld\n", SA_BYTES(bwt->n_sa), bwt->n_sa);
|
||||
// calculate SA value
|
||||
isa = 0; sa = bwt->seq_len;
|
||||
|
|
@ -118,19 +155,43 @@ void bwt_cal_sa(bwt_t *bwt, int intv)
|
|||
fprintf(stderr, "%ld%% percent complished. %f s elapsed.\n", i / block_size, elapsed_time);
|
||||
}
|
||||
if (isa % intv == 0) {
|
||||
bwt_set_sa(bwt->sa, isa / intv, sa); // 第一个位置是$,所以位置就是序列长度
|
||||
bwt_set_sa(bwt->byte_sa, isa / intv, sa); // 第一个位置是$,所以位置就是序列长度
|
||||
if (i % (block_size / 2) == 0)
|
||||
{
|
||||
fprintf(stderr, "%ld %ld\n", sa, bwt_get_sa(bwt->sa, isa / intv));
|
||||
fprintf(stderr, "%ld %ld\n", sa, bwt_get_sa(bwt->byte_sa, isa / intv));
|
||||
}
|
||||
}
|
||||
--sa; // 从后往前,一个位置一个位置的找对应的后缀数组,isa就是与sa对应的后缀数组排序后在sa数组中的相对位置
|
||||
isa = bwt_invPsi(bwt, isa); // 下一个后缀数组的相对位置
|
||||
}
|
||||
if (isa % intv == 0)
|
||||
bwt_set_sa(bwt->sa, isa / intv, sa);
|
||||
// bwt_set_sa(bwt->sa, 0, (bwtint_t)-1); // 赋值成-1也没问题,set_sa那里已经修正了
|
||||
bwt_set_sa(bwt->sa, 0, 8589934591); // before this line, bwt->sa[0] = bwt->seq_len
|
||||
if (isa % intv == 0) bwt_set_sa(bwt->byte_sa, isa / intv, sa);
|
||||
// bwt_set_sa(bwt->byte_sa, 0, (bwtint_t)-1); // 赋值成-1也没问题,set_sa那里已经修正了
|
||||
bwt_set_sa(bwt->byte_sa, 0, 8589934591); // before this line, bwt->sa[0] = bwt->seq_len
|
||||
}
|
||||
|
||||
// bwt->bwt and bwt->occ must be precalculated
|
||||
void bwt_cal_sa(bwt_t *bwt, int intv)
|
||||
{
|
||||
bwtint_t isa, sa, i; // S(isa) = sa
|
||||
int intv_round = intv;
|
||||
|
||||
kv_roundup32(intv_round);
|
||||
xassert(intv_round == intv, "SA sample interval is not a power of 2.");
|
||||
xassert(bwt->bwt, "bwt_t::bwt is not initialized.");
|
||||
|
||||
if (bwt->sa) free(bwt->sa);
|
||||
bwt->sa_intv = intv;
|
||||
bwt->n_sa = (bwt->seq_len + intv) / intv;
|
||||
bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
|
||||
// calculate SA value
|
||||
isa = 0; sa = bwt->seq_len;
|
||||
for (i = 0; i < bwt->seq_len; ++i) {
|
||||
if (isa % intv == 0) bwt->sa[isa/intv] = sa;
|
||||
--sa;
|
||||
isa = bwt_invPsi(bwt, isa);
|
||||
}
|
||||
if (isa % intv == 0) bwt->sa[isa/intv] = sa;
|
||||
bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len
|
||||
}
|
||||
|
||||
bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k)
|
||||
|
|
@ -141,17 +202,6 @@ bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k)
|
|||
k = bwt_invPsi(bwt, k);
|
||||
}
|
||||
return sa + bwt->sa[k / bwt->sa_intv];
|
||||
/* without setting bwt->sa[0] = -1, the following line should be
|
||||
changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */
|
||||
//#ifdef SHOW_PERF
|
||||
// int64_t tmp_time = realtime_msec();
|
||||
//#endif
|
||||
sa += bwt_get_sa(bwt->sa, k / bwt->sa_intv);
|
||||
//#ifdef SHOW_PERF
|
||||
// tmp_time = realtime_msec() - tmp_time;
|
||||
// __sync_fetch_and_add(&time_bwt_sa_read, tmp_time);
|
||||
//#endif
|
||||
return sa;
|
||||
}
|
||||
|
||||
static inline int __occ_aux(uint64_t y, int c)
|
||||
|
|
@ -320,11 +370,6 @@ int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t
|
|||
|
||||
void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back)
|
||||
{
|
||||
#ifdef SHOW_PERF
|
||||
#if 0
|
||||
int64_t tmp_time = realtime_msec();
|
||||
#endif
|
||||
#endif
|
||||
bwtint_t tk[4], tl[4];
|
||||
int i;
|
||||
bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl);
|
||||
|
|
@ -336,12 +381,34 @@ void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_b
|
|||
ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2];
|
||||
ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2];
|
||||
ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2];
|
||||
#ifdef SHOW_PERF
|
||||
#if 0
|
||||
tmp_time = realtime_msec() - tmp_time;
|
||||
__sync_fetch_and_add(&time_bwt_extend, tmp_time);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
// 创建正向的kmer
|
||||
inline uint32_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed)
|
||||
{
|
||||
uint32_t qbit = 0, i;
|
||||
qlen = qlen < kmer_len ? qlen : kmer_len;
|
||||
for (i = 0; i < qlen; ++i) {
|
||||
if (q[i] > 3) break; // 要考虑碱基是N
|
||||
qbit |= q[i] << ((kmer_len - 1 - i) << 1);
|
||||
}
|
||||
*base_consumed = i;
|
||||
return qbit;
|
||||
}
|
||||
|
||||
// 创建f反向的kmer
|
||||
inline uint32_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed)
|
||||
{
|
||||
uint32_t qbit = 0;
|
||||
int i, j, end_pos;
|
||||
end_pos = start_pos - kmer_len;
|
||||
end_pos = end_pos < 0 ? -1 : end_pos;
|
||||
for (i = start_pos, j = 0; i > end_pos; --i, ++j) {
|
||||
if (q[i] > 3) break; // 要考虑碱基是N
|
||||
qbit |= q[i] << ((kmer_len - 1 - j) << 1);
|
||||
}
|
||||
*base_consumed = start_pos - i;
|
||||
return (~qbit) & ((1L << (kmer_len << 1)) - 1);
|
||||
}
|
||||
|
||||
static void bwt_reverse_intvs(bwtintv_v *p)
|
||||
|
|
@ -372,24 +439,8 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
|
|||
bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
|
||||
ik.info = x + 1;
|
||||
|
||||
// int flag = 0;
|
||||
// int small_intv_qlen = 0;
|
||||
// int small_intv_val = 0;
|
||||
// int intv_19 = 0;
|
||||
for (i = x + 1, curr->n = 0; i < len; ++i)
|
||||
{ // forward search
|
||||
//if (ik.x[2] <= 2)
|
||||
//{
|
||||
// if (!flag) {
|
||||
// small_intv_qlen = i - x;
|
||||
// small_intv_val = ik.x[2];
|
||||
// flag = 1;
|
||||
// }
|
||||
//}
|
||||
//if (i-x == 19)
|
||||
//{
|
||||
// intv_19 = ik.x[2];
|
||||
//}
|
||||
if (ik.x[2] < max_intv) { // an interval small enough
|
||||
kv_push(bwtintv_t, *curr, ik);
|
||||
break;
|
||||
|
|
@ -407,11 +458,6 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
|
|||
break; // always terminate extension at an ambiguous base; in this case, i<len always stands
|
||||
}
|
||||
}
|
||||
//if (small_intv_qlen == 0)
|
||||
// fprintf(fp1, "stop_len: %d; final_len: %d; intval: %ld\n", i-x, i-x, ik.x[2]);
|
||||
//else
|
||||
// fprintf(fp1, "stop_len: %d; final_len: %d; intval: %d; final_intval: %ld\n", small_intv_qlen, i - x, small_intv_val, ik.x[2]);
|
||||
// fprintf(fp1, "19-intv: %d\n", intv_19);
|
||||
|
||||
if (i == len) kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end
|
||||
bwt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first
|
||||
|
|
@ -452,13 +498,21 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
|
|||
|
||||
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem)
|
||||
{
|
||||
int i, c;
|
||||
int i = x + 1, c, kmer_len;
|
||||
bwtintv_t ik = {0}, ok[4] = {0};
|
||||
|
||||
memset(mem, 0, sizeof(bwtintv_t));
|
||||
if (q[x] > 3) return x + 1;
|
||||
bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
|
||||
for (i = x + 1; i < len; ++i) { // forward search
|
||||
|
||||
uint32_t qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
|
||||
bwt_kmer_get(&bwt->kmer_hash, &ik, qbit, kmer_len - 1);
|
||||
ik.info = x + kmer_len;
|
||||
i = (int)ik.info;
|
||||
|
||||
//bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
|
||||
//i = x + 1;
|
||||
|
||||
for (; i < len; ++i) { // forward search
|
||||
if (q[i] < 4) { // an A/C/G/T base
|
||||
c = 3 - q[i]; // complement of q[i]
|
||||
bwt_extend(bwt, &ik, ok, 0);
|
||||
|
|
@ -496,22 +550,22 @@ void bwt_dump_sa(const char *fn, const bwt_t *bwt)
|
|||
err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
|
||||
err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
|
||||
err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp);
|
||||
err_fwrite(bwt->sa, sizeof(bwtint_t), SA_BYTES(bwt->n_sa) >> 3, fp);
|
||||
err_fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
|
||||
err_fflush(fp);
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
||||
static bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a)
|
||||
{ // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks
|
||||
//const int bufsize = 0x1000000; // 16M block
|
||||
const int bufsize = 0x4000000; //64M block
|
||||
bwtint_t offset = 0;
|
||||
while (size) {
|
||||
int x = bufsize < size? bufsize : size;
|
||||
if ((x = err_fread_noeof(a + offset, 1, x, fp)) == 0) break;
|
||||
size -= x; offset += x;
|
||||
}
|
||||
return offset;
|
||||
void bwt_dump_byte_sa(const char *fn, const bwt_t *bwt)
|
||||
{
|
||||
FILE *fp;
|
||||
fp = xopen(fn, "wb");
|
||||
err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
|
||||
err_fwrite(bwt->L2 + 1, sizeof(bwtint_t), 4, fp);
|
||||
err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
|
||||
err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp);
|
||||
err_fwrite(bwt->byte_sa, sizeof(bwtint_t), SA_BYTES(bwt->n_sa) >> 3, fp);
|
||||
err_fflush(fp);
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
||||
void bwt_restore_sa(const char *fn, bwt_t *bwt)
|
||||
|
|
@ -529,10 +583,6 @@ void bwt_restore_sa(const char *fn, bwt_t *bwt)
|
|||
xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
|
||||
|
||||
bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv;
|
||||
|
||||
//bwt->sa = (uint8_t*)malloc(SA_BYTES(bwt->n_sa));
|
||||
//fprintf(stderr, "zzh-read-sa %ld\n", SA_BYTES(bwt->n_sa));
|
||||
//fread_fix(fp, SA_BYTES(bwt->n_sa), bwt->sa);
|
||||
|
||||
bwt->sa = (bwtint_t *)calloc(bwt->n_sa, sizeof(bwtint_t));
|
||||
bwt->sa[0] = -1;
|
||||
|
|
@ -565,6 +615,6 @@ bwt_t *bwt_restore_bwt(const char *fn)
|
|||
void bwt_destroy(bwt_t *bwt)
|
||||
{
|
||||
if (bwt == 0) return;
|
||||
//free(bwt->sa); free(bwt->bwt);
|
||||
//free(bwt);
|
||||
free(bwt->sa); free(bwt->bwt);
|
||||
free(bwt);
|
||||
}
|
||||
|
|
|
|||
37
bwt.h
37
bwt.h
|
|
@ -45,6 +45,30 @@ typedef unsigned char ubyte_t;
|
|||
|
||||
typedef uint64_t bwtint_t;
|
||||
|
||||
#define HASH_KMER_LEN 14
|
||||
|
||||
// 用来保存kmer对应的fmt的位置信息
|
||||
typedef struct
|
||||
{
|
||||
// 40+40+32 14个byte,这样好处理
|
||||
uint8_t intv_arr[14]; // 保存kmer中每扩展一个碱基对应的bwtintv_t数据
|
||||
} KmerEntry;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint8_t intv_arr[140]; // 保存长度为10的kmer,每个碱基对应的bwt匹配信息
|
||||
} KmerEntryArr;
|
||||
|
||||
// 保存各个位置对应的bwt匹配信息
|
||||
typedef struct
|
||||
{
|
||||
KmerEntryArr *ke10;
|
||||
KmerEntry *ke11;
|
||||
KmerEntry *ke12;
|
||||
KmerEntry *ke13;
|
||||
KmerEntry *ke14;
|
||||
} KmerHash;
|
||||
|
||||
typedef struct {
|
||||
bwtint_t primary; // S^{-1}(0), or the primary index of BWT
|
||||
bwtint_t L2[5]; // C(), cumulative count
|
||||
|
|
@ -53,11 +77,13 @@ typedef struct {
|
|||
uint32_t *bwt; // BWT
|
||||
// occurance array, separated to two parts
|
||||
uint32_t cnt_table[256];
|
||||
// 保存kmer对应的fmt位置信息
|
||||
KmerHash kmer_hash;
|
||||
// suffix array
|
||||
int sa_intv;
|
||||
bwtint_t n_sa;
|
||||
//uint8_t *sa;
|
||||
bwtint_t *sa;
|
||||
uint8_t *byte_sa;
|
||||
} bwt_t;
|
||||
|
||||
typedef struct {
|
||||
|
|
@ -99,6 +125,7 @@ extern "C" {
|
|||
|
||||
void bwt_dump_bwt(const char *fn, const bwt_t *bwt);
|
||||
void bwt_dump_sa(const char *fn, const bwt_t *bwt);
|
||||
void bwt_dump_byte_sa(const char *fn, const bwt_t *bwt);
|
||||
|
||||
bwt_t *bwt_restore_bwt(const char *fn);
|
||||
void bwt_restore_sa(const char *fn, bwt_t *bwt);
|
||||
|
|
@ -108,6 +135,7 @@ extern "C" {
|
|||
void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW
|
||||
void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size); // from BWT-SW
|
||||
void bwt_cal_sa(bwt_t *bwt, int intv);
|
||||
void bwt_cal_byte_sa(bwt_t *bwt, int intv);
|
||||
|
||||
void bwt_bwtupdate_core(bwt_t *bwt);
|
||||
|
||||
|
|
@ -118,6 +146,13 @@ extern "C" {
|
|||
void bwt_set_sa(uint8_t *sa_arr, bwtint_t k, bwtint_t val);
|
||||
bwtint_t bwt_get_sa(uint8_t *sa_arr, bwtint_t k);
|
||||
|
||||
void kmer_getval_at(uint8_t *mem_addr, bwtintv_t *ok, int pos);
|
||||
void kmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos);
|
||||
void bwt_kmer_get(const KmerHash *kmer_hash, bwtintv_t *ok, uint32_t qbit, int pos);
|
||||
|
||||
uint32_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed);
|
||||
uint32_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed);
|
||||
|
||||
// more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values
|
||||
void bwt_gen_cnt_table(bwt_t *bwt);
|
||||
void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol);
|
||||
|
|
|
|||
21
bwtindex.c
21
bwtindex.c
|
|
@ -206,6 +206,27 @@ int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command
|
|||
return 0;
|
||||
}
|
||||
|
||||
int bwa_bwt2bytesa(int argc, char *argv[]) // the "bwt2sa" command
|
||||
{
|
||||
bwt_t *bwt;
|
||||
int c, sa_intv = 32;
|
||||
while ((c = getopt(argc, argv, "i:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'i': sa_intv = atoi(optarg); break;
|
||||
default: return 1;
|
||||
}
|
||||
}
|
||||
if (optind + 2 > argc) {
|
||||
fprintf(stderr, "Usage: bwa bwt2bytesa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
|
||||
return 1;
|
||||
}
|
||||
bwt = bwt_restore_bwt(argv[optind]);
|
||||
bwt_cal_byte_sa(bwt, sa_intv);
|
||||
bwt_dump_byte_sa(argv[optind+1], bwt);
|
||||
bwt_destroy(bwt);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bwa_index(int argc, char *argv[]) // the "index" command
|
||||
{
|
||||
int c, algo_type = BWTALGO_AUTO, is_64 = 0, block_size = 10000000;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,38 @@
|
|||
thread=1
|
||||
#n_r1=~/data/fastq/ZY2105177532213000/n_r1.fq
|
||||
#n_r2=~/data/fastq/ZY2105177532213000/n_r2.fq
|
||||
#n_r1=~/data/fastq/ZY2105177532213000/sn_r1.fq
|
||||
#n_r2=~/data/fastq/ZY2105177532213000/sn_r2.fq
|
||||
#reference=~/data/reference/human_g1k_v37_decoy.fasta
|
||||
#n_r1=~/fastq/sn_r1.fq
|
||||
#n_r2=~/fastq/sn_r2.fq
|
||||
#n_r1=~/fastq/ssn_r1.fq
|
||||
#n_r2=~/fastq/ssn_r2.fq
|
||||
#n_r1=~/fastq/tiny_n_r1.fq
|
||||
#n_r2=~/fastq/tiny_n_r2.fq
|
||||
n_r1=~/fastq/diff_r1.fq
|
||||
n_r2=~/fastq/diff_r2.fq
|
||||
#n_r1=~/fastq/d_r1.fq
|
||||
#n_r2=~/fastq/d_r2.fq
|
||||
reference=~/reference/human_g1k_v37_decoy.fasta
|
||||
#out=./ssn.sam
|
||||
out=./out.sam
|
||||
#out=/dev/null
|
||||
#time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
||||
# /home/zzh/data/reference/human_g1k_v37_decoy.fasta \
|
||||
# /home/zzh/data/fastq/nm1.fq \
|
||||
# /home/zzh/data/fastq/nm2.fq -o /dev/null
|
||||
|
||||
#time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
||||
# /home/zzh/data/reference/human_g1k_v37_decoy.fasta \
|
||||
# /mnt/d/data/fastq/ZY2105177532213000/ZY2105177532213010_L4_1.fq.gz \
|
||||
# /mnt/d/data/fastq/ZY2105177532213000/ZY2105177532213010_L4_2.fq.gz \
|
||||
# -o /dev/null
|
||||
|
||||
time ./bwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
||||
$reference \
|
||||
$n_r1 \
|
||||
$n_r2 \
|
||||
-o $out
|
||||
|
||||
|
||||
19
fastmap.c
19
fastmap.c
|
|
@ -59,7 +59,8 @@ int64_t time_ksw_extend2 = 0,
|
|||
time_bns = 0;
|
||||
|
||||
int64_t dn = 0, n16 = 0, n17 = 0, n18 = 0, n19 = 0, nall = 0, num_sa = 0;
|
||||
|
||||
int64_t s1n = 0, s2n = 0, s3n = 0, s1l = 0, s2l = 0, s3l = 0;
|
||||
int64_t g_num_smem2 = 0;
|
||||
FILE *fp1;
|
||||
|
||||
#endif
|
||||
|
|
@ -433,22 +434,20 @@ int main_mem(int argc, char *argv[])
|
|||
|
||||
#ifdef SHOW_PERF
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "time_bwt_smem1a: %f s\n", time_bwt_smem1a / 1000.0 / opt->n_threads);
|
||||
fprintf(stderr, "time_seed_1: %f s\n", time_seed_1 / 1000.0 / opt->n_threads);
|
||||
fprintf(stderr, "time_seed_2: %f s\n", time_seed_2 / 1000.0 / opt->n_threads);
|
||||
fprintf(stderr, "time_seed_3: %f s\n", time_seed_3 / 1000.0 / opt->n_threads);
|
||||
fprintf(stderr, "time_fmt_smem_0: %f s\n", time_fmt_smem_0 / 1000.0 / opt->n_threads);
|
||||
fprintf(stderr, "time_bwt_extend: %f s\n", time_bwt_extend / 1000.0 / opt->n_threads);
|
||||
fprintf(stderr, "time_bwt_sa: %f s\n", time_bwt_sa / 1000.0 / opt->n_threads);
|
||||
fprintf(stderr, "time_ksw_extend2: %f s\n", time_ksw_extend2 / 1000.0 / opt->n_threads);
|
||||
fprintf(stderr, "time_bwt_sa_read: %f s\n", time_bwt_sa_read / 1000.0 / opt->n_threads);
|
||||
fprintf(stderr, "time_bns: %f s\n", time_bns / 1000.0 / opt->n_threads);
|
||||
fprintf(stderr, "all seed num: %ld\n", nall);
|
||||
fprintf(stderr, "all seed num litter than 19: %ld\n", n19);
|
||||
fprintf(stderr, "all seed num litter than 18: %ld\n", n18);
|
||||
fprintf(stderr, "all seed num litter than 17: %ld\n", n17);
|
||||
fprintf(stderr, "all seed num litter than 16: %ld\n", n16);
|
||||
fprintf(stderr, "s1 num: %ld\n", s1n);
|
||||
fprintf(stderr, "s2 num: %ld\n", s2n);
|
||||
fprintf(stderr, "s3 num: %ld\n", s3n);
|
||||
fprintf(stderr, "s1 len: %ld\n", s1l / s1n);
|
||||
fprintf(stderr, "s2 len: %ld\n", s2l / s2n);
|
||||
fprintf(stderr, "s3 len: %ld\n", s3l / s3n);
|
||||
fprintf(stderr, "get sa num: %ld\n", num_sa);
|
||||
fprintf(stderr, "seed 2 num: %ld\n", g_num_smem2);
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
fclose(fp1);
|
||||
|
|
|
|||
435
fmt_idx.c
435
fmt_idx.c
|
|
@ -18,19 +18,18 @@ Date : 2023/12/24
|
|||
|
||||
const static char BASE[4] = {'A', 'C', 'G', 'T'};
|
||||
|
||||
static bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a)
|
||||
{ // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks
|
||||
const int bufsize = 0x4000000; // 16 * 4M block
|
||||
bwtint_t offset = 0;
|
||||
while (size)
|
||||
// 生成所有KMER_LEN长度的序列,字符串表示
|
||||
void gen_all_seq(char **seq_arr, int kmer_len)
|
||||
{
|
||||
uint32_t seq_up_val = (1 << (kmer_len << 1));
|
||||
for (uint32_t i = 0; i < seq_up_val; ++i)
|
||||
{
|
||||
int x = bufsize < size ? bufsize : size;
|
||||
if ((x = err_fread_noeof(a + offset, 1, x, fp)) == 0)
|
||||
break;
|
||||
size -= x;
|
||||
offset += x;
|
||||
seq_arr[i] = (char *)malloc(kmer_len);
|
||||
for (int j = kmer_len - 1; j >= 0; --j)
|
||||
{
|
||||
seq_arr[i][kmer_len - 1 - j] = BASE[(i >> (j << 1)) & 3];
|
||||
}
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
// 生成occ,每个字节对应一个pattern
|
||||
|
|
@ -155,23 +154,12 @@ KmerHash fmt_restore_kmer_idx(const char *fn)
|
|||
return khash;
|
||||
}
|
||||
|
||||
uint16_t *fmt_restore_kmer_bit(const char *fn, int kmer_len)
|
||||
{
|
||||
uint16_t *kbit = (uint16_t *)calloc(1L << ((kmer_len << 1) - 4), sizeof(uint16_t));
|
||||
FILE *fp;
|
||||
fp = xopen(fn, "rb");
|
||||
fread_fix(fp, (1L << ((kmer_len << 1) - 4)) * sizeof(uint16_t), kbit);
|
||||
err_fclose(fp);
|
||||
return kbit;
|
||||
}
|
||||
|
||||
// 读取sa数据
|
||||
void fmt_restore_sa(const char *fn, FMTIndex *fmt)
|
||||
{
|
||||
char skipped[256];
|
||||
FILE *fp;
|
||||
bwtint_t primary;
|
||||
|
||||
fp = xopen(fn, "rb");
|
||||
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
|
||||
xassert(primary == fmt->primary, "SA-BWT inconsistency: primary is not the same.");
|
||||
|
|
@ -179,10 +167,8 @@ void fmt_restore_sa(const char *fn, FMTIndex *fmt)
|
|||
err_fread_noeof(&fmt->sa_intv, sizeof(bwtint_t), 1, fp);
|
||||
err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
|
||||
xassert(primary == fmt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
|
||||
|
||||
fmt->n_sa = (fmt->seq_len + fmt->sa_intv) / fmt->sa_intv;
|
||||
fmt->sa = (uint8_t *)malloc(SA_BYTES(fmt->n_sa));
|
||||
|
||||
fread_fix(fp, SA_BYTES(fmt->n_sa), fmt->sa);
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
|
@ -224,7 +210,6 @@ FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
|
|||
#endif
|
||||
|
||||
buf = (uint32_t *)calloc(fmt->bwt_size, 4); // 开辟计算fmt用到的缓存
|
||||
|
||||
c[0] = c[1] = c[2] = c[3] = 0;
|
||||
// 首行的c2,应该是对应的ACGT对应的行,减去1的occ
|
||||
for (i = 0; i < 4; ++i)
|
||||
|
|
@ -382,7 +367,6 @@ inline void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t
|
|||
cnt[3] = q[b2]; // b2的occ
|
||||
p += 20;
|
||||
|
||||
#ifdef FMT_MID_INTERVAL // 加入了middle checkpoint
|
||||
// 使用mid interval信息
|
||||
int mk = k % FMT_OCC_INTERVAL;
|
||||
int n_mintv = mk >> FMT_MID_INTV_SHIFT;
|
||||
|
|
@ -401,35 +385,12 @@ inline void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t
|
|||
x = 0;
|
||||
p += 4;
|
||||
}
|
||||
#if FMT_MID_INTERVAL == 16 // middle check point interval等于16时候,只需要判断一下是不是要计算两个uint32表示的碱基序列
|
||||
if ((mk & FMT_MID_INTV_MASK) >> 3)
|
||||
{
|
||||
x += __fmt_occ_e2_aux2(fmt, ti, *p);
|
||||
++p;
|
||||
}
|
||||
#elif FMT_MID_INTERVAL > 16 // 该地址是bwt和pre_bwt字符串数据的首地址
|
||||
uint32_t *end = p + ((k >> 3) - ((k & ~FMT_MID_INTV_MASK) >> 3));
|
||||
for (; p < end; ++p)
|
||||
{
|
||||
x += __fmt_occ_e2_aux2(fmt, ti, *p);
|
||||
}
|
||||
#endif
|
||||
#else // 没有加入middle check point interval
|
||||
#if FMT_OCC_INTERVAL > 16
|
||||
uint32_t *end = p + ((k >> 3) - ((k & ~FMT_OCC_INTV_MASK) >> 3));
|
||||
// p = end - (end - p) / 4;
|
||||
for (; p < end; ++p)
|
||||
{
|
||||
x += __fmt_occ_e2_aux2(fmt, ti, *p);
|
||||
}
|
||||
#else // FMT_OCC_INTERVAL等于16的时候,只需要判断一次就可以
|
||||
if ((k & FMT_OCC_INTV_MASK) >> 3)
|
||||
{
|
||||
x += __fmt_occ_e2_aux2(fmt, ti, *p);
|
||||
++p;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
tmp = *p & ~((1U << ((~k & 7) << 2)) - 1);
|
||||
x += __fmt_occ_e2_aux2(fmt, ti, tmp);
|
||||
|
||||
|
|
@ -456,11 +417,6 @@ inline void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t
|
|||
// 扩展两个碱基
|
||||
inline void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwtintv_t *ok2, int is_back, int b1, int b2)
|
||||
{
|
||||
#ifdef SHOW_PERF
|
||||
#if 0
|
||||
int64_t tmp_time = realtime_msec();
|
||||
#endif
|
||||
#endif
|
||||
bwtint_t tk[4], tl[4];
|
||||
bwtintv_t intv = {0};
|
||||
// tk表示在k行之前所有各个碱基累积出现次数,tl表示在l行之前的累积
|
||||
|
|
@ -476,22 +432,11 @@ inline void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwti
|
|||
intv.x[!is_back] = fmt->L2[b2] + 1 + tk[3];
|
||||
intv.x[2] = tl[3] - tk[3];
|
||||
*ok2 = intv;
|
||||
#ifdef SHOW_PERF
|
||||
#if 0
|
||||
tmp_time = realtime_msec() - tmp_time;
|
||||
__sync_fetch_and_add(&time_bwt_extend, tmp_time);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
// 扩展一个碱基
|
||||
inline void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1)
|
||||
{
|
||||
#ifdef SHOW_PERF
|
||||
#if 0
|
||||
int64_t tmp_time = realtime_msec();
|
||||
#endif
|
||||
#endif
|
||||
bwtint_t tk[4], tl[4];
|
||||
int b2 = 3; // 如果只扩展一次,那么第二个碱基设置成T,可以减小一些计算量,如计算大于b2的累积数量
|
||||
// tk表示在k行之前所有各个碱基累积出现次数,tl表示在l行之前的累积
|
||||
|
|
@ -502,90 +447,70 @@ inline void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int i
|
|||
ok->x[2] = tl[1] - tk[1];
|
||||
// 第一次正向扩展
|
||||
ok->x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
|
||||
#ifdef SHOW_PERF
|
||||
#if 0
|
||||
tmp_time = realtime_msec() - tmp_time;
|
||||
__sync_fetch_and_add(&time_bwt_extend, tmp_time);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
// 获取kmer的fmt匹配信息
|
||||
inline void kmer_getval_at(uint8_t *mem_addr, bwtintv_t *ok, int pos)
|
||||
// 序列和参考基因直接对比
|
||||
static void direct_extend(const FMTIndex *fmt, int len, const uint8_t *q, int left_pos, int right_pos, bwtint_t mtx_line, bwtintv_t *mt)
|
||||
{
|
||||
bwtint_t x0, x1, x2;
|
||||
int byte_idx = pos * 14;
|
||||
uint8_t *arr = mem_addr + byte_idx;
|
||||
x0 = *arr;
|
||||
x0 = (x0 << 32) | *((uint32_t *)(arr + 1));
|
||||
arr += 5;
|
||||
x1 = *arr;
|
||||
x1 = (x1 << 32) | *((uint32_t *)(arr + 1));
|
||||
arr += 5;
|
||||
x2 = *((uint32_t *)arr);
|
||||
ok->x[0] = x0;
|
||||
ok->x[1] = x1;
|
||||
ok->x[2] = x2;
|
||||
#define PAC_BASE(pac, l) ((pac)[(l) >> 2] >> ((~(l) & 3) << 1) & 3)
|
||||
#define EXTEND_BASE_LOOP(qcond, rcond, qstep, rstep) \
|
||||
while (k != qcond && r != rcond) \
|
||||
{ \
|
||||
const int base = PAC_BASE(fmt->pac, r); \
|
||||
if (q[k] != base) break; \
|
||||
k += qstep; \
|
||||
r += rstep; \
|
||||
}
|
||||
#define EXTEND_BASE_LOOP_COMP(qcond, rcond, qstep, rstep) \
|
||||
while (k != qcond && r != rcond) \
|
||||
{ \
|
||||
const int base = 3 - PAC_BASE(fmt->pac, r); \
|
||||
if (q[k] != base) break; \
|
||||
k += qstep; \
|
||||
r += rstep; \
|
||||
}
|
||||
|
||||
int k;
|
||||
int64_t r, rp;
|
||||
mt->num_match = 1;
|
||||
rp = fmt_sa(fmt, mtx_line);
|
||||
r = rp >= fmt->l_pac ? (fmt->l_pac << 1) - 1 - rp : rp;
|
||||
k = right_pos;
|
||||
if (rp < fmt->l_pac) // 匹配到了正向链
|
||||
{
|
||||
// 向前继续扩展
|
||||
r += right_pos - left_pos;
|
||||
EXTEND_BASE_LOOP(len, fmt->l_pac, 1, 1);
|
||||
mt->rm[0].qe = k;
|
||||
mt->rm[0].reverse = 0;
|
||||
// 向后扩展,x位置之前的碱基
|
||||
r -= k - left_pos + 1;
|
||||
k = left_pos - 1;
|
||||
EXTEND_BASE_LOOP(-1, -1, -1, -1);
|
||||
mt->rm[0].qs = k + 1;
|
||||
mt->rm[0].rs = r + 1;
|
||||
}
|
||||
else // 匹配到了互补链
|
||||
{
|
||||
r -= right_pos - left_pos;
|
||||
EXTEND_BASE_LOOP_COMP(len, -1, 1, -1);
|
||||
mt->rm[0].qe = k;
|
||||
mt->rm[0].reverse = 1;
|
||||
// 扩展x之前的碱基
|
||||
r += k - left_pos + 1;
|
||||
k = left_pos - 1;
|
||||
EXTEND_BASE_LOOP_COMP(-1, fmt->l_pac, -1, 1);
|
||||
mt->rm[0].qs = k + 1;
|
||||
mt->rm[0].rs = r - 1;
|
||||
}
|
||||
mt->info = mt->rm[0].qs;
|
||||
mt->info = mt->info << 32 | mt->rm[0].qe;
|
||||
mt->x[2] = 1;
|
||||
}
|
||||
|
||||
// 设置kmer第pos个碱基对应的fmt匹配信息
|
||||
inline void kmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos)
|
||||
static inline void fmt_reverse_intvs(bwtintv_v *p)
|
||||
{
|
||||
int byte_idx = pos * 14;
|
||||
uint8_t *arr = mem_addr + byte_idx;
|
||||
arr[0] = (uint8_t)(ik.x[0] >> 32);
|
||||
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[0];
|
||||
arr += 5;
|
||||
arr[0] = (uint8_t)(ik.x[1] >> 32);
|
||||
*((uint32_t *)(arr + 1)) = (uint32_t)ik.x[1];
|
||||
arr += 5;
|
||||
*((uint32_t *)arr) = (uint32_t)ik.x[2];
|
||||
}
|
||||
|
||||
// 获取kmer对应的fmt匹配信息, pos should be [0, 13]
|
||||
inline void fmt_kmer_get(const FMTIndex *fmt, bwtintv_t *ok, uint32_t qbit, int pos)
|
||||
{
|
||||
|
||||
if (pos == 13)
|
||||
{
|
||||
kmer_getval_at(fmt->kmer_hash.ke14[qbit].intv_arr, ok, 0);
|
||||
}
|
||||
else if (pos == 12)
|
||||
{
|
||||
kmer_getval_at(fmt->kmer_hash.ke13[qbit >> 2].intv_arr, ok, 0);
|
||||
}
|
||||
else if (pos == 11)
|
||||
{
|
||||
kmer_getval_at(fmt->kmer_hash.ke12[qbit >> 4].intv_arr, ok, 0);
|
||||
}
|
||||
else if (pos == 10)
|
||||
{
|
||||
kmer_getval_at(fmt->kmer_hash.ke11[qbit >> 6].intv_arr, ok, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
kmer_getval_at(fmt->kmer_hash.ke10[qbit >> 8].intv_arr, ok, pos);
|
||||
}
|
||||
}
|
||||
|
||||
// 生成所有KMER_LEN长度的序列,字符串表示
|
||||
void gen_all_seq(char **seq_arr, int kmer_len)
|
||||
{
|
||||
uint32_t seq_up_val = (1 << (kmer_len << 1));
|
||||
for (uint32_t i = 0; i < seq_up_val; ++i)
|
||||
{
|
||||
seq_arr[i] = (char *)malloc(kmer_len);
|
||||
for (int j = kmer_len - 1; j >= 0; --j)
|
||||
{
|
||||
seq_arr[i][kmer_len - 1 - j] = BASE[(i >> (j << 1)) & 3];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void fmt_reverse_intvs(bwtintv_v *p)
|
||||
{
|
||||
if (p->n > 1)
|
||||
{
|
||||
if (p->n > 1) {
|
||||
int j;
|
||||
for (j = 0; j < p->n >> 1; ++j)
|
||||
{
|
||||
|
|
@ -596,58 +521,23 @@ static void fmt_reverse_intvs(bwtintv_v *p)
|
|||
}
|
||||
}
|
||||
|
||||
// 创建正向的kmer
|
||||
inline static uint64_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed)
|
||||
{
|
||||
uint64_t qbit = 0;
|
||||
int i;
|
||||
qlen = qlen < kmer_len ? qlen : kmer_len;
|
||||
for (i = 0; i < qlen; ++i)
|
||||
{
|
||||
if (q[i] > 3) // 要考虑碱基是N
|
||||
break;
|
||||
qbit |= (uint64_t)q[i] << ((kmer_len - 1 - i) << 1);
|
||||
}
|
||||
*base_consumed = i;
|
||||
return qbit;
|
||||
}
|
||||
|
||||
// 创建f反向的kmer
|
||||
inline static uint64_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed)
|
||||
{
|
||||
uint64_t qbit = 0;
|
||||
int i, j, end_pos;
|
||||
end_pos = start_pos - kmer_len;
|
||||
end_pos = end_pos < 0 ? -1 : end_pos;
|
||||
for (i = start_pos, j = 0; i > end_pos; --i, ++j)
|
||||
{
|
||||
if (q[i] > 3) // 要考虑碱基是N
|
||||
break;
|
||||
qbit |= ((uint64_t)q[i]) << ((kmer_len - 1 - j) << 1);
|
||||
}
|
||||
*base_consumed = start_pos - i;
|
||||
return (~qbit) & ((1L << (kmer_len << 1)) - 1);
|
||||
}
|
||||
|
||||
// 找smem(seed)
|
||||
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
|
||||
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_v *mem, bwtintv_v *tmpvec)
|
||||
{
|
||||
int i, j, ret, kmer_len;
|
||||
bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0};
|
||||
bwtintv_t mt = {0};
|
||||
bwtintv_v a[1], *curr;
|
||||
bwtintv_v *curr;
|
||||
uint32_t qbit = 0;
|
||||
mem->n = 0;
|
||||
int only_forward = x == 0 || q[x - 1] > 3;
|
||||
|
||||
if (q[x] > 3) return x + 1;
|
||||
|
||||
if (min_intv < 1) min_intv = 1; // the interval size should be at least 1
|
||||
kv_init(a[0]);
|
||||
curr = tmpvec && tmpvec[0] ? tmpvec[0] : &a[0]; // use the temporary vector if provided
|
||||
curr = tmpvec; // use the temporary vector if provided
|
||||
|
||||
qbit = (uint32_t)build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
|
||||
fmt_kmer_get(fmt, &ik, qbit, 0); // 初始碱基位置
|
||||
qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
|
||||
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, 0); // 初始碱基位置
|
||||
ik.info = x + 1;
|
||||
|
||||
// check change of the interval size and whether the interval size is too small to be extended further
|
||||
|
|
@ -657,125 +547,37 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
|
|||
do { kv_push(bwtintv_t, *curr, iv); goto backward_search; } while(0)
|
||||
|
||||
// 处理kmer对应的匹配信息
|
||||
if (only_forward)
|
||||
j = kmer_len - 1;
|
||||
else
|
||||
j = 1;
|
||||
if (only_forward) j = kmer_len - 1;
|
||||
else j = 1;
|
||||
for (curr->n = 0; j < kmer_len; ++j)
|
||||
{
|
||||
fmt_kmer_get(fmt, &ok1, qbit, j);
|
||||
bwt_kmer_get(&fmt->kmer_hash, &ok1, qbit, j);
|
||||
CHECK_INTV_CHANGE(ik, ok1, x + j + 1);
|
||||
}
|
||||
if (kmer_len != HASH_KMER_LEN) // 遇到了N或者到了序列最后
|
||||
PUSH_VAL_AND_SKIP(ik);
|
||||
|
||||
#define PAC_BASE(pac, l) ((pac)[(l) >> 2] >> ((~(l) & 3) << 1) & 3)
|
||||
|
||||
// 扩展kmer之后的碱基
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
|
||||
for (i = (int)ik.info; i + 1 < len; i += 2)
|
||||
{ // forward search
|
||||
if (q[i] < 4 && q[i + 1] < 4)
|
||||
{
|
||||
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
|
||||
CHECK_INTV_CHANGE(ik, ok1, i + 1);
|
||||
CHECK_INTV_CHANGE(ik, ok2, i + 2);
|
||||
// 在这里进行判断是否只有一个候选了
|
||||
//if (0)
|
||||
if (min_intv < 2 && ok2.x[2] == min_intv)
|
||||
{ // 最多处理两个
|
||||
mt.num_match = min_intv;
|
||||
int64_t r, rp;
|
||||
int k;
|
||||
int max_qs = 0, min_qe = len;
|
||||
for (j = 0; j < ok2.x[2]; ++j)
|
||||
{
|
||||
rp = fmt_sa(fmt, ok2.x[0] + j);
|
||||
r = rp >= fmt->l_pac ? (fmt->l_pac << 1) - 1 - rp : rp;
|
||||
k = i + 2;
|
||||
if (rp < fmt->l_pac) // 匹配到了正向链
|
||||
{
|
||||
// 向前继续扩展
|
||||
r += i + 2 - x;
|
||||
while (k < len && r < fmt->l_pac) {
|
||||
int base = PAC_BASE(fmt->pac, r);
|
||||
if (q[k] != base)
|
||||
break;
|
||||
++k;
|
||||
++r;
|
||||
}
|
||||
mt.rm[j].qe = k;
|
||||
mt.rm[j].reverse = 0;
|
||||
// 向后扩展,x位置之前的碱基
|
||||
r -= k - x + 1;
|
||||
k = x - 1;
|
||||
while (k > -1 && r > -1) {
|
||||
int base = PAC_BASE(fmt->pac, r);
|
||||
if (q[k] != base)
|
||||
break;
|
||||
--k;
|
||||
--r;
|
||||
}
|
||||
mt.rm[j].qs = k + 1;
|
||||
mt.rm[j].rs = r + 1;
|
||||
}
|
||||
else // 匹配到了互补链
|
||||
{
|
||||
r -= i + 2 - x;
|
||||
while (k < len && r > -1)
|
||||
{
|
||||
const int base = 3 - PAC_BASE(fmt->pac, r);
|
||||
if (q[k] != base)
|
||||
break;
|
||||
++k;
|
||||
--r;
|
||||
}
|
||||
mt.rm[j].qe = k;
|
||||
mt.rm[j].reverse = 1;
|
||||
// 扩展x之前的碱基
|
||||
r += k - x + 1;
|
||||
k = x - 1;
|
||||
while (k > -1 && r < fmt->l_pac)
|
||||
{
|
||||
const int base = 3 - PAC_BASE(fmt->pac, r);
|
||||
if (q[k] != base)
|
||||
break;
|
||||
--k;
|
||||
++r;
|
||||
}
|
||||
mt.rm[j].qs = k + 1;
|
||||
mt.rm[j].rs = r - 1;
|
||||
}
|
||||
max_qs = max_qs < mt.rm[j].qs ? mt.rm[j].qs : max_qs;
|
||||
min_qe = min_qe > mt.rm[j].qe ? mt.rm[j].qe : min_qe;
|
||||
}
|
||||
if (min_intv > 1)
|
||||
{
|
||||
// 修正两个匹配的位置,使比对的碱基个数和位置一致
|
||||
for (j = 0; j < min_intv; ++j)
|
||||
{
|
||||
if (mt.rm[j].reverse)
|
||||
mt.rm[j].rs -= max_qs - mt.rm[j].qs;
|
||||
else
|
||||
mt.rm[j].rs += max_qs - mt.rm[j].qs;
|
||||
mt.rm[j].qs = max_qs;
|
||||
mt.rm[j].qe = min_qe;
|
||||
}
|
||||
}
|
||||
mt.info = mt.rm[0].qs;
|
||||
mt.info = mt.info << 32 | mt.rm[0].qe;
|
||||
mt.x[2] = min_intv;
|
||||
kv_push(bwtintv_t, *mem, mt);
|
||||
ret = (uint32_t)mt.info;
|
||||
if (only_forward || mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3)
|
||||
{
|
||||
goto fmt_smem_end;
|
||||
}
|
||||
goto backward_search;
|
||||
}
|
||||
//if (min_intv == 1 && ok2.x[2] == min_intv)
|
||||
//{
|
||||
// direct_extend(fmt, len, q, x, i + 2, ok2.x[0], &mt);
|
||||
// kv_push(bwtintv_t, *mem, mt);
|
||||
// ret = (uint32_t)mt.info;
|
||||
// if (only_forward || mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) goto fmt_smem_end;
|
||||
// goto backward_search;
|
||||
//}
|
||||
} else if (q[i] < 4) // q[i+1] >= 4
|
||||
{
|
||||
fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]);
|
||||
|
|
@ -802,26 +604,30 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv
|
|||
|
||||
backward_search:
|
||||
fmt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first
|
||||
if (mt.num_match == 0)
|
||||
ret = curr->a[0].info; // this will be the returned value,扩展到的最远的位置
|
||||
if (mt.num_match == 0) ret = curr->a[0].info; // this will be the returned value,扩展到的最远的位置
|
||||
|
||||
// 按照种子进行遍历,反向扩展
|
||||
#define CHECK_ADD_MEM(pos, intv, mem) \
|
||||
if (mem->n == 0 || (pos) < mem->a[mem->n - 1].info >> 32) { (intv).info |= (uint64_t)(pos) << 32; kv_push(bwtintv_t, *mem, (intv)); }
|
||||
if (((int)((intv).info) - (pos) >= min_seed_len) && (mem->n == 0 || (pos) < mem->a[mem->n - 1].info >> 32)) { \
|
||||
(intv).info |= (uint64_t)(pos) << 32; kv_push(bwtintv_t, *mem, (intv)); \
|
||||
}
|
||||
|
||||
#define CHECK_INTV_ADD_MEM(ok, pos, intv, mem) \
|
||||
if (ok.x[2] < min_intv) { CHECK_ADD_MEM(pos, intv, mem); break; }
|
||||
if (ok.x[2] < min_intv) { CHECK_ADD_MEM(pos, intv, mem); break; }
|
||||
|
||||
int last_kmer_start = 0;
|
||||
for (j = 0; j < curr->n; ++j)
|
||||
{
|
||||
bwtintv_t *p = &curr->a[j]; // 前向扩展的种子
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2);
|
||||
uint64_t qbit = 0;
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2);
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2);
|
||||
if (!only_forward && p->info - x < HASH_KMER_LEN) {
|
||||
qbit = build_backward_kmer(q, p->info - 1, HASH_KMER_LEN, &kmer_len); // 创建反向kmer
|
||||
if (last_kmer_start && kmer_len == HASH_KMER_LEN && p->info == last_kmer_start && p->info - kmer_len > 0 && q[p->info - kmer_len] < 4)
|
||||
qbit = ((qbit << 2) | (3 - q[p->info - kmer_len])) & ((1L << (kmer_len << 1)) - 1); // 创建反向kmer
|
||||
else qbit = build_backward_kmer(q, p->info - 1, HASH_KMER_LEN, &kmer_len); // 创建反向kmer
|
||||
last_kmer_start = p->info - 1;
|
||||
i = 1;
|
||||
do { fmt_kmer_get(fmt, &ik, qbit, kmer_len - i++); } while (ik.x[2] < min_intv);
|
||||
do { bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len - i++); } while (ik.x[2] < min_intv);
|
||||
if (i > 2) continue;
|
||||
p->x[0] = ik.x[1]; p->x[1] = ik.x[0]; p->x[2] = ik.x[2];
|
||||
i = p->info - (kmer_len - i + 3);
|
||||
|
|
@ -833,8 +639,8 @@ backward_search:
|
|||
if (q[i] < 4 && q[i - 1] < 4) // 两个都可以扩展
|
||||
{
|
||||
fmt_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2);
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2);
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2);
|
||||
CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem);
|
||||
ok1.info = p->info;
|
||||
CHECK_INTV_ADD_MEM(ok2, i, ok1, mem);
|
||||
|
|
@ -873,8 +679,6 @@ backward_search:
|
|||
|
||||
fmt_smem_end:
|
||||
fmt_reverse_intvs(mem); // s.t. sorted by the start coordinate
|
||||
if (tmpvec == 0 || tmpvec[0] == 0)
|
||||
free(a[0].a);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
@ -883,18 +687,18 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in
|
|||
int i, kmer_len;
|
||||
bwtintv_t ik = {0}, ok1={0}, ok2={0};
|
||||
uint64_t qbit;
|
||||
|
||||
|
||||
memset(mem, 0, sizeof(bwtintv_t));
|
||||
if (q[x] > 3)
|
||||
return x + 1;
|
||||
if (q[x] > 3) return x + 1;
|
||||
|
||||
qbit = (uint32_t)build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
|
||||
|
||||
fmt_kmer_get(fmt, &ik, qbit, kmer_len-1); // 初始碱基位置
|
||||
qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len);
|
||||
bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len-1); // 初始碱基位置
|
||||
ik.info = x + kmer_len;
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
|
||||
|
||||
//fmt_set_intv(fmt, q[x], ik);
|
||||
//ik.info = x + 1;
|
||||
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2);
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2);
|
||||
|
||||
#define COND_SET_RETURN(iv, ov, start_pos, end_pos, max_intv, min_len) \
|
||||
if (iv.x[2] < max_intv && end_pos - start_pos >= min_len) \
|
||||
|
|
@ -909,8 +713,8 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in
|
|||
if (q[i] < 4 && q[i + 1] < 4)
|
||||
{
|
||||
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
|
||||
__builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2);
|
||||
// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2);
|
||||
COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len);
|
||||
COND_SET_RETURN(ok2, *mem, x, i + 1, max_intv, min_len);
|
||||
ik = ok2;
|
||||
|
|
@ -938,7 +742,6 @@ inline static void fmt_get_previous_base(const FMTIndex *fmt, bwtint_t k, uint8_
|
|||
{
|
||||
uint32_t *p;
|
||||
uint8_t base2;
|
||||
|
||||
// 第一步,找到check point位置
|
||||
p = fmt_occ_intv(fmt, k); // check point起始位置
|
||||
p += 20; // bwt碱基起始位置
|
||||
|
|
@ -949,7 +752,6 @@ inline static void fmt_get_previous_base(const FMTIndex *fmt, bwtint_t k, uint8_
|
|||
// 第三步,找到具体的uint32_t
|
||||
p += (k & FMT_MID_INTV_MASK) >> 3; // 每个uint32_t包含8个碱基(和8个倒数第二bwt碱基)
|
||||
// 第四步,获取碱基
|
||||
|
||||
base2 = *p >> ((~(k) & 0x7) << 2) & 0xf;
|
||||
*b2 = base2 >> 2 & 3;
|
||||
*b1 = base2 & 3;
|
||||
|
|
@ -982,13 +784,6 @@ bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k)
|
|||
++sa;
|
||||
k = k2;
|
||||
}
|
||||
//#ifdef SHOW_PERF
|
||||
// int64_t tmp_time = realtime_msec();
|
||||
//#endif
|
||||
sa += bwt_get_sa(fmt->sa, k / fmt->sa_intv);
|
||||
//#ifdef SHOW_PERF
|
||||
// tmp_time = realtime_msec() - tmp_time;
|
||||
// __sync_fetch_and_add(&time_bwt_sa_read, tmp_time);
|
||||
//#endif
|
||||
return sa;
|
||||
}
|
||||
35
fmt_idx.h
35
fmt_idx.h
|
|
@ -49,33 +49,7 @@ Date : 2023/12/24
|
|||
((x) >> 24 & 0xff) + ((x) >> 16 & 0xff) + ((x) >> 8 & 0xff) + ((x) & 0xff)
|
||||
|
||||
// sa存储的行间隔
|
||||
#define SA_INTV 2
|
||||
|
||||
#define HASH_KMER_LEN 14
|
||||
|
||||
#define BIT_KMER_LEN 17
|
||||
|
||||
// 用来保存kmer对应的fmt的位置信息
|
||||
typedef struct
|
||||
{
|
||||
// 40+40+32 14个byte,这样好处理
|
||||
uint8_t intv_arr[14]; // 保存kmer中每扩展一个碱基对应的bwtintv_t数据
|
||||
} KmerEntry;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint8_t intv_arr[140]; // 保存长度为10的kmer,每个碱基对应的bwt匹配信息
|
||||
} KmerEntryArr;
|
||||
|
||||
// 保存各个位置对应的bwt匹配信息
|
||||
typedef struct
|
||||
{
|
||||
KmerEntryArr *ke10;
|
||||
KmerEntry *ke11;
|
||||
KmerEntry *ke12;
|
||||
KmerEntry *ke13;
|
||||
KmerEntry *ke14;
|
||||
} KmerHash;
|
||||
#define SA_INTV 4
|
||||
|
||||
// fm-index, extend twice in one search step (one memory access)
|
||||
typedef struct
|
||||
|
|
@ -96,9 +70,6 @@ typedef struct
|
|||
uint8_t *pac; // 保存2bit编码的参考序列
|
||||
// 保存kmer对应的fmt位置信息
|
||||
KmerHash kmer_hash;
|
||||
// kmer_bit好像效果一般
|
||||
uint16_t *kmer_bit; // 用来检测特定长度序列有没有fm-index匹配
|
||||
int bit_kmer_len;
|
||||
// suffix array
|
||||
int sa_intv;
|
||||
bwtint_t n_sa;
|
||||
|
|
@ -113,8 +84,6 @@ FMTIndex *fmt_restore_fmt(const char *fn);
|
|||
void fmt_dump_kmer_idx(const char *fn, const KmerHash *kh);
|
||||
// 从文件中读取kmer hash信息
|
||||
KmerHash fmt_restore_kmer_idx(const char *fn);
|
||||
// 读取kmer bit数据
|
||||
uint16_t *fmt_restore_kmer_bit(const char *fn, int kmer_len);
|
||||
// 读取sa数据
|
||||
void fmt_restore_sa(const char *fn, FMTIndex *fmt);
|
||||
// 根据interval-bwt创建fmt-index
|
||||
|
|
@ -133,7 +102,7 @@ void kmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos);
|
|||
void kmer_getval_at(uint8_t *mem_addr, bwtintv_t *ok, int pos);
|
||||
void fmt_kmer_get(const FMTIndex *fmt, bwtintv_t *ok, uint32_t qbit, int pos);
|
||||
// 找smem(seed)
|
||||
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
|
||||
int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_v *mem, bwtintv_v *tmpvec);
|
||||
|
||||
int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem);
|
||||
|
||||
|
|
|
|||
3
main.c
3
main.c
|
|
@ -37,6 +37,7 @@ int bwa_fa2pac(int argc, char *argv[]);
|
|||
int bwa_pac2bwt(int argc, char *argv[]);
|
||||
int bwa_bwtupdate(int argc, char *argv[]);
|
||||
int bwa_bwt2sa(int argc, char *argv[]);
|
||||
int bwa_bwt2bytesa(int argc, char *argv[]);
|
||||
int bwa_index(int argc, char *argv[]);
|
||||
int bwt_bwtgen_main(int argc, char *argv[]);
|
||||
|
||||
|
|
@ -75,6 +76,7 @@ static int usage()
|
|||
fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n");
|
||||
fprintf(stderr, " bwtupdate update .bwt to the new format\n");
|
||||
fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n");
|
||||
fprintf(stderr, " bwt2bytesa generate SA(using byte array) from BWT and Occ\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr,
|
||||
"Note: To use BWA, you need to first index the genome with `bwa index'.\n"
|
||||
|
|
@ -100,6 +102,7 @@ int main(int argc, char *argv[])
|
|||
else if (strcmp(argv[1], "pac2bwtgen") == 0) ret = bwt_bwtgen_main(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "bwt2bytesa") == 0) ret = bwa_bwt2bytesa(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,108 @@
|
|||
@SQ SN:1 LN:249250621
|
||||
@SQ SN:2 LN:243199373
|
||||
@SQ SN:3 LN:198022430
|
||||
@SQ SN:4 LN:191154276
|
||||
@SQ SN:5 LN:180915260
|
||||
@SQ SN:6 LN:171115067
|
||||
@SQ SN:7 LN:159138663
|
||||
@SQ SN:8 LN:146364022
|
||||
@SQ SN:9 LN:141213431
|
||||
@SQ SN:10 LN:135534747
|
||||
@SQ SN:11 LN:135006516
|
||||
@SQ SN:12 LN:133851895
|
||||
@SQ SN:13 LN:115169878
|
||||
@SQ SN:14 LN:107349540
|
||||
@SQ SN:15 LN:102531392
|
||||
@SQ SN:16 LN:90354753
|
||||
@SQ SN:17 LN:81195210
|
||||
@SQ SN:18 LN:78077248
|
||||
@SQ SN:19 LN:59128983
|
||||
@SQ SN:20 LN:63025520
|
||||
@SQ SN:21 LN:48129895
|
||||
@SQ SN:22 LN:51304566
|
||||
@SQ SN:X LN:155270560
|
||||
@SQ SN:Y LN:59373566
|
||||
@SQ SN:MT LN:16569
|
||||
@SQ SN:GL000207.1 LN:4262
|
||||
@SQ SN:GL000226.1 LN:15008
|
||||
@SQ SN:GL000229.1 LN:19913
|
||||
@SQ SN:GL000231.1 LN:27386
|
||||
@SQ SN:GL000210.1 LN:27682
|
||||
@SQ SN:GL000239.1 LN:33824
|
||||
@SQ SN:GL000235.1 LN:34474
|
||||
@SQ SN:GL000201.1 LN:36148
|
||||
@SQ SN:GL000247.1 LN:36422
|
||||
@SQ SN:GL000245.1 LN:36651
|
||||
@SQ SN:GL000197.1 LN:37175
|
||||
@SQ SN:GL000203.1 LN:37498
|
||||
@SQ SN:GL000246.1 LN:38154
|
||||
@SQ SN:GL000249.1 LN:38502
|
||||
@SQ SN:GL000196.1 LN:38914
|
||||
@SQ SN:GL000248.1 LN:39786
|
||||
@SQ SN:GL000244.1 LN:39929
|
||||
@SQ SN:GL000238.1 LN:39939
|
||||
@SQ SN:GL000202.1 LN:40103
|
||||
@SQ SN:GL000234.1 LN:40531
|
||||
@SQ SN:GL000232.1 LN:40652
|
||||
@SQ SN:GL000206.1 LN:41001
|
||||
@SQ SN:GL000240.1 LN:41933
|
||||
@SQ SN:GL000236.1 LN:41934
|
||||
@SQ SN:GL000241.1 LN:42152
|
||||
@SQ SN:GL000243.1 LN:43341
|
||||
@SQ SN:GL000242.1 LN:43523
|
||||
@SQ SN:GL000230.1 LN:43691
|
||||
@SQ SN:GL000237.1 LN:45867
|
||||
@SQ SN:GL000233.1 LN:45941
|
||||
@SQ SN:GL000204.1 LN:81310
|
||||
@SQ SN:GL000198.1 LN:90085
|
||||
@SQ SN:GL000208.1 LN:92689
|
||||
@SQ SN:GL000191.1 LN:106433
|
||||
@SQ SN:GL000227.1 LN:128374
|
||||
@SQ SN:GL000228.1 LN:129120
|
||||
@SQ SN:GL000214.1 LN:137718
|
||||
@SQ SN:GL000221.1 LN:155397
|
||||
@SQ SN:GL000209.1 LN:159169
|
||||
@SQ SN:GL000218.1 LN:161147
|
||||
@SQ SN:GL000220.1 LN:161802
|
||||
@SQ SN:GL000213.1 LN:164239
|
||||
@SQ SN:GL000211.1 LN:166566
|
||||
@SQ SN:GL000199.1 LN:169874
|
||||
@SQ SN:GL000217.1 LN:172149
|
||||
@SQ SN:GL000216.1 LN:172294
|
||||
@SQ SN:GL000215.1 LN:172545
|
||||
@SQ SN:GL000205.1 LN:174588
|
||||
@SQ SN:GL000219.1 LN:179198
|
||||
@SQ SN:GL000224.1 LN:179693
|
||||
@SQ SN:GL000223.1 LN:180455
|
||||
@SQ SN:GL000195.1 LN:182896
|
||||
@SQ SN:GL000212.1 LN:186858
|
||||
@SQ SN:GL000222.1 LN:186861
|
||||
@SQ SN:GL000200.1 LN:187035
|
||||
@SQ SN:GL000193.1 LN:189789
|
||||
@SQ SN:GL000194.1 LN:191469
|
||||
@SQ SN:GL000225.1 LN:211173
|
||||
@SQ SN:GL000192.1 LN:547496
|
||||
@SQ SN:NC_007605 LN:171823
|
||||
@SQ SN:hs37d5 LN:35477943
|
||||
@HD VN:1.5 SO:unsorted GO:query
|
||||
@RG ID:normal SM:normal PL:illumina LB:normal PG:bwa
|
||||
@PG ID:bwa PN:bwa VN:0.7.17-r1198-dirty CL:./bwa mem -t 1 -M -R @RG\tID:normal\tSM:normal\tPL:illumina\tLB:normal\tPG:bwa /home/zzh/reference/human_g1k_v37_decoy.fasta /home/zzh/fastq/diff_r1.fq /home/zzh/fastq/diff_r2.fq -o ./out.sam
|
||||
A00289:748:HLCH3DSX3:4:1102:24749:34929 97 16 75642166 60 150M 12 66451372 0 AAAGCCATTGGCAGTAACATCCAAAGGCTGCTCAGGAACTGCACTCCAGCTGATGGCTATGAAAAGATAAGATTCTAGGGTTAAAACTGCTCAATTTCTTGTGAAAGTCAGAAGTATTTTAATAAAAATATTTCTCTGACTCTACAGACC FFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFF: NM:i:0 MD:Z:150 MC:Z:17S91M42S AS:i:150 XS:i:0 RG:Z:normal
|
||||
A00289:748:HLCH3DSX3:4:1102:24749:34929 145 12 66451372 0 17S91M42S 16 75642166 0 TTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATAAATAATTCTTTGTGAAACCCCT F,F,FFF,:FFF,,F,F,,F,F:,FFF:F:FFF:FFFFFFF:F,F,F,FFFFFF::,,FFFFFFF,:F,:,F,::,,F:FF:,FF:F::F::FFFFFFFFFF,F:F,FFFF,FFFFF,FF,F,,F,F,,,,,,:,,,,,,,,,,,,,,,F NM:i:0 MD:Z:91 MC:Z:150M AS:i:91 XS:i:90 RG:Z:normal SA:Z:12,49106363,+,117S33M,0,0; XA:Z:12,-66451373,35S90M25S,0;12,-66451373,28S90M32S,0;6,-160521757,23S83M44S,0;15,-77910871,23S79M48S,0;
|
||||
A00289:748:HLCH3DSX3:4:1102:24749:34929 385 12 49106363 0 117H33M 16 75642166 0 AAAAAAAAAAAAAAATAAAAAAAAAAAAAAAAA FFF:F:FFF,:F,F,,F,F,,FFF:,FFF,F,F NM:i:0 MD:Z:33 MC:Z:150M AS:i:33 XS:i:32 RG:Z:normal SA:Z:12,66451372,-,17S91M42S,0,0; XA:Z:4,-19079503,32M118S,0;15,-55218248,32M118S,0;X,-149551159,32M118S,0;
|
||||
A00289:748:HLCH3DSX3:4:1104:6234:8656 113 12 66451373 8 12S91M47S 22 18873826 0 TTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGTTTGTTTTTTTTTTTTTTGGGGGGGGGGGGGGGTGAGGTGGTCTGTG ,,:FFF:,:,::,,,,,,FF,FFF,,FF,F,FFF:,,:FFFFFF:FFF,:,:FF:F:::::FFF::,::F:FFFFFF:,FFFF,,,:FFFFFFFF:,,::,,,,,:,,FFF:,,,FF:F,:,FFFF:F::,,FF:,:FFFFFF:FFFFFF NM:i:1 MD:Z:13T77 MC:Z:150M AS:i:86 XS:i:81 RG:Z:normal SA:Z:3,121668661,-,89S32M29S,0,0; XA:Z:12,-66451372,25S91M34S,2;6,-160521757,19S84M47S,1;15,-77910874,26S77M47S,0;7,-107410599,25M1I3M15D74M47S,16;
|
||||
A00289:748:HLCH3DSX3:4:1104:6234:8656 369 3 121668661 0 89H32M29H 22 18873826 0 TTTTTTTTTTTTTGTTTGTTTTTTTTTTTTTT FFFFFF:,,::,,,,,:,,FFF:,,,FF:F,: NM:i:0 MD:Z:32 MC:Z:150M AS:i:32 XS:i:32 RG:Z:normal SA:Z:12,66451373,-,12S91M47S,8,1;
|
||||
A00289:748:HLCH3DSX3:4:1104:6234:8656 177 22 18873826 22 150M 12 66451373 0 CTATACCCCACATCCTCATTCCCACCAGATGTCTTGGATCATGGAGGCTCTCCAGACAAAAGCCAGCAGTTAAGCTCCAGATTTCCTATAGAATCCTTTTCTAACAACCAGTGAGTGATTCCAGAATACGTACCATTGAATGTGCTCCCT FFF:FFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NM:i:0 MD:Z:150 MC:Z:12S91M47S AS:i:150 XS:i:140 RG:Z:normal
|
||||
A00289:748:HLCH3DSX3:4:1104:1741:18129 97 11 113220686 60 150M 12 66451373 0 TATTTCACCCGGATGCTGACTCCTTTTTTGGTTTCCTTGCAAGGTTACCCTGTTGGGATGTTGTTAGTTGGAGGACAGCTACCTCTGAGGTTATTATGGTTTTTTGCAGATTATTGGAAGCGCTGGTGTCATTTCTTGATTTCTCGGATA FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFF,FFFFFFFFFFFFFFFFFF:FFFFFFF:FFFF NM:i:0 MD:Z:150 MC:Z:90M60S AS:i:150 XS:i:19 RG:Z:normal
|
||||
A00289:748:HLCH3DSX3:4:1104:1741:18129 145 12 66451373 7 90M60S 11 113220686 0 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTTTTTTCCCAC FF:FFFF,F,,FFFFFFFF:FFFF:,F,,:F:F::::,F::::FFF::F::,,:FFF,:::FFFFFFFFFFF:FFFFFFFF,FFFF:FFF,:,:,,::FFFFFFFFFFF,FF:FFF:F:F:F:,F,,::F,,F,,FFFFF:F:,,,,,,, NM:i:1 MD:Z:81T8 MC:Z:150M AS:i:85 XS:i:81 RG:Z:normal SA:Z:8,112168232,-,79S59M12S,0,1; XA:Z:6,-160521757,81M69S,0;4,-82515382,13S69M68S,0;
|
||||
A00289:748:HLCH3DSX3:4:1104:1741:18129 401 8 112168232 0 79H59M12H 11 113220686 0 TTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTT FF,FFFF:FFF,:,:,,::FFFFFFFFFFF,FF:FFF:F:F:F:,F,,::F,,F,,FFF NM:i:1 MD:Z:46T12 MC:Z:150M AS:i:54 XS:i:53 RG:Z:normal SA:Z:12,66451373,-,90M60S,7,1;
|
||||
A00289:748:HLCH3DSX3:4:1105:27281:29700 113 X 117117442 60 150M 12 66451372 0 CCCTGAAGTTGATTTTTTGAATATATCCTTGAAATCGGCTTGTTTTAATGACAATTGCCTCCGTCTCCTGAAAGTGGAGTGCAAAAAGAAACCTGGTTAATTGTAGTTTTATTATATTTAGTTCAAATTTTTCAAAGCATACCCTGACAG FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NM:i:0 MD:Z:150 MC:Z:12S89M49S AS:i:150 XS:i:0 RG:Z:normal
|
||||
A00289:748:HLCH3DSX3:4:1105:27281:29700 177 12 66451372 0 12S89M49S X 117117442 0 TTTTTTAAAAATATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTATTTATTTTTTTATTTTTTTATTATTTTTTTTTTTTTTTTTTTTTTTTTGGTGCCGCCCCC ,,,,:,,F:F,,,,:FF,F:FF,::F:F:F,,F::FFF,FFFFFFFFF,FFFFFF::FFF:F:FFFFFFF,FFFFFFF::FFFF:,:FF:,,F,:,FFFFF,:,,:F,,,,:,FFFFF:F,,FFF:FFFFF:,F,F,,,:,,,,,:FFFF NM:i:3 MD:Z:73T3T3T7 MC:Z:150M AS:i:74 XS:i:72 RG:Z:normal SA:Z:15,85942734,+,20S48M82S,0,1;
|
||||
A00289:748:HLCH3DSX3:4:1105:27281:29700 417 15 85942734 0 20H48M82H X 117117442 0 AAAAAAAAAAAAAAAAATAATAAAAAAATAAAAAAATAAATAAATAAA FFFF:FFF,,F:FFFFF,:,,,,F:,,:,FFFFF,:,F,,:FF:,:FF NM:i:1 MD:Z:17A30 MC:Z:150M AS:i:43 XS:i:36 RG:Z:normal SA:Z:12,66451372,-,12S89M49S,0,3; XA:Z:11,-118049564,69S35M7D34M12S,11;2,+204499190,12S31M1D21M86S,3;
|
||||
A00289:748:HLCH3DSX3:4:1106:2880:19288 97 4 169630630 0 29M1I4M1I68M9D22M25S 12 66451373 0 CAGCCTGGGTGACAGAGTGAGACTCCAGCTAAGGCAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAATGAAGGAAGGAAGGAGGGAGGGAGGGAGGGAGGGAGAGATCGGAAGAGCACACGTCTGAAC FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFF:FFF:FFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFF NM:i:14 MD:Z:18C8T59G13^AAGGAAGGA22 MC:Z:36S91M23S AS:i:73 XS:i:72 RG:Z:normal
|
||||
A00289:748:HLCH3DSX3:4:1106:2880:19288 145 12 66451373 9 36S91M23S 4 169630630 0 TATATACTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGTGGGTGGGTGGGTGGGTTGGCAG :,,,,,,,:FF:FFFF:,:FF:,F,F,FFF,,,FFF:FFFFFFFFF,:,,F:F,FFFF,:FFFFFFFFFFFFFFFFFFFF:FF:FFFF:FFFFFFF:FFF:FFFFFFF:FF,FFF:,:::,:F::,,:,,:,:,,,,,:,::,,,:F,FF NM:i:1 MD:Z:10T80 MC:Z:29M1I4M1I68M9D22M25S AS:i:86 XS:i:80 RG:Z:normal SA:Z:15,55218228,-,9S52M89S,0,0; XA:Z:15,-77910871,47S80M23S,0;6,-160521761,47S80M23S,0;6,-160521757,47S79M24S,0;7,-107410599,25S28M15D74M23S,16;
|
||||
A00289:748:HLCH3DSX3:4:1106:2880:19288 401 15 55218228 0 9H52M89H 4 169630630 0 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTT FF:FFFF:,:FF:,F,F,FFF,,,FFF:FFFFFFFFF,:,,F:F,FFFF,:F NM:i:0 MD:Z:52 MC:Z:29M1I4M1I68M9D22M25H AS:i:52 XS:i:50 RG:Z:normal SA:Z:12,66451373,-,36S91M23S,9,1;
|
||||
A00289:748:HLCH3DSX3:4:1107:2365:20462 97 22 18737670 0 150M 12 66451372 0 GAGGACACCATACAAGCCGTCGCTAACAAGGACACTGTACACAACATCGCTAATGACGGCACCGTACAAGACATCACCAATGAGGGCGCTTTATACGACATTGCTAATGATACCGACAAGGCACGCTAACGTGGACGCTGTACACGACAT FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFF NM:i:0 MD:Z:150 MC:Z:21S90M39S AS:i:150 XS:i:150 RG:Z:normal
|
||||
A00289:748:HLCH3DSX3:4:1107:2365:20462 145 12 66451372 13 21S90M39S 22 18737670 0 TTTTATATATTTATATTTATTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGTTTGTTTTTTGTGGTGGGGGGGGGGTTTGGCATAGCAT F:F,,F,F,F:FFF,,,:,F:,:,,:,,,,:,F:,,FFFFFFFFFFFFF,F:FFFFFFFFF:FFFF,,FFFFF:FFF::FFFFF,F:F,,F:F,FFFFFFFF,:F:F::::,,:F,F,,F,:,,,,,::,,,,:,,:,,,,,,F,F::FF NM:i:0 MD:Z:90 MC:Z:150M AS:i:90 XS:i:84 RG:Z:normal XA:Z:6,-160521757,28S84M38S,0;15,-77910871,32S80M38S,0;7,-107410642,38S74M38S,0;
|
||||
A00289:748:HLCH3DSX3:4:1107:26377:28385 113 GL000220.1 122227 60 150M 6 160521757 0 TGCTCTGTTGCTCACGCTGGTCTCAAACTCCTGGCCTTGACGCTTCTCCCGTCACATCCGCCGTCTGGTTGTTGAAATGAGCATCTCTCGTAAAATGGAAAAGATGAAAGAAATAAACACGAAGACGGAAAGCACGGTGTGAACGTTTCT FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NM:i:0 MD:Z:150 MC:Z:85M65S AS:i:150 XS:i:113 RG:Z:normal
|
||||
A00289:748:HLCH3DSX3:4:1107:26377:28385 177 6 160521757 0 85M65S GL000220.1 122227 0 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGGTGTTTTTTTTTTGTTGGGGGTTGGGTGTTGGGGGGCGGGGGGGGGGGCGCCCCCCCCCAAGGAGA F:FF,F,,F,F,FFFFFF,:FF,:FF:F,:FF,FFFF:FFF:FF,FFFFFFFFFFFFFFFFFFFFFFFF:FF,F,F:F,:FF,,,FFFF,F,:,,,,,:::,,:,,,,::,,,,,,,,,F,,FF,:F:,,F,,,,,FF:FF:F,FFFF:: NM:i:0 MD:Z:85 MC:Z:150M AS:i:85 XS:i:84 RG:Z:normal XA:Z:12,-66451380,84M66S,0;12,-66451373,83M67S,0;15,-77910871,4S80M66S,0;
|
||||
12
run.sh
12
run.sh
|
|
@ -6,17 +6,17 @@ thread=1
|
|||
#reference=~/data/reference/human_g1k_v37_decoy.fasta
|
||||
#n_r1=~/fastq/sn_r1.fq
|
||||
#n_r2=~/fastq/sn_r2.fq
|
||||
#n_r1=~/fastq/ssn_r1.fq
|
||||
#n_r2=~/fastq/ssn_r2.fq
|
||||
n_r1=~/fastq/ssn_r1.fq
|
||||
n_r2=~/fastq/ssn_r2.fq
|
||||
#n_r1=~/fastq/tiny_n_r1.fq
|
||||
#n_r2=~/fastq/tiny_n_r2.fq
|
||||
n_r1=~/fastq/diff_r1.fq
|
||||
n_r2=~/fastq/diff_r2.fq
|
||||
#n_r1=~/fastq/diff_r1.fq
|
||||
#n_r2=~/fastq/diff_r2.fq
|
||||
#n_r1=~/fastq/d_r1.fq
|
||||
#n_r2=~/fastq/d_r2.fq
|
||||
reference=~/reference/human_g1k_v37_decoy.fasta
|
||||
#out=./ssn.sam
|
||||
out=./out.sam
|
||||
out=./ssn.sam
|
||||
#out=./out.sam
|
||||
#out=/dev/null
|
||||
#time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
||||
# /home/zzh/data/reference/human_g1k_v37_decoy.fasta \
|
||||
|
|
|
|||
16
utils.c
16
utils.c
|
|
@ -139,6 +139,22 @@ size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream)
|
|||
return ret;
|
||||
}
|
||||
|
||||
uint64_t fread_fix(FILE *fp, uint64_t size, void *a)
|
||||
{ // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks
|
||||
// const int bufsize = 0x1000000; // 16M block
|
||||
const int bufsize = 0x4000000; // 64M block
|
||||
uint64_t offset = 0;
|
||||
while (size)
|
||||
{
|
||||
int x = bufsize < size ? bufsize : size;
|
||||
if ((x = err_fread_noeof(a + offset, 1, x, fp)) == 0)
|
||||
break;
|
||||
size -= x;
|
||||
offset += x;
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
int err_gzread(gzFile file, void *ptr, unsigned int len)
|
||||
{
|
||||
int ret = gzread(file, ptr, len);
|
||||
|
|
|
|||
4
utils.h
4
utils.h
|
|
@ -48,7 +48,8 @@ extern int64_t time_ksw_extend2,
|
|||
time_bns;
|
||||
|
||||
extern int64_t dn, n16, n17, n18, n19, nall, num_sa;
|
||||
|
||||
extern int64_t s1n, s2n, s3n, s1l, s2l, s3l;
|
||||
extern int64_t g_num_smem2;
|
||||
extern FILE *fp1;
|
||||
|
||||
#endif
|
||||
|
|
@ -89,6 +90,7 @@ extern "C" {
|
|||
gzFile err_xzopen_core(const char *func, const char *fn, const char *mode);
|
||||
size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
|
||||
size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream);
|
||||
uint64_t fread_fix(FILE *fp, uint64_t size, void *a);
|
||||
|
||||
int err_gzread(gzFile file, void *ptr, unsigned int len);
|
||||
int err_fseek(FILE *stream, long offset, int whence);
|
||||
|
|
|
|||
Loading…
Reference in New Issue