diff --git a/.vscode/launch.json b/.vscode/launch.json index 170c3c5..2fef6c8 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -18,8 +18,8 @@ "-R", "'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'", "~/reference/human_g1k_v37_decoy.fasta", - "~/fastq/tiny_n_r1.fq", - "~/fastq/tiny_n_r2.fq", + "~/fastq/diff_r1.fq", + "~/fastq/diff_r2.fq", "-o", "/dev/null" ], diff --git a/bwa.c b/bwa.c index 066edd0..ba14350 100644 --- a/bwa.c +++ b/bwa.c @@ -317,14 +317,6 @@ FMTIndex *bwa_idx_load_fmt(const char *hint) fprintf(stderr, "%s\n", kmer_idx_fn); fmt->kmer_hash = fmt_restore_kmer_idx(kmer_idx_fn); - // restore kmer bit -// fmt->bit_kmer_len = 18; -// sprintf(suffix, ".%d.kmer.bit", fmt->bit_kmer_len); -// strcpy(kmer_bit_fn, hint); -// strcpy(kmer_bit_fn + l_hint, suffix); -// fprintf(stderr, "%s\n", kmer_bit_fn); -// fmt->kmer_bit = fmt_restore_kmer_bit(kmer_bit_fn, fmt->bit_kmer_len); - strcpy(sa_fn, hint); sprintf(suffix, ".33.%d.sa", SA_INTV); strcpy(sa_fn + l_hint, suffix); // partial suffix array (SA) @@ -348,13 +340,9 @@ bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which) } idx = calloc(1, sizeof(bwaidx_t)); if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint); - if (which & BWA_IDX_BWT) { - idx->fmt = bwa_idx_load_fmt(hint); - // 先和bwt共用sa - //idx->fmt->sa = idx->bwt->sa; - //idx->fmt->n_sa = idx->bwt->n_sa; - //idx->fmt->sa_intv = idx->bwt->sa_intv; - } + if (which & BWA_IDX_BWT) idx->fmt = bwa_idx_load_fmt(hint); + idx->bwt->kmer_hash = idx->fmt->kmer_hash; + if (which & BWA_IDX_BNS) { int i, c; @@ -404,7 +392,7 @@ int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx) // generate idx->bwt x = sizeof(bwt_t); idx->bwt = malloc(x); memcpy(idx->bwt, mem + k, x); k += x; x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x; - x = SA_BYTES(idx->bwt->n_sa); idx->bwt->sa = (uint8_t*)(mem + k); k += x; + x = SA_BYTES(idx->bwt->n_sa); idx->bwt->sa = (bwtint_t*)(mem + k); k += x; // generate idx->bns and idx->pac x = sizeof(bntseq_t); idx->bns = malloc(x); memcpy(idx->bns, mem + k, x); k += x; diff --git a/bwamem.c b/bwamem.c index d1025c1..3a264ff 100644 --- a/bwamem.c +++ b/bwamem.c @@ -148,18 +148,11 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn int max_seed_len = 0; int start_N_num = 0, start_flag = 1; a->mem.n = 0; - // char *BASE = "ACGT"; - // for (i = 0; i < len; ++i) - // if (seq[i] < 4) - // fprintf(stderr, "%c", BASE[seq[i]]); - // else - // fprintf(stderr, "N"); - // fprintf(stderr, "\n"); // first pass: find all SMEMs fprintf(fp1, "seq: %ld\n", dn++); - //fprintf(stderr, "seq: %ld\n", dn++); - //dn ++; + // fprintf(stderr, "seq: %ld\n", dn++); + // dn ++; // goto third_seed; while (x < len) { @@ -169,8 +162,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn int64_t tmp_time = realtime_msec(); #endif #if USE_FMT - x = fmt_smem(fmt, len, seq, x, start_width, &a->mem1, a->tmpv); - + x = fmt_smem(fmt, len, seq, x, start_width, opt->min_seed_len, &a->mem1, a->tmpv[0]); #else x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv); #endif @@ -178,15 +170,17 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn tmp_time = realtime_msec() - tmp_time; __sync_fetch_and_add(&time_seed_1, tmp_time); #endif + s1n += a->mem1.n; for (i = 0; i < a->mem1.n; ++i) { bwtintv_t *p = &a->mem1.a[i]; //fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info); //fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); //fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); int slen = (uint32_t)p->info - (p->info >> 32); // seed length + s1l += slen; max_seed_len = fmax(max_seed_len, slen); if (slen >= opt->min_seed_len) { - fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info); + //fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info); kv_push(bwtintv_t, a->mem, *p); } } @@ -210,7 +204,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn int64_t tmp_time = realtime_msec(); #endif #if USE_FMT - fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, &a->mem1, a->tmpv); + fmt_smem(fmt, len, seq, (start + end) >> 1, p->x[2] + 1, opt->min_seed_len, &a->mem1, a->tmpv[0]); #else bwt_smem1(bwt, len, seq, (start + end) >> 1, p->x[2] + 1, &a->mem1, a->tmpv); #endif @@ -218,13 +212,17 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn tmp_time = realtime_msec() - tmp_time; __sync_fetch_and_add(&time_seed_2, tmp_time); #endif + s2n += a->mem1.n; + for (i = 0; i < a->mem1.n; ++i) { bwtintv_t *p = &a->mem1.a[i]; //fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info); // fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); // fprintf(stderr, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); - int slen = (uint32_t)p->info - (p->info >> 32); + int slen = (uint32_t)p->info - (p->info >> 32); + s2l += slen; if (slen >= opt->min_seed_len) { + g_num_smem2 += 1; fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info); kv_push(bwtintv_t, a->mem, a->mem1.a[i]); } @@ -233,7 +231,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn //if (max_seed_len == len - start_N_num) // goto collect_intv_end; //goto collect_intv_end; -//third_seed: +third_seed: // third pass: LAST-like if (opt->max_mem_intv > 0) { x = 0; @@ -254,13 +252,15 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn tmp_time = realtime_msec() - tmp_time; __sync_fetch_and_add(&time_seed_3, tmp_time); #endif - //bwtintv_t *p = &m; - //fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); + s3n += 1; + s3l += (uint32_t)m.info - (m.info >> 32); + // bwtintv_t *p = &m; + // fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); if (m.x[2] > 0) { kv_push(bwtintv_t, a->mem, m); - bwtintv_t *p = &m; - fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info); + //bwtintv_t *p = &m; + //fprintf(fp1, "%ld %ld %d\n", p->x[2], p->info >> 32, (uint32_t)p->info); //fprintf(fp1, "%ld %ld %ld %ld %d\n", p->x[0], p->x[1], p->x[2], p->info >> 32, (uint32_t)p->info); } } else { // for now, we never come to this block which is slower @@ -272,7 +272,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, const FMTIn } } -collect_intv_end: +//collect_intv_end: // sort ks_introsort(mem_intv, a->mem.n, a->mem.a); } @@ -438,10 +438,10 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const FMTIndex *fm #endif #if USE_FMT s.rbeg = tmp.pos = fmt_sa(fmt, p->x[0] + k); - uint64_t tpos = bwt_sa(bwt, p->x[0] + k); - if (s.rbeg != tpos) { - fprintf(stderr, "diff: %ld, %ld %ld\n", p->x[0] + k, tmp.pos, tpos); - } + // uint64_t tpos = bwt_sa(bwt, p->x[0] + k); + // if (s.rbeg != tpos) { + // fprintf(stderr, "diff: %ld, %ld %ld\n", p->x[0] + k, tmp.pos, tpos); + // } #else s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference #endif diff --git a/bwt.c b/bwt.c index 0797f02..e3b59d1 100644 --- a/bwt.c +++ b/bwt.c @@ -53,20 +53,10 @@ void bwt_gen_cnt_table(bwt_t *bwt) static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inverse CSA { - bwtint_t x = k - (k > bwt->primary); // bwt中不包含$,所以位置超过序列长度后,要减掉1,(因为后边加上了互补链) - // x = bwt_B0(bwt, x); // 获取x位置对应的字符 - - uint32_t t1 = (bwt->bwt[((x) >> 7 << 4) + sizeof(bwtint_t) + (((x) & 0x7f) >> 4)]); - uint32_t t2 = t1 >> ((~(x) & 0xf) << 1) & 3; - - x = t2; - -// bwtint_t cnt[4], cnt1; -// bwt_occ4(bwt, k, cnt); -// cnt1 = bwt_occ(bwt, k, x); - - x = bwt->L2[x] + bwt_occ(bwt, k, x); // 获取x字符在k位置(后缀索引), - return k == bwt->primary? 0 : x; + bwtint_t x = k - (k > bwt->primary); + x = bwt_B0(bwt, x); + x = bwt->L2[x] + bwt_occ(bwt, k, x); + return k == bwt->primary ? 0 : x; } // 设置某一行的排序值,sa的索引有效值从1开始,(0设置为-1, 小端模式) @@ -91,8 +81,55 @@ bwtint_t bwt_get_sa(uint8_t *sa_arr, bwtint_t k) return val; } +// 获取kmer的fmt匹配信息 +inline void kmer_getval_at(uint8_t *mem_addr, bwtintv_t *ok, int pos) +{ + bwtint_t x0, x1, x2; + int byte_idx = pos * 14; + uint8_t *arr = mem_addr + byte_idx; + x0 = *arr; + x0 = (x0 << 32) | *((uint32_t *)(arr + 1)); + arr += 5; + x1 = *arr; + x1 = (x1 << 32) | *((uint32_t *)(arr + 1)); + arr += 5; + x2 = *((uint32_t *)arr); + ok->x[0] = x0; + ok->x[1] = x1; + ok->x[2] = x2; +} + +// 设置kmer第pos个碱基对应的fmt匹配信息 +inline void kmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos) +{ + int byte_idx = pos * 14; + uint8_t *arr = mem_addr + byte_idx; + arr[0] = (uint8_t)(ik.x[0] >> 32); + *((uint32_t *)(arr + 1)) = (uint32_t)ik.x[0]; + arr += 5; + arr[0] = (uint8_t)(ik.x[1] >> 32); + *((uint32_t *)(arr + 1)) = (uint32_t)ik.x[1]; + arr += 5; + *((uint32_t *)arr) = (uint32_t)ik.x[2]; +} + +// 获取kmer对应的fmt匹配信息, pos should be [0, 13] +inline void bwt_kmer_get(const KmerHash *kmer_hash, bwtintv_t *ok, uint32_t qbit, int pos) +{ + if (pos == 13) + kmer_getval_at(kmer_hash->ke14[qbit].intv_arr, ok, 0); + else if (pos == 12) + kmer_getval_at(kmer_hash->ke13[qbit >> 2].intv_arr, ok, 0); + else if (pos == 11) + kmer_getval_at(kmer_hash->ke12[qbit >> 4].intv_arr, ok, 0); + else if (pos == 10) + kmer_getval_at(kmer_hash->ke11[qbit >> 6].intv_arr, ok, 0); + else + kmer_getval_at(kmer_hash->ke10[qbit >> 8].intv_arr, ok, pos); +} + // bwt->bwt and bwt->occ must be precalculated -void bwt_cal_sa(bwt_t *bwt, int intv) +void bwt_cal_byte_sa(bwt_t *bwt, int intv) { bwtint_t isa, sa, i, block_size; // S(isa) = sa isa是后缀数组的索引,sa表示位置 double tmp_time, elapsed_time; @@ -102,10 +139,10 @@ void bwt_cal_sa(bwt_t *bwt, int intv) xassert(intv_round == intv, "SA sample interval is not a power of 2."); xassert(bwt->bwt, "bwt_t::bwt is not initialized."); - if (bwt->sa) free(bwt->sa); + if (bwt->byte_sa) free(bwt->byte_sa); bwt->sa_intv = intv; bwt->n_sa = (bwt->seq_len + intv) / intv; - bwt->sa = (uint8_t *)calloc(SA_BYTES(bwt->n_sa), 1); // 用33位表示位置 + bwt->byte_sa = (uint8_t *)calloc(SA_BYTES(bwt->n_sa), 1); // 用33位表示位置 fprintf(stderr, "bytes: %ld, sa size: %ld\n", SA_BYTES(bwt->n_sa), bwt->n_sa); // calculate SA value isa = 0; sa = bwt->seq_len; @@ -118,19 +155,43 @@ void bwt_cal_sa(bwt_t *bwt, int intv) fprintf(stderr, "%ld%% percent complished. %f s elapsed.\n", i / block_size, elapsed_time); } if (isa % intv == 0) { - bwt_set_sa(bwt->sa, isa / intv, sa); // 第一个位置是$,所以位置就是序列长度 + bwt_set_sa(bwt->byte_sa, isa / intv, sa); // 第一个位置是$,所以位置就是序列长度 if (i % (block_size / 2) == 0) { - fprintf(stderr, "%ld %ld\n", sa, bwt_get_sa(bwt->sa, isa / intv)); + fprintf(stderr, "%ld %ld\n", sa, bwt_get_sa(bwt->byte_sa, isa / intv)); } } --sa; // 从后往前,一个位置一个位置的找对应的后缀数组,isa就是与sa对应的后缀数组排序后在sa数组中的相对位置 isa = bwt_invPsi(bwt, isa); // 下一个后缀数组的相对位置 } - if (isa % intv == 0) - bwt_set_sa(bwt->sa, isa / intv, sa); - // bwt_set_sa(bwt->sa, 0, (bwtint_t)-1); // 赋值成-1也没问题,set_sa那里已经修正了 - bwt_set_sa(bwt->sa, 0, 8589934591); // before this line, bwt->sa[0] = bwt->seq_len + if (isa % intv == 0) bwt_set_sa(bwt->byte_sa, isa / intv, sa); + // bwt_set_sa(bwt->byte_sa, 0, (bwtint_t)-1); // 赋值成-1也没问题,set_sa那里已经修正了 + bwt_set_sa(bwt->byte_sa, 0, 8589934591); // before this line, bwt->sa[0] = bwt->seq_len +} + +// bwt->bwt and bwt->occ must be precalculated +void bwt_cal_sa(bwt_t *bwt, int intv) +{ + bwtint_t isa, sa, i; // S(isa) = sa + int intv_round = intv; + + kv_roundup32(intv_round); + xassert(intv_round == intv, "SA sample interval is not a power of 2."); + xassert(bwt->bwt, "bwt_t::bwt is not initialized."); + + if (bwt->sa) free(bwt->sa); + bwt->sa_intv = intv; + bwt->n_sa = (bwt->seq_len + intv) / intv; + bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); + // calculate SA value + isa = 0; sa = bwt->seq_len; + for (i = 0; i < bwt->seq_len; ++i) { + if (isa % intv == 0) bwt->sa[isa/intv] = sa; + --sa; + isa = bwt_invPsi(bwt, isa); + } + if (isa % intv == 0) bwt->sa[isa/intv] = sa; + bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len } bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k) @@ -141,17 +202,6 @@ bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k) k = bwt_invPsi(bwt, k); } return sa + bwt->sa[k / bwt->sa_intv]; - /* without setting bwt->sa[0] = -1, the following line should be - changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */ -//#ifdef SHOW_PERF -// int64_t tmp_time = realtime_msec(); -//#endif - sa += bwt_get_sa(bwt->sa, k / bwt->sa_intv); -//#ifdef SHOW_PERF -// tmp_time = realtime_msec() - tmp_time; -// __sync_fetch_and_add(&time_bwt_sa_read, tmp_time); -//#endif - return sa; } static inline int __occ_aux(uint64_t y, int c) @@ -320,11 +370,6 @@ int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back) { -#ifdef SHOW_PERF -#if 0 - int64_t tmp_time = realtime_msec(); -#endif -#endif bwtint_t tk[4], tl[4]; int i; bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); @@ -336,12 +381,34 @@ void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_b ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2]; ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2]; ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2]; -#ifdef SHOW_PERF -#if 0 - tmp_time = realtime_msec() - tmp_time; - __sync_fetch_and_add(&time_bwt_extend, tmp_time); -#endif -#endif +} + +// 创建正向的kmer +inline uint32_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed) +{ + uint32_t qbit = 0, i; + qlen = qlen < kmer_len ? qlen : kmer_len; + for (i = 0; i < qlen; ++i) { + if (q[i] > 3) break; // 要考虑碱基是N + qbit |= q[i] << ((kmer_len - 1 - i) << 1); + } + *base_consumed = i; + return qbit; +} + +// 创建f反向的kmer +inline uint32_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed) +{ + uint32_t qbit = 0; + int i, j, end_pos; + end_pos = start_pos - kmer_len; + end_pos = end_pos < 0 ? -1 : end_pos; + for (i = start_pos, j = 0; i > end_pos; --i, ++j) { + if (q[i] > 3) break; // 要考虑碱基是N + qbit |= q[i] << ((kmer_len - 1 - j) << 1); + } + *base_consumed = start_pos - i; + return (~qbit) & ((1L << (kmer_len << 1)) - 1); } static void bwt_reverse_intvs(bwtintv_v *p) @@ -372,24 +439,8 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base ik.info = x + 1; - // int flag = 0; - // int small_intv_qlen = 0; - // int small_intv_val = 0; - // int intv_19 = 0; for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search - //if (ik.x[2] <= 2) - //{ - // if (!flag) { - // small_intv_qlen = i - x; - // small_intv_val = ik.x[2]; - // flag = 1; - // } - //} - //if (i-x == 19) - //{ - // intv_19 = ik.x[2]; - //} if (ik.x[2] < max_intv) { // an interval small enough kv_push(bwtintv_t, *curr, ik); break; @@ -407,11 +458,6 @@ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, break; // always terminate extension at an ambiguous base; in this case, i 3) return x + 1; - bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base - for (i = x + 1; i < len; ++i) { // forward search + + uint32_t qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len); + bwt_kmer_get(&bwt->kmer_hash, &ik, qbit, kmer_len - 1); + ik.info = x + kmer_len; + i = (int)ik.info; + + //bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base + //i = x + 1; + + for (; i < len; ++i) { // forward search if (q[i] < 4) { // an A/C/G/T base c = 3 - q[i]; // complement of q[i] bwt_extend(bwt, &ik, ok, 0); @@ -496,22 +550,22 @@ void bwt_dump_sa(const char *fn, const bwt_t *bwt) err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp); - err_fwrite(bwt->sa, sizeof(bwtint_t), SA_BYTES(bwt->n_sa) >> 3, fp); + err_fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); err_fflush(fp); err_fclose(fp); } -static bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a) -{ // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks - //const int bufsize = 0x1000000; // 16M block - const int bufsize = 0x4000000; //64M block - bwtint_t offset = 0; - while (size) { - int x = bufsize < size? bufsize : size; - if ((x = err_fread_noeof(a + offset, 1, x, fp)) == 0) break; - size -= x; offset += x; - } - return offset; +void bwt_dump_byte_sa(const char *fn, const bwt_t *bwt) +{ + FILE *fp; + fp = xopen(fn, "wb"); + err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); + err_fwrite(bwt->L2 + 1, sizeof(bwtint_t), 4, fp); + err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); + err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp); + err_fwrite(bwt->byte_sa, sizeof(bwtint_t), SA_BYTES(bwt->n_sa) >> 3, fp); + err_fflush(fp); + err_fclose(fp); } void bwt_restore_sa(const char *fn, bwt_t *bwt) @@ -529,10 +583,6 @@ void bwt_restore_sa(const char *fn, bwt_t *bwt) xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same."); bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv; - - //bwt->sa = (uint8_t*)malloc(SA_BYTES(bwt->n_sa)); - //fprintf(stderr, "zzh-read-sa %ld\n", SA_BYTES(bwt->n_sa)); - //fread_fix(fp, SA_BYTES(bwt->n_sa), bwt->sa); bwt->sa = (bwtint_t *)calloc(bwt->n_sa, sizeof(bwtint_t)); bwt->sa[0] = -1; @@ -565,6 +615,6 @@ bwt_t *bwt_restore_bwt(const char *fn) void bwt_destroy(bwt_t *bwt) { if (bwt == 0) return; - //free(bwt->sa); free(bwt->bwt); - //free(bwt); + free(bwt->sa); free(bwt->bwt); + free(bwt); } diff --git a/bwt.h b/bwt.h index 7c71380..39b3e46 100644 --- a/bwt.h +++ b/bwt.h @@ -45,6 +45,30 @@ typedef unsigned char ubyte_t; typedef uint64_t bwtint_t; +#define HASH_KMER_LEN 14 + +// 用来保存kmer对应的fmt的位置信息 +typedef struct +{ + // 40+40+32 14个byte,这样好处理 + uint8_t intv_arr[14]; // 保存kmer中每扩展一个碱基对应的bwtintv_t数据 +} KmerEntry; + +typedef struct +{ + uint8_t intv_arr[140]; // 保存长度为10的kmer,每个碱基对应的bwt匹配信息 +} KmerEntryArr; + +// 保存各个位置对应的bwt匹配信息 +typedef struct +{ + KmerEntryArr *ke10; + KmerEntry *ke11; + KmerEntry *ke12; + KmerEntry *ke13; + KmerEntry *ke14; +} KmerHash; + typedef struct { bwtint_t primary; // S^{-1}(0), or the primary index of BWT bwtint_t L2[5]; // C(), cumulative count @@ -53,11 +77,13 @@ typedef struct { uint32_t *bwt; // BWT // occurance array, separated to two parts uint32_t cnt_table[256]; + // 保存kmer对应的fmt位置信息 + KmerHash kmer_hash; // suffix array int sa_intv; bwtint_t n_sa; - //uint8_t *sa; bwtint_t *sa; + uint8_t *byte_sa; } bwt_t; typedef struct { @@ -99,6 +125,7 @@ extern "C" { void bwt_dump_bwt(const char *fn, const bwt_t *bwt); void bwt_dump_sa(const char *fn, const bwt_t *bwt); + void bwt_dump_byte_sa(const char *fn, const bwt_t *bwt); bwt_t *bwt_restore_bwt(const char *fn); void bwt_restore_sa(const char *fn, bwt_t *bwt); @@ -108,6 +135,7 @@ extern "C" { void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size); // from BWT-SW void bwt_cal_sa(bwt_t *bwt, int intv); + void bwt_cal_byte_sa(bwt_t *bwt, int intv); void bwt_bwtupdate_core(bwt_t *bwt); @@ -118,6 +146,13 @@ extern "C" { void bwt_set_sa(uint8_t *sa_arr, bwtint_t k, bwtint_t val); bwtint_t bwt_get_sa(uint8_t *sa_arr, bwtint_t k); + void kmer_getval_at(uint8_t *mem_addr, bwtintv_t *ok, int pos); + void kmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos); + void bwt_kmer_get(const KmerHash *kmer_hash, bwtintv_t *ok, uint32_t qbit, int pos); + + uint32_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed); + uint32_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed); + // more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values void bwt_gen_cnt_table(bwt_t *bwt); void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol); diff --git a/bwtindex.c b/bwtindex.c index 6a27ae1..70379d8 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -206,6 +206,27 @@ int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command return 0; } +int bwa_bwt2bytesa(int argc, char *argv[]) // the "bwt2sa" command +{ + bwt_t *bwt; + int c, sa_intv = 32; + while ((c = getopt(argc, argv, "i:")) >= 0) { + switch (c) { + case 'i': sa_intv = atoi(optarg); break; + default: return 1; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: bwa bwt2bytesa [-i %d] \n", sa_intv); + return 1; + } + bwt = bwt_restore_bwt(argv[optind]); + bwt_cal_byte_sa(bwt, sa_intv); + bwt_dump_byte_sa(argv[optind+1], bwt); + bwt_destroy(bwt); + return 0; +} + int bwa_index(int argc, char *argv[]) // the "index" command { int c, algo_type = BWTALGO_AUTO, is_64 = 0, block_size = 10000000; diff --git a/debug.sh b/debug.sh new file mode 100755 index 0000000..b906d4e --- /dev/null +++ b/debug.sh @@ -0,0 +1,38 @@ +thread=1 +#n_r1=~/data/fastq/ZY2105177532213000/n_r1.fq +#n_r2=~/data/fastq/ZY2105177532213000/n_r2.fq +#n_r1=~/data/fastq/ZY2105177532213000/sn_r1.fq +#n_r2=~/data/fastq/ZY2105177532213000/sn_r2.fq +#reference=~/data/reference/human_g1k_v37_decoy.fasta +#n_r1=~/fastq/sn_r1.fq +#n_r2=~/fastq/sn_r2.fq +#n_r1=~/fastq/ssn_r1.fq +#n_r2=~/fastq/ssn_r2.fq +#n_r1=~/fastq/tiny_n_r1.fq +#n_r2=~/fastq/tiny_n_r2.fq +n_r1=~/fastq/diff_r1.fq +n_r2=~/fastq/diff_r2.fq +#n_r1=~/fastq/d_r1.fq +#n_r2=~/fastq/d_r2.fq +reference=~/reference/human_g1k_v37_decoy.fasta +#out=./ssn.sam +out=./out.sam +#out=/dev/null +#time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ +# /home/zzh/data/reference/human_g1k_v37_decoy.fasta \ +# /home/zzh/data/fastq/nm1.fq \ +# /home/zzh/data/fastq/nm2.fq -o /dev/null + +#time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ +# /home/zzh/data/reference/human_g1k_v37_decoy.fasta \ +# /mnt/d/data/fastq/ZY2105177532213000/ZY2105177532213010_L4_1.fq.gz \ +# /mnt/d/data/fastq/ZY2105177532213000/ZY2105177532213010_L4_2.fq.gz \ +# -o /dev/null + +time ./bwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ + $reference \ + $n_r1 \ + $n_r2 \ + -o $out + + diff --git a/fastmap.c b/fastmap.c index 888b0bf..6a3aa35 100644 --- a/fastmap.c +++ b/fastmap.c @@ -59,7 +59,8 @@ int64_t time_ksw_extend2 = 0, time_bns = 0; int64_t dn = 0, n16 = 0, n17 = 0, n18 = 0, n19 = 0, nall = 0, num_sa = 0; - +int64_t s1n = 0, s2n = 0, s3n = 0, s1l = 0, s2l = 0, s3l = 0; +int64_t g_num_smem2 = 0; FILE *fp1; #endif @@ -433,22 +434,20 @@ int main_mem(int argc, char *argv[]) #ifdef SHOW_PERF fprintf(stderr, "\n"); - fprintf(stderr, "time_bwt_smem1a: %f s\n", time_bwt_smem1a / 1000.0 / opt->n_threads); fprintf(stderr, "time_seed_1: %f s\n", time_seed_1 / 1000.0 / opt->n_threads); fprintf(stderr, "time_seed_2: %f s\n", time_seed_2 / 1000.0 / opt->n_threads); fprintf(stderr, "time_seed_3: %f s\n", time_seed_3 / 1000.0 / opt->n_threads); - fprintf(stderr, "time_fmt_smem_0: %f s\n", time_fmt_smem_0 / 1000.0 / opt->n_threads); - fprintf(stderr, "time_bwt_extend: %f s\n", time_bwt_extend / 1000.0 / opt->n_threads); fprintf(stderr, "time_bwt_sa: %f s\n", time_bwt_sa / 1000.0 / opt->n_threads); fprintf(stderr, "time_ksw_extend2: %f s\n", time_ksw_extend2 / 1000.0 / opt->n_threads); - fprintf(stderr, "time_bwt_sa_read: %f s\n", time_bwt_sa_read / 1000.0 / opt->n_threads); fprintf(stderr, "time_bns: %f s\n", time_bns / 1000.0 / opt->n_threads); - fprintf(stderr, "all seed num: %ld\n", nall); - fprintf(stderr, "all seed num litter than 19: %ld\n", n19); - fprintf(stderr, "all seed num litter than 18: %ld\n", n18); - fprintf(stderr, "all seed num litter than 17: %ld\n", n17); - fprintf(stderr, "all seed num litter than 16: %ld\n", n16); + fprintf(stderr, "s1 num: %ld\n", s1n); + fprintf(stderr, "s2 num: %ld\n", s2n); + fprintf(stderr, "s3 num: %ld\n", s3n); + fprintf(stderr, "s1 len: %ld\n", s1l / s1n); + fprintf(stderr, "s2 len: %ld\n", s2l / s2n); + fprintf(stderr, "s3 len: %ld\n", s3l / s3n); fprintf(stderr, "get sa num: %ld\n", num_sa); + fprintf(stderr, "seed 2 num: %ld\n", g_num_smem2); fprintf(stderr, "\n"); fclose(fp1); diff --git a/fmt_idx.c b/fmt_idx.c index 2e16625..95578b4 100644 --- a/fmt_idx.c +++ b/fmt_idx.c @@ -18,19 +18,18 @@ Date : 2023/12/24 const static char BASE[4] = {'A', 'C', 'G', 'T'}; -static bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a) -{ // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks - const int bufsize = 0x4000000; // 16 * 4M block - bwtint_t offset = 0; - while (size) +// 生成所有KMER_LEN长度的序列,字符串表示 +void gen_all_seq(char **seq_arr, int kmer_len) +{ + uint32_t seq_up_val = (1 << (kmer_len << 1)); + for (uint32_t i = 0; i < seq_up_val; ++i) { - int x = bufsize < size ? bufsize : size; - if ((x = err_fread_noeof(a + offset, 1, x, fp)) == 0) - break; - size -= x; - offset += x; + seq_arr[i] = (char *)malloc(kmer_len); + for (int j = kmer_len - 1; j >= 0; --j) + { + seq_arr[i][kmer_len - 1 - j] = BASE[(i >> (j << 1)) & 3]; + } } - return offset; } // 生成occ,每个字节对应一个pattern @@ -155,23 +154,12 @@ KmerHash fmt_restore_kmer_idx(const char *fn) return khash; } -uint16_t *fmt_restore_kmer_bit(const char *fn, int kmer_len) -{ - uint16_t *kbit = (uint16_t *)calloc(1L << ((kmer_len << 1) - 4), sizeof(uint16_t)); - FILE *fp; - fp = xopen(fn, "rb"); - fread_fix(fp, (1L << ((kmer_len << 1) - 4)) * sizeof(uint16_t), kbit); - err_fclose(fp); - return kbit; -} - // 读取sa数据 void fmt_restore_sa(const char *fn, FMTIndex *fmt) { char skipped[256]; FILE *fp; bwtint_t primary; - fp = xopen(fn, "rb"); err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp); xassert(primary == fmt->primary, "SA-BWT inconsistency: primary is not the same."); @@ -179,10 +167,8 @@ void fmt_restore_sa(const char *fn, FMTIndex *fmt) err_fread_noeof(&fmt->sa_intv, sizeof(bwtint_t), 1, fp); err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp); xassert(primary == fmt->seq_len, "SA-BWT inconsistency: seq_len is not the same."); - fmt->n_sa = (fmt->seq_len + fmt->sa_intv) / fmt->sa_intv; fmt->sa = (uint8_t *)malloc(SA_BYTES(fmt->n_sa)); - fread_fix(fp, SA_BYTES(fmt->n_sa), fmt->sa); err_fclose(fp); } @@ -224,7 +210,6 @@ FMTIndex *create_fmt_from_bwt(bwt_t *bwt) #endif buf = (uint32_t *)calloc(fmt->bwt_size, 4); // 开辟计算fmt用到的缓存 - c[0] = c[1] = c[2] = c[3] = 0; // 首行的c2,应该是对应的ACGT对应的行,减去1的occ for (i = 0; i < 4; ++i) @@ -382,7 +367,6 @@ inline void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t cnt[3] = q[b2]; // b2的occ p += 20; -#ifdef FMT_MID_INTERVAL // 加入了middle checkpoint // 使用mid interval信息 int mk = k % FMT_OCC_INTERVAL; int n_mintv = mk >> FMT_MID_INTV_SHIFT; @@ -401,35 +385,12 @@ inline void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t x = 0; p += 4; } -#if FMT_MID_INTERVAL == 16 // middle check point interval等于16时候,只需要判断一下是不是要计算两个uint32表示的碱基序列 - if ((mk & FMT_MID_INTV_MASK) >> 3) - { - x += __fmt_occ_e2_aux2(fmt, ti, *p); - ++p; - } -#elif FMT_MID_INTERVAL > 16 // 该地址是bwt和pre_bwt字符串数据的首地址 uint32_t *end = p + ((k >> 3) - ((k & ~FMT_MID_INTV_MASK) >> 3)); for (; p < end; ++p) { x += __fmt_occ_e2_aux2(fmt, ti, *p); } -#endif -#else // 没有加入middle check point interval -#if FMT_OCC_INTERVAL > 16 - uint32_t *end = p + ((k >> 3) - ((k & ~FMT_OCC_INTV_MASK) >> 3)); - // p = end - (end - p) / 4; - for (; p < end; ++p) - { - x += __fmt_occ_e2_aux2(fmt, ti, *p); - } -#else // FMT_OCC_INTERVAL等于16的时候,只需要判断一次就可以 - if ((k & FMT_OCC_INTV_MASK) >> 3) - { - x += __fmt_occ_e2_aux2(fmt, ti, *p); - ++p; - } -#endif -#endif + tmp = *p & ~((1U << ((~k & 7) << 2)) - 1); x += __fmt_occ_e2_aux2(fmt, ti, tmp); @@ -456,11 +417,6 @@ inline void fmt_e2_occ(const FMTIndex *fmt, bwtint_t k, int b1, int b2, bwtint_t // 扩展两个碱基 inline void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwtintv_t *ok2, int is_back, int b1, int b2) { -#ifdef SHOW_PERF -#if 0 - int64_t tmp_time = realtime_msec(); -#endif -#endif bwtint_t tk[4], tl[4]; bwtintv_t intv = {0}; // tk表示在k行之前所有各个碱基累积出现次数,tl表示在l行之前的累积 @@ -476,22 +432,11 @@ inline void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwti intv.x[!is_back] = fmt->L2[b2] + 1 + tk[3]; intv.x[2] = tl[3] - tk[3]; *ok2 = intv; -#ifdef SHOW_PERF -#if 0 - tmp_time = realtime_msec() - tmp_time; - __sync_fetch_and_add(&time_bwt_extend, tmp_time); -#endif -#endif } // 扩展一个碱基 inline void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1) { -#ifdef SHOW_PERF -#if 0 - int64_t tmp_time = realtime_msec(); -#endif -#endif bwtint_t tk[4], tl[4]; int b2 = 3; // 如果只扩展一次,那么第二个碱基设置成T,可以减小一些计算量,如计算大于b2的累积数量 // tk表示在k行之前所有各个碱基累积出现次数,tl表示在l行之前的累积 @@ -502,90 +447,70 @@ inline void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int i ok->x[2] = tl[1] - tk[1]; // 第一次正向扩展 ok->x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= fmt->primary && ik->x[!is_back] + ik->x[2] - 1 >= fmt->primary) + tl[0] - tk[0]; -#ifdef SHOW_PERF -#if 0 - tmp_time = realtime_msec() - tmp_time; - __sync_fetch_and_add(&time_bwt_extend, tmp_time); -#endif -#endif } -// 获取kmer的fmt匹配信息 -inline void kmer_getval_at(uint8_t *mem_addr, bwtintv_t *ok, int pos) +// 序列和参考基因直接对比 +static void direct_extend(const FMTIndex *fmt, int len, const uint8_t *q, int left_pos, int right_pos, bwtint_t mtx_line, bwtintv_t *mt) { - bwtint_t x0, x1, x2; - int byte_idx = pos * 14; - uint8_t *arr = mem_addr + byte_idx; - x0 = *arr; - x0 = (x0 << 32) | *((uint32_t *)(arr + 1)); - arr += 5; - x1 = *arr; - x1 = (x1 << 32) | *((uint32_t *)(arr + 1)); - arr += 5; - x2 = *((uint32_t *)arr); - ok->x[0] = x0; - ok->x[1] = x1; - ok->x[2] = x2; +#define PAC_BASE(pac, l) ((pac)[(l) >> 2] >> ((~(l) & 3) << 1) & 3) +#define EXTEND_BASE_LOOP(qcond, rcond, qstep, rstep) \ + while (k != qcond && r != rcond) \ + { \ + const int base = PAC_BASE(fmt->pac, r); \ + if (q[k] != base) break; \ + k += qstep; \ + r += rstep; \ + } +#define EXTEND_BASE_LOOP_COMP(qcond, rcond, qstep, rstep) \ + while (k != qcond && r != rcond) \ + { \ + const int base = 3 - PAC_BASE(fmt->pac, r); \ + if (q[k] != base) break; \ + k += qstep; \ + r += rstep; \ + } + + int k; + int64_t r, rp; + mt->num_match = 1; + rp = fmt_sa(fmt, mtx_line); + r = rp >= fmt->l_pac ? (fmt->l_pac << 1) - 1 - rp : rp; + k = right_pos; + if (rp < fmt->l_pac) // 匹配到了正向链 + { + // 向前继续扩展 + r += right_pos - left_pos; + EXTEND_BASE_LOOP(len, fmt->l_pac, 1, 1); + mt->rm[0].qe = k; + mt->rm[0].reverse = 0; + // 向后扩展,x位置之前的碱基 + r -= k - left_pos + 1; + k = left_pos - 1; + EXTEND_BASE_LOOP(-1, -1, -1, -1); + mt->rm[0].qs = k + 1; + mt->rm[0].rs = r + 1; + } + else // 匹配到了互补链 + { + r -= right_pos - left_pos; + EXTEND_BASE_LOOP_COMP(len, -1, 1, -1); + mt->rm[0].qe = k; + mt->rm[0].reverse = 1; + // 扩展x之前的碱基 + r += k - left_pos + 1; + k = left_pos - 1; + EXTEND_BASE_LOOP_COMP(-1, fmt->l_pac, -1, 1); + mt->rm[0].qs = k + 1; + mt->rm[0].rs = r - 1; + } + mt->info = mt->rm[0].qs; + mt->info = mt->info << 32 | mt->rm[0].qe; + mt->x[2] = 1; } -// 设置kmer第pos个碱基对应的fmt匹配信息 -inline void kmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos) +static inline void fmt_reverse_intvs(bwtintv_v *p) { - int byte_idx = pos * 14; - uint8_t *arr = mem_addr + byte_idx; - arr[0] = (uint8_t)(ik.x[0] >> 32); - *((uint32_t *)(arr + 1)) = (uint32_t)ik.x[0]; - arr += 5; - arr[0] = (uint8_t)(ik.x[1] >> 32); - *((uint32_t *)(arr + 1)) = (uint32_t)ik.x[1]; - arr += 5; - *((uint32_t *)arr) = (uint32_t)ik.x[2]; -} - -// 获取kmer对应的fmt匹配信息, pos should be [0, 13] -inline void fmt_kmer_get(const FMTIndex *fmt, bwtintv_t *ok, uint32_t qbit, int pos) -{ - - if (pos == 13) - { - kmer_getval_at(fmt->kmer_hash.ke14[qbit].intv_arr, ok, 0); - } - else if (pos == 12) - { - kmer_getval_at(fmt->kmer_hash.ke13[qbit >> 2].intv_arr, ok, 0); - } - else if (pos == 11) - { - kmer_getval_at(fmt->kmer_hash.ke12[qbit >> 4].intv_arr, ok, 0); - } - else if (pos == 10) - { - kmer_getval_at(fmt->kmer_hash.ke11[qbit >> 6].intv_arr, ok, 0); - } - else - { - kmer_getval_at(fmt->kmer_hash.ke10[qbit >> 8].intv_arr, ok, pos); - } -} - -// 生成所有KMER_LEN长度的序列,字符串表示 -void gen_all_seq(char **seq_arr, int kmer_len) -{ - uint32_t seq_up_val = (1 << (kmer_len << 1)); - for (uint32_t i = 0; i < seq_up_val; ++i) - { - seq_arr[i] = (char *)malloc(kmer_len); - for (int j = kmer_len - 1; j >= 0; --j) - { - seq_arr[i][kmer_len - 1 - j] = BASE[(i >> (j << 1)) & 3]; - } - } -} - -static void fmt_reverse_intvs(bwtintv_v *p) -{ - if (p->n > 1) - { + if (p->n > 1) { int j; for (j = 0; j < p->n >> 1; ++j) { @@ -596,58 +521,23 @@ static void fmt_reverse_intvs(bwtintv_v *p) } } -// 创建正向的kmer -inline static uint64_t build_forward_kmer(const uint8_t *q, int qlen, int kmer_len, int *base_consumed) -{ - uint64_t qbit = 0; - int i; - qlen = qlen < kmer_len ? qlen : kmer_len; - for (i = 0; i < qlen; ++i) - { - if (q[i] > 3) // 要考虑碱基是N - break; - qbit |= (uint64_t)q[i] << ((kmer_len - 1 - i) << 1); - } - *base_consumed = i; - return qbit; -} - -// 创建f反向的kmer -inline static uint64_t build_backward_kmer(const uint8_t *q, int start_pos, int kmer_len, int *base_consumed) -{ - uint64_t qbit = 0; - int i, j, end_pos; - end_pos = start_pos - kmer_len; - end_pos = end_pos < 0 ? -1 : end_pos; - for (i = start_pos, j = 0; i > end_pos; --i, ++j) - { - if (q[i] > 3) // 要考虑碱基是N - break; - qbit |= ((uint64_t)q[i]) << ((kmer_len - 1 - j) << 1); - } - *base_consumed = start_pos - i; - return (~qbit) & ((1L << (kmer_len << 1)) - 1); -} - // 找smem(seed) -int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) +int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_v *mem, bwtintv_v *tmpvec) { int i, j, ret, kmer_len; bwtintv_t ik = {0}, ok1 = {0}, ok2 = {0}; bwtintv_t mt = {0}; - bwtintv_v a[1], *curr; + bwtintv_v *curr; uint32_t qbit = 0; mem->n = 0; int only_forward = x == 0 || q[x - 1] > 3; - if (q[x] > 3) return x + 1; if (min_intv < 1) min_intv = 1; // the interval size should be at least 1 - kv_init(a[0]); - curr = tmpvec && tmpvec[0] ? tmpvec[0] : &a[0]; // use the temporary vector if provided + curr = tmpvec; // use the temporary vector if provided - qbit = (uint32_t)build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len); - fmt_kmer_get(fmt, &ik, qbit, 0); // 初始碱基位置 + qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len); + bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, 0); // 初始碱基位置 ik.info = x + 1; // check change of the interval size and whether the interval size is too small to be extended further @@ -657,125 +547,37 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv do { kv_push(bwtintv_t, *curr, iv); goto backward_search; } while(0) // 处理kmer对应的匹配信息 - if (only_forward) - j = kmer_len - 1; - else - j = 1; + if (only_forward) j = kmer_len - 1; + else j = 1; for (curr->n = 0; j < kmer_len; ++j) { - fmt_kmer_get(fmt, &ok1, qbit, j); + bwt_kmer_get(&fmt->kmer_hash, &ok1, qbit, j); CHECK_INTV_CHANGE(ik, ok1, x + j + 1); } if (kmer_len != HASH_KMER_LEN) // 遇到了N或者到了序列最后 PUSH_VAL_AND_SKIP(ik); -#define PAC_BASE(pac, l) ((pac)[(l) >> 2] >> ((~(l) & 3) << 1) & 3) - // 扩展kmer之后的碱基 - __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2); - __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2); +// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2); +// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2); for (i = (int)ik.info; i + 1 < len; i += 2) { // forward search if (q[i] < 4 && q[i + 1] < 4) { fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]); - __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2); - __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2); +// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2); +// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2); CHECK_INTV_CHANGE(ik, ok1, i + 1); CHECK_INTV_CHANGE(ik, ok2, i + 2); // 在这里进行判断是否只有一个候选了 - //if (0) - if (min_intv < 2 && ok2.x[2] == min_intv) - { // 最多处理两个 - mt.num_match = min_intv; - int64_t r, rp; - int k; - int max_qs = 0, min_qe = len; - for (j = 0; j < ok2.x[2]; ++j) - { - rp = fmt_sa(fmt, ok2.x[0] + j); - r = rp >= fmt->l_pac ? (fmt->l_pac << 1) - 1 - rp : rp; - k = i + 2; - if (rp < fmt->l_pac) // 匹配到了正向链 - { - // 向前继续扩展 - r += i + 2 - x; - while (k < len && r < fmt->l_pac) { - int base = PAC_BASE(fmt->pac, r); - if (q[k] != base) - break; - ++k; - ++r; - } - mt.rm[j].qe = k; - mt.rm[j].reverse = 0; - // 向后扩展,x位置之前的碱基 - r -= k - x + 1; - k = x - 1; - while (k > -1 && r > -1) { - int base = PAC_BASE(fmt->pac, r); - if (q[k] != base) - break; - --k; - --r; - } - mt.rm[j].qs = k + 1; - mt.rm[j].rs = r + 1; - } - else // 匹配到了互补链 - { - r -= i + 2 - x; - while (k < len && r > -1) - { - const int base = 3 - PAC_BASE(fmt->pac, r); - if (q[k] != base) - break; - ++k; - --r; - } - mt.rm[j].qe = k; - mt.rm[j].reverse = 1; - // 扩展x之前的碱基 - r += k - x + 1; - k = x - 1; - while (k > -1 && r < fmt->l_pac) - { - const int base = 3 - PAC_BASE(fmt->pac, r); - if (q[k] != base) - break; - --k; - ++r; - } - mt.rm[j].qs = k + 1; - mt.rm[j].rs = r - 1; - } - max_qs = max_qs < mt.rm[j].qs ? mt.rm[j].qs : max_qs; - min_qe = min_qe > mt.rm[j].qe ? mt.rm[j].qe : min_qe; - } - if (min_intv > 1) - { - // 修正两个匹配的位置,使比对的碱基个数和位置一致 - for (j = 0; j < min_intv; ++j) - { - if (mt.rm[j].reverse) - mt.rm[j].rs -= max_qs - mt.rm[j].qs; - else - mt.rm[j].rs += max_qs - mt.rm[j].qs; - mt.rm[j].qs = max_qs; - mt.rm[j].qe = min_qe; - } - } - mt.info = mt.rm[0].qs; - mt.info = mt.info << 32 | mt.rm[0].qe; - mt.x[2] = min_intv; - kv_push(bwtintv_t, *mem, mt); - ret = (uint32_t)mt.info; - if (only_forward || mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) - { - goto fmt_smem_end; - } - goto backward_search; - } + //if (min_intv == 1 && ok2.x[2] == min_intv) + //{ + // direct_extend(fmt, len, q, x, i + 2, ok2.x[0], &mt); + // kv_push(bwtintv_t, *mem, mt); + // ret = (uint32_t)mt.info; + // if (only_forward || mt.rm[0].qs == 0 || q[mt.rm[0].qs - 1] > 3) goto fmt_smem_end; + // goto backward_search; + //} } else if (q[i] < 4) // q[i+1] >= 4 { fmt_extend1(fmt, &ik, &ok1, 0, 3 - q[i]); @@ -802,26 +604,30 @@ int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv backward_search: fmt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first - if (mt.num_match == 0) - ret = curr->a[0].info; // this will be the returned value,扩展到的最远的位置 + if (mt.num_match == 0) ret = curr->a[0].info; // this will be the returned value,扩展到的最远的位置 // 按照种子进行遍历,反向扩展 #define CHECK_ADD_MEM(pos, intv, mem) \ - if (mem->n == 0 || (pos) < mem->a[mem->n - 1].info >> 32) { (intv).info |= (uint64_t)(pos) << 32; kv_push(bwtintv_t, *mem, (intv)); } + if (((int)((intv).info) - (pos) >= min_seed_len) && (mem->n == 0 || (pos) < mem->a[mem->n - 1].info >> 32)) { \ + (intv).info |= (uint64_t)(pos) << 32; kv_push(bwtintv_t, *mem, (intv)); \ + } #define CHECK_INTV_ADD_MEM(ok, pos, intv, mem) \ - if (ok.x[2] < min_intv) { CHECK_ADD_MEM(pos, intv, mem); break; } + if (ok.x[2] < min_intv) { CHECK_ADD_MEM(pos, intv, mem); break; } + int last_kmer_start = 0; for (j = 0; j < curr->n; ++j) { bwtintv_t *p = &curr->a[j]; // 前向扩展的种子 - __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2); - __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2); - uint64_t qbit = 0; +// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1), 0, 2); +// __builtin_prefetch(fmt_occ_intv(fmt, p->x[0] - 1 + p->x[2]), 0, 2); if (!only_forward && p->info - x < HASH_KMER_LEN) { - qbit = build_backward_kmer(q, p->info - 1, HASH_KMER_LEN, &kmer_len); // 创建反向kmer + if (last_kmer_start && kmer_len == HASH_KMER_LEN && p->info == last_kmer_start && p->info - kmer_len > 0 && q[p->info - kmer_len] < 4) + qbit = ((qbit << 2) | (3 - q[p->info - kmer_len])) & ((1L << (kmer_len << 1)) - 1); // 创建反向kmer + else qbit = build_backward_kmer(q, p->info - 1, HASH_KMER_LEN, &kmer_len); // 创建反向kmer + last_kmer_start = p->info - 1; i = 1; - do { fmt_kmer_get(fmt, &ik, qbit, kmer_len - i++); } while (ik.x[2] < min_intv); + do { bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len - i++); } while (ik.x[2] < min_intv); if (i > 2) continue; p->x[0] = ik.x[1]; p->x[1] = ik.x[0]; p->x[2] = ik.x[2]; i = p->info - (kmer_len - i + 3); @@ -833,8 +639,8 @@ backward_search: if (q[i] < 4 && q[i - 1] < 4) // 两个都可以扩展 { fmt_extend2(fmt, p, &ok1, &ok2, 1, q[i], q[i - 1]); - __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2); - __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2); +// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1), 0, 2); +// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[0] - 1 + ok2.x[2]), 0, 2); CHECK_INTV_ADD_MEM(ok1, i + 1, *p, mem); ok1.info = p->info; CHECK_INTV_ADD_MEM(ok2, i, ok1, mem); @@ -873,8 +679,6 @@ backward_search: fmt_smem_end: fmt_reverse_intvs(mem); // s.t. sorted by the start coordinate - if (tmpvec == 0 || tmpvec[0] == 0) - free(a[0].a); return ret; } @@ -883,18 +687,18 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in int i, kmer_len; bwtintv_t ik = {0}, ok1={0}, ok2={0}; uint64_t qbit; - - memset(mem, 0, sizeof(bwtintv_t)); - if (q[x] > 3) - return x + 1; + if (q[x] > 3) return x + 1; - qbit = (uint32_t)build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len); - - fmt_kmer_get(fmt, &ik, qbit, kmer_len-1); // 初始碱基位置 + qbit = build_forward_kmer(&q[x], len - x, HASH_KMER_LEN, &kmer_len); + bwt_kmer_get(&fmt->kmer_hash, &ik, qbit, kmer_len-1); // 初始碱基位置 ik.info = x + kmer_len; - __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2); - __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2); + + //fmt_set_intv(fmt, q[x], ik); + //ik.info = x + 1; + +// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1), 0, 2); +// __builtin_prefetch(fmt_occ_intv(fmt, ik.x[1] - 1 + ik.x[2]), 0, 2); #define COND_SET_RETURN(iv, ov, start_pos, end_pos, max_intv, min_len) \ if (iv.x[2] < max_intv && end_pos - start_pos >= min_len) \ @@ -909,8 +713,8 @@ int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, in if (q[i] < 4 && q[i + 1] < 4) { fmt_extend2(fmt, &ik, &ok1, &ok2, 0, 3 - q[i], 3 - q[i + 1]); - __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2); - __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2); +// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1), 0, 2); +// __builtin_prefetch(fmt_occ_intv(fmt, ok2.x[1] - 1 + ok2.x[2]), 0, 2); COND_SET_RETURN(ok1, *mem, x, i, max_intv, min_len); COND_SET_RETURN(ok2, *mem, x, i + 1, max_intv, min_len); ik = ok2; @@ -938,7 +742,6 @@ inline static void fmt_get_previous_base(const FMTIndex *fmt, bwtint_t k, uint8_ { uint32_t *p; uint8_t base2; - // 第一步,找到check point位置 p = fmt_occ_intv(fmt, k); // check point起始位置 p += 20; // bwt碱基起始位置 @@ -949,7 +752,6 @@ inline static void fmt_get_previous_base(const FMTIndex *fmt, bwtint_t k, uint8_ // 第三步,找到具体的uint32_t p += (k & FMT_MID_INTV_MASK) >> 3; // 每个uint32_t包含8个碱基(和8个倒数第二bwt碱基) // 第四步,获取碱基 - base2 = *p >> ((~(k) & 0x7) << 2) & 0xf; *b2 = base2 >> 2 & 3; *b1 = base2 & 3; @@ -982,13 +784,6 @@ bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k) ++sa; k = k2; } -//#ifdef SHOW_PERF -// int64_t tmp_time = realtime_msec(); -//#endif sa += bwt_get_sa(fmt->sa, k / fmt->sa_intv); -//#ifdef SHOW_PERF -// tmp_time = realtime_msec() - tmp_time; -// __sync_fetch_and_add(&time_bwt_sa_read, tmp_time); -//#endif return sa; } \ No newline at end of file diff --git a/fmt_idx.h b/fmt_idx.h index 0252c8e..bc55b74 100644 --- a/fmt_idx.h +++ b/fmt_idx.h @@ -49,33 +49,7 @@ Date : 2023/12/24 ((x) >> 24 & 0xff) + ((x) >> 16 & 0xff) + ((x) >> 8 & 0xff) + ((x) & 0xff) // sa存储的行间隔 -#define SA_INTV 2 - -#define HASH_KMER_LEN 14 - -#define BIT_KMER_LEN 17 - -// 用来保存kmer对应的fmt的位置信息 -typedef struct -{ - // 40+40+32 14个byte,这样好处理 - uint8_t intv_arr[14]; // 保存kmer中每扩展一个碱基对应的bwtintv_t数据 -} KmerEntry; - -typedef struct -{ - uint8_t intv_arr[140]; // 保存长度为10的kmer,每个碱基对应的bwt匹配信息 -} KmerEntryArr; - -// 保存各个位置对应的bwt匹配信息 -typedef struct -{ - KmerEntryArr *ke10; - KmerEntry *ke11; - KmerEntry *ke12; - KmerEntry *ke13; - KmerEntry *ke14; -} KmerHash; +#define SA_INTV 4 // fm-index, extend twice in one search step (one memory access) typedef struct @@ -96,9 +70,6 @@ typedef struct uint8_t *pac; // 保存2bit编码的参考序列 // 保存kmer对应的fmt位置信息 KmerHash kmer_hash; - // kmer_bit好像效果一般 - uint16_t *kmer_bit; // 用来检测特定长度序列有没有fm-index匹配 - int bit_kmer_len; // suffix array int sa_intv; bwtint_t n_sa; @@ -113,8 +84,6 @@ FMTIndex *fmt_restore_fmt(const char *fn); void fmt_dump_kmer_idx(const char *fn, const KmerHash *kh); // 从文件中读取kmer hash信息 KmerHash fmt_restore_kmer_idx(const char *fn); -// 读取kmer bit数据 -uint16_t *fmt_restore_kmer_bit(const char *fn, int kmer_len); // 读取sa数据 void fmt_restore_sa(const char *fn, FMTIndex *fmt); // 根据interval-bwt创建fmt-index @@ -133,7 +102,7 @@ void kmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos); void kmer_getval_at(uint8_t *mem_addr, bwtintv_t *ok, int pos); void fmt_kmer_get(const FMTIndex *fmt, bwtintv_t *ok, uint32_t qbit, int pos); // 找smem(seed) -int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); +int fmt_smem(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_intv, int min_seed_len, bwtintv_v *mem, bwtintv_v *tmpvec); int fmt_seed_strategy1(const FMTIndex *fmt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem); diff --git a/main.c b/main.c index 22ccd40..25d5dc7 100644 --- a/main.c +++ b/main.c @@ -37,6 +37,7 @@ int bwa_fa2pac(int argc, char *argv[]); int bwa_pac2bwt(int argc, char *argv[]); int bwa_bwtupdate(int argc, char *argv[]); int bwa_bwt2sa(int argc, char *argv[]); +int bwa_bwt2bytesa(int argc, char *argv[]); int bwa_index(int argc, char *argv[]); int bwt_bwtgen_main(int argc, char *argv[]); @@ -75,6 +76,7 @@ static int usage() fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n"); fprintf(stderr, " bwtupdate update .bwt to the new format\n"); fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n"); + fprintf(stderr, " bwt2bytesa generate SA(using byte array) from BWT and Occ\n"); fprintf(stderr, "\n"); fprintf(stderr, "Note: To use BWA, you need to first index the genome with `bwa index'.\n" @@ -100,6 +102,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "pac2bwtgen") == 0) ret = bwt_bwtgen_main(argc-1, argv+1); else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1); else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1); + else if (strcmp(argv[1], "bwt2bytesa") == 0) ret = bwa_bwt2bytesa(argc-1, argv+1); else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1); else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1); else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1); diff --git a/out.sam b/out.sam new file mode 100644 index 0000000..7a61bcc --- /dev/null +++ b/out.sam @@ -0,0 +1,108 @@ +@SQ SN:1 LN:249250621 +@SQ SN:2 LN:243199373 +@SQ SN:3 LN:198022430 +@SQ SN:4 LN:191154276 +@SQ SN:5 LN:180915260 +@SQ SN:6 LN:171115067 +@SQ SN:7 LN:159138663 +@SQ SN:8 LN:146364022 +@SQ SN:9 LN:141213431 +@SQ SN:10 LN:135534747 +@SQ SN:11 LN:135006516 +@SQ SN:12 LN:133851895 +@SQ SN:13 LN:115169878 +@SQ SN:14 LN:107349540 +@SQ SN:15 LN:102531392 +@SQ SN:16 LN:90354753 +@SQ SN:17 LN:81195210 +@SQ SN:18 LN:78077248 +@SQ SN:19 LN:59128983 +@SQ SN:20 LN:63025520 +@SQ SN:21 LN:48129895 +@SQ SN:22 LN:51304566 +@SQ SN:X LN:155270560 +@SQ SN:Y LN:59373566 +@SQ SN:MT LN:16569 +@SQ SN:GL000207.1 LN:4262 +@SQ SN:GL000226.1 LN:15008 +@SQ SN:GL000229.1 LN:19913 +@SQ SN:GL000231.1 LN:27386 +@SQ SN:GL000210.1 LN:27682 +@SQ SN:GL000239.1 LN:33824 +@SQ SN:GL000235.1 LN:34474 +@SQ SN:GL000201.1 LN:36148 +@SQ SN:GL000247.1 LN:36422 +@SQ SN:GL000245.1 LN:36651 +@SQ SN:GL000197.1 LN:37175 +@SQ SN:GL000203.1 LN:37498 +@SQ SN:GL000246.1 LN:38154 +@SQ SN:GL000249.1 LN:38502 +@SQ SN:GL000196.1 LN:38914 +@SQ SN:GL000248.1 LN:39786 +@SQ SN:GL000244.1 LN:39929 +@SQ SN:GL000238.1 LN:39939 +@SQ SN:GL000202.1 LN:40103 +@SQ SN:GL000234.1 LN:40531 +@SQ SN:GL000232.1 LN:40652 +@SQ SN:GL000206.1 LN:41001 +@SQ SN:GL000240.1 LN:41933 +@SQ SN:GL000236.1 LN:41934 +@SQ SN:GL000241.1 LN:42152 +@SQ SN:GL000243.1 LN:43341 +@SQ SN:GL000242.1 LN:43523 +@SQ SN:GL000230.1 LN:43691 +@SQ SN:GL000237.1 LN:45867 +@SQ SN:GL000233.1 LN:45941 +@SQ SN:GL000204.1 LN:81310 +@SQ SN:GL000198.1 LN:90085 +@SQ SN:GL000208.1 LN:92689 +@SQ SN:GL000191.1 LN:106433 +@SQ SN:GL000227.1 LN:128374 +@SQ SN:GL000228.1 LN:129120 +@SQ SN:GL000214.1 LN:137718 +@SQ SN:GL000221.1 LN:155397 +@SQ SN:GL000209.1 LN:159169 +@SQ SN:GL000218.1 LN:161147 +@SQ SN:GL000220.1 LN:161802 +@SQ SN:GL000213.1 LN:164239 +@SQ SN:GL000211.1 LN:166566 +@SQ SN:GL000199.1 LN:169874 +@SQ SN:GL000217.1 LN:172149 +@SQ SN:GL000216.1 LN:172294 +@SQ SN:GL000215.1 LN:172545 +@SQ SN:GL000205.1 LN:174588 +@SQ SN:GL000219.1 LN:179198 +@SQ SN:GL000224.1 LN:179693 +@SQ SN:GL000223.1 LN:180455 +@SQ SN:GL000195.1 LN:182896 +@SQ SN:GL000212.1 LN:186858 +@SQ SN:GL000222.1 LN:186861 +@SQ SN:GL000200.1 LN:187035 +@SQ SN:GL000193.1 LN:189789 +@SQ SN:GL000194.1 LN:191469 +@SQ SN:GL000225.1 LN:211173 +@SQ SN:GL000192.1 LN:547496 +@SQ SN:NC_007605 LN:171823 +@SQ SN:hs37d5 LN:35477943 +@HD VN:1.5 SO:unsorted GO:query +@RG ID:normal SM:normal PL:illumina LB:normal PG:bwa +@PG ID:bwa PN:bwa VN:0.7.17-r1198-dirty CL:./bwa mem -t 1 -M -R @RG\tID:normal\tSM:normal\tPL:illumina\tLB:normal\tPG:bwa /home/zzh/reference/human_g1k_v37_decoy.fasta /home/zzh/fastq/diff_r1.fq /home/zzh/fastq/diff_r2.fq -o ./out.sam +A00289:748:HLCH3DSX3:4:1102:24749:34929 97 16 75642166 60 150M 12 66451372 0 AAAGCCATTGGCAGTAACATCCAAAGGCTGCTCAGGAACTGCACTCCAGCTGATGGCTATGAAAAGATAAGATTCTAGGGTTAAAACTGCTCAATTTCTTGTGAAAGTCAGAAGTATTTTAATAAAAATATTTCTCTGACTCTACAGACC FFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFF: NM:i:0 MD:Z:150 MC:Z:17S91M42S AS:i:150 XS:i:0 RG:Z:normal +A00289:748:HLCH3DSX3:4:1102:24749:34929 145 12 66451372 0 17S91M42S 16 75642166 0 TTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATAAATAATTCTTTGTGAAACCCCT F,F,FFF,:FFF,,F,F,,F,F:,FFF:F:FFF:FFFFFFF:F,F,F,FFFFFF::,,FFFFFFF,:F,:,F,::,,F:FF:,FF:F::F::FFFFFFFFFF,F:F,FFFF,FFFFF,FF,F,,F,F,,,,,,:,,,,,,,,,,,,,,,F NM:i:0 MD:Z:91 MC:Z:150M AS:i:91 XS:i:90 RG:Z:normal SA:Z:12,49106363,+,117S33M,0,0; XA:Z:12,-66451373,35S90M25S,0;12,-66451373,28S90M32S,0;6,-160521757,23S83M44S,0;15,-77910871,23S79M48S,0; +A00289:748:HLCH3DSX3:4:1102:24749:34929 385 12 49106363 0 117H33M 16 75642166 0 AAAAAAAAAAAAAAATAAAAAAAAAAAAAAAAA FFF:F:FFF,:F,F,,F,F,,FFF:,FFF,F,F NM:i:0 MD:Z:33 MC:Z:150M AS:i:33 XS:i:32 RG:Z:normal SA:Z:12,66451372,-,17S91M42S,0,0; XA:Z:4,-19079503,32M118S,0;15,-55218248,32M118S,0;X,-149551159,32M118S,0; +A00289:748:HLCH3DSX3:4:1104:6234:8656 113 12 66451373 8 12S91M47S 22 18873826 0 TTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGTTTGTTTTTTTTTTTTTTGGGGGGGGGGGGGGGTGAGGTGGTCTGTG ,,:FFF:,:,::,,,,,,FF,FFF,,FF,F,FFF:,,:FFFFFF:FFF,:,:FF:F:::::FFF::,::F:FFFFFF:,FFFF,,,:FFFFFFFF:,,::,,,,,:,,FFF:,,,FF:F,:,FFFF:F::,,FF:,:FFFFFF:FFFFFF NM:i:1 MD:Z:13T77 MC:Z:150M AS:i:86 XS:i:81 RG:Z:normal SA:Z:3,121668661,-,89S32M29S,0,0; XA:Z:12,-66451372,25S91M34S,2;6,-160521757,19S84M47S,1;15,-77910874,26S77M47S,0;7,-107410599,25M1I3M15D74M47S,16; +A00289:748:HLCH3DSX3:4:1104:6234:8656 369 3 121668661 0 89H32M29H 22 18873826 0 TTTTTTTTTTTTTGTTTGTTTTTTTTTTTTTT FFFFFF:,,::,,,,,:,,FFF:,,,FF:F,: NM:i:0 MD:Z:32 MC:Z:150M AS:i:32 XS:i:32 RG:Z:normal SA:Z:12,66451373,-,12S91M47S,8,1; +A00289:748:HLCH3DSX3:4:1104:6234:8656 177 22 18873826 22 150M 12 66451373 0 CTATACCCCACATCCTCATTCCCACCAGATGTCTTGGATCATGGAGGCTCTCCAGACAAAAGCCAGCAGTTAAGCTCCAGATTTCCTATAGAATCCTTTTCTAACAACCAGTGAGTGATTCCAGAATACGTACCATTGAATGTGCTCCCT FFF:FFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NM:i:0 MD:Z:150 MC:Z:12S91M47S AS:i:150 XS:i:140 RG:Z:normal +A00289:748:HLCH3DSX3:4:1104:1741:18129 97 11 113220686 60 150M 12 66451373 0 TATTTCACCCGGATGCTGACTCCTTTTTTGGTTTCCTTGCAAGGTTACCCTGTTGGGATGTTGTTAGTTGGAGGACAGCTACCTCTGAGGTTATTATGGTTTTTTGCAGATTATTGGAAGCGCTGGTGTCATTTCTTGATTTCTCGGATA FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFF,FFFFFFFFFFFFFFFFFF:FFFFFFF:FFFF NM:i:0 MD:Z:150 MC:Z:90M60S AS:i:150 XS:i:19 RG:Z:normal +A00289:748:HLCH3DSX3:4:1104:1741:18129 145 12 66451373 7 90M60S 11 113220686 0 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTTTTTTCCCAC FF:FFFF,F,,FFFFFFFF:FFFF:,F,,:F:F::::,F::::FFF::F::,,:FFF,:::FFFFFFFFFFF:FFFFFFFF,FFFF:FFF,:,:,,::FFFFFFFFFFF,FF:FFF:F:F:F:,F,,::F,,F,,FFFFF:F:,,,,,,, NM:i:1 MD:Z:81T8 MC:Z:150M AS:i:85 XS:i:81 RG:Z:normal SA:Z:8,112168232,-,79S59M12S,0,1; XA:Z:6,-160521757,81M69S,0;4,-82515382,13S69M68S,0; +A00289:748:HLCH3DSX3:4:1104:1741:18129 401 8 112168232 0 79H59M12H 11 113220686 0 TTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTT FF,FFFF:FFF,:,:,,::FFFFFFFFFFF,FF:FFF:F:F:F:,F,,::F,,F,,FFF NM:i:1 MD:Z:46T12 MC:Z:150M AS:i:54 XS:i:53 RG:Z:normal SA:Z:12,66451373,-,90M60S,7,1; +A00289:748:HLCH3DSX3:4:1105:27281:29700 113 X 117117442 60 150M 12 66451372 0 CCCTGAAGTTGATTTTTTGAATATATCCTTGAAATCGGCTTGTTTTAATGACAATTGCCTCCGTCTCCTGAAAGTGGAGTGCAAAAAGAAACCTGGTTAATTGTAGTTTTATTATATTTAGTTCAAATTTTTCAAAGCATACCCTGACAG FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NM:i:0 MD:Z:150 MC:Z:12S89M49S AS:i:150 XS:i:0 RG:Z:normal +A00289:748:HLCH3DSX3:4:1105:27281:29700 177 12 66451372 0 12S89M49S X 117117442 0 TTTTTTAAAAATATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTATTTATTTTTTTATTTTTTTATTATTTTTTTTTTTTTTTTTTTTTTTTTGGTGCCGCCCCC ,,,,:,,F:F,,,,:FF,F:FF,::F:F:F,,F::FFF,FFFFFFFFF,FFFFFF::FFF:F:FFFFFFF,FFFFFFF::FFFF:,:FF:,,F,:,FFFFF,:,,:F,,,,:,FFFFF:F,,FFF:FFFFF:,F,F,,,:,,,,,:FFFF NM:i:3 MD:Z:73T3T3T7 MC:Z:150M AS:i:74 XS:i:72 RG:Z:normal SA:Z:15,85942734,+,20S48M82S,0,1; +A00289:748:HLCH3DSX3:4:1105:27281:29700 417 15 85942734 0 20H48M82H X 117117442 0 AAAAAAAAAAAAAAAAATAATAAAAAAATAAAAAAATAAATAAATAAA FFFF:FFF,,F:FFFFF,:,,,,F:,,:,FFFFF,:,F,,:FF:,:FF NM:i:1 MD:Z:17A30 MC:Z:150M AS:i:43 XS:i:36 RG:Z:normal SA:Z:12,66451372,-,12S89M49S,0,3; XA:Z:11,-118049564,69S35M7D34M12S,11;2,+204499190,12S31M1D21M86S,3; +A00289:748:HLCH3DSX3:4:1106:2880:19288 97 4 169630630 0 29M1I4M1I68M9D22M25S 12 66451373 0 CAGCCTGGGTGACAGAGTGAGACTCCAGCTAAGGCAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAATGAAGGAAGGAAGGAGGGAGGGAGGGAGGGAGGGAGAGATCGGAAGAGCACACGTCTGAAC FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFF:FFF:FFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFF NM:i:14 MD:Z:18C8T59G13^AAGGAAGGA22 MC:Z:36S91M23S AS:i:73 XS:i:72 RG:Z:normal +A00289:748:HLCH3DSX3:4:1106:2880:19288 145 12 66451373 9 36S91M23S 4 169630630 0 TATATACTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGTGGGTGGGTGGGTGGGTTGGCAG :,,,,,,,:FF:FFFF:,:FF:,F,F,FFF,,,FFF:FFFFFFFFF,:,,F:F,FFFF,:FFFFFFFFFFFFFFFFFFFF:FF:FFFF:FFFFFFF:FFF:FFFFFFF:FF,FFF:,:::,:F::,,:,,:,:,,,,,:,::,,,:F,FF NM:i:1 MD:Z:10T80 MC:Z:29M1I4M1I68M9D22M25S AS:i:86 XS:i:80 RG:Z:normal SA:Z:15,55218228,-,9S52M89S,0,0; XA:Z:15,-77910871,47S80M23S,0;6,-160521761,47S80M23S,0;6,-160521757,47S79M24S,0;7,-107410599,25S28M15D74M23S,16; +A00289:748:HLCH3DSX3:4:1106:2880:19288 401 15 55218228 0 9H52M89H 4 169630630 0 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTT FF:FFFF:,:FF:,F,F,FFF,,,FFF:FFFFFFFFF,:,,F:F,FFFF,:F NM:i:0 MD:Z:52 MC:Z:29M1I4M1I68M9D22M25H AS:i:52 XS:i:50 RG:Z:normal SA:Z:12,66451373,-,36S91M23S,9,1; +A00289:748:HLCH3DSX3:4:1107:2365:20462 97 22 18737670 0 150M 12 66451372 0 GAGGACACCATACAAGCCGTCGCTAACAAGGACACTGTACACAACATCGCTAATGACGGCACCGTACAAGACATCACCAATGAGGGCGCTTTATACGACATTGCTAATGATACCGACAAGGCACGCTAACGTGGACGCTGTACACGACAT FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFF NM:i:0 MD:Z:150 MC:Z:21S90M39S AS:i:150 XS:i:150 RG:Z:normal +A00289:748:HLCH3DSX3:4:1107:2365:20462 145 12 66451372 13 21S90M39S 22 18737670 0 TTTTATATATTTATATTTATTATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGTTTGTTTTTTGTGGTGGGGGGGGGGTTTGGCATAGCAT F:F,,F,F,F:FFF,,,:,F:,:,,:,,,,:,F:,,FFFFFFFFFFFFF,F:FFFFFFFFF:FFFF,,FFFFF:FFF::FFFFF,F:F,,F:F,FFFFFFFF,:F:F::::,,:F,F,,F,:,,,,,::,,,,:,,:,,,,,,F,F::FF NM:i:0 MD:Z:90 MC:Z:150M AS:i:90 XS:i:84 RG:Z:normal XA:Z:6,-160521757,28S84M38S,0;15,-77910871,32S80M38S,0;7,-107410642,38S74M38S,0; +A00289:748:HLCH3DSX3:4:1107:26377:28385 113 GL000220.1 122227 60 150M 6 160521757 0 TGCTCTGTTGCTCACGCTGGTCTCAAACTCCTGGCCTTGACGCTTCTCCCGTCACATCCGCCGTCTGGTTGTTGAAATGAGCATCTCTCGTAAAATGGAAAAGATGAAAGAAATAAACACGAAGACGGAAAGCACGGTGTGAACGTTTCT FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NM:i:0 MD:Z:150 MC:Z:85M65S AS:i:150 XS:i:113 RG:Z:normal +A00289:748:HLCH3DSX3:4:1107:26377:28385 177 6 160521757 0 85M65S GL000220.1 122227 0 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGGTGTTTTTTTTTTGTTGGGGGTTGGGTGTTGGGGGGCGGGGGGGGGGGCGCCCCCCCCCAAGGAGA F:FF,F,,F,F,FFFFFF,:FF,:FF:F,:FF,FFFF:FFF:FF,FFFFFFFFFFFFFFFFFFFFFFFF:FF,F,F:F,:FF,,,FFFF,F,:,,,,,:::,,:,,,,::,,,,,,,,,F,,FF,:F:,,F,,,,,FF:FF:F,FFFF:: NM:i:0 MD:Z:85 MC:Z:150M AS:i:85 XS:i:84 RG:Z:normal XA:Z:12,-66451380,84M66S,0;12,-66451373,83M67S,0;15,-77910871,4S80M66S,0; diff --git a/run.sh b/run.sh index b906d4e..961dc81 100755 --- a/run.sh +++ b/run.sh @@ -6,17 +6,17 @@ thread=1 #reference=~/data/reference/human_g1k_v37_decoy.fasta #n_r1=~/fastq/sn_r1.fq #n_r2=~/fastq/sn_r2.fq -#n_r1=~/fastq/ssn_r1.fq -#n_r2=~/fastq/ssn_r2.fq +n_r1=~/fastq/ssn_r1.fq +n_r2=~/fastq/ssn_r2.fq #n_r1=~/fastq/tiny_n_r1.fq #n_r2=~/fastq/tiny_n_r2.fq -n_r1=~/fastq/diff_r1.fq -n_r2=~/fastq/diff_r2.fq +#n_r1=~/fastq/diff_r1.fq +#n_r2=~/fastq/diff_r2.fq #n_r1=~/fastq/d_r1.fq #n_r2=~/fastq/d_r2.fq reference=~/reference/human_g1k_v37_decoy.fasta -#out=./ssn.sam -out=./out.sam +out=./ssn.sam +#out=./out.sam #out=/dev/null #time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ # /home/zzh/data/reference/human_g1k_v37_decoy.fasta \ diff --git a/utils.c b/utils.c index 46181db..b957de8 100644 --- a/utils.c +++ b/utils.c @@ -139,6 +139,22 @@ size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream) return ret; } +uint64_t fread_fix(FILE *fp, uint64_t size, void *a) +{ // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks + // const int bufsize = 0x1000000; // 16M block + const int bufsize = 0x4000000; // 64M block + uint64_t offset = 0; + while (size) + { + int x = bufsize < size ? bufsize : size; + if ((x = err_fread_noeof(a + offset, 1, x, fp)) == 0) + break; + size -= x; + offset += x; + } + return offset; +} + int err_gzread(gzFile file, void *ptr, unsigned int len) { int ret = gzread(file, ptr, len); diff --git a/utils.h b/utils.h index b2c8b81..5722787 100644 --- a/utils.h +++ b/utils.h @@ -48,7 +48,8 @@ extern int64_t time_ksw_extend2, time_bns; extern int64_t dn, n16, n17, n18, n19, nall, num_sa; - +extern int64_t s1n, s2n, s3n, s1l, s2l, s3l; +extern int64_t g_num_smem2; extern FILE *fp1; #endif @@ -89,6 +90,7 @@ extern "C" { gzFile err_xzopen_core(const char *func, const char *fn, const char *mode); size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream); size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream); + uint64_t fread_fix(FILE *fp, uint64_t size, void *a); int err_gzread(gzFile file, void *ptr, unsigned int len); int err_fseek(FILE *stream, long offset, int whence);