#include "hyb_idx.h" #include "profiling.h" #define CALC_STAT 0 static void get_seed_end_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int max_hits, int seed_end, uint32_t* seq_pos_p, uint32_t* hits_p, uint64_t* sa_start_p, int tid) { uint8_t cmp_ref_val = 0; int is_head_node = 1; uint8_t* cmp_ref = &cmp_ref_val; uint8_t* prev_addr = idata; uint32_t prev_seq_pos = *seq_pos_p; uint32_t prev_hits = *hits_p; uint64_t prev_sa_start = *sa_start_p; uint8_t* next_addr = parse_first_hyb_node(idata, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid); #if CALC_STAT if (next_addr != NULL) { // fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr); uint64_t dist = next_addr - prev_addr; if (dist < 32) gdat[0]++; else if (dist < 64) gdat[1]++; else if (dist < 128) gdat[2]++; else gdat[3]++; } #endif while (next_addr != NULL && *hits_p > 1 && (*seq_pos_p < seed_end || *hits_p >= max_hits)) { prev_addr = next_addr; prev_seq_pos = *seq_pos_p; prev_hits = *hits_p; prev_sa_start = *sa_start_p; next_addr = parse_one_hyb_node(next_addr, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid); is_head_node = 0; #if CALC_STAT if (next_addr != NULL) { // fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr); uint64_t dist = next_addr - prev_addr; if (dist < 32) gdat[0]++; else if (dist < 64) gdat[1]++; else if (dist < 128) gdat[2]++; else gdat[3]++; } #endif } uint32_t hold_seq_pos = *seq_pos_p; uint32_t hold_hits = *hits_p; uint64_t hold_sa_start = *sa_start_p; if (*seq_pos_p > seed_end && *hits_p < max_hits) { // 检查前一个节点 *seq_pos_p = prev_seq_pos; *hits_p = prev_hits; *sa_start_p = prev_sa_start; next_addr = prev_addr; parse_one_hyb_node_max_hits(next_addr, seq_bits, seq_bp, seq_end, max_hits, seed_end - prev_seq_pos, is_head_node, seq_pos_p, sa_start_p, hits_p, tid); if (*hits_p >= max_hits) { *seq_pos_p = hold_seq_pos; *hits_p = hold_hits; *sa_start_p = hold_sa_start; } } } // assume max_hits > 2 void hyb_third_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const Range* seeds_range, const int min_seed_len, const int max_hits, HybSeedArr* seeds, int tid) { if (seeds_range->start == seeds_range->end) { return; } uint64_t seeds_m = seeds->m; uint64_t* seeds_cap = &seeds_m; int new_seed_len = min_seed_len + 1; int i = 0; int right_match_arr[2] = {0}; Range ff = *read_range; uint8_t type_hits = 0; uint64_t offset = 0; int seeds_i = seeds_range->start; int x = read_range->start; int x_end = x + new_seed_len; int flag_found_x_end = 0; int flag_i = 0; // PROF_START(seed_3); HybSeed s = kv_A(*seeds, seeds_i); if (s.first_len > 0 && s.first_len < new_seed_len && s.seed_start == x && s.seed_end >= x_end && s.ref_pos_arr.n <= 2) { __add_seed_one_pos(seed, s.ref_pos_arr.a[0], x, x_end); if (s.ref_pos_arr.n == 2) kv_push(uint64_t, seed->ref_pos_arr, s.ref_pos_arr.a[1]); x = x_end; } while (x + min_seed_len < read_range->end) { while (seeds_i < seeds_range->end && kv_A(*seeds, seeds_i).seed_end < x) ++seeds_i; if (seeds_i == seeds_range->end) break; if (seeds->a[seeds_i].seed_start > x) { x += new_seed_len; continue; } x_end = x + new_seed_len; flag_found_x_end = 0; flag_i = 0; for (i = seeds_i; i < seeds_range->end; ++i) { HybSeed* s = &kv_A(*seeds, i); if (s->seed_start >= x_end) break; if (s->seed_start <= x && s->seed_end >= x_end) { flag_found_x_end = 1; // x_end点存在seed flag_i = i; break; } } if (!flag_found_x_end) { x = x_end; continue; } // PROF_START(seed_3); get_kmer_data(hyb, read_seq->for_bits, x, &type_hits, &offset); // PROF_END(tprof[T_SEED_3_0][tid], seed_3); if (type_hits == 0) { x += new_seed_len; } else if (type_hits == 1) { // PROF_START(seed_3); __add_seed_one_pos(seed, offset, x, x_end); x = x_end; // PROF_END(tprof[T_SEED_3_1][tid], seed_3); } else if (type_hits == 2) { // PROF_START(seed_3); HybSeed s = kv_A(*seeds, flag_i); if (s.ref_pos_arr.n == 2) { __add_seed_one_pos(seed, s.ref_pos_arr.a[0] + x - s.seed_start, x, x_end); kv_push(uint64_t, seed->ref_pos_arr, s.ref_pos_arr.a[1] + x - s.seed_start); } else { // 只有一个ref_pos ff.end = x_end; uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, offset), hyb_sa_to_ref_pos(hyb->sa, offset + 1)}; for (i = 0; i < 2; ++i) { if (s.ref_pos_arr.a[0] + x - s.seed_start == ref_pos_arr[i]) { right_match_arr[i] = MIN(s.seed_end - x, new_seed_len); } else { right_end_match(hyb, read_seq->len, &ff, read_seq->for_bits, read_seq->back_bits, x, HYB_KMER_LEN, ref_pos_arr[i], &right_match_arr[i]); } } if (right_match_arr[0] == right_match_arr[1]) { if (right_match_arr[0] == new_seed_len) { __add_seed_one_pos(seed, ref_pos_arr[0], x, x_end); kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]); } } else { if (right_match_arr[0] == new_seed_len) { __add_seed_one_pos(seed, ref_pos_arr[0], x, x_end); } else if (right_match_arr[1] == new_seed_len) { __add_seed_one_pos(seed, ref_pos_arr[1], x, x_end); } } } x = x_end; // PROF_END(tprof[T_SEED_3_2][tid], seed_3); } else { uint32_t seq_pos = x + HYB_KMER_LEN; uint32_t hits = type_hits; uint64_t sa_start = 0; // PROF_START(seed_3); get_seed_end_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, read_range->end, max_hits, x + new_seed_len, &seq_pos, &hits, &sa_start, tid); // PROF_END(tprof[T_SEED_3_3_0][tid], seed_3); // tdat[(seq_pos - x - HYB_KMER_LEN + 2) / 3][tid]++; if (seq_pos - x < new_seed_len) { // PROF_START(seed_3); HybSeed s = kv_A(*seeds, flag_i); __add_seed_one_pos(seed, s.ref_pos_arr.a[0] + x - s.seed_start, x, x_end); x = x_end; // PROF_END(tprof[T_SEED_3_3_1][tid], seed_3); } else { // PROF_START(seed_3); if (hits < max_hits) { __add_seed_one_pos(seed, hyb_sa_to_ref_pos(hyb->sa, sa_start), x, seq_pos); int i = 0; for (i = 1; i < hits; ++i) { kv_push(uint64_t, seed->ref_pos_arr, hyb_sa_to_ref_pos(hyb->sa, sa_start + i)); } x = seq_pos; } else { x = seq_pos + 1; } // PROF_END(tprof[T_SEED_3_3_2][tid], seed_3); } // PROF_END(tprof[T_SEED_3_3][tid], seed_3); } } // PROF_END(tprof[T_SEED_3_ALL][tid], seed_3); }