#include "hyb_idx.h" #include "profiling.h" static int handle_hits_1(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const Range* rr, int x, int rx, int init_match_len, uint64_t ref_pos, const int min_seed_len, HybSeedArr* seeds, uint64_t* seeds_cap) { int left_match = 0, right_match = 0; both_end_match(hyb, read_seq->len, rr, read_seq->back_bits, read_seq->for_bits, rx, init_match_len, ref_pos, &left_match, &right_match); if (left_match + right_match >= min_seed_len) { ref_pos = _rev_ref(hyb, ref_pos); __add_seed_one_pos(seed, ref_pos - right_match + 1, x - right_match + 1, x + left_match + 1); } return MAX(x + left_match + 1, x - right_match + 1 + min_seed_len); } static int handle_hits_2(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const Range* rr, int x, int rx, int init_match_len, uint64_t sa_pos, const int min_seed_len, HybSeedArr* seeds, uint64_t* seeds_cap, int tid) { int left_match_arr[2] = {0}, right_match_arr[2] = {0}; Range mr_arr[2] = {0}; uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, sa_pos), hyb_sa_to_ref_pos(hyb->sa, sa_pos + 1)}; int i = 0; for (i = 0; i < 2; ++i) { both_end_match(hyb, read_seq->len, rr, read_seq->back_bits, read_seq->for_bits, rx, init_match_len, ref_pos_arr[i], &left_match_arr[i], &right_match_arr[i]); _set_range(mr_arr[i], x - right_match_arr[i] + 1, x + left_match_arr[i] + 1); } if (_range_equal(mr_arr[0], mr_arr[1])) { // 相等 if (mr_arr[0].end - mr_arr[0].start >= min_seed_len) { // 正向搜索确定ref_pos的先后顺序 uint8_t type_hits = 0; uint64_t offset = 0; get_kmer_data(hyb, read_seq->for_bits, mr_arr[0].start, &type_hits, &offset); if (type_hits == 2) { ref_pos_arr[0] = hyb_sa_to_ref_pos(hyb->sa, offset); ref_pos_arr[1] = hyb_sa_to_ref_pos(hyb->sa, offset + 1); __add_seed_one_pos(seed, ref_pos_arr[0], mr_arr[0].start, mr_arr[0].end); kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]); } else { // 需要反向搜索确定ref uint32_t seq_pos = mr_arr[0].start + HYB_KMER_LEN; uint32_t hits = type_hits; uint64_t sa_start = 0; uint8_t cmp_ref = 0; // PROF_START(seed_1); get_leaf_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, read_range->end, &seq_pos, &hits, &sa_start, &cmp_ref, tid); // PROF_END(tprof[T_SEED_1_3_1][tid], seed_1); ref_pos_arr[0] = hyb_sa_to_ref_pos(hyb->sa, sa_start); ref_pos_arr[1] = hyb_sa_to_ref_pos(hyb->sa, sa_start + 1); __add_seed_one_pos(seed, ref_pos_arr[0], mr_arr[0].start, mr_arr[0].end); kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]); } } } else if (_range_cross(mr_arr[0], mr_arr[1])) { // 交叉 if (mr_arr[0].start < mr_arr[1].start) { __check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[0]) - right_match_arr[0] + 1, mr_arr[0].start, mr_arr[0].end); __check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[1]) - right_match_arr[1] + 1, mr_arr[1].start, mr_arr[1].end); } else { __check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[1]) - right_match_arr[1] + 1, mr_arr[1].start, mr_arr[1].end); __check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[0]) - right_match_arr[0] + 1, mr_arr[0].start, mr_arr[0].end); } } else { // 包含 if (mr_arr[0].start < mr_arr[1].start || mr_arr[0].end > mr_arr[1].end) { __check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[0]) - right_match_arr[0] + 1, mr_arr[0].start, mr_arr[0].end); } else { __check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[1]) - right_match_arr[1] + 1, mr_arr[1].start, mr_arr[1].end); } } return MAX(MAX(mr_arr[0].end, mr_arr[1].end), MIN(mr_arr[0].start, mr_arr[1].start) + min_seed_len); } static int handle_hits_much(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, int x, const int min_seed_len, HybSeedArr* seeds, uint64_t* seeds_cap, int tid) { int max_reach = x + HYB_KMER_LEN; int right_match = 0; uint8_t type_hits = 0; uint64_t offset = 0; uint64_t ref_pos = 0; uint32_t seq_pos = x + HYB_KMER_LEN; uint32_t hits = 0; uint64_t sa_start = 0; uint8_t cmp_ref = 0; int i = 0; get_kmer_data(hyb, read_seq->for_bits, x, &type_hits, &offset); if (type_hits == 2) { int match_end[2] = {0}; uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, offset), hyb_sa_to_ref_pos(hyb->sa, offset + 1)}; for (i = 0; i < 2; ++i) { right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, HYB_KMER_LEN, ref_pos_arr[i], &right_match); match_end[i] = x + right_match; } max_reach = MAX(match_end[0], match_end[1]); if (max_reach - x >= min_seed_len) { if (match_end[0] == match_end[1]) { __add_seed_one_pos(seed, ref_pos_arr[0], x, max_reach); kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]); } else if (match_end[0] > match_end[1]) { __add_seed_one_pos(seed, ref_pos_arr[0], x, match_end[0]); } else { __add_seed_one_pos(seed, ref_pos_arr[1], x, match_end[1]); } } } else { hits = type_hits; // PROF_START(seed_1); get_leaf_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, read_range->end, &seq_pos, &hits, &sa_start, &cmp_ref, tid); // PROF_END(tprof[T_SEED_1_3_1][tid], seed_1); // tdat[(seq_pos - x - HYB_KMER_LEN + 2) / 3][tid]++; if (seq_pos == read_range->end || !cmp_ref) { max_reach = seq_pos; if (max_reach - x >= min_seed_len) { __add_seed_one_pos(seed, hyb_sa_to_ref_pos(hyb->sa, sa_start), x, max_reach); int i = 0; for (i = 1; i < hits; ++i) { kv_push(uint64_t, seed->ref_pos_arr, hyb_sa_to_ref_pos(hyb->sa, sa_start + i)); } } } else { ref_pos = hyb_sa_to_ref_pos(hyb->sa, sa_start); right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, seq_pos - x, ref_pos, &right_match); max_reach = x + right_match; if (right_match >= min_seed_len) { __add_seed_one_pos(seed, ref_pos, x, max_reach); } } } return MAX(max_reach, x + min_seed_len); } int seeding_from_start(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const int min_seed_len, HybSeedArr* seeds, int tid) { // PROF_START(seed_1); uint64_t seeds_m = seeds->m; uint64_t* seeds_cap = &seeds_m; // 记录当前seeds的长度, 如果扩容,则需要初始化ref_pos_arr.m, n, a 为0 int max_reach = read_range->start + HYB_KMER_LEN; // 返回的结果,最远匹配的read的位置 int x = read_range->start; // 从read_range的起始位置开始匹配 uint64_t i = 0; int right_match = 0; uint8_t type_hits = 0; uint64_t offset = 0; uint64_t ref_pos = 0; get_kmer_data(hyb, read_seq->for_bits, x, &type_hits, &offset); // PROF_END(tprof[T_SEED_1_0][tid], seed_1); if (type_hits == 0) { // tdat[TD_SEED_1_0][tid]++; } else if (type_hits == 1) { // tdat[TD_SEED_1_1][tid]++; // PROF_START(seed_1); right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, HYB_KMER_LEN, offset, &right_match); max_reach = x + right_match; if (max_reach - x >= min_seed_len) { __add_seed_one_pos(seed, offset, x, max_reach); seed->first_len = HYB_KMER_LEN; } // PROF_END(tprof[T_SEED_1_1][tid], seed_1); } else if (type_hits == 2) { // tdat[TD_SEED_1_2][tid]++; // PROF_START(seed_1); int match_end[2] = {0}; uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, offset), hyb_sa_to_ref_pos(hyb->sa, offset + 1)}; for (i = 0; i < 2; ++i) { right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, HYB_KMER_LEN, ref_pos_arr[i], &right_match); match_end[i] = x + right_match; } max_reach = MAX(match_end[0], match_end[1]); if (max_reach - x >= min_seed_len) { if (match_end[0] == match_end[1]) { __add_seed_one_pos(seed, ref_pos_arr[0], x, max_reach); kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]); seed->first_len = match_end[0]; // seed->first_len = HYB_KMER_LEN; } else if (match_end[0] > match_end[1]) { __add_seed_one_pos(seed, ref_pos_arr[0], x, match_end[0]); seed->first_len = match_end[1]; // seed->first_len = HYB_KMER_LEN; } else { __add_seed_one_pos(seed, ref_pos_arr[1], x, match_end[1]); seed->first_len = match_end[0]; // seed->first_len = HYB_KMER_LEN; } } // PROF_END(tprof[T_SEED_1_2][tid], seed_1); } else { // tdat[TD_SEED_1_3][tid]++; uint32_t seq_pos = x + HYB_KMER_LEN; uint32_t hits = type_hits; uint64_t sa_start = 0; uint8_t cmp_ref = 0; // PROF_START(seed_1); get_leaf_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, read_range->end, &seq_pos, &hits, &sa_start, &cmp_ref, tid); // PROF_END(tprof[T_SEED_1_3_1][tid], seed_1); if (seq_pos == read_range->end || !cmp_ref) { max_reach = seq_pos; if (max_reach - x >= min_seed_len) { __add_seed_one_pos(seed, hyb_sa_to_ref_pos(hyb->sa, sa_start), x, max_reach); if (hits == 1) seed->first_len = seq_pos - x; for (i = 1; i < hits; ++i) { kv_push(uint64_t, seed->ref_pos_arr, hyb_sa_to_ref_pos(hyb->sa, sa_start + i)); } } } else { ref_pos = hyb_sa_to_ref_pos(hyb->sa, sa_start); right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, seq_pos - x, ref_pos, &right_match); max_reach = x + right_match; if (right_match >= min_seed_len) { __add_seed_one_pos(seed, ref_pos, x, max_reach); seed->first_len = seq_pos - x; } } // PROF_END(tprof[T_SEED_1_3][tid], seed_1); } // PROF_END(tprof[T_SEED_1_ALL][tid], seed_1); // PROF_END(tprof[T_SEED_1_1][tid], seed_1); return MAX(max_reach, x + min_seed_len); } ////////////// // 用hybrid-index来寻找smem(seeding-1),要求种子 hits >= min_hits_thres(>0) void hyb_first_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const int min_seed_len, HybSeedArr* seeds, int tid) { int x = seeding_from_start(hyb, read_seq, read_range, min_seed_len, seeds, tid); int rx = 0; // 对应的反向位置 Range rr = {read_seq->len - read_range->end, read_seq->len - read_range->start}; uint64_t seeds_m = seeds->m; uint64_t* seeds_cap = &seeds_m; // 记录当前seeds的长度, 如果扩容,则需要初始化ref_pos_arr.m, n, a 为0 uint8_t type_hits = 0; uint64_t offset = 0; int extra_tend = MAX(0, min_seed_len - HYB_KMER_LEN) + 1; // PROF_START(seed_1); while (x < read_range->end) { // 反向搜索, 此时x距离start超过16 rx = read_seq->len - x - 1; // 反向位置, 因为正向包含x, 所以这里需要减1 // PROF_START(seed_1); get_kmer_data(hyb, read_seq->back_bits, rx, &type_hits, &offset); // PROF_END(tprof[T_SEED_1_0][tid], seed_1); if (type_hits == 0) { x += extra_tend; // tdat[TD_SEED_1_0][tid]++; } else if (type_hits == 1) { // tdat[TD_SEED_1_1][tid]++; // PROF_START(seed_1); x = handle_hits_1(hyb, read_seq, read_range, &rr, x, rx, HYB_KMER_LEN, offset, min_seed_len, seeds, seeds_cap); // PROF_END(tprof[T_SEED_1_1][tid], seed_1); } else if (type_hits == 2) { // tdat[TD_SEED_1_2][tid]++; // PROF_START(seed_1); x = handle_hits_2(hyb, read_seq, read_range, &rr, x, rx, HYB_KMER_LEN, offset, min_seed_len, seeds, seeds_cap, tid); // PROF_END(tprof[T_SEED_1_2][tid], seed_1); } else { // tdat[TD_SEED_1_3][tid]++; // PROF_START(seed_1); uint32_t seq_pos = rx + HYB_KMER_LEN; uint32_t hits = type_hits; uint64_t sa_start = 0; uint8_t cmp_ref = 0; get_leaf_node(hyb->index_data + offset, read_seq->back_bits, read_seq->rseq, rr.end, &seq_pos, &hits, &sa_start, &cmp_ref, tid); // PROF_END(tprof[T_SEED_1_3_1][tid], seed_1); // tdat[(seq_pos - rx - HYB_KMER_LEN + 2) / 3][tid]++; // tdat[TD_SEED_1_0][tid]++; // if (hits == 1) { // tdat[TD_SEED_1_1][tid]++; // } else if (hits == 2) { // tdat[TD_SEED_1_2][tid]++; // } else if (hits == 3) { // tdat[TD_SEED_1_3][tid]++; // } else if (hits == 4) { // tdat[TD_SEED_1_4][tid]++; // } else { // tdat[TD_SEED_1_5][tid]++; // } if (seq_pos == rr.end || !cmp_ref) { if (hits == 1) { // PROF_START(seed_1); x = handle_hits_1(hyb, read_seq, read_range, &rr, x, rx, seq_pos - rx, hyb_sa_to_ref_pos(hyb->sa, sa_start), min_seed_len, seeds, seeds_cap); // PROF_END(tprof[T_SEED_1_3_2][tid], seed_1); } else if (hits == 2) { // PROF_START(seed_1); x = handle_hits_2(hyb, read_seq, read_range, &rr, x, rx, seq_pos - rx, sa_start, min_seed_len, seeds, seeds_cap, tid); // PROF_END(tprof[T_SEED_1_3_3][tid], seed_1); } else { // PROF_START(seed_1); x = handle_hits_much(hyb, read_seq, read_range, x + rx - seq_pos + 1, min_seed_len, seeds, seeds_cap, tid); // PROF_END(tprof[T_SEED_1_3_4][tid], seed_1); } } else { // hits == 1 // PROF_START(seed_1); x = handle_hits_1(hyb, read_seq, read_range, &rr, x, rx, seq_pos - rx, hyb_sa_to_ref_pos(hyb->sa, sa_start), min_seed_len, seeds, seeds_cap); // PROF_END(tprof[T_SEED_1_3_5][tid], seed_1); } // PROF_END(tprof[T_SEED_1_3][tid], seed_1); } } // PROF_END(tprof[T_SEED_1_ALL][tid], seed_1); // PROF_END(tprof[T_SEED_1_0][tid], seed_1); }