#include "hyb_idx.h" #include "profiling.h" #define CALC_STAT 0 // 需要给定初始化的hits和seq_pos static void get_min_hits_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int min_hits, uint32_t* seq_pos_p, uint32_t* hits_p, uint64_t* sa_start_p, int tid) { uint8_t cmp_ref_val = 0; int is_head_node = 1; uint8_t* cmp_ref = &cmp_ref_val; uint8_t* prev_addr = idata; uint32_t prev_seq_pos = *seq_pos_p; uint32_t prev_hits = *hits_p; uint64_t prev_sa_start = *sa_start_p; uint8_t* next_addr = parse_first_hyb_node(idata, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid); #if CALC_STAT if (next_addr != NULL) { // fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr); uint64_t dist = next_addr - prev_addr; if (dist < 32) gdat[0]++; else if (dist < 64) gdat[1]++; else if (dist < 128) gdat[2]++; else gdat[3]++; } #endif while (next_addr != NULL && *hits_p >= min_hits && *seq_pos_p < seq_end) { prev_addr = next_addr; prev_seq_pos = *seq_pos_p; prev_hits = *hits_p; prev_sa_start = *sa_start_p; next_addr = parse_one_hyb_node(next_addr, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid); is_head_node = 0; #if CALC_STAT if (next_addr != NULL) { // fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr); uint64_t dist = next_addr - prev_addr; if (dist < 32) gdat[0]++; else if (dist < 64) gdat[1]++; else if (dist < 128) gdat[2]++; else gdat[3]++; } #endif } if (*hits_p < min_hits) { *seq_pos_p = prev_seq_pos; *hits_p = prev_hits; *sa_start_p = prev_sa_start; next_addr = prev_addr; parse_one_hyb_node_min_hits(next_addr, seq_bits, seq_bp, seq_end, min_hits, is_head_node, seq_pos_p, sa_start_p, hits_p, tid); } } // for seeding-2 , 先反向,后正向 int hyb_second_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, int seed_start, int seed_end, int read_start, int read_end, uint64_t first_ref, int min_hits, int pre_pivot, int pre_start, int pre_end, const int min_seed_len, HybSeedArr* seeds, int tid) { uint64_t seeds_m = seeds->m; uint64_t* seeds_cap = &seeds_m; int pivot = (seed_start + seed_end) >> 1; int x = MAX(MAX(pivot, read_start + min_seed_len - 1), pre_pivot); int rx = 0; Range fr = {read_start, read_end}; Range rr = {read_seq->len - read_end, read_seq->len - read_start}; Range* read_range = &fr; uint8_t type_hits = 0; uint64_t offset = 0; int extra_tend = MAX(0, min_seed_len - HYB_KMER_LEN) + 1; int next_pivot = x; int cur_left = 0; int old_n = seeds->n; int i = 0; // PROF_START(seed_2); #if 1 if (pre_end > pre_start && seeds->a[pre_end - 1].seed_end > pivot) { for (i = pre_start; i < pre_end; ++i) { HybSeed* seed = &kv_A(*seeds, i); if (seed->seed_end > pivot) { __check_add_seed(new_seed); seed = &kv_A(*seeds, i); __copy_seed(*seed, *new_seed); } } } // PROF_END(tprof[T_SEED_2_0][tid], seed_2); #endif while (cur_left <= pivot && x < fr.end) { next_pivot = x; rx = read_seq->len - x - 1; // 反向位置, 因为正向包含x, 所以这里需要减1 // PROF_START(seed_2); get_kmer_data(hyb, read_seq->back_bits, rx, &type_hits, &offset); // PROF_END(tprof[T_SEED_2_0][tid], seed_2); if (type_hits == 0) { cur_left = x - HYB_KMER_LEN + 2; x += extra_tend; } else if (type_hits == 1) { // min_hits肯定大于1 cur_left = x - HYB_KMER_LEN + 2; x += extra_tend; } else if (type_hits == 2) { // PROF_START(seed_2); if (min_hits > 2) { cur_left = x - HYB_KMER_LEN + 2; x += extra_tend; } else { uint64_t ref_pos_arr[2]; int left_match_arr[2] = {0}, right_match_arr[2] = {0}; Range mr_arr[2] = {0}; int new_x = x - HYB_KMER_LEN + 1; get_kmer_data(hyb, read_seq->for_bits, new_x, &type_hits, &offset); ref_pos_arr[0] = hyb_sa_to_ref_pos(hyb->sa, offset); ref_pos_arr[1] = hyb_sa_to_ref_pos(hyb->sa, offset + 1); for (i = 0; i < 2; ++i) { if (first_ref + new_x - seed_start == ref_pos_arr[i]) { left_match_arr[i] = new_x - seed_start; right_match_arr[i] = seed_end - new_x; } else { both_end_match(hyb, read_seq->len, &fr, read_seq->for_bits, read_seq->back_bits, new_x, HYB_KMER_LEN, ref_pos_arr[i], &left_match_arr[i], &right_match_arr[i]); } _set_range(mr_arr[i], new_x - left_match_arr[i], new_x + right_match_arr[i]); } Range sr = {MAX(mr_arr[0].start, mr_arr[1].start), MIN(mr_arr[0].end, mr_arr[1].end)}; if (sr.end - sr.start >= min_seed_len && sr.start <= pivot) { __add_seed_one_pos(seed, ref_pos_arr[0] - new_x + sr.start, sr.start, sr.end); kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1] - new_x + sr.start); } cur_left = sr.start; x = MAX(x + 1, MAX(sr.end, cur_left + min_seed_len)); } // PROF_END(tprof[T_SEED_2_1][tid], seed_2); } else { // PROF_START(seed_2); if (type_hits <= HYB_HIT_THRESH && type_hits < min_hits) { cur_left = x - HYB_KMER_LEN + 2; x += extra_tend; } else { uint32_t seq_pos = rx + HYB_KMER_LEN; uint32_t hits = type_hits; uint64_t sa_start = 0; // PROF_START(seed_2); get_min_hits_node(hyb->index_data + offset, read_seq->back_bits, read_seq->rseq, rr.end, min_hits, &seq_pos, &hits, &sa_start, tid); // PROF_END(tprof[T_SEED_2_2_0][tid], seed_2); // tdat[(seq_pos - rx - HYB_KMER_LEN + 2) / 3][tid]++; // forward search int new_x = x - (seq_pos - rx) + 1; if (new_x <= pivot) { // PROF_START(seed_2); get_kmer_data(hyb, read_seq->for_bits, new_x, &type_hits, &offset); // PROF_END(tprof[T_SEED_2_2_1][tid], seed_2); if (type_hits == 2) { // PROF_START(seed_2); int right_match = 0; int match_end[2] = {0}; uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, offset), hyb_sa_to_ref_pos(hyb->sa, offset + 1)}; for (i = 0; i < 2; ++i) { if (first_ref + new_x - seed_start == ref_pos_arr[i]) { right_match = seed_end - new_x; } else { right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, new_x, HYB_KMER_LEN, ref_pos_arr[i], &right_match); } match_end[i] = new_x + right_match; } seq_pos = MIN(match_end[0], match_end[1]); if (seq_pos - new_x >= min_seed_len) { __add_seed_one_pos(seed, ref_pos_arr[0], new_x, seq_pos); kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]); } // PROF_END(tprof[T_SEED_2_2_2][tid], seed_2); } else { hits = type_hits; seq_pos = new_x + HYB_KMER_LEN; // PROF_START(seed_2); get_min_hits_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, fr.end, min_hits, &seq_pos, &hits, &sa_start, tid); // PROF_END(tprof[T_SEED_2_2_0][tid], seed_2); // tdat[(seq_pos - new_x - HYB_KMER_LEN + 2) / 3][tid]++; if (seq_pos - new_x >= min_seed_len) { __add_seed_one_pos(seed, hyb_sa_to_ref_pos(hyb->sa, sa_start), new_x, seq_pos); for (i = 1; i < hits; ++i) { kv_push(uint64_t, seed->ref_pos_arr, hyb_sa_to_ref_pos(hyb->sa, sa_start + i)); } } // PROF_END(tprof[T_SEED_2_2_3][tid], seed_2); } } cur_left = new_x; x = MAX(seq_pos, cur_left + min_seed_len); // x = seq_pos; } // PROF_END(tprof[T_SEED_2_2][tid], seed_2); } } if (old_n < seeds->n) { next_pivot = seeds->a[seeds->n - 1].seed_end; } // PROF_END(tprof[T_SEED_2_ALL][tid], seed_2); return next_pivot; }