209 lines
9.6 KiB
C
209 lines
9.6 KiB
C
#include "hyb_idx.h"
|
|
#include "profiling.h"
|
|
|
|
#define CALC_STAT 0
|
|
// 需要给定初始化的hits和seq_pos
|
|
static void get_min_hits_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int min_hits,
|
|
uint32_t* seq_pos_p, uint32_t* hits_p, uint64_t* sa_start_p, int tid) {
|
|
uint8_t cmp_ref_val = 0;
|
|
int is_head_node = 1;
|
|
uint8_t* cmp_ref = &cmp_ref_val;
|
|
uint8_t* prev_addr = idata;
|
|
uint32_t prev_seq_pos = *seq_pos_p;
|
|
uint32_t prev_hits = *hits_p;
|
|
uint64_t prev_sa_start = *sa_start_p;
|
|
uint8_t* next_addr = parse_first_hyb_node(idata, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid);
|
|
#if CALC_STAT
|
|
if (next_addr != NULL) {
|
|
// fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr);
|
|
uint64_t dist = next_addr - prev_addr;
|
|
if (dist < 32)
|
|
gdat[0]++;
|
|
else if (dist < 64)
|
|
gdat[1]++;
|
|
else if (dist < 128)
|
|
gdat[2]++;
|
|
else
|
|
gdat[3]++;
|
|
}
|
|
#endif
|
|
while (next_addr != NULL && *hits_p >= min_hits && *seq_pos_p < seq_end) {
|
|
prev_addr = next_addr;
|
|
prev_seq_pos = *seq_pos_p;
|
|
prev_hits = *hits_p;
|
|
prev_sa_start = *sa_start_p;
|
|
next_addr = parse_one_hyb_node(next_addr, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid);
|
|
is_head_node = 0;
|
|
#if CALC_STAT
|
|
if (next_addr != NULL) {
|
|
// fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr);
|
|
uint64_t dist = next_addr - prev_addr;
|
|
if (dist < 32)
|
|
gdat[0]++;
|
|
else if (dist < 64)
|
|
gdat[1]++;
|
|
else if (dist < 128)
|
|
gdat[2]++;
|
|
else
|
|
gdat[3]++;
|
|
}
|
|
#endif
|
|
}
|
|
if (*hits_p < min_hits) {
|
|
*seq_pos_p = prev_seq_pos;
|
|
*hits_p = prev_hits;
|
|
*sa_start_p = prev_sa_start;
|
|
next_addr = prev_addr;
|
|
parse_one_hyb_node_min_hits(next_addr, seq_bits, seq_bp, seq_end, min_hits, is_head_node, seq_pos_p, sa_start_p, hits_p,
|
|
tid);
|
|
}
|
|
}
|
|
|
|
// for seeding-2 , 先反向,后正向
|
|
int hyb_second_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, int seed_start, int seed_end, int read_start,
|
|
int read_end, uint64_t first_ref, int min_hits, int pre_pivot, int pre_start, int pre_end,
|
|
const int min_seed_len, HybSeedArr* seeds, int tid) {
|
|
uint64_t seeds_m = seeds->m;
|
|
uint64_t* seeds_cap = &seeds_m;
|
|
int pivot = (seed_start + seed_end) >> 1;
|
|
int x = MAX(MAX(pivot, read_start + min_seed_len - 1), pre_pivot);
|
|
int rx = 0;
|
|
Range fr = {read_start, read_end};
|
|
Range rr = {read_seq->len - read_end, read_seq->len - read_start};
|
|
Range* read_range = &fr;
|
|
uint8_t type_hits = 0;
|
|
uint64_t offset = 0;
|
|
int extra_tend = MAX(0, min_seed_len - HYB_KMER_LEN) + 1;
|
|
int next_pivot = x;
|
|
int cur_left = 0;
|
|
int old_n = seeds->n;
|
|
int i = 0;
|
|
// PROF_START(seed_2);
|
|
#if 1
|
|
if (pre_end > pre_start && seeds->a[pre_end - 1].seed_end > pivot) {
|
|
for (i = pre_start; i < pre_end; ++i) {
|
|
HybSeed* seed = &kv_A(*seeds, i);
|
|
if (seed->seed_end > pivot) {
|
|
__check_add_seed(new_seed);
|
|
seed = &kv_A(*seeds, i);
|
|
__copy_seed(*seed, *new_seed);
|
|
}
|
|
}
|
|
}
|
|
// PROF_END(tprof[T_SEED_2_0][tid], seed_2);
|
|
#endif
|
|
while (cur_left <= pivot && x < fr.end) {
|
|
next_pivot = x;
|
|
rx = read_seq->len - x - 1; // 反向位置, 因为正向包含x, 所以这里需要减1
|
|
// PROF_START(seed_2);
|
|
get_kmer_data(hyb, read_seq->back_bits, rx, &type_hits, &offset);
|
|
// PROF_END(tprof[T_SEED_2_0][tid], seed_2);
|
|
if (type_hits == 0) {
|
|
cur_left = x - HYB_KMER_LEN + 2;
|
|
x += extra_tend;
|
|
} else if (type_hits == 1) { // min_hits肯定大于1
|
|
cur_left = x - HYB_KMER_LEN + 2;
|
|
x += extra_tend;
|
|
} else if (type_hits == 2) {
|
|
// PROF_START(seed_2);
|
|
if (min_hits > 2) {
|
|
cur_left = x - HYB_KMER_LEN + 2;
|
|
x += extra_tend;
|
|
} else {
|
|
uint64_t ref_pos_arr[2];
|
|
int left_match_arr[2] = {0}, right_match_arr[2] = {0};
|
|
Range mr_arr[2] = {0};
|
|
int new_x = x - HYB_KMER_LEN + 1;
|
|
get_kmer_data(hyb, read_seq->for_bits, new_x, &type_hits, &offset);
|
|
ref_pos_arr[0] = hyb_sa_to_ref_pos(hyb->sa, offset);
|
|
ref_pos_arr[1] = hyb_sa_to_ref_pos(hyb->sa, offset + 1);
|
|
for (i = 0; i < 2; ++i) {
|
|
if (first_ref + new_x - seed_start == ref_pos_arr[i]) {
|
|
left_match_arr[i] = new_x - seed_start;
|
|
right_match_arr[i] = seed_end - new_x;
|
|
} else {
|
|
both_end_match(hyb, read_seq->len, &fr, read_seq->for_bits, read_seq->back_bits, new_x, HYB_KMER_LEN,
|
|
ref_pos_arr[i], &left_match_arr[i], &right_match_arr[i]);
|
|
}
|
|
_set_range(mr_arr[i], new_x - left_match_arr[i], new_x + right_match_arr[i]);
|
|
}
|
|
Range sr = {MAX(mr_arr[0].start, mr_arr[1].start), MIN(mr_arr[0].end, mr_arr[1].end)};
|
|
if (sr.end - sr.start >= min_seed_len && sr.start <= pivot) {
|
|
__add_seed_one_pos(seed, ref_pos_arr[0] - new_x + sr.start, sr.start, sr.end);
|
|
kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1] - new_x + sr.start);
|
|
}
|
|
cur_left = sr.start;
|
|
x = MAX(x + 1, MAX(sr.end, cur_left + min_seed_len));
|
|
}
|
|
// PROF_END(tprof[T_SEED_2_1][tid], seed_2);
|
|
} else {
|
|
// PROF_START(seed_2);
|
|
if (type_hits <= HYB_HIT_THRESH && type_hits < min_hits) {
|
|
cur_left = x - HYB_KMER_LEN + 2;
|
|
x += extra_tend;
|
|
} else {
|
|
uint32_t seq_pos = rx + HYB_KMER_LEN;
|
|
uint32_t hits = type_hits;
|
|
uint64_t sa_start = 0;
|
|
// PROF_START(seed_2);
|
|
get_min_hits_node(hyb->index_data + offset, read_seq->back_bits, read_seq->rseq, rr.end, min_hits, &seq_pos,
|
|
&hits, &sa_start, tid);
|
|
// PROF_END(tprof[T_SEED_2_2_0][tid], seed_2);
|
|
// tdat[(seq_pos - rx - HYB_KMER_LEN + 2) / 3][tid]++;
|
|
// forward search
|
|
int new_x = x - (seq_pos - rx) + 1;
|
|
if (new_x <= pivot) {
|
|
// PROF_START(seed_2);
|
|
get_kmer_data(hyb, read_seq->for_bits, new_x, &type_hits, &offset);
|
|
// PROF_END(tprof[T_SEED_2_2_1][tid], seed_2);
|
|
if (type_hits == 2) {
|
|
// PROF_START(seed_2);
|
|
int right_match = 0;
|
|
int match_end[2] = {0};
|
|
uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, offset), hyb_sa_to_ref_pos(hyb->sa, offset + 1)};
|
|
for (i = 0; i < 2; ++i) {
|
|
if (first_ref + new_x - seed_start == ref_pos_arr[i]) {
|
|
right_match = seed_end - new_x;
|
|
} else {
|
|
right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, new_x,
|
|
HYB_KMER_LEN, ref_pos_arr[i], &right_match);
|
|
}
|
|
match_end[i] = new_x + right_match;
|
|
}
|
|
seq_pos = MIN(match_end[0], match_end[1]);
|
|
if (seq_pos - new_x >= min_seed_len) {
|
|
__add_seed_one_pos(seed, ref_pos_arr[0], new_x, seq_pos);
|
|
kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]);
|
|
}
|
|
// PROF_END(tprof[T_SEED_2_2_2][tid], seed_2);
|
|
} else {
|
|
hits = type_hits;
|
|
seq_pos = new_x + HYB_KMER_LEN;
|
|
// PROF_START(seed_2);
|
|
get_min_hits_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, fr.end, min_hits,
|
|
&seq_pos, &hits, &sa_start, tid);
|
|
// PROF_END(tprof[T_SEED_2_2_0][tid], seed_2);
|
|
// tdat[(seq_pos - new_x - HYB_KMER_LEN + 2) / 3][tid]++;
|
|
if (seq_pos - new_x >= min_seed_len) {
|
|
__add_seed_one_pos(seed, hyb_sa_to_ref_pos(hyb->sa, sa_start), new_x, seq_pos);
|
|
for (i = 1; i < hits; ++i) {
|
|
kv_push(uint64_t, seed->ref_pos_arr, hyb_sa_to_ref_pos(hyb->sa, sa_start + i));
|
|
}
|
|
}
|
|
// PROF_END(tprof[T_SEED_2_2_3][tid], seed_2);
|
|
}
|
|
}
|
|
cur_left = new_x;
|
|
x = MAX(seq_pos, cur_left + min_seed_len);
|
|
// x = seq_pos;
|
|
}
|
|
// PROF_END(tprof[T_SEED_2_2][tid], seed_2);
|
|
}
|
|
}
|
|
if (old_n < seeds->n) {
|
|
next_pivot = seeds->a[seeds->n - 1].seed_end;
|
|
}
|
|
// PROF_END(tprof[T_SEED_2_ALL][tid], seed_2);
|
|
return next_pivot;
|
|
}
|