204 lines
8.1 KiB
C
204 lines
8.1 KiB
C
#include "hyb_idx.h"
|
|
#include "profiling.h"
|
|
|
|
#define CALC_STAT 0
|
|
|
|
static void get_seed_end_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int max_hits, int seed_end,
|
|
uint32_t* seq_pos_p, uint32_t* hits_p, uint64_t* sa_start_p, int tid) {
|
|
uint8_t cmp_ref_val = 0;
|
|
int is_head_node = 1;
|
|
uint8_t* cmp_ref = &cmp_ref_val;
|
|
uint8_t* prev_addr = idata;
|
|
uint32_t prev_seq_pos = *seq_pos_p;
|
|
uint32_t prev_hits = *hits_p;
|
|
uint64_t prev_sa_start = *sa_start_p;
|
|
uint8_t* next_addr = parse_first_hyb_node(idata, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid);
|
|
#if CALC_STAT
|
|
if (next_addr != NULL) {
|
|
// fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr);
|
|
uint64_t dist = next_addr - prev_addr;
|
|
if (dist < 32)
|
|
gdat[0]++;
|
|
else if (dist < 64)
|
|
gdat[1]++;
|
|
else if (dist < 128)
|
|
gdat[2]++;
|
|
else
|
|
gdat[3]++;
|
|
}
|
|
#endif
|
|
while (next_addr != NULL && *hits_p > 1 && (*seq_pos_p < seed_end || *hits_p >= max_hits)) {
|
|
prev_addr = next_addr;
|
|
prev_seq_pos = *seq_pos_p;
|
|
prev_hits = *hits_p;
|
|
prev_sa_start = *sa_start_p;
|
|
next_addr = parse_one_hyb_node(next_addr, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid);
|
|
is_head_node = 0;
|
|
#if CALC_STAT
|
|
if (next_addr != NULL) {
|
|
// fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr);
|
|
uint64_t dist = next_addr - prev_addr;
|
|
if (dist < 32)
|
|
gdat[0]++;
|
|
else if (dist < 64)
|
|
gdat[1]++;
|
|
else if (dist < 128)
|
|
gdat[2]++;
|
|
else
|
|
gdat[3]++;
|
|
}
|
|
#endif
|
|
}
|
|
uint32_t hold_seq_pos = *seq_pos_p;
|
|
uint32_t hold_hits = *hits_p;
|
|
uint64_t hold_sa_start = *sa_start_p;
|
|
|
|
if (*seq_pos_p > seed_end && *hits_p < max_hits) {
|
|
// 检查前一个节点
|
|
*seq_pos_p = prev_seq_pos;
|
|
*hits_p = prev_hits;
|
|
*sa_start_p = prev_sa_start;
|
|
next_addr = prev_addr;
|
|
parse_one_hyb_node_max_hits(next_addr, seq_bits, seq_bp, seq_end, max_hits, seed_end - prev_seq_pos, is_head_node,
|
|
seq_pos_p, sa_start_p, hits_p, tid);
|
|
if (*hits_p >= max_hits) {
|
|
*seq_pos_p = hold_seq_pos;
|
|
*hits_p = hold_hits;
|
|
*sa_start_p = hold_sa_start;
|
|
}
|
|
}
|
|
}
|
|
|
|
// assume max_hits > 2
|
|
void hyb_third_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const Range* seeds_range,
|
|
const int min_seed_len, const int max_hits, HybSeedArr* seeds, int tid) {
|
|
if (seeds_range->start == seeds_range->end) {
|
|
return;
|
|
}
|
|
uint64_t seeds_m = seeds->m;
|
|
uint64_t* seeds_cap = &seeds_m;
|
|
int new_seed_len = min_seed_len + 1;
|
|
|
|
int i = 0;
|
|
int right_match_arr[2] = {0};
|
|
Range ff = *read_range;
|
|
uint8_t type_hits = 0;
|
|
uint64_t offset = 0;
|
|
|
|
int seeds_i = seeds_range->start;
|
|
int x = read_range->start;
|
|
int x_end = x + new_seed_len;
|
|
int flag_found_x_end = 0;
|
|
int flag_i = 0;
|
|
// PROF_START(seed_3);
|
|
HybSeed s = kv_A(*seeds, seeds_i);
|
|
if (s.first_len > 0 && s.first_len < new_seed_len && s.seed_start == x && s.seed_end >= x_end && s.ref_pos_arr.n <= 2) {
|
|
__add_seed_one_pos(seed, s.ref_pos_arr.a[0], x, x_end);
|
|
if (s.ref_pos_arr.n == 2)
|
|
kv_push(uint64_t, seed->ref_pos_arr, s.ref_pos_arr.a[1]);
|
|
x = x_end;
|
|
}
|
|
while (x + min_seed_len < read_range->end) {
|
|
while (seeds_i < seeds_range->end && kv_A(*seeds, seeds_i).seed_end < x) ++seeds_i;
|
|
if (seeds_i == seeds_range->end)
|
|
break;
|
|
if (seeds->a[seeds_i].seed_start > x) {
|
|
x += new_seed_len;
|
|
continue;
|
|
}
|
|
x_end = x + new_seed_len;
|
|
flag_found_x_end = 0;
|
|
flag_i = 0;
|
|
for (i = seeds_i; i < seeds_range->end; ++i) {
|
|
HybSeed* s = &kv_A(*seeds, i);
|
|
if (s->seed_start >= x_end)
|
|
break;
|
|
if (s->seed_start <= x && s->seed_end >= x_end) {
|
|
flag_found_x_end = 1; // x_end点存在seed
|
|
flag_i = i;
|
|
break;
|
|
}
|
|
}
|
|
if (!flag_found_x_end) {
|
|
x = x_end;
|
|
continue;
|
|
}
|
|
|
|
// PROF_START(seed_3);
|
|
get_kmer_data(hyb, read_seq->for_bits, x, &type_hits, &offset);
|
|
// PROF_END(tprof[T_SEED_3_0][tid], seed_3);
|
|
if (type_hits == 0) {
|
|
x += new_seed_len;
|
|
} else if (type_hits == 1) {
|
|
// PROF_START(seed_3);
|
|
__add_seed_one_pos(seed, offset, x, x_end);
|
|
x = x_end;
|
|
// PROF_END(tprof[T_SEED_3_1][tid], seed_3);
|
|
} else if (type_hits == 2) {
|
|
// PROF_START(seed_3);
|
|
HybSeed s = kv_A(*seeds, flag_i);
|
|
if (s.ref_pos_arr.n == 2) {
|
|
__add_seed_one_pos(seed, s.ref_pos_arr.a[0] + x - s.seed_start, x, x_end);
|
|
kv_push(uint64_t, seed->ref_pos_arr, s.ref_pos_arr.a[1] + x - s.seed_start);
|
|
} else { // 只有一个ref_pos
|
|
ff.end = x_end;
|
|
uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, offset), hyb_sa_to_ref_pos(hyb->sa, offset + 1)};
|
|
for (i = 0; i < 2; ++i) {
|
|
if (s.ref_pos_arr.a[0] + x - s.seed_start == ref_pos_arr[i]) {
|
|
right_match_arr[i] = MIN(s.seed_end - x, new_seed_len);
|
|
} else {
|
|
right_end_match(hyb, read_seq->len, &ff, read_seq->for_bits, read_seq->back_bits, x, HYB_KMER_LEN,
|
|
ref_pos_arr[i], &right_match_arr[i]);
|
|
}
|
|
}
|
|
if (right_match_arr[0] == right_match_arr[1]) {
|
|
if (right_match_arr[0] == new_seed_len) {
|
|
__add_seed_one_pos(seed, ref_pos_arr[0], x, x_end);
|
|
kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]);
|
|
}
|
|
} else {
|
|
if (right_match_arr[0] == new_seed_len) {
|
|
__add_seed_one_pos(seed, ref_pos_arr[0], x, x_end);
|
|
} else if (right_match_arr[1] == new_seed_len) {
|
|
__add_seed_one_pos(seed, ref_pos_arr[1], x, x_end);
|
|
}
|
|
}
|
|
}
|
|
x = x_end;
|
|
// PROF_END(tprof[T_SEED_3_2][tid], seed_3);
|
|
} else {
|
|
uint32_t seq_pos = x + HYB_KMER_LEN;
|
|
uint32_t hits = type_hits;
|
|
uint64_t sa_start = 0;
|
|
// PROF_START(seed_3);
|
|
get_seed_end_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, read_range->end, max_hits,
|
|
x + new_seed_len, &seq_pos, &hits, &sa_start, tid);
|
|
// PROF_END(tprof[T_SEED_3_3_0][tid], seed_3);
|
|
// tdat[(seq_pos - x - HYB_KMER_LEN + 2) / 3][tid]++;
|
|
if (seq_pos - x < new_seed_len) {
|
|
// PROF_START(seed_3);
|
|
HybSeed s = kv_A(*seeds, flag_i);
|
|
__add_seed_one_pos(seed, s.ref_pos_arr.a[0] + x - s.seed_start, x, x_end);
|
|
x = x_end;
|
|
// PROF_END(tprof[T_SEED_3_3_1][tid], seed_3);
|
|
} else {
|
|
// PROF_START(seed_3);
|
|
if (hits < max_hits) {
|
|
__add_seed_one_pos(seed, hyb_sa_to_ref_pos(hyb->sa, sa_start), x, seq_pos);
|
|
int i = 0;
|
|
for (i = 1; i < hits; ++i) {
|
|
kv_push(uint64_t, seed->ref_pos_arr, hyb_sa_to_ref_pos(hyb->sa, sa_start + i));
|
|
}
|
|
x = seq_pos;
|
|
} else {
|
|
x = seq_pos + 1;
|
|
}
|
|
// PROF_END(tprof[T_SEED_3_3_2][tid], seed_3);
|
|
}
|
|
|
|
// PROF_END(tprof[T_SEED_3_3][tid], seed_3);
|
|
}
|
|
}
|
|
// PROF_END(tprof[T_SEED_3_ALL][tid], seed_3);
|
|
}
|