hyb-align/hyb_seeding_3.c

204 lines
8.1 KiB
C

#include "hyb_idx.h"
#include "profiling.h"
#define CALC_STAT 0
static void get_seed_end_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int max_hits, int seed_end,
uint32_t* seq_pos_p, uint32_t* hits_p, uint64_t* sa_start_p, int tid) {
uint8_t cmp_ref_val = 0;
int is_head_node = 1;
uint8_t* cmp_ref = &cmp_ref_val;
uint8_t* prev_addr = idata;
uint32_t prev_seq_pos = *seq_pos_p;
uint32_t prev_hits = *hits_p;
uint64_t prev_sa_start = *sa_start_p;
uint8_t* next_addr = parse_first_hyb_node(idata, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid);
#if CALC_STAT
if (next_addr != NULL) {
// fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr);
uint64_t dist = next_addr - prev_addr;
if (dist < 32)
gdat[0]++;
else if (dist < 64)
gdat[1]++;
else if (dist < 128)
gdat[2]++;
else
gdat[3]++;
}
#endif
while (next_addr != NULL && *hits_p > 1 && (*seq_pos_p < seed_end || *hits_p >= max_hits)) {
prev_addr = next_addr;
prev_seq_pos = *seq_pos_p;
prev_hits = *hits_p;
prev_sa_start = *sa_start_p;
next_addr = parse_one_hyb_node(next_addr, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid);
is_head_node = 0;
#if CALC_STAT
if (next_addr != NULL) {
// fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr);
uint64_t dist = next_addr - prev_addr;
if (dist < 32)
gdat[0]++;
else if (dist < 64)
gdat[1]++;
else if (dist < 128)
gdat[2]++;
else
gdat[3]++;
}
#endif
}
uint32_t hold_seq_pos = *seq_pos_p;
uint32_t hold_hits = *hits_p;
uint64_t hold_sa_start = *sa_start_p;
if (*seq_pos_p > seed_end && *hits_p < max_hits) {
// 检查前一个节点
*seq_pos_p = prev_seq_pos;
*hits_p = prev_hits;
*sa_start_p = prev_sa_start;
next_addr = prev_addr;
parse_one_hyb_node_max_hits(next_addr, seq_bits, seq_bp, seq_end, max_hits, seed_end - prev_seq_pos, is_head_node,
seq_pos_p, sa_start_p, hits_p, tid);
if (*hits_p >= max_hits) {
*seq_pos_p = hold_seq_pos;
*hits_p = hold_hits;
*sa_start_p = hold_sa_start;
}
}
}
// assume max_hits > 2
void hyb_third_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const Range* seeds_range,
const int min_seed_len, const int max_hits, HybSeedArr* seeds, int tid) {
if (seeds_range->start == seeds_range->end) {
return;
}
uint64_t seeds_m = seeds->m;
uint64_t* seeds_cap = &seeds_m;
int new_seed_len = min_seed_len + 1;
int i = 0;
int right_match_arr[2] = {0};
Range ff = *read_range;
uint8_t type_hits = 0;
uint64_t offset = 0;
int seeds_i = seeds_range->start;
int x = read_range->start;
int x_end = x + new_seed_len;
int flag_found_x_end = 0;
int flag_i = 0;
// PROF_START(seed_3);
HybSeed s = kv_A(*seeds, seeds_i);
if (s.first_len > 0 && s.first_len < new_seed_len && s.seed_start == x && s.seed_end >= x_end && s.ref_pos_arr.n <= 2) {
__add_seed_one_pos(seed, s.ref_pos_arr.a[0], x, x_end);
if (s.ref_pos_arr.n == 2)
kv_push(uint64_t, seed->ref_pos_arr, s.ref_pos_arr.a[1]);
x = x_end;
}
while (x + min_seed_len < read_range->end) {
while (seeds_i < seeds_range->end && kv_A(*seeds, seeds_i).seed_end < x) ++seeds_i;
if (seeds_i == seeds_range->end)
break;
if (seeds->a[seeds_i].seed_start > x) {
x += new_seed_len;
continue;
}
x_end = x + new_seed_len;
flag_found_x_end = 0;
flag_i = 0;
for (i = seeds_i; i < seeds_range->end; ++i) {
HybSeed* s = &kv_A(*seeds, i);
if (s->seed_start >= x_end)
break;
if (s->seed_start <= x && s->seed_end >= x_end) {
flag_found_x_end = 1; // x_end点存在seed
flag_i = i;
break;
}
}
if (!flag_found_x_end) {
x = x_end;
continue;
}
// PROF_START(seed_3);
get_kmer_data(hyb, read_seq->for_bits, x, &type_hits, &offset);
// PROF_END(tprof[T_SEED_3_0][tid], seed_3);
if (type_hits == 0) {
x += new_seed_len;
} else if (type_hits == 1) {
// PROF_START(seed_3);
__add_seed_one_pos(seed, offset, x, x_end);
x = x_end;
// PROF_END(tprof[T_SEED_3_1][tid], seed_3);
} else if (type_hits == 2) {
// PROF_START(seed_3);
HybSeed s = kv_A(*seeds, flag_i);
if (s.ref_pos_arr.n == 2) {
__add_seed_one_pos(seed, s.ref_pos_arr.a[0] + x - s.seed_start, x, x_end);
kv_push(uint64_t, seed->ref_pos_arr, s.ref_pos_arr.a[1] + x - s.seed_start);
} else { // 只有一个ref_pos
ff.end = x_end;
uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, offset), hyb_sa_to_ref_pos(hyb->sa, offset + 1)};
for (i = 0; i < 2; ++i) {
if (s.ref_pos_arr.a[0] + x - s.seed_start == ref_pos_arr[i]) {
right_match_arr[i] = MIN(s.seed_end - x, new_seed_len);
} else {
right_end_match(hyb, read_seq->len, &ff, read_seq->for_bits, read_seq->back_bits, x, HYB_KMER_LEN,
ref_pos_arr[i], &right_match_arr[i]);
}
}
if (right_match_arr[0] == right_match_arr[1]) {
if (right_match_arr[0] == new_seed_len) {
__add_seed_one_pos(seed, ref_pos_arr[0], x, x_end);
kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]);
}
} else {
if (right_match_arr[0] == new_seed_len) {
__add_seed_one_pos(seed, ref_pos_arr[0], x, x_end);
} else if (right_match_arr[1] == new_seed_len) {
__add_seed_one_pos(seed, ref_pos_arr[1], x, x_end);
}
}
}
x = x_end;
// PROF_END(tprof[T_SEED_3_2][tid], seed_3);
} else {
uint32_t seq_pos = x + HYB_KMER_LEN;
uint32_t hits = type_hits;
uint64_t sa_start = 0;
// PROF_START(seed_3);
get_seed_end_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, read_range->end, max_hits,
x + new_seed_len, &seq_pos, &hits, &sa_start, tid);
// PROF_END(tprof[T_SEED_3_3_0][tid], seed_3);
// tdat[(seq_pos - x - HYB_KMER_LEN + 2) / 3][tid]++;
if (seq_pos - x < new_seed_len) {
// PROF_START(seed_3);
HybSeed s = kv_A(*seeds, flag_i);
__add_seed_one_pos(seed, s.ref_pos_arr.a[0] + x - s.seed_start, x, x_end);
x = x_end;
// PROF_END(tprof[T_SEED_3_3_1][tid], seed_3);
} else {
// PROF_START(seed_3);
if (hits < max_hits) {
__add_seed_one_pos(seed, hyb_sa_to_ref_pos(hyb->sa, sa_start), x, seq_pos);
int i = 0;
for (i = 1; i < hits; ++i) {
kv_push(uint64_t, seed->ref_pos_arr, hyb_sa_to_ref_pos(hyb->sa, sa_start + i));
}
x = seq_pos;
} else {
x = seq_pos + 1;
}
// PROF_END(tprof[T_SEED_3_3_2][tid], seed_3);
}
// PROF_END(tprof[T_SEED_3_3][tid], seed_3);
}
}
// PROF_END(tprof[T_SEED_3_ALL][tid], seed_3);
}