hyb-align/hyb_seeding_2.c

209 lines
9.6 KiB
C

#include "hyb_idx.h"
#include "profiling.h"
#define CALC_STAT 0
// 需要给定初始化的hits和seq_pos
static void get_min_hits_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int min_hits,
uint32_t* seq_pos_p, uint32_t* hits_p, uint64_t* sa_start_p, int tid) {
uint8_t cmp_ref_val = 0;
int is_head_node = 1;
uint8_t* cmp_ref = &cmp_ref_val;
uint8_t* prev_addr = idata;
uint32_t prev_seq_pos = *seq_pos_p;
uint32_t prev_hits = *hits_p;
uint64_t prev_sa_start = *sa_start_p;
uint8_t* next_addr = parse_first_hyb_node(idata, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid);
#if CALC_STAT
if (next_addr != NULL) {
// fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr);
uint64_t dist = next_addr - prev_addr;
if (dist < 32)
gdat[0]++;
else if (dist < 64)
gdat[1]++;
else if (dist < 128)
gdat[2]++;
else
gdat[3]++;
}
#endif
while (next_addr != NULL && *hits_p >= min_hits && *seq_pos_p < seq_end) {
prev_addr = next_addr;
prev_seq_pos = *seq_pos_p;
prev_hits = *hits_p;
prev_sa_start = *sa_start_p;
next_addr = parse_one_hyb_node(next_addr, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid);
is_head_node = 0;
#if CALC_STAT
if (next_addr != NULL) {
// fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr);
uint64_t dist = next_addr - prev_addr;
if (dist < 32)
gdat[0]++;
else if (dist < 64)
gdat[1]++;
else if (dist < 128)
gdat[2]++;
else
gdat[3]++;
}
#endif
}
if (*hits_p < min_hits) {
*seq_pos_p = prev_seq_pos;
*hits_p = prev_hits;
*sa_start_p = prev_sa_start;
next_addr = prev_addr;
parse_one_hyb_node_min_hits(next_addr, seq_bits, seq_bp, seq_end, min_hits, is_head_node, seq_pos_p, sa_start_p, hits_p,
tid);
}
}
// for seeding-2 , 先反向,后正向
int hyb_second_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, int seed_start, int seed_end, int read_start,
int read_end, uint64_t first_ref, int min_hits, int pre_pivot, int pre_start, int pre_end,
const int min_seed_len, HybSeedArr* seeds, int tid) {
uint64_t seeds_m = seeds->m;
uint64_t* seeds_cap = &seeds_m;
int pivot = (seed_start + seed_end) >> 1;
int x = MAX(MAX(pivot, read_start + min_seed_len - 1), pre_pivot);
int rx = 0;
Range fr = {read_start, read_end};
Range rr = {read_seq->len - read_end, read_seq->len - read_start};
Range* read_range = &fr;
uint8_t type_hits = 0;
uint64_t offset = 0;
int extra_tend = MAX(0, min_seed_len - HYB_KMER_LEN) + 1;
int next_pivot = x;
int cur_left = 0;
int old_n = seeds->n;
int i = 0;
// PROF_START(seed_2);
#if 1
if (pre_end > pre_start && seeds->a[pre_end - 1].seed_end > pivot) {
for (i = pre_start; i < pre_end; ++i) {
HybSeed* seed = &kv_A(*seeds, i);
if (seed->seed_end > pivot) {
__check_add_seed(new_seed);
seed = &kv_A(*seeds, i);
__copy_seed(*seed, *new_seed);
}
}
}
// PROF_END(tprof[T_SEED_2_0][tid], seed_2);
#endif
while (cur_left <= pivot && x < fr.end) {
next_pivot = x;
rx = read_seq->len - x - 1; // 反向位置, 因为正向包含x, 所以这里需要减1
// PROF_START(seed_2);
get_kmer_data(hyb, read_seq->back_bits, rx, &type_hits, &offset);
// PROF_END(tprof[T_SEED_2_0][tid], seed_2);
if (type_hits == 0) {
cur_left = x - HYB_KMER_LEN + 2;
x += extra_tend;
} else if (type_hits == 1) { // min_hits肯定大于1
cur_left = x - HYB_KMER_LEN + 2;
x += extra_tend;
} else if (type_hits == 2) {
// PROF_START(seed_2);
if (min_hits > 2) {
cur_left = x - HYB_KMER_LEN + 2;
x += extra_tend;
} else {
uint64_t ref_pos_arr[2];
int left_match_arr[2] = {0}, right_match_arr[2] = {0};
Range mr_arr[2] = {0};
int new_x = x - HYB_KMER_LEN + 1;
get_kmer_data(hyb, read_seq->for_bits, new_x, &type_hits, &offset);
ref_pos_arr[0] = hyb_sa_to_ref_pos(hyb->sa, offset);
ref_pos_arr[1] = hyb_sa_to_ref_pos(hyb->sa, offset + 1);
for (i = 0; i < 2; ++i) {
if (first_ref + new_x - seed_start == ref_pos_arr[i]) {
left_match_arr[i] = new_x - seed_start;
right_match_arr[i] = seed_end - new_x;
} else {
both_end_match(hyb, read_seq->len, &fr, read_seq->for_bits, read_seq->back_bits, new_x, HYB_KMER_LEN,
ref_pos_arr[i], &left_match_arr[i], &right_match_arr[i]);
}
_set_range(mr_arr[i], new_x - left_match_arr[i], new_x + right_match_arr[i]);
}
Range sr = {MAX(mr_arr[0].start, mr_arr[1].start), MIN(mr_arr[0].end, mr_arr[1].end)};
if (sr.end - sr.start >= min_seed_len && sr.start <= pivot) {
__add_seed_one_pos(seed, ref_pos_arr[0] - new_x + sr.start, sr.start, sr.end);
kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1] - new_x + sr.start);
}
cur_left = sr.start;
x = MAX(x + 1, MAX(sr.end, cur_left + min_seed_len));
}
// PROF_END(tprof[T_SEED_2_1][tid], seed_2);
} else {
// PROF_START(seed_2);
if (type_hits <= HYB_HIT_THRESH && type_hits < min_hits) {
cur_left = x - HYB_KMER_LEN + 2;
x += extra_tend;
} else {
uint32_t seq_pos = rx + HYB_KMER_LEN;
uint32_t hits = type_hits;
uint64_t sa_start = 0;
// PROF_START(seed_2);
get_min_hits_node(hyb->index_data + offset, read_seq->back_bits, read_seq->rseq, rr.end, min_hits, &seq_pos,
&hits, &sa_start, tid);
// PROF_END(tprof[T_SEED_2_2_0][tid], seed_2);
// tdat[(seq_pos - rx - HYB_KMER_LEN + 2) / 3][tid]++;
// forward search
int new_x = x - (seq_pos - rx) + 1;
if (new_x <= pivot) {
// PROF_START(seed_2);
get_kmer_data(hyb, read_seq->for_bits, new_x, &type_hits, &offset);
// PROF_END(tprof[T_SEED_2_2_1][tid], seed_2);
if (type_hits == 2) {
// PROF_START(seed_2);
int right_match = 0;
int match_end[2] = {0};
uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, offset), hyb_sa_to_ref_pos(hyb->sa, offset + 1)};
for (i = 0; i < 2; ++i) {
if (first_ref + new_x - seed_start == ref_pos_arr[i]) {
right_match = seed_end - new_x;
} else {
right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, new_x,
HYB_KMER_LEN, ref_pos_arr[i], &right_match);
}
match_end[i] = new_x + right_match;
}
seq_pos = MIN(match_end[0], match_end[1]);
if (seq_pos - new_x >= min_seed_len) {
__add_seed_one_pos(seed, ref_pos_arr[0], new_x, seq_pos);
kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]);
}
// PROF_END(tprof[T_SEED_2_2_2][tid], seed_2);
} else {
hits = type_hits;
seq_pos = new_x + HYB_KMER_LEN;
// PROF_START(seed_2);
get_min_hits_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, fr.end, min_hits,
&seq_pos, &hits, &sa_start, tid);
// PROF_END(tprof[T_SEED_2_2_0][tid], seed_2);
// tdat[(seq_pos - new_x - HYB_KMER_LEN + 2) / 3][tid]++;
if (seq_pos - new_x >= min_seed_len) {
__add_seed_one_pos(seed, hyb_sa_to_ref_pos(hyb->sa, sa_start), new_x, seq_pos);
for (i = 1; i < hits; ++i) {
kv_push(uint64_t, seed->ref_pos_arr, hyb_sa_to_ref_pos(hyb->sa, sa_start + i));
}
}
// PROF_END(tprof[T_SEED_2_2_3][tid], seed_2);
}
}
cur_left = new_x;
x = MAX(seq_pos, cur_left + min_seed_len);
// x = seq_pos;
}
// PROF_END(tprof[T_SEED_2_2][tid], seed_2);
}
}
if (old_n < seeds->n) {
next_pivot = seeds->a[seeds->n - 1].seed_end;
}
// PROF_END(tprof[T_SEED_2_ALL][tid], seed_2);
return next_pivot;
}