315 lines
16 KiB
C
315 lines
16 KiB
C
#include "hyb_idx.h"
|
||
#include "profiling.h"
|
||
|
||
static int handle_hits_1(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const Range* rr, int x,
|
||
int rx, int init_match_len, uint64_t ref_pos, const int min_seed_len, HybSeedArr* seeds,
|
||
uint64_t* seeds_cap) {
|
||
int left_match = 0, right_match = 0;
|
||
both_end_match(hyb, read_seq->len, rr, read_seq->back_bits, read_seq->for_bits, rx, init_match_len, ref_pos, &left_match,
|
||
&right_match);
|
||
if (left_match + right_match >= min_seed_len) {
|
||
ref_pos = _rev_ref(hyb, ref_pos);
|
||
__add_seed_one_pos(seed, ref_pos - right_match + 1, x - right_match + 1, x + left_match + 1);
|
||
}
|
||
return MAX(x + left_match + 1, x - right_match + 1 + min_seed_len);
|
||
}
|
||
|
||
static int handle_hits_2(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const Range* rr, int x,
|
||
int rx, int init_match_len, uint64_t sa_pos, const int min_seed_len, HybSeedArr* seeds,
|
||
uint64_t* seeds_cap, int tid) {
|
||
int left_match_arr[2] = {0}, right_match_arr[2] = {0};
|
||
Range mr_arr[2] = {0};
|
||
uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, sa_pos), hyb_sa_to_ref_pos(hyb->sa, sa_pos + 1)};
|
||
int i = 0;
|
||
for (i = 0; i < 2; ++i) {
|
||
both_end_match(hyb, read_seq->len, rr, read_seq->back_bits, read_seq->for_bits, rx, init_match_len, ref_pos_arr[i],
|
||
&left_match_arr[i], &right_match_arr[i]);
|
||
_set_range(mr_arr[i], x - right_match_arr[i] + 1, x + left_match_arr[i] + 1);
|
||
}
|
||
if (_range_equal(mr_arr[0], mr_arr[1])) { // 相等
|
||
if (mr_arr[0].end - mr_arr[0].start >= min_seed_len) { // 正向搜索确定ref_pos的先后顺序
|
||
uint8_t type_hits = 0;
|
||
uint64_t offset = 0;
|
||
get_kmer_data(hyb, read_seq->for_bits, mr_arr[0].start, &type_hits, &offset);
|
||
if (type_hits == 2) {
|
||
ref_pos_arr[0] = hyb_sa_to_ref_pos(hyb->sa, offset);
|
||
ref_pos_arr[1] = hyb_sa_to_ref_pos(hyb->sa, offset + 1);
|
||
__add_seed_one_pos(seed, ref_pos_arr[0], mr_arr[0].start, mr_arr[0].end);
|
||
kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]);
|
||
} else { // 需要反向搜索确定ref
|
||
uint32_t seq_pos = mr_arr[0].start + HYB_KMER_LEN;
|
||
uint32_t hits = type_hits;
|
||
uint64_t sa_start = 0;
|
||
uint8_t cmp_ref = 0;
|
||
// PROF_START(seed_1);
|
||
get_leaf_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, read_range->end, &seq_pos, &hits,
|
||
&sa_start, &cmp_ref, tid);
|
||
// PROF_END(tprof[T_SEED_1_3_1][tid], seed_1);
|
||
ref_pos_arr[0] = hyb_sa_to_ref_pos(hyb->sa, sa_start);
|
||
ref_pos_arr[1] = hyb_sa_to_ref_pos(hyb->sa, sa_start + 1);
|
||
__add_seed_one_pos(seed, ref_pos_arr[0], mr_arr[0].start, mr_arr[0].end);
|
||
kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]);
|
||
}
|
||
}
|
||
} else if (_range_cross(mr_arr[0], mr_arr[1])) { // 交叉
|
||
if (mr_arr[0].start < mr_arr[1].start) {
|
||
__check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[0]) - right_match_arr[0] + 1, mr_arr[0].start,
|
||
mr_arr[0].end);
|
||
__check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[1]) - right_match_arr[1] + 1, mr_arr[1].start,
|
||
mr_arr[1].end);
|
||
} else {
|
||
__check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[1]) - right_match_arr[1] + 1, mr_arr[1].start,
|
||
mr_arr[1].end);
|
||
__check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[0]) - right_match_arr[0] + 1, mr_arr[0].start,
|
||
mr_arr[0].end);
|
||
}
|
||
} else { // 包含
|
||
if (mr_arr[0].start < mr_arr[1].start || mr_arr[0].end > mr_arr[1].end) {
|
||
__check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[0]) - right_match_arr[0] + 1, mr_arr[0].start,
|
||
mr_arr[0].end);
|
||
} else {
|
||
__check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[1]) - right_match_arr[1] + 1, mr_arr[1].start,
|
||
mr_arr[1].end);
|
||
}
|
||
}
|
||
return MAX(MAX(mr_arr[0].end, mr_arr[1].end), MIN(mr_arr[0].start, mr_arr[1].start) + min_seed_len);
|
||
}
|
||
|
||
static int handle_hits_much(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, int x,
|
||
const int min_seed_len, HybSeedArr* seeds, uint64_t* seeds_cap, int tid) {
|
||
int max_reach = x + HYB_KMER_LEN;
|
||
int right_match = 0;
|
||
uint8_t type_hits = 0;
|
||
uint64_t offset = 0;
|
||
uint64_t ref_pos = 0;
|
||
uint32_t seq_pos = x + HYB_KMER_LEN;
|
||
uint32_t hits = 0;
|
||
uint64_t sa_start = 0;
|
||
uint8_t cmp_ref = 0;
|
||
int i = 0;
|
||
|
||
get_kmer_data(hyb, read_seq->for_bits, x, &type_hits, &offset);
|
||
if (type_hits == 2) {
|
||
int match_end[2] = {0};
|
||
uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, offset), hyb_sa_to_ref_pos(hyb->sa, offset + 1)};
|
||
for (i = 0; i < 2; ++i) {
|
||
right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, HYB_KMER_LEN,
|
||
ref_pos_arr[i], &right_match);
|
||
match_end[i] = x + right_match;
|
||
}
|
||
max_reach = MAX(match_end[0], match_end[1]);
|
||
if (max_reach - x >= min_seed_len) {
|
||
if (match_end[0] == match_end[1]) {
|
||
__add_seed_one_pos(seed, ref_pos_arr[0], x, max_reach);
|
||
kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]);
|
||
} else if (match_end[0] > match_end[1]) {
|
||
__add_seed_one_pos(seed, ref_pos_arr[0], x, match_end[0]);
|
||
} else {
|
||
__add_seed_one_pos(seed, ref_pos_arr[1], x, match_end[1]);
|
||
}
|
||
}
|
||
} else {
|
||
hits = type_hits;
|
||
// PROF_START(seed_1);
|
||
get_leaf_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, read_range->end, &seq_pos, &hits, &sa_start,
|
||
&cmp_ref, tid);
|
||
// PROF_END(tprof[T_SEED_1_3_1][tid], seed_1);
|
||
// tdat[(seq_pos - x - HYB_KMER_LEN + 2) / 3][tid]++;
|
||
if (seq_pos == read_range->end || !cmp_ref) {
|
||
max_reach = seq_pos;
|
||
if (max_reach - x >= min_seed_len) {
|
||
__add_seed_one_pos(seed, hyb_sa_to_ref_pos(hyb->sa, sa_start), x, max_reach);
|
||
int i = 0;
|
||
for (i = 1; i < hits; ++i) {
|
||
kv_push(uint64_t, seed->ref_pos_arr, hyb_sa_to_ref_pos(hyb->sa, sa_start + i));
|
||
}
|
||
}
|
||
} else {
|
||
ref_pos = hyb_sa_to_ref_pos(hyb->sa, sa_start);
|
||
right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, seq_pos - x, ref_pos,
|
||
&right_match);
|
||
max_reach = x + right_match;
|
||
if (right_match >= min_seed_len) {
|
||
__add_seed_one_pos(seed, ref_pos, x, max_reach);
|
||
}
|
||
}
|
||
}
|
||
return MAX(max_reach, x + min_seed_len);
|
||
}
|
||
|
||
int seeding_from_start(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const int min_seed_len,
|
||
HybSeedArr* seeds, int tid) {
|
||
// PROF_START(seed_1);
|
||
uint64_t seeds_m = seeds->m;
|
||
uint64_t* seeds_cap = &seeds_m; // 记录当前seeds的长度, 如果扩容,则需要初始化ref_pos_arr.m, n, a 为0
|
||
int max_reach = read_range->start + HYB_KMER_LEN; // 返回的结果,最远匹配的read的位置
|
||
int x = read_range->start; // 从read_range的起始位置开始匹配
|
||
uint64_t i = 0;
|
||
int right_match = 0;
|
||
uint8_t type_hits = 0;
|
||
uint64_t offset = 0;
|
||
uint64_t ref_pos = 0;
|
||
get_kmer_data(hyb, read_seq->for_bits, x, &type_hits, &offset);
|
||
// PROF_END(tprof[T_SEED_1_0][tid], seed_1);
|
||
if (type_hits == 0) {
|
||
// tdat[TD_SEED_1_0][tid]++;
|
||
} else if (type_hits == 1) {
|
||
// tdat[TD_SEED_1_1][tid]++;
|
||
// PROF_START(seed_1);
|
||
right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, HYB_KMER_LEN, offset,
|
||
&right_match);
|
||
max_reach = x + right_match;
|
||
if (max_reach - x >= min_seed_len) {
|
||
__add_seed_one_pos(seed, offset, x, max_reach);
|
||
seed->first_len = HYB_KMER_LEN;
|
||
}
|
||
// PROF_END(tprof[T_SEED_1_1][tid], seed_1);
|
||
} else if (type_hits == 2) {
|
||
// tdat[TD_SEED_1_2][tid]++;
|
||
// PROF_START(seed_1);
|
||
int match_end[2] = {0};
|
||
uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, offset), hyb_sa_to_ref_pos(hyb->sa, offset + 1)};
|
||
for (i = 0; i < 2; ++i) {
|
||
right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, HYB_KMER_LEN,
|
||
ref_pos_arr[i], &right_match);
|
||
match_end[i] = x + right_match;
|
||
}
|
||
max_reach = MAX(match_end[0], match_end[1]);
|
||
if (max_reach - x >= min_seed_len) {
|
||
if (match_end[0] == match_end[1]) {
|
||
__add_seed_one_pos(seed, ref_pos_arr[0], x, max_reach);
|
||
kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]);
|
||
seed->first_len = match_end[0];
|
||
// seed->first_len = HYB_KMER_LEN;
|
||
} else if (match_end[0] > match_end[1]) {
|
||
__add_seed_one_pos(seed, ref_pos_arr[0], x, match_end[0]);
|
||
seed->first_len = match_end[1];
|
||
// seed->first_len = HYB_KMER_LEN;
|
||
} else {
|
||
__add_seed_one_pos(seed, ref_pos_arr[1], x, match_end[1]);
|
||
seed->first_len = match_end[0];
|
||
// seed->first_len = HYB_KMER_LEN;
|
||
}
|
||
}
|
||
// PROF_END(tprof[T_SEED_1_2][tid], seed_1);
|
||
} else {
|
||
// tdat[TD_SEED_1_3][tid]++;
|
||
uint32_t seq_pos = x + HYB_KMER_LEN;
|
||
uint32_t hits = type_hits;
|
||
uint64_t sa_start = 0;
|
||
uint8_t cmp_ref = 0;
|
||
// PROF_START(seed_1);
|
||
get_leaf_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, read_range->end, &seq_pos, &hits, &sa_start,
|
||
&cmp_ref, tid);
|
||
// PROF_END(tprof[T_SEED_1_3_1][tid], seed_1);
|
||
if (seq_pos == read_range->end || !cmp_ref) {
|
||
max_reach = seq_pos;
|
||
if (max_reach - x >= min_seed_len) {
|
||
__add_seed_one_pos(seed, hyb_sa_to_ref_pos(hyb->sa, sa_start), x, max_reach);
|
||
if (hits == 1)
|
||
seed->first_len = seq_pos - x;
|
||
for (i = 1; i < hits; ++i) {
|
||
kv_push(uint64_t, seed->ref_pos_arr, hyb_sa_to_ref_pos(hyb->sa, sa_start + i));
|
||
}
|
||
}
|
||
} else {
|
||
ref_pos = hyb_sa_to_ref_pos(hyb->sa, sa_start);
|
||
right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, seq_pos - x, ref_pos,
|
||
&right_match);
|
||
max_reach = x + right_match;
|
||
if (right_match >= min_seed_len) {
|
||
__add_seed_one_pos(seed, ref_pos, x, max_reach);
|
||
seed->first_len = seq_pos - x;
|
||
}
|
||
}
|
||
// PROF_END(tprof[T_SEED_1_3][tid], seed_1);
|
||
}
|
||
// PROF_END(tprof[T_SEED_1_ALL][tid], seed_1);
|
||
// PROF_END(tprof[T_SEED_1_1][tid], seed_1);
|
||
return MAX(max_reach, x + min_seed_len);
|
||
}
|
||
|
||
//////////////
|
||
// 用hybrid-index来寻找smem(seeding-1),要求种子 hits >= min_hits_thres(>0)
|
||
void hyb_first_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const int min_seed_len,
|
||
HybSeedArr* seeds, int tid) {
|
||
int x = seeding_from_start(hyb, read_seq, read_range, min_seed_len, seeds, tid);
|
||
int rx = 0; // 对应的反向位置
|
||
Range rr = {read_seq->len - read_range->end, read_seq->len - read_range->start};
|
||
uint64_t seeds_m = seeds->m;
|
||
uint64_t* seeds_cap = &seeds_m; // 记录当前seeds的长度, 如果扩容,则需要初始化ref_pos_arr.m, n, a 为0
|
||
uint8_t type_hits = 0;
|
||
uint64_t offset = 0;
|
||
int extra_tend = MAX(0, min_seed_len - HYB_KMER_LEN) + 1;
|
||
// PROF_START(seed_1);
|
||
while (x < read_range->end) {
|
||
// 反向搜索, 此时x距离start超过16
|
||
rx = read_seq->len - x - 1; // 反向位置, 因为正向包含x, 所以这里需要减1
|
||
// PROF_START(seed_1);
|
||
get_kmer_data(hyb, read_seq->back_bits, rx, &type_hits, &offset);
|
||
// PROF_END(tprof[T_SEED_1_0][tid], seed_1);
|
||
if (type_hits == 0) {
|
||
x += extra_tend;
|
||
// tdat[TD_SEED_1_0][tid]++;
|
||
} else if (type_hits == 1) {
|
||
// tdat[TD_SEED_1_1][tid]++;
|
||
// PROF_START(seed_1);
|
||
x = handle_hits_1(hyb, read_seq, read_range, &rr, x, rx, HYB_KMER_LEN, offset, min_seed_len, seeds, seeds_cap);
|
||
// PROF_END(tprof[T_SEED_1_1][tid], seed_1);
|
||
} else if (type_hits == 2) {
|
||
// tdat[TD_SEED_1_2][tid]++;
|
||
// PROF_START(seed_1);
|
||
x = handle_hits_2(hyb, read_seq, read_range, &rr, x, rx, HYB_KMER_LEN, offset, min_seed_len, seeds, seeds_cap, tid);
|
||
// PROF_END(tprof[T_SEED_1_2][tid], seed_1);
|
||
} else {
|
||
// tdat[TD_SEED_1_3][tid]++;
|
||
// PROF_START(seed_1);
|
||
uint32_t seq_pos = rx + HYB_KMER_LEN;
|
||
uint32_t hits = type_hits;
|
||
uint64_t sa_start = 0;
|
||
uint8_t cmp_ref = 0;
|
||
get_leaf_node(hyb->index_data + offset, read_seq->back_bits, read_seq->rseq, rr.end, &seq_pos, &hits, &sa_start,
|
||
&cmp_ref, tid);
|
||
// PROF_END(tprof[T_SEED_1_3_1][tid], seed_1);
|
||
// tdat[(seq_pos - rx - HYB_KMER_LEN + 2) / 3][tid]++;
|
||
// tdat[TD_SEED_1_0][tid]++;
|
||
// if (hits == 1) {
|
||
// tdat[TD_SEED_1_1][tid]++;
|
||
// } else if (hits == 2) {
|
||
// tdat[TD_SEED_1_2][tid]++;
|
||
// } else if (hits == 3) {
|
||
// tdat[TD_SEED_1_3][tid]++;
|
||
// } else if (hits == 4) {
|
||
// tdat[TD_SEED_1_4][tid]++;
|
||
// } else {
|
||
// tdat[TD_SEED_1_5][tid]++;
|
||
// }
|
||
if (seq_pos == rr.end || !cmp_ref) {
|
||
if (hits == 1) {
|
||
// PROF_START(seed_1);
|
||
x = handle_hits_1(hyb, read_seq, read_range, &rr, x, rx, seq_pos - rx, hyb_sa_to_ref_pos(hyb->sa, sa_start),
|
||
min_seed_len, seeds, seeds_cap);
|
||
// PROF_END(tprof[T_SEED_1_3_2][tid], seed_1);
|
||
} else if (hits == 2) {
|
||
// PROF_START(seed_1);
|
||
x = handle_hits_2(hyb, read_seq, read_range, &rr, x, rx, seq_pos - rx, sa_start, min_seed_len, seeds,
|
||
seeds_cap, tid);
|
||
// PROF_END(tprof[T_SEED_1_3_3][tid], seed_1);
|
||
} else {
|
||
// PROF_START(seed_1);
|
||
x = handle_hits_much(hyb, read_seq, read_range, x + rx - seq_pos + 1, min_seed_len, seeds, seeds_cap, tid);
|
||
// PROF_END(tprof[T_SEED_1_3_4][tid], seed_1);
|
||
}
|
||
} else { // hits == 1
|
||
// PROF_START(seed_1);
|
||
x = handle_hits_1(hyb, read_seq, read_range, &rr, x, rx, seq_pos - rx, hyb_sa_to_ref_pos(hyb->sa, sa_start),
|
||
min_seed_len, seeds, seeds_cap);
|
||
// PROF_END(tprof[T_SEED_1_3_5][tid], seed_1);
|
||
}
|
||
// PROF_END(tprof[T_SEED_1_3][tid], seed_1);
|
||
}
|
||
}
|
||
// PROF_END(tprof[T_SEED_1_ALL][tid], seed_1);
|
||
// PROF_END(tprof[T_SEED_1_0][tid], seed_1);
|
||
}
|