hyb-align/hyb_seeding_1.c

315 lines
16 KiB
C
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include "hyb_idx.h"
#include "profiling.h"
static int handle_hits_1(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const Range* rr, int x,
int rx, int init_match_len, uint64_t ref_pos, const int min_seed_len, HybSeedArr* seeds,
uint64_t* seeds_cap) {
int left_match = 0, right_match = 0;
both_end_match(hyb, read_seq->len, rr, read_seq->back_bits, read_seq->for_bits, rx, init_match_len, ref_pos, &left_match,
&right_match);
if (left_match + right_match >= min_seed_len) {
ref_pos = _rev_ref(hyb, ref_pos);
__add_seed_one_pos(seed, ref_pos - right_match + 1, x - right_match + 1, x + left_match + 1);
}
return MAX(x + left_match + 1, x - right_match + 1 + min_seed_len);
}
static int handle_hits_2(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const Range* rr, int x,
int rx, int init_match_len, uint64_t sa_pos, const int min_seed_len, HybSeedArr* seeds,
uint64_t* seeds_cap, int tid) {
int left_match_arr[2] = {0}, right_match_arr[2] = {0};
Range mr_arr[2] = {0};
uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, sa_pos), hyb_sa_to_ref_pos(hyb->sa, sa_pos + 1)};
int i = 0;
for (i = 0; i < 2; ++i) {
both_end_match(hyb, read_seq->len, rr, read_seq->back_bits, read_seq->for_bits, rx, init_match_len, ref_pos_arr[i],
&left_match_arr[i], &right_match_arr[i]);
_set_range(mr_arr[i], x - right_match_arr[i] + 1, x + left_match_arr[i] + 1);
}
if (_range_equal(mr_arr[0], mr_arr[1])) { // 相等
if (mr_arr[0].end - mr_arr[0].start >= min_seed_len) { // 正向搜索确定ref_pos的先后顺序
uint8_t type_hits = 0;
uint64_t offset = 0;
get_kmer_data(hyb, read_seq->for_bits, mr_arr[0].start, &type_hits, &offset);
if (type_hits == 2) {
ref_pos_arr[0] = hyb_sa_to_ref_pos(hyb->sa, offset);
ref_pos_arr[1] = hyb_sa_to_ref_pos(hyb->sa, offset + 1);
__add_seed_one_pos(seed, ref_pos_arr[0], mr_arr[0].start, mr_arr[0].end);
kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]);
} else { // 需要反向搜索确定ref
uint32_t seq_pos = mr_arr[0].start + HYB_KMER_LEN;
uint32_t hits = type_hits;
uint64_t sa_start = 0;
uint8_t cmp_ref = 0;
// PROF_START(seed_1);
get_leaf_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, read_range->end, &seq_pos, &hits,
&sa_start, &cmp_ref, tid);
// PROF_END(tprof[T_SEED_1_3_1][tid], seed_1);
ref_pos_arr[0] = hyb_sa_to_ref_pos(hyb->sa, sa_start);
ref_pos_arr[1] = hyb_sa_to_ref_pos(hyb->sa, sa_start + 1);
__add_seed_one_pos(seed, ref_pos_arr[0], mr_arr[0].start, mr_arr[0].end);
kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]);
}
}
} else if (_range_cross(mr_arr[0], mr_arr[1])) { // 交叉
if (mr_arr[0].start < mr_arr[1].start) {
__check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[0]) - right_match_arr[0] + 1, mr_arr[0].start,
mr_arr[0].end);
__check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[1]) - right_match_arr[1] + 1, mr_arr[1].start,
mr_arr[1].end);
} else {
__check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[1]) - right_match_arr[1] + 1, mr_arr[1].start,
mr_arr[1].end);
__check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[0]) - right_match_arr[0] + 1, mr_arr[0].start,
mr_arr[0].end);
}
} else { // 包含
if (mr_arr[0].start < mr_arr[1].start || mr_arr[0].end > mr_arr[1].end) {
__check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[0]) - right_match_arr[0] + 1, mr_arr[0].start,
mr_arr[0].end);
} else {
__check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[1]) - right_match_arr[1] + 1, mr_arr[1].start,
mr_arr[1].end);
}
}
return MAX(MAX(mr_arr[0].end, mr_arr[1].end), MIN(mr_arr[0].start, mr_arr[1].start) + min_seed_len);
}
static int handle_hits_much(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, int x,
const int min_seed_len, HybSeedArr* seeds, uint64_t* seeds_cap, int tid) {
int max_reach = x + HYB_KMER_LEN;
int right_match = 0;
uint8_t type_hits = 0;
uint64_t offset = 0;
uint64_t ref_pos = 0;
uint32_t seq_pos = x + HYB_KMER_LEN;
uint32_t hits = 0;
uint64_t sa_start = 0;
uint8_t cmp_ref = 0;
int i = 0;
get_kmer_data(hyb, read_seq->for_bits, x, &type_hits, &offset);
if (type_hits == 2) {
int match_end[2] = {0};
uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, offset), hyb_sa_to_ref_pos(hyb->sa, offset + 1)};
for (i = 0; i < 2; ++i) {
right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, HYB_KMER_LEN,
ref_pos_arr[i], &right_match);
match_end[i] = x + right_match;
}
max_reach = MAX(match_end[0], match_end[1]);
if (max_reach - x >= min_seed_len) {
if (match_end[0] == match_end[1]) {
__add_seed_one_pos(seed, ref_pos_arr[0], x, max_reach);
kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]);
} else if (match_end[0] > match_end[1]) {
__add_seed_one_pos(seed, ref_pos_arr[0], x, match_end[0]);
} else {
__add_seed_one_pos(seed, ref_pos_arr[1], x, match_end[1]);
}
}
} else {
hits = type_hits;
// PROF_START(seed_1);
get_leaf_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, read_range->end, &seq_pos, &hits, &sa_start,
&cmp_ref, tid);
// PROF_END(tprof[T_SEED_1_3_1][tid], seed_1);
// tdat[(seq_pos - x - HYB_KMER_LEN + 2) / 3][tid]++;
if (seq_pos == read_range->end || !cmp_ref) {
max_reach = seq_pos;
if (max_reach - x >= min_seed_len) {
__add_seed_one_pos(seed, hyb_sa_to_ref_pos(hyb->sa, sa_start), x, max_reach);
int i = 0;
for (i = 1; i < hits; ++i) {
kv_push(uint64_t, seed->ref_pos_arr, hyb_sa_to_ref_pos(hyb->sa, sa_start + i));
}
}
} else {
ref_pos = hyb_sa_to_ref_pos(hyb->sa, sa_start);
right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, seq_pos - x, ref_pos,
&right_match);
max_reach = x + right_match;
if (right_match >= min_seed_len) {
__add_seed_one_pos(seed, ref_pos, x, max_reach);
}
}
}
return MAX(max_reach, x + min_seed_len);
}
int seeding_from_start(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const int min_seed_len,
HybSeedArr* seeds, int tid) {
// PROF_START(seed_1);
uint64_t seeds_m = seeds->m;
uint64_t* seeds_cap = &seeds_m; // 记录当前seeds的长度, 如果扩容则需要初始化ref_pos_arr.m, n, a 为0
int max_reach = read_range->start + HYB_KMER_LEN; // 返回的结果最远匹配的read的位置
int x = read_range->start; // 从read_range的起始位置开始匹配
uint64_t i = 0;
int right_match = 0;
uint8_t type_hits = 0;
uint64_t offset = 0;
uint64_t ref_pos = 0;
get_kmer_data(hyb, read_seq->for_bits, x, &type_hits, &offset);
// PROF_END(tprof[T_SEED_1_0][tid], seed_1);
if (type_hits == 0) {
// tdat[TD_SEED_1_0][tid]++;
} else if (type_hits == 1) {
// tdat[TD_SEED_1_1][tid]++;
// PROF_START(seed_1);
right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, HYB_KMER_LEN, offset,
&right_match);
max_reach = x + right_match;
if (max_reach - x >= min_seed_len) {
__add_seed_one_pos(seed, offset, x, max_reach);
seed->first_len = HYB_KMER_LEN;
}
// PROF_END(tprof[T_SEED_1_1][tid], seed_1);
} else if (type_hits == 2) {
// tdat[TD_SEED_1_2][tid]++;
// PROF_START(seed_1);
int match_end[2] = {0};
uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, offset), hyb_sa_to_ref_pos(hyb->sa, offset + 1)};
for (i = 0; i < 2; ++i) {
right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, HYB_KMER_LEN,
ref_pos_arr[i], &right_match);
match_end[i] = x + right_match;
}
max_reach = MAX(match_end[0], match_end[1]);
if (max_reach - x >= min_seed_len) {
if (match_end[0] == match_end[1]) {
__add_seed_one_pos(seed, ref_pos_arr[0], x, max_reach);
kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]);
seed->first_len = match_end[0];
// seed->first_len = HYB_KMER_LEN;
} else if (match_end[0] > match_end[1]) {
__add_seed_one_pos(seed, ref_pos_arr[0], x, match_end[0]);
seed->first_len = match_end[1];
// seed->first_len = HYB_KMER_LEN;
} else {
__add_seed_one_pos(seed, ref_pos_arr[1], x, match_end[1]);
seed->first_len = match_end[0];
// seed->first_len = HYB_KMER_LEN;
}
}
// PROF_END(tprof[T_SEED_1_2][tid], seed_1);
} else {
// tdat[TD_SEED_1_3][tid]++;
uint32_t seq_pos = x + HYB_KMER_LEN;
uint32_t hits = type_hits;
uint64_t sa_start = 0;
uint8_t cmp_ref = 0;
// PROF_START(seed_1);
get_leaf_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, read_range->end, &seq_pos, &hits, &sa_start,
&cmp_ref, tid);
// PROF_END(tprof[T_SEED_1_3_1][tid], seed_1);
if (seq_pos == read_range->end || !cmp_ref) {
max_reach = seq_pos;
if (max_reach - x >= min_seed_len) {
__add_seed_one_pos(seed, hyb_sa_to_ref_pos(hyb->sa, sa_start), x, max_reach);
if (hits == 1)
seed->first_len = seq_pos - x;
for (i = 1; i < hits; ++i) {
kv_push(uint64_t, seed->ref_pos_arr, hyb_sa_to_ref_pos(hyb->sa, sa_start + i));
}
}
} else {
ref_pos = hyb_sa_to_ref_pos(hyb->sa, sa_start);
right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, seq_pos - x, ref_pos,
&right_match);
max_reach = x + right_match;
if (right_match >= min_seed_len) {
__add_seed_one_pos(seed, ref_pos, x, max_reach);
seed->first_len = seq_pos - x;
}
}
// PROF_END(tprof[T_SEED_1_3][tid], seed_1);
}
// PROF_END(tprof[T_SEED_1_ALL][tid], seed_1);
// PROF_END(tprof[T_SEED_1_1][tid], seed_1);
return MAX(max_reach, x + min_seed_len);
}
//////////////
// 用hybrid-index来寻找smem(seeding-1),要求种子 hits >= min_hits_thres(>0)
void hyb_first_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const int min_seed_len,
HybSeedArr* seeds, int tid) {
int x = seeding_from_start(hyb, read_seq, read_range, min_seed_len, seeds, tid);
int rx = 0; // 对应的反向位置
Range rr = {read_seq->len - read_range->end, read_seq->len - read_range->start};
uint64_t seeds_m = seeds->m;
uint64_t* seeds_cap = &seeds_m; // 记录当前seeds的长度, 如果扩容则需要初始化ref_pos_arr.m, n, a 为0
uint8_t type_hits = 0;
uint64_t offset = 0;
int extra_tend = MAX(0, min_seed_len - HYB_KMER_LEN) + 1;
// PROF_START(seed_1);
while (x < read_range->end) {
// 反向搜索, 此时x距离start超过16
rx = read_seq->len - x - 1; // 反向位置, 因为正向包含x, 所以这里需要减1
// PROF_START(seed_1);
get_kmer_data(hyb, read_seq->back_bits, rx, &type_hits, &offset);
// PROF_END(tprof[T_SEED_1_0][tid], seed_1);
if (type_hits == 0) {
x += extra_tend;
// tdat[TD_SEED_1_0][tid]++;
} else if (type_hits == 1) {
// tdat[TD_SEED_1_1][tid]++;
// PROF_START(seed_1);
x = handle_hits_1(hyb, read_seq, read_range, &rr, x, rx, HYB_KMER_LEN, offset, min_seed_len, seeds, seeds_cap);
// PROF_END(tprof[T_SEED_1_1][tid], seed_1);
} else if (type_hits == 2) {
// tdat[TD_SEED_1_2][tid]++;
// PROF_START(seed_1);
x = handle_hits_2(hyb, read_seq, read_range, &rr, x, rx, HYB_KMER_LEN, offset, min_seed_len, seeds, seeds_cap, tid);
// PROF_END(tprof[T_SEED_1_2][tid], seed_1);
} else {
// tdat[TD_SEED_1_3][tid]++;
// PROF_START(seed_1);
uint32_t seq_pos = rx + HYB_KMER_LEN;
uint32_t hits = type_hits;
uint64_t sa_start = 0;
uint8_t cmp_ref = 0;
get_leaf_node(hyb->index_data + offset, read_seq->back_bits, read_seq->rseq, rr.end, &seq_pos, &hits, &sa_start,
&cmp_ref, tid);
// PROF_END(tprof[T_SEED_1_3_1][tid], seed_1);
// tdat[(seq_pos - rx - HYB_KMER_LEN + 2) / 3][tid]++;
// tdat[TD_SEED_1_0][tid]++;
// if (hits == 1) {
// tdat[TD_SEED_1_1][tid]++;
// } else if (hits == 2) {
// tdat[TD_SEED_1_2][tid]++;
// } else if (hits == 3) {
// tdat[TD_SEED_1_3][tid]++;
// } else if (hits == 4) {
// tdat[TD_SEED_1_4][tid]++;
// } else {
// tdat[TD_SEED_1_5][tid]++;
// }
if (seq_pos == rr.end || !cmp_ref) {
if (hits == 1) {
// PROF_START(seed_1);
x = handle_hits_1(hyb, read_seq, read_range, &rr, x, rx, seq_pos - rx, hyb_sa_to_ref_pos(hyb->sa, sa_start),
min_seed_len, seeds, seeds_cap);
// PROF_END(tprof[T_SEED_1_3_2][tid], seed_1);
} else if (hits == 2) {
// PROF_START(seed_1);
x = handle_hits_2(hyb, read_seq, read_range, &rr, x, rx, seq_pos - rx, sa_start, min_seed_len, seeds,
seeds_cap, tid);
// PROF_END(tprof[T_SEED_1_3_3][tid], seed_1);
} else {
// PROF_START(seed_1);
x = handle_hits_much(hyb, read_seq, read_range, x + rx - seq_pos + 1, min_seed_len, seeds, seeds_cap, tid);
// PROF_END(tprof[T_SEED_1_3_4][tid], seed_1);
}
} else { // hits == 1
// PROF_START(seed_1);
x = handle_hits_1(hyb, read_seq, read_range, &rr, x, rx, seq_pos - rx, hyb_sa_to_ref_pos(hyb->sa, sa_start),
min_seed_len, seeds, seeds_cap);
// PROF_END(tprof[T_SEED_1_3_5][tid], seed_1);
}
// PROF_END(tprof[T_SEED_1_3][tid], seed_1);
}
}
// PROF_END(tprof[T_SEED_1_ALL][tid], seed_1);
// PROF_END(tprof[T_SEED_1_0][tid], seed_1);
}