hybrid-index和bwt结果一致

This commit is contained in:
zzh 2025-11-16 01:37:21 +08:00
parent caf01ce0f3
commit 70979c1b60
35 changed files with 5047 additions and 410 deletions

1
.gitignore vendored
View File

@ -8,6 +8,7 @@ bwamem-lite
test_index/
index/
orig_index/
output/
run.sh
debug.sh
hybalign

33
.vscode/launch.json vendored
View File

@ -9,7 +9,7 @@
"preLaunchTask": "Build",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceRoot}/hbwa",
"program": "${workspaceRoot}/hybalign",
"args": [
"mem",
"-t",
@ -17,9 +17,9 @@
"-M",
"-R",
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
"~/data/fmt_ref/human_g1k_v37_decoy.fasta",
"./b1.fq",
"./b2.fq",
"/home/zzh/work/bioinfo/hyb-align/index/human_g1k_v37_decoy.fasta",
//"./b1.fq",
//"./b2.fq",
//"./b1.fq",
//"~/data/dataset/real/D1/n1.fq",
//"~/data/dataset/real/D1/n2.fq",
@ -29,11 +29,11 @@
//"~/data/dataset/real/D3/n2.fq",
//"~/data/dataset/real/D1/n1.fq.gz",
//"~/data/dataset/real/D1/n2.fq.gz",
//"~/data/dataset/real/D3/1w1.fq",
//"~/data/dataset/real/D3/1w2.fq",
"~/data/dataset/real/D3/1w1.fq",
"~/data/dataset/real/D3/1w2.fq",
"-o",
"/dev/null",
//"-Z",
// "-g",
],
"cwd": "${workspaceFolder}", //
},
@ -42,7 +42,7 @@
"preLaunchTask": "Build",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceRoot}/hbwa",
"program": "${workspaceRoot}/hybalign",
"args": [
"index",
"~/data/reference/human_g1k_v37_decoy.fasta"
@ -54,7 +54,7 @@
"preLaunchTask": "Build",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceRoot}/hbwa",
"program": "${workspaceRoot}/hybalign",
"args": [
"buildkmer",
"~/data/reference/human_g1k_v37_decoy.fasta.256.64.fmt",
@ -67,7 +67,7 @@
"preLaunchTask": "Build",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceRoot}/hbwa",
"program": "${workspaceRoot}/hybalign",
"args": [
"shm",
"-Z",
@ -80,7 +80,7 @@
"preLaunchTask": "Build",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceRoot}/hbwa",
"program": "${workspaceRoot}/hybalign",
"args": [
"pac2bref",
"~/data1/fmt_ref/human_g1k_v37_decoy.fasta"
@ -102,18 +102,15 @@
"cwd": "${workspaceFolder}", //
},
{
"name": "train hybrid index",
"name": "fa2pac",
"preLaunchTask": "Build",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceRoot}/hbwa",
"program": "${workspaceRoot}/hybalign",
"args": [
"trainhybrid",
"-t",
"1",
"fa2pac",
"-f",
"~/data/fmt_ref/human_g1k_v37_decoy.fasta",
"~/data/dataset/real/D1/n1.fq.gz",
"~/data/dataset/real/D1/n2.fq.gz"
],
"cwd": "${workspaceFolder}", //
},

View File

@ -70,6 +70,7 @@
"share_mem.h": "c",
"kseq.h": "c",
"ostream": "c",
"streambuf": "c"
"streambuf": "c",
"kbtree.h": "c"
}
}

View File

@ -1,10 +1,11 @@
CC= gcc
#CC= clang --analyze
CFLAGS= -g -Wall -Wno-unused-function -O3
CFLAGS= -g -Wall -Wno-unused-function -mavx2 -O3
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
AR= ar
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
HYBOBJS= hyb_bwa.o hyb_utils.o hyb_seeding_1.o hyb_seeding_2.o hyb_seeding_3.o hyb_create_idx.o debug.o profiling.o
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) -DUSE_AVX2_EXT -DSHOW_PERF -DDEBUG_FILE_OUTPUT
HYBOBJS= hyb_bwa.o hyb_utils.o hyb_seeding_1.o hyb_seeding_2.o hyb_seeding_3.o hyb_create_idx.o debug.o profiling.o share_mem.o yarn.o \
ksw_extend2_avx2.o ksw_extend2_avx2_u8.o
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o \
QSufSort.o bwt_gen.o rope.o rle.o is.o bwtindex.o
AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \

View File

@ -346,7 +346,9 @@ int bwa_fa2pac(int argc, char *argv[])
return 1;
}
fp = xzopen(argv[optind], "r");
start_async_read(fp);
bns_fasta2bntseq(fp, (optind+1 < argc)? argv[optind+1] : argv[optind], for_only);
stop_async_read(fp);
err_gzclose(fp);
return 0;
}

236
bwa.c
View File

@ -24,16 +24,19 @@
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#include <string.h>
#include <stdio.h>
#include <zlib.h>
#include <assert.h>
#include "bntseq.h"
#include "bwa.h"
#include "ksw.h"
#include "utils.h"
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <string.h>
#include <zlib.h>
#include "bntseq.h"
#include "kstring.h"
#include "ksw.h"
#include "kvec.h"
#include "utils.h"
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
@ -57,28 +60,193 @@ static inline void trim_readno(kstring_t *s)
s->l -= 2, s->s[s->l] = 0;
}
static inline char *dupkstring(const kstring_t *str, int dupempty)
{
char *s = (str->l > 0 || dupempty)? malloc(str->l + 1) : NULL;
if (!s) return NULL;
memcpy(s, str->s, str->l);
s[str->l] = '\0';
return s;
static inline void dupkstring(const kstring_t* str, int dupempty, char** dstp, int* sm) {
if (!dupempty && str->l == 0) {
if (*dstp) free(*dstp);
*dstp = 0; *sm = 0;
} else if (*dstp == 0 || *sm < str->l) {
*sm = str->l;
*dstp = (char*)realloc(*dstp, str->l + 1);
}
char* s = *dstp;
if (!s) return;
memcpy(s, str->s, str->l);
s[str->l] = '\0';
}
static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s)
static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s, int copy_comment)
{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice
s->name = dupkstring(&ks->name, 1);
s->comment = dupkstring(&ks->comment, 0);
s->seq = dupkstring(&ks->seq, 1);
s->qual = dupkstring(&ks->qual, 0);
s->l_seq = ks->seq.l;
dupkstring(&ks->name, 1, &s->name, &s->m_name);
if (copy_comment) dupkstring(&ks->comment, 0, &s->comment, &s->m_comment);
dupkstring(&ks->seq, 1, &s->seq, &s->m_seq);
dupkstring(&ks->qual, 0, &s->qual, &s->m_qual);
s->l_seq = ks->seq.l;
}
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
{
kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_;
typedef struct {
kseq_t* ks;
bseq1_t* seq;
int start_pos;
int n_bound;
int copy_comment;
int ret_n;
int ret_size;
int ret_status;
int chunk_size;
} read_data_t;
static void* thread_bseq_read(void* data) {
read_data_t* d = (read_data_t*)data;
kseq_t* ks = d->ks;
bseq1_t* seqs = d->seq;
int copy_comment = d->copy_comment;
int chunk_size = d->chunk_size;
int cur_n = 0, cur_pos = d->start_pos, size = 0;
int ret_status = 1;
while (cur_n < d->n_bound && (ret_status = kseq_read(ks)) >= 0) {
trim_readno(&ks->name);
kseq2bseq1(ks, seqs + cur_pos, copy_comment);
seqs[cur_pos].id = cur_pos;
size += seqs[cur_pos].l_seq;
cur_pos += 2; cur_n += 1;
if (size >= chunk_size) break;
}
d->ret_n = cur_n; d->ret_size = size; d->ret_status = ret_status;
return 0;
}
#define READ_ONE_SEQ(ksin) \
trim_readno(&(ksin)->name); \
kseq2bseq1(ksin, &seqs[n], copy_comment); \
seqs[n].id = n; \
size += seqs[n++].l_seq;
// multi thread reading input seqs
void bseq_read_pe_mt(int chunk_size, int* n_, void* ks1_, void* ks2_, int copy_comment, int64_t* size_, int* m_, bseq1_t** seqs_ptr) {
kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_;
int size = 0, m = *m_, n = 0;
bseq1_t* seqs = *seqs_ptr;
read_data_t d[2];
pthread_t tid[2];
const int chunk_size_narrow = 4 * 1024 * 1024;
const int init_n_reads = 20;
if (m == 0) { // 还没开辟空间,要初始化
seqs = (bseq1_t*)calloc(init_n_reads,
sizeof(bseq1_t)); // 先读取20个reads根据reads的长度和chunk size决定要读取多少条reads
#if 1
int ks1_ret = 0, ks2_ret = 0;
int i = init_n_reads >> 1;
while (i-- > 0) {
ks1_ret = kseq_read(ks);
if (ks1_ret < 0)
break;
ks2_ret = kseq_read(ks2);
if (ks2_ret < 0) {
fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__);
break;
}
READ_ONE_SEQ(ks);
READ_ONE_SEQ(ks2);
}
if (ks1_ret < 0 || ks2_ret < 0) {
if (size == 0 && kseq_read(ks2) >= 0) { // test if the 2nd file is finished
fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__);
}
*n_ = n;
*seqs_ptr = seqs;
*size_ = size;
*m_ = n;
return;
}
m = (chunk_size + size / init_n_reads - 1) / (size / init_n_reads);
#else
m = 50000;
#endif
seqs = (bseq1_t*)realloc(seqs, m * sizeof(bseq1_t));
memset(seqs + n, 0, sizeof(bseq1_t) * (m - n));
}
d[0].copy_comment = copy_comment;
d[1].copy_comment = copy_comment;
d[0].ks = ks;
d[0].seq = &seqs[0];
d[0].n_bound = (m >> 1) - (n >> 1);
d[0].start_pos = n;
d[1].ks = ks2;
d[1].seq = &seqs[0];
d[1].n_bound = (m >> 1) - (n >> 1);
d[1].start_pos = n + 1;
d[0].chunk_size = d[1].chunk_size = (chunk_size - chunk_size_narrow - size) >> 1;
pthread_create(&tid[0], 0, thread_bseq_read, &d[0]);
pthread_create(&tid[1], 0, thread_bseq_read, &d[1]);
pthread_join(tid[0], 0);
pthread_join(tid[1], 0);
size += d[0].ret_size + d[1].ret_size;
// 如果两个线程读入的reads数量不一致
if (d[0].ret_n < d[1].ret_n) {
int num_to_read = d[1].ret_n - d[0].ret_n;
int offset = n + d[0].ret_n * 2;
while (num_to_read-- > 0 && kseq_read(ks) >= 0) {
trim_readno(&ks->name);
kseq2bseq1(ks, &seqs[offset], copy_comment);
seqs[offset].id = offset;
size += seqs[offset].l_seq;
offset += 2;
}
d[0].ret_n = d[1].ret_n;
} else if (d[1].ret_n < d[0].ret_n) {
int num_to_read = d[0].ret_n - d[1].ret_n;
int offset = n + 1 + d[1].ret_n * 2;
while (num_to_read-- > 0 && kseq_read(ks2) >= 0) {
trim_readno(&ks2->name);
kseq2bseq1(ks2, &seqs[offset], copy_comment);
seqs[offset].id = offset;
size += seqs[offset].l_seq;
offset += 2;
}
d[1].ret_n = d[0].ret_n;
}
n += d[0].ret_n + d[1].ret_n;
if (size == 0 && kseq_read(ks2) >= 0) { // test if the 2nd file is finished
fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__);
} else if (size < chunk_size && d[0].ret_status > 0 && d[1].ret_status > 0) {
while (kseq_read(ks) >= 0) {
if (kseq_read(ks2) < 0) { // the 2nd file has fewer reads
fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__);
break;
}
if (n >= m) {
m = m ? m << 1 : 256;
seqs = (bseq1_t*)realloc(seqs, m * sizeof(bseq1_t));
memset(seqs + n, 0, (m - n) * sizeof(bseq1_t));
}
READ_ONE_SEQ(ks);
READ_ONE_SEQ(ks2);
if (size >= chunk_size && (n & 1) == 0)
break;
}
if (size == 0) { // test if the 2nd file is finished
if (kseq_read(ks2) >= 0)
fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__);
}
}
*n_ = n;
*size_ = size;
if (m > *m_)
*m_ = m;
*seqs_ptr = seqs;
}
void bseq_read(int chunk_size, int* n_, void* ks1_, void* ks2_, int copy_comment, int64_t* size_, int* m_, bseq1_t** seqs_ptr) {
// using multi-thread reading
if (ks2_) return bseq_read_pe_mt(chunk_size, n_, ks1_, ks2_, copy_comment, size_, m_, seqs_ptr);
kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_;
int size = 0, m, n;
bseq1_t *seqs;
m = n = 0; seqs = 0;
@ -91,24 +259,20 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
m = m? m<<1 : 256;
seqs = realloc(seqs, m * sizeof(bseq1_t));
}
trim_readno(&ks->name);
kseq2bseq1(ks, &seqs[n]);
seqs[n].id = n;
size += seqs[n++].l_seq;
if (ks2) {
trim_readno(&ks2->name);
kseq2bseq1(ks2, &seqs[n]);
seqs[n].id = n;
size += seqs[n++].l_seq;
}
READ_ONE_SEQ(ks);
if (ks2) {
READ_ONE_SEQ(ks2);
}
if (size >= chunk_size && (n&1) == 0) break;
}
if (size == 0) { // test if the 2nd file is finished
if (ks2 && kseq_read(ks2) >= 0)
fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__);
}
*n_ = n;
return seqs;
*n_ = n;
*size_ = size;
if (m > *m_) *m_ = m;
*seqs_ptr = seqs;
}
void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2])

23
bwa.h
View File

@ -28,8 +28,11 @@
#define BWA_H_
#include <stdint.h>
#include "bntseq.h"
#include "bwt.h"
#include "kstring.h"
#include "hyb_idx.h"
#define BWA_IDX_BWT 0x1
#define BWA_IDX_BNS 0x2
@ -49,17 +52,24 @@ typedef struct {
bwt_t *bwt; // FM-index
bntseq_t *bns; // information on the reference sequences
uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
HybridIndex *hyb; // Hybrid index
int is_shm;
int64_t l_mem;
int is_shm;
int64_t l_mem;
uint8_t *mem;
} bwaidx_t;
typedef struct {
int l_seq, id;
char *name, *comment, *seq, *qual, *sam;
int l_seq, id;
int m_name, m_comment, m_seq, m_qual;
char *name, *comment, *seq, *qual;
kstring_t sam;
} bseq1_t;
typedef struct {
kstring_t sam;
} seq_sam_t;
extern int bwa_verbose, bwa_dbg;
extern char bwa_rg_id[256];
@ -67,8 +77,9 @@ extern char bwa_rg_id[256];
extern "C" {
#endif
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]);
// bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
void bseq_read(int chunk_size, int* n_, void* ks1_, void* ks2_, int copy_comment, int64_t* size_, int* m_, bseq1_t** seqs_ptr);
void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]);
void bwa_fill_scmat(int a, int b, int8_t mat[25]);
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);

683
bwamem.c
View File

@ -41,6 +41,7 @@
#include "kvec.h"
#include "ksort.h"
#include "utils.h"
#include "hyb_idx.h"
#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
@ -110,7 +111,7 @@ mem_opt_t *mem_opt_init()
o->use_bwt = 0;
o->skip_entire_match = 0;
o->batch_size = 256;
return o;
}
@ -121,17 +122,23 @@ mem_opt_t *mem_opt_init()
#define intv_lt(a, b) ((a).info < (b).info)
KSORT_INIT(mem_intv, bwtintv_t, intv_lt)
typedef struct {
bwtintv_v mem, mem1, *tmpv[2];
} smem_aux_t;
static smem_aux_t *smem_aux_init()
{
smem_aux_t *a;
a = calloc(1, sizeof(smem_aux_t));
a->tmpv[0] = calloc(1, sizeof(bwtintv_v));
a->tmpv[1] = calloc(1, sizeof(bwtintv_v));
return a;
a->sw_buf = (buf_t*)calloc(1, sizeof(buf_t));
a->seq_buf = (buf_t*)calloc(1, sizeof(buf_t));
a->byte_seq = (byte_v*)calloc(1, sizeof(byte_v));
a->reverse_seq = (byte_v*)calloc(1, sizeof(byte_v));
a->for_bits = (byte_v*)calloc(1, sizeof(byte_v));
a->back_bits = (byte_v*)calloc(1, sizeof(byte_v));
kv_resize(uint8_t, *a->byte_seq, HYB_MAX_SEQ_LEN);
kv_resize(uint8_t, *a->reverse_seq, HYB_MAX_SEQ_LEN);
kv_resize(uint8_t, *a->for_bits, HYB_MAX_SEQ_LEN);
kv_resize(uint8_t, *a->back_bits, HYB_MAX_SEQ_LEN);
return a;
}
static void smem_aux_destroy(smem_aux_t *a)
@ -142,13 +149,41 @@ static void smem_aux_destroy(smem_aux_t *a)
free(a);
}
static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq, smem_aux_t *a)
{
int i, k, x = 0, old_n;
// 初始化线程需要的数据
mem_worker_t* init_mem_worker(const mem_opt_t* opt, const bwt_t *bwt, const HybridIndex* hyb, const bntseq_t* bns, const uint8_t* pac) {
int i = opt->n_threads, j;
mem_worker_t *w = (mem_worker_t *)calloc(1, sizeof(mem_worker_t));
w->opt = opt; w->bwt = bwt; w->hyb = hyb; w->bns = bns; w->pac = pac;
w->calc_isize = 0; w->n = 0; w->regs = 0;
w->aux = (smem_aux_t**)malloc(i * sizeof(smem_aux_t*));
w->smem_arr = (smem_v **)malloc(i * sizeof(smem_v *));
w->chain_arr = (mem_chain_v **)malloc(i * sizeof(mem_chain_v *));
w->isize_arr = (uint64_v **)malloc(i * sizeof(uint64_v *));
w->seed_arr = (HybSeedArr **)malloc(i * sizeof(HybSeedArr*));
for (i = 0; i < opt->n_threads; ++i) {
w->aux[i] = smem_aux_init();
w->smem_arr[i] = (smem_v*)malloc(opt->batch_size * sizeof(smem_v));
w->chain_arr[i] = (mem_chain_v*)malloc(opt->batch_size * sizeof(mem_chain_v));
w->isize_arr[i] = (uint64_v *)calloc(4, sizeof(uint64_v));
w->seed_arr[i] = (HybSeedArr *)malloc(opt->batch_size * sizeof(HybSeedArr));
for (j = 0; j < opt->batch_size; ++j) {
kv_init(w->smem_arr[i][j].mem);
kv_init(w->smem_arr[i][j].pos_arr);
kv_init(w->chain_arr[i][j]);
kv_init(w->seed_arr[i][j]);
}
}
return w;
}
// seeding
static void mem_collect_intv(const mem_opt_t* opt, const bwt_t* bwt, int len, const uint8_t* seq, smem_v* smem, smem_aux_t* a, int tid) {
int i, k, x = 0, old_n;
int start_width = 1;
int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
a->mem.n = 0;
// first pass: find all SMEMs
smem->mem.n = 0;
// first pass: find all SMEMs
while (x < len) {
if (seq[x] < 4) {
x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv);
@ -156,21 +191,21 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
bwtintv_t *p = &a->mem1.a[i];
int slen = (uint32_t)p->info - (p->info>>32); // seed length
if (slen >= opt->min_seed_len)
kv_push(bwtintv_t, a->mem, *p);
}
kv_push(bwtintv_t, smem->mem, *p);
}
} else ++x;
}
// second pass: find MEMs inside a long SMEM
old_n = a->mem.n;
for (k = 0; k < old_n; ++k) {
bwtintv_t *p = &a->mem.a[k];
old_n = smem->mem.n;
for (k = 0; k < old_n; ++k) {
bwtintv_t *p = &smem->mem.a[k];
int start = p->info>>32, end = (int32_t)p->info;
if (end - start < split_len || p->x[2] > opt->split_width) continue;
bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv);
for (i = 0; i < a->mem1.n; ++i)
if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info>>32) >= opt->min_seed_len)
kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
}
kv_push(bwtintv_t, smem->mem, a->mem1.a[i]);
}
// third pass: LAST-like
if (opt->max_mem_intv > 0) {
x = 0;
@ -179,39 +214,120 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
if (1) {
bwtintv_t m;
x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m);
if (m.x[2] > 0) kv_push(bwtintv_t, smem->mem, m);
} else { // for now, we never come to this block which is slower
x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv);
for (i = 0; i < a->mem1.n; ++i)
kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
kv_push(bwtintv_t, smem->mem, a->mem1.a[i]);
}
} else ++x;
}
}
// sort
ks_introsort(mem_intv, a->mem.n, a->mem.a);
ks_introsort(mem_intv, smem->mem.n, smem->mem.a);
}
void find_smem(const mem_opt_t* opt, const bwt_t* bwt, int len, const uint8_t* seq, smem_aux_t* aux, smem_v* smemv, int tid) {
if (len < opt->min_seed_len)
return; // if the query is shorter than the seed length, no match
mem_collect_intv(opt, bwt, len, seq, smemv, aux, tid);
smemv->pos_arr.n = 0;
}
// hybrid-index-based seeding
#define hyb_seed_lt(a, b) ((a).seed_start == (b).seed_start ? (a).seed_end < (b).seed_end : (a).seed_start < (b).seed_start)
KSORT_INIT(hyb_seed, HybSeed, hyb_seed_lt)
static void hyb_seeding(const mem_opt_t* opt, const HybridIndex* hyb, ReadSeq* read_seq, RangeArr* read_ranges, RangeArr* seeds_ranges,
HybSeedArr* seeds, uint64_t seq_id, int tid) {
int i = 0;
int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
seeds->n = 0;
// fprintf(stderr, "seq-id: %ld\n", seq_id);
if (seq_id == 4) {
fprintf(stderr, "seq-id: %ld\n", seq_id);
}
// 1. seeding-1: find all SMEMs
PROF_START(seed_1);
for (i = 0; i < read_ranges->n; ++i) {
Range range = kv_A(*read_ranges, i);
if (range.len < opt->min_seed_len)
continue;
seeds_ranges->a[i].start = seeds->n;
hyb_first_seeding(hyb, read_seq, &range, opt->min_seed_len, seeds, tid);
seeds_ranges->a[i].end = seeds->n;
}
tprof[T_SEED_LEN][tid] += seeds->n;
PROF_END(tprof[T_SEED_1][tid], seed_1);
#if 1
// 2. seeding-2: find MEMs inside a long SMEM
PROF_START(seed_2);
int pre_pivot = 0;
int old_n = seeds->n;
int pre_start = old_n, pre_end = old_n, pre_n = old_n;
for (i = 0; i < old_n; ++i) {
HybSeed* seed = &kv_A(*seeds, i);
int start = seed->seed_start, end = seed->seed_end;
if (end - start < split_len || seed->ref_pos_arr.n > opt->split_width)
continue;
pre_n = seeds->n;
if (seed->ref_pos_arr.n == 1) {
pre_pivot = hyb_second_seeding(hyb, read_seq, start, end, seed->read_start, seed->read_end, seed->ref_pos_arr.a[0],
seed->ref_pos_arr.n + 1, pre_pivot, pre_start, pre_end, opt->min_seed_len, seeds, tid);
pre_start = pre_n;
pre_end = seeds->n;
} else {
hyb_second_seeding(hyb, read_seq, start, end, seed->read_start, seed->read_end, seed->ref_pos_arr.a[0], seed->ref_pos_arr.n + 1, 0, 0, 0,
opt->min_seed_len, seeds, tid);
}
}
PROF_END(tprof[T_SEED_2][tid], seed_2);
#endif
#if 1
// 3. seeding-3: LAST-like
old_n = seeds->n;
PROF_START(seed_3);
if (opt->max_mem_intv > 0) {
for (i = 0; i < read_ranges->n; ++i) {
Range range = kv_A(*read_ranges, i);
if (range.len < opt->min_seed_len)
continue;
Range seeds_range = kv_A(*seeds_ranges, i);
hyb_third_seeding(hyb, read_seq, &range, &seeds_range, opt->min_seed_len, opt->max_mem_intv, seeds, tid);
}
}
PROF_END(tprof[T_SEED_3][tid], seed_3);
#endif
#if 0
{
FILE *fp = gf[1];
int j;
// fprintf(fp, "%ld ", seq_id);
for (i = 0; i < seeds->n; ++i) {
HybSeed *seed = &kv_A(*seeds, i);
fprintf(fp, "s:%d e:%d n:%ld ", seed->seed_start, seed->seed_end, seed->ref_pos_arr.n);
for (j = 0; j < seed->ref_pos_arr.n; ++j) {
fprintf(fp, "%ld ", seed->ref_pos_arr.a[j]);
}
fprintf(fp, "\n");
}
fprintf(fp, "\n");
// fprintf(fp, "seq_id:%ld\n", seq_id);
}
#endif
ks_introsort(hyb_seed, kv_size(*seeds), seeds->a);
}
/************
* Chaining *
************/
typedef struct {
int64_t rbeg;
int32_t qbeg, len;
int score;
} mem_seed_t; // unaligned memory
typedef struct {
int n, m, first, rid;
uint32_t w:29, kept:2, is_alt:1;
float frac_rep;
int64_t pos;
mem_seed_t *seeds;
} mem_chain_t;
typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v;
#include "kbtree.h"
#define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos))
@ -279,30 +395,25 @@ void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn)
}
}
mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, int len, const uint8_t *seq, void *buf)
{
int i, b, e, l_rep;
void generate_chain(const mem_opt_t* opt, const bwt_t* bwt, const bntseq_t* bns, int len, const uint8_t* seq, bwtintv_v mem, mem_chain_v* chain, int tid) {
int i, b, e, l_rep;
int64_t l_pac = bns->l_pac;
mem_chain_v chain;
kbtree_t(chn) *tree;
smem_aux_t *aux;
chain->n = 0;
kv_init(chain);
if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match
if (len < opt->min_seed_len) return; // if the query is shorter than the seed length, no match
tree = kb_init(chn, KB_DEFAULT_SIZE);
aux = buf? (smem_aux_t*)buf : smem_aux_init();
mem_collect_intv(opt, bwt, len, seq, aux);
for (i = 0, b = e = l_rep = 0; i < aux->mem.n; ++i) { // compute frac_rep
bwtintv_t *p = &aux->mem.a[i];
for (i = 0, b = e = l_rep = 0; i < mem.n; ++i) { // compute frac_rep
bwtintv_t *p = &mem.a[i];
int sb = (p->info>>32), se = (uint32_t)p->info;
if (p->x[2] <= opt->max_occ) continue;
if (sb > e) l_rep += e - b, b = sb, e = se;
else e = e > se? e : se;
}
l_rep += e - b;
for (i = 0; i < aux->mem.n; ++i) {
bwtintv_t *p = &aux->mem.a[i];
for (i = 0; i < mem.n; ++i) {
bwtintv_t *p = &mem.a[i];
int step, count, slen = (uint32_t)p->info - (p->info>>32); // seed length
int64_t k;
// if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive
@ -330,19 +441,78 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn
}
}
}
if (buf == 0) smem_aux_destroy(aux);
if (chain->m < kb_size(tree)) {
kv_resize(mem_chain_t, *chain, kb_size(tree));
}
kv_resize(mem_chain_t, chain, kb_size(tree));
#define traverse_func(p_) (chain->a[chain->n++] = *(p_))
__kb_traverse(mem_chain_t, tree, traverse_func);
#undef traverse_func
for (i = 0; i < chain->n; ++i) chain->a[i].frac_rep = (float)l_rep / len;
if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len);
kb_destroy(chn, tree);
}
#define traverse_func(p_) (chain.a[chain.n++] = *(p_))
__kb_traverse(mem_chain_t, tree, traverse_func);
#undef traverse_func
void hyb_generate_chain(const mem_opt_t *opt, const HybridIndex *hyb, const bntseq_t *bns, int len, const uint8_t *seq,
HybSeedArr *seeds, mem_chain_v *chain, int tid) {
int i, b, e, l_rep;
int64_t l_pac = bns->l_pac;
kbtree_t(chn) * tree;
chain->n = 0;
if (len < opt->min_seed_len) return; // if the query is shorter than the seed length, no match
tree = kb_init(chn, KB_DEFAULT_SIZE);
for (i = 0, b = e = l_rep = 0; i < seeds->n; ++i) { // compute frac_rep
HybSeed *seed = &kv_A(*seeds, i);
int sb = seed->seed_start, se = seed->seed_end;
if (seed->ref_pos_arr.n <= opt->max_occ) continue;
if (sb > e) l_rep += e - b, b = sb, e = se;
else e = e > se ? e : se;
}
l_rep += e - b;
for (i = 0; i < seeds->n; ++i) {
HybSeed *seed = &kv_A(*seeds, i);
int step, count; // seed length
int64_t k;
step = seed->ref_pos_arr.n > opt->max_occ ? seed->ref_pos_arr.n / opt->max_occ : 1;
for (k = count = 0; k < seed->ref_pos_arr.n && count < opt->max_occ; k += step, ++count) {
mem_chain_t tmp, *lower, *upper;
mem_seed_t s;
int rid, to_add = 0;
s.rbeg = tmp.pos = seed->ref_pos_arr.a[k];
s.qbeg = seed->seed_start;
s.len = seed->seed_end - seed->seed_start;
s.score = s.len;
rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len);
if (rid < 0)
continue; // bridging multiple reference sequences or1 the forward-reverse boundary; TODO: split the seed;
// don't discard it!!!
if (kb_size(tree)) {
kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain
if (!lower || !test_and_merge(opt, l_pac, lower, &s, rid))
to_add = 1;
} else
to_add = 1;
if (to_add) { // add the seed as a new chain
tmp.n = 1;
tmp.m = 4;
tmp.seeds = (mem_seed_t *)calloc(tmp.m, sizeof(mem_seed_t));
tmp.seeds[0] = s;
tmp.rid = rid;
tmp.is_alt = !!bns->anns[rid].is_alt;
kb_putp(chn, tree, &tmp);
}
}
}
if (chain->m < kb_size(tree)) {
kv_resize(mem_chain_t, *chain, kb_size(tree));
}
#define traverse_func(p_) (chain->a[chain->n++] = *(p_))
__kb_traverse(mem_chain_t, tree, traverse_func);
#undef traverse_func
for (i = 0; i < chain.n; ++i) chain.a[i].frac_rep = (float)l_rep / len;
if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len);
kb_destroy(chn, tree);
return chain;
for (i = 0; i < chain->n; ++i) chain->a[i].frac_rep = (float)l_rep / len;
if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len);
kb_destroy(chn, tree);
}
/********************
@ -660,15 +830,16 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen)
#define MAX_BAND_TRY 2
void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av)
void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av, void *buf, int tid)
{
int i, k, rid, max_off[2], aw[2]; // aw: actual bandwidth used in extension
int64_t l_pac = bns->l_pac, rmax[2], tmp, max = 0;
const mem_seed_t *s;
uint8_t *rseq = 0;
uint64_t *srt;
smem_aux_t* aux = (smem_aux_t*)buf;
if (c->n == 0) return;
if (c->n == 0) return;
// get the max possible span
rmax[0] = l_pac<<1; rmax[1] = 0;
for (i = 0; i < c->n; ++i) {
@ -744,23 +915,30 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
if (bwa_verbose >= 4) err_printf("** ---> Extending from seed(%d) [%ld;%ld,%ld] @ %s <---\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg, bns->anns[c->rid].name);
if (s->qbeg) { // left extension
uint8_t *rs, *qs;
int qle, tle, gtle, gscore;
qs = malloc(s->qbeg);
for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i];
tmp = s->rbeg - rmax[0];
rs = malloc(tmp);
for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i];
#ifndef USE_AVX2_EXT
uint8_t *rs, *qs;
qs = malloc(s->qbeg);
for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i];
rs = malloc(tmp);
for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i];
#endif
for (i = 0; i < MAX_BAND_TRY; ++i) {
int prev = a->score;
aw[0] = opt->w << i;
if (bwa_verbose >= 4) {
int j;
printf("*** Left ref: "); for (j = 0; j < tmp; ++j) putchar("ACGTN"[(int)rs[j]]); putchar('\n');
printf("*** Left query: "); for (j = 0; j < s->qbeg; ++j) putchar("ACGTN"[(int)qs[j]]); putchar('\n');
printf("*** Left ref: "); for (j = 0; j < tmp; ++j) putchar("ACGTN"[(int)rseq[tmp - 1 - j]]); putchar('\n');
printf("*** Left query: "); for (j = 0; j < s->qbeg; ++j) putchar("ACGTN"[(int)query[s->qbeg - 1 - j]]); putchar('\n');
}
a->score = ksw_extend2(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, &gtle, &gscore, &max_off[0]);
if (bwa_verbose >= 4) { printf("*** Left extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); }
#ifndef USE_AVX2_EXT
a->score = ksw_extend2(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, &gtle, &gscore, &max_off[0]);
#else
a->score = ksw_extend2_avx2(s->qbeg, query, tmp, rseq, 1, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->a, opt->b,
aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, &gtle, &gscore, &max_off[0], aux->sw_buf);
#endif
if (bwa_verbose >= 4) { printf("*** Left extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); }
if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break;
}
// check whether we prefer to reach the end of the query
@ -771,7 +949,9 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
a->qb = 0, a->rb = s->rbeg - gtle;
a->truesc = gscore;
}
free(qs); free(rs);
#ifndef USE_AVX2_EXT
free(qs); free(rs);
#endif
} else a->score = a->truesc = s->len * opt->a, a->qb = 0, a->rb = s->rbeg;
if (s->qbeg + s->len != l_query) { // right extension
@ -787,7 +967,11 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
printf("*** Right ref: "); for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[re+j]]); putchar('\n');
printf("*** Right query: "); for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[qe+j]]); putchar('\n');
}
a->score = ksw_extend2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, &gtle, &gscore, &max_off[1]);
#ifndef USE_AVX2_EXT
a->score = ksw_extend2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, &gtle, &gscore, &max_off[1]);
#else
a->score = ksw_extend2_avx2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 0, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->a, opt->b, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, &gtle, &gscore, &max_off[1], aux->sw_buf);
#endif
if (bwa_verbose >= 4) { printf("*** Right extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); }
if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break;
}
@ -1035,10 +1219,9 @@ void mem_reorder_primary5(int T, mem_alnreg_v *a)
}
// TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible
void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m)
{
extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query);
kstring_t str;
void mem_reg2sam(const mem_opt_t* opt, const bntseq_t* bns, const uint8_t* pac, bseq1_t* s, mem_alnreg_v* a, int extra_flag, const mem_aln_t* m, seq_sam_t* ss) {
extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query);
// kstring_t str;
kvec_t(mem_aln_t) aa;
int k, l;
char **XA = 0;
@ -1046,8 +1229,9 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
if (!(opt->flag & MEM_F_ALL))
XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq);
kv_init(aa);
str.l = str.m = 0; str.s = 0;
for (k = l = 0; k < a->n; ++k) {
// str.l = str.m = 0; str.s = 0;
ss->sam.l = 0;
for (k = l = 0; k < a->n; ++k) {
mem_alnreg_t *p = &a->a[k];
mem_aln_t *q;
if (p->score < opt->T) continue;
@ -1069,58 +1253,19 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
mem_aln_t t;
t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0);
t.flag |= extra_flag;
mem_aln2sam(opt, bns, &str, s, 1, &t, 0, m);
} else {
for (k = 0; k < aa.n; ++k)
mem_aln2sam(opt, bns, &str, s, aa.n, aa.a, k, m);
for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar);
mem_aln2sam(opt, bns, &ss->sam, s, 1, &t, 0, m);
} else {
for (k = 0; k < aa.n; ++k) mem_aln2sam(opt, bns, &ss->sam, s, aa.n, aa.a, k, m);
for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar);
free(aa.a);
}
s->sam = str.s;
// s->sam = str.s;
if (XA) {
for (k = 0; k < a->n; ++k) free(XA[k]);
free(XA);
}
}
mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf)
{
int i;
mem_chain_v chn;
mem_alnreg_v regs;
for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so
seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]];
chn = mem_chain(opt, bwt, bns, l_seq, (uint8_t*)seq, buf);
chn.n = mem_chain_flt(opt, chn.n, chn.a);
mem_flt_chained_seeds(opt, bns, pac, l_seq, (uint8_t*)seq, chn.n, chn.a);
if (bwa_verbose >= 4) mem_print_chain(bns, &chn);
kv_init(regs);
for (i = 0; i < chn.n; ++i) {
mem_chain_t *p = &chn.a[i];
if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i);
mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, &regs);
free(chn.a[i].seeds);
}
free(chn.a);
regs.n = mem_sort_dedup_patch(opt, bns, pac, (uint8_t*)seq, regs.n, regs.a);
if (bwa_verbose >= 4) {
err_printf("* %ld chains remain after removing duplicated chains\n", regs.n);
for (i = 0; i < regs.n; ++i) {
mem_alnreg_t *p = &regs.a[i];
printf("** %d, [%d,%d) <=> [%ld,%ld)\n", p->score, p->qb, p->qe, (long)p->rb, (long)p->re);
}
}
for (i = 0; i < regs.n; ++i) {
mem_alnreg_t *p = &regs.a[i];
if (p->rid >= 0 && bns->anns[p->rid].is_alt)
p->is_alt = 1;
}
return regs;
}
mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar)
{
mem_aln_t a;
@ -1193,77 +1338,267 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *
return a;
}
typedef struct {
const mem_opt_t *opt;
const bwt_t *bwt;
const bntseq_t *bns;
const uint8_t *pac;
const mem_pestat_t *pes;
smem_aux_t **aux;
bseq1_t *seqs;
mem_alnreg_v *regs;
int64_t n_processed;
} worker_t;
static void worker1(void *data, long i, int tid)
{
worker_t *w = (worker_t*)data;
if (!(w->opt->flag&MEM_F_PE)) {
if (bwa_verbose >= 4) printf("=====> Processing read '%s' <=====\n", w->seqs[i].name);
w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq, w->aux[tid]);
} else {
if (bwa_verbose >= 4) printf("=====> Processing read '%s'/1 <=====\n", w->seqs[i<<1|0].name);
w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq, w->aux[tid]);
if (bwa_verbose >= 4) printf("=====> Processing read '%s'/2 <=====\n", w->seqs[i<<1|1].name);
w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq, w->aux[tid]);
}
static inline int cal_sub(const mem_opt_t* opt, mem_alnreg_v* r) {
int j;
for (j = 1; j < r->n; ++j) { // choose unique alignment
int b_max = r->a[j].qb > r->a[0].qb ? r->a[j].qb : r->a[0].qb;
int e_min = r->a[j].qe < r->a[0].qe ? r->a[j].qe : r->a[0].qe;
if (e_min > b_max) { // have overlap
int min_l = r->a[j].qe - r->a[j].qb < r->a[0].qe - r->a[0].qb ? r->a[j].qe - r->a[j].qb : r->a[0].qe - r->a[0].qb;
if (e_min - b_max >= min_l * opt->mask_level)
break; // significant overlap
}
}
return j < r->n ? r->a[j].score : opt->min_seed_len * opt->a;
}
static void worker2(void *data, long i, int tid)
static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t* dist) {
int64_t p2;
int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac);
p2 = r1 == r2 ? b2 : (l_pac << 1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand
*dist = p2 > b1 ? p2 - b1 : b1 - p2;
return (r1 == r2 ? 0 : 1) ^ (p2 > b1 ? 0 : 3);
}
// mem主要流程
void mem_core_process(const mem_opt_t* opt, const bwt_t* bwt, const HybridIndex* hyb, const bntseq_t* bns, const uint8_t* pac, bseq1_t* seq_arr,
int nseq, smem_aux_t* aux, void* seed_arr, mem_chain_v* chain_arr, mem_alnreg_v* reg_arr, int calc_isize, int64_t l_pac,
uint64_v* isize, int tid) {
int i, j, l_seq;
mem_chain_v* chnp;
mem_alnreg_v* regp;
char* seq;
if (opt->use_bwt) {
smem_v *smem_arr = (smem_v*)seed_arr;
// 1. seeding
PROF_START(seed_all);
for (i = 0; i < nseq; ++i) {
seq = seq_arr[i].seq;
l_seq = seq_arr[i].l_seq;
for (j = 0; j < l_seq; ++j) {
seq[j] = seq[j] < 4 ? seq[j] : nst_nt4_table[(int)seq[j]];
}
find_smem(opt, bwt, l_seq, (uint8_t*)seq, aux, &smem_arr[i], tid);
}
PROF_END(tprof[T_SEED_ALL][tid], seed_all);
// 2. chain
PROF_START(chain_all);
for (i = 0; i < nseq; ++i) {
seq = seq_arr[i].seq;
l_seq = seq_arr[i].l_seq;
chnp = chain_arr + i;
PROF_START(gen_chain);
generate_chain(opt, bwt, bns, l_seq, (uint8_t*)seq, smem_arr[i].mem, chnp, tid);
PROF_END(tprof[T_GEN_CHAIN][tid], gen_chain);
PROF_START(flt_chain);
chnp->n = mem_chain_flt(opt, chnp->n, chnp->a);
PROF_END(tprof[T_FLT_CHAIN][tid], flt_chain);
PROF_START(flt_chained_seeds);
mem_flt_chained_seeds(opt, bns, pac, l_seq, (uint8_t*)seq, chnp->n, chnp->a);
PROF_END(tprof[T_FLT_CHANNED_SEEDS][tid], flt_chained_seeds);
if (bwa_verbose >= 4) mem_print_chain(bns, chnp);
}
PROF_END(tprof[T_CHAIN_ALL][tid], chain_all);
} else {
HybSeedArr* seeds = (HybSeedArr*)seed_arr;
// 1. seeding
PROF_START(seed_all);
RangeArr read_ranges = {0};
RangeArr seeds_ranges = {0};
Range init_range = {0};
for (i = 0; i < nseq; ++i) {
uint8_t* reverse_seq = aux->reverse_seq->a;
uint8_t* for_bits = aux->for_bits->a;
uint8_t* back_bits = aux->back_bits->a;
read_ranges.n = 0;
seeds_ranges.n = 0;
int last_N = -1;
seq = seq_arr[i].seq;
l_seq = seq_arr[i].l_seq;
for (j = 0; j < l_seq; ++j) {
seq[j] = (uint8_t)(seq[j] < 4 ? seq[j] : nst_nt4_table[(int)seq[j]]);
if (seq[j] >= 4) { // N
reverse_seq[l_seq - 1 - j] = seq[j];
if (last_N + 1 < j) {
const Range range = {last_N + 1, j, j - last_N - 1};
kv_push(Range, read_ranges, range);
kv_push(Range, seeds_ranges, init_range);
}
last_N = j;
} else {
reverse_seq[l_seq - 1 - j] = 3 - seq[j];
}
}
if (last_N + 1 < j) {
const Range range = {last_N + 1, j, j - last_N - 1};
kv_push(Range, read_ranges, range);
kv_push(Range, seeds_ranges, init_range);
}
create_seq_fb_bits((uint8_t*)seq, l_seq, for_bits, back_bits);
ReadSeq read_seq = {l_seq, (uint8_t*)seq, reverse_seq, for_bits, back_bits, aux->seq_id};
++aux->seq_id;
hyb_seeding(opt, hyb, &read_seq, &read_ranges, &seeds_ranges, &seeds[i], aux->seq_id, tid);
}
kv_destroy(read_ranges);
kv_destroy(seeds_ranges);
PROF_END(tprof[T_SEED_ALL][tid], seed_all);
// 2. chain
PROF_START(chain_all);
for (i = 0; i < nseq; ++i) {
seq = seq_arr[i].seq;
l_seq = seq_arr[i].l_seq;
chnp = chain_arr + i;
PROF_START(gen_chain);
hyb_generate_chain(opt, hyb, bns, l_seq, (uint8_t*)seq, &seeds[i], chnp, tid);
PROF_END(tprof[T_GEN_CHAIN][tid], gen_chain);
PROF_START(flt_chain);
chnp->n = mem_chain_flt(opt, chnp->n, chnp->a);
PROF_END(tprof[T_FLT_CHAIN][tid], flt_chain);
PROF_START(flt_chained_seeds);
mem_flt_chained_seeds(opt, bns, pac, l_seq, (uint8_t*)seq, chnp->n, chnp->a);
PROF_END(tprof[T_FLT_CHANNED_SEEDS][tid], flt_chained_seeds);
if (bwa_verbose >= 4)
mem_print_chain(bns, chnp);
}
PROF_END(tprof[T_CHAIN_ALL][tid], chain_all);
}
// 3. align
PROF_START(aln_all);
for (i = 0; i < nseq; ++i) {
seq = seq_arr[i].seq;
l_seq = seq_arr[i].l_seq;
chnp = chain_arr + i;
regp = reg_arr + i;
kv_init(*regp);
for (j = 0; j < chnp->n; ++j) {
mem_chain_t* p = &chnp->a[j];
if (bwa_verbose >= 4)
err_printf("* ---> Processing chain(%d) <---\n", j);
mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, regp, aux, tid);
free(chnp->a[j].seeds);
}
free(chnp->a);
chnp->m = 0;
chnp->a = 0;
regp->n = mem_sort_dedup_patch(opt, bns, pac, (uint8_t*)seq, regp->n, regp->a);
if (bwa_verbose >= 4) {
err_printf("* %ld chains remain after removing duplicated chains\n", regp->n);
for (j = 0; j < regp->n; ++j) {
mem_alnreg_t* p = &regp->a[j];
printf("** %d, [%d,%d) <=> [%ld,%ld)\n", p->score, p->qb, p->qe, (long)p->rb, (long)p->re);
}
}
for (j = 0; j < regp->n; ++j) {
mem_alnreg_t* p = &regp->a[j];
if (p->rid >= 0 && bns->anns[p->rid].is_alt)
p->is_alt = 1;
}
}
PROF_END(tprof[T_ALN_ALL][tid], aln_all);
// 4. calc insert size
#define MIN_RATIO 0.8
if (calc_isize) {
PROF_START(ins_size);
for (i = 0; i < nseq >> 1; ++i) {
int dir;
int64_t is;
mem_alnreg_v* r[2];
r[0] = (mem_alnreg_v*)&reg_arr[i << 1 | 0];
r[1] = (mem_alnreg_v*)&reg_arr[i << 1 | 1];
if (r[0]->n == 0 || r[1]->n == 0)
continue;
if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score)
continue;
if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score)
continue;
if (r[0]->a[0].rid != r[1]->a[0].rid)
continue; // not on the same chr
dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is);
if (is && is <= opt->max_ins)
kv_push(uint64_t, isize[dir], is);
}
PROF_END(tprof[T_INS_SIZE][tid], ins_size);
}
}
static void worker_smem_align(void *data, long i, int tid)
{
extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]);
mem_worker_t *w = (mem_worker_t*)data;
int start = i * w->opt->batch_size;
int end = MIN(start + w->opt->batch_size, w->n_reads);
mem_core_process(w->opt, w->bwt, w->hyb, w->bns, w->pac, w->seqs + start, end - start, w->aux[tid], w->smem_arr[tid], w->chain_arr[tid], w->regs + start,
w->calc_isize, w->bns->l_pac, w->isize_arr[tid], tid);
}
static void worker_sam(void *data, long i, int tid)
{
extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], seq_sam_t ss[2], int tid);
extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a);
worker_t *w = (worker_t*)data;
mem_worker_t *w = (mem_worker_t*)data;
if (!(w->opt->flag&MEM_F_PE)) {
if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name);
mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i);
if (w->opt->flag & MEM_F_PRIMARY5) mem_reorder_primary5(w->opt->T, &w->regs[i]);
mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0);
free(w->regs[i].a);
mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0, &w->sams[i]);
free(w->regs[i].a);
} else {
if (bwa_verbose >= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i<<1|0].name);
mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1]);
mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1], &w->sams[i<<1], tid);
free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a);
}
}
void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0)
{
extern void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n);
worker_t w;
void mem_process_seqs(const mem_opt_t* opt, mem_worker_t* w, int64_t n_processed, int n, bseq1_t* seqs, const mem_pestat_t* pes0, seq_sam_t* sams) {
extern void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n);
mem_pestat_t pes[4];
double ctime, rtime;
int i;
int n_batch = (n + opt->batch_size - 1) / opt->batch_size;
ctime = cputime(); rtime = realtime();
global_bns = bns;
w.regs = malloc(n * sizeof(mem_alnreg_v));
w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac;
w.seqs = seqs; w.n_processed = n_processed;
w.pes = &pes[0];
w.aux = malloc(opt->n_threads * sizeof(smem_aux_t*));
for (i = 0; i < opt->n_threads; ++i)
w.aux[i] = smem_aux_init();
kt_for(opt->n_threads, worker1, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions
for (i = 0; i < opt->n_threads; ++i)
smem_aux_destroy(w.aux[i]);
free(w.aux);
ctime = cputime(); rtime = realtime();
global_bns = w->bns;
w->opt = opt;
if (w->n < n) {
w->n = n;
w->regs = (mem_alnreg_v*)realloc(w->regs, n * sizeof(mem_alnreg_v));
}
w->seqs = seqs;
w->n_processed = n_processed;
w->sams = sams;
w->n_reads = n;
w->pes = &pes[0];
if ((opt->flag & MEM_F_PE) && !pes0) { // infer insert sizes if not provided
int i, j;
w->calc_isize = 1;
for (i = 0; i < opt->n_threads; ++i)
for (j = 0; j < 4; ++j) w->isize_arr[i][j].n = 0;
}
PROF_START(kernel);
kt_for(opt->n_threads, worker_smem_align, w, n_batch); // find mapping positions
PROF_END(gprof[G_MEM_KERNEL], kernel);
PROF_START(pestat);
if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided
if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0
else mem_pestat(opt, bns->l_pac, n, w.regs, pes); // otherwise, infer the insert size distribution from data
else mem_pestat(opt, w->bns->l_pac, n, w->isize_arr, pes); // otherwise, infer the insert size distribution from data
}
kt_for(opt->n_threads, worker2, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment
free(w.regs);
PROF_END(gprof[G_MEM_PESTAT], pestat);
PROF_START(mem_sam);
kt_for(opt->n_threads, worker_sam, w, (opt->flag & MEM_F_PE) ? n >> 1 : n); // generate alignment
PROF_END(gprof[G_MEM_SAM], mem_sam);
if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] Processed %d reads in %.3f CPU sec, %.3f real sec\n", __func__, n, cputime() - ctime, realtime() - rtime);
}

View File

@ -27,9 +27,11 @@
#ifndef BWAMEM_H_
#define BWAMEM_H_
#include "bwt.h"
#include "bntseq.h"
#include "bwa.h"
#include "bwt.h"
#include "hyb_idx.h"
#include "utils.h"
#define MEM_MAPQ_COEF 30.0
#define MEM_MAPQ_MAX 60
@ -126,9 +128,68 @@ typedef struct { // This struct is only used for the convenience of API.
int score, sub, alt_sc;
} mem_aln_t;
typedef struct {
int64_t rbeg;
int32_t qbeg, len;
int score;
} mem_seed_t; // unaligned memory
typedef struct {
int n, m, first, rid;
uint32_t w : 29, kept : 2, is_alt : 1;
float frac_rep;
int64_t pos;
mem_seed_t* seeds;
} mem_chain_t;
typedef struct {
size_t n, m;
mem_chain_t* a;
} mem_chain_v;
typedef kvec_t(uint8_t) byte_v;
typedef kvec_t(byte_v) byte_vv;
typedef struct {
bwtintv_v mem, mem1, *tmpv[2];
buf_t *sw_buf, *seq_buf;
byte_v* byte_seq;
byte_v* reverse_seq;
byte_v* for_bits;
byte_v* back_bits;
uint64_t seq_id;
} smem_aux_t;
typedef struct {
bwtintv_v mem;
uint64_v pos_arr;
} smem_v;
typedef struct {
int calc_isize;
const mem_opt_t* opt;
const bwt_t* bwt;
const HybridIndex* hyb;
const bntseq_t* bns;
const uint8_t* pac;
const mem_pestat_t* pes;
smem_aux_t** aux;
bseq1_t* seqs;
seq_sam_t* sams;
smem_v** smem_arr;
HybSeedArr** seed_arr;
mem_chain_v** chain_arr;
mem_alnreg_v* regs;
uint64_v** isize_arr;
int64_t n_processed;
int64_t n;
int64_t n_reads;
} mem_worker_t;
#ifdef __cplusplus
extern "C" {
#endif
mem_worker_t *init_mem_worker(const mem_opt_t *opt, const bwt_t *bwt, const HybridIndex *hyb, const bntseq_t *bns, const uint8_t *pac);
smem_i *smem_itr_init(const bwt_t *bwt);
void smem_itr_destroy(smem_i *itr);
@ -161,9 +222,10 @@ extern "C" {
* @param pes0 insert-size info; if NULL, infer from data; if not NULL, it should be an array with 4 elements,
* corresponding to each FF, FR, RF and RR orientation. See mem_pestat() for more info.
*/
void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0);
// void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0);
void mem_process_seqs(const mem_opt_t* opt, mem_worker_t* w, int64_t n_processed, int n, bseq1_t* seqs, const mem_pestat_t* pes0, seq_sam_t* sams);
/**
/**
* Find the aligned regions for one query sequence
*
* Note that this routine does not generate CIGAR. CIGAR should be
@ -207,10 +269,10 @@ extern "C" {
* @param regs region array of size $n; 2i-th and (2i+1)-th elements constitute a pair
* @param pes inferred insert size distribution (output)
*/
void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]);
void mem_pestat(const mem_opt_t* opt, int64_t l_pac, int n, uint64_v** isize_arr, mem_pestat_t pes[4]);
#ifdef __cplusplus
}
}
#endif
#endif

View File

@ -101,14 +101,14 @@ const bwtintv_v *smem_next(smem_i *itr)
mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq_)
{ // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence
extern mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf);
// extern mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf);
extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id);
mem_alnreg_v ar;
mem_alnreg_v ar = {0,0,0};
char *seq;
seq = malloc(l_seq);
memcpy(seq, seq_, l_seq); // makes a copy of seq_
ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq, 0);
mem_mark_primary_se(opt, ar.n, ar.a, lrand48());
// ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq, 0);
// mem_mark_primary_se(opt, ar.n, ar.a, lrand48());
free(seq);
return ar;
}

View File

@ -69,26 +69,19 @@ static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r)
return j < r->n? r->a[j].score : opt->min_seed_len * opt->a;
}
void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4])
{
int i, d, max;
void mem_pestat(const mem_opt_t* opt, int64_t l_pac, int n, uint64_v** isize_arr, mem_pestat_t pes[4]) {
int i, j, d, max;
uint64_v isize[4];
memset(pes, 0, 4 * sizeof(mem_pestat_t));
memset(isize, 0, sizeof(kvec_t(int)) * 4);
for (i = 0; i < n>>1; ++i) {
int dir;
int64_t is;
mem_alnreg_v *r[2];
r[0] = (mem_alnreg_v*)&regs[i<<1|0];
r[1] = (mem_alnreg_v*)&regs[i<<1|1];
if (r[0]->n == 0 || r[1]->n == 0) continue;
if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue;
if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue;
if (r[0]->a[0].rid != r[1]->a[0].rid) continue; // not on the same chr
dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is);
if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is);
}
if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n);
for (i = 0; i < opt->n_threads; ++i) {
for (d = 0; d < 4; ++d) {
for (j = 0; j < isize_arr[i][d].n; ++j) {
kv_push(uint64_t, isize[d], isize_arr[i][d].a[j]);
}
}
}
if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n);
for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two.
mem_pestat_t *r = &pes[d];
uint64_v *q = &isize[d];
@ -273,11 +266,11 @@ void mem_reorder_primary5(int T, mem_alnreg_v *a);
#define raw_mapq(diff, a) ((int)(6.02 * (diff) / (a) + .499))
int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2])
int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], seq_sam_t ss[2], int tid)
{
extern int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id);
extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a);
extern void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m);
extern void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m, seq_sam_t *ss);
extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query);
int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1, n_pri[2], n_aa[2];
@ -288,7 +281,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
memset(h, 0, sizeof(mem_aln_t) * 2);
memset(g, 0, sizeof(mem_aln_t) * 2);
n_aa[0] = n_aa[1] = 0;
if (!(opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment
PROF_START(matesw);
if (!(opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment
mem_alnreg_v b[2];
kv_init(b[0]); kv_init(b[1]);
for (i = 0; i < 2; ++i)
@ -300,7 +294,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
n += mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
free(b[0].a); free(b[1].a);
}
n_pri[0] = mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0);
PROF_END(tprof[T_SAM_MATESW][tid], matesw);
n_pri[0] = mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0);
n_pri[1] = mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1);
if (opt->flag & MEM_F_PRIMARY5) {
mem_reorder_primary5(opt->T, &a[0]);
@ -363,8 +358,10 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
} else XA[0] = XA[1] = 0;
// write SAM
for (i = 0; i < 2; ++i) {
h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[z[i]]);
h[i].mapq = q_se[i];
PROF_START(reg2aln);
h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[z[i]]);
PROF_END(tprof[T_SAM_REG2ALN][tid], reg2aln);
h[i].mapq = q_se[i];
h[i].flag |= 0x40<<i | extra_flag;
h[i].XA = XA[i]? XA[i][z[i]] : 0;
aa[i][n_aa[i]++] = h[i];
@ -377,12 +374,12 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
aa[i][n_aa[i]++] = g[i];
}
}
for (i = 0; i < n_aa[0]; ++i)
mem_aln2sam(opt, bns, &str, &s[0], n_aa[0], aa[0], i, &h[1]); // write read1 hits
s[0].sam = strdup(str.s); str.l = 0;
for (i = 0; i < n_aa[1]; ++i)
mem_aln2sam(opt, bns, &str, &s[1], n_aa[1], aa[1], i, &h[0]); // write read2 hits
s[1].sam = str.s;
ss[0].sam.l = 0;
for (i = 0; i < n_aa[0]; ++i)
mem_aln2sam(opt, bns, &ss[0].sam, &s[0], n_aa[0], aa[0], i, &h[1]); // write read1 hits
ss[1].sam.l = 0;
for (i = 0; i < n_aa[1]; ++i)
mem_aln2sam(opt, bns, &ss[1].sam, &s[1], n_aa[1], aa[1], i, &h[0]); // write read2 hits
if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
// free
for (i = 0; i < 2; ++i) {
@ -411,9 +408,9 @@ no_pairing:
d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist);
if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) extra_flag |= 2;
}
mem_reg2sam(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]);
mem_reg2sam(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]);
if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
mem_reg2sam(opt, bns, pac, &s[0], &a[0], 0x41 | extra_flag, &h[1], &ss[0]);
mem_reg2sam(opt, bns, pac, &s[1], &a[1], 0x81 | extra_flag, &h[0], &ss[1]);
if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
free(h[0].cigar); free(h[1].cigar);
return n;
}

View File

@ -266,6 +266,7 @@ int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_s
{ // nucleotide indexing
gzFile fp = xzopen(fa, "r");
start_async_read(fp);
t = clock();
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Pack FASTA... ");
l_pac = bns_fasta2bntseq(fp, prefix, 0);
@ -280,8 +281,9 @@ int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_s
//exit(0);
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
err_gzclose(fp);
}
stop_async_read(fp);
err_gzclose(fp);
}
if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT
{
strcpy(str, prefix); strcat(str, ".pac");
@ -310,11 +312,13 @@ int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_s
}
{
gzFile fp = xzopen(fa, "r");
t = clock();
start_async_read(fp);
t = clock();
if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Pack forward-only FASTA... ");
l_pac = bns_fasta2bntseq(fp, prefix, 1);
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
err_gzclose(fp);
stop_async_read(fp);
err_gzclose(fp);
}
{
bwt_t *bwt;

View File

@ -732,8 +732,10 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c
uint8_t *pac;
bsw2seq_t *_seq;
bseq1_t *bseq;
int64_t seq_size = 0;
int m = 0;
pac = calloc(bns->l_pac/4+1, 1);
pac = calloc(bns->l_pac/4+1, 1);
for (l = 0; l < bns->n_seqs; ++l)
err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len);
err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac);
@ -745,13 +747,14 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c
ks2 = kseq_init(fp2);
is_pe = 1;
} else fp2 = 0, ks2 = 0, is_pe = 0;
while ((bseq = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) {
bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2, 1, &seq_size, &m, &bseq);
while (n > 0) {
int size = 0;
if (n > _seq->max) {
_seq->max = n;
kroundup32(_seq->max);
_seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t));
}
_seq->seq = (bsw2seq1_t*)realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t));
}
_seq->n = n;
for (i = 0; i < n; ++i) {
bseq1_t *b = &bseq[i];
@ -761,8 +764,8 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c
size += p->l;
}
fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp) ...\n", n, size);
free(bseq);
process_seqs(_seq, opt, bns, pac, target, is_pe);
bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2, 1, &seq_size, &m, &bseq);
}
// free
free(pac);

View File

@ -10,7 +10,7 @@
////////////////// for debug and test //////////////////////////
#define DEBUG_FILE_OUTPUT // 打开gfp1-4文件并记录debug信息
// #define DEBUG_FILE_OUTPUT // 打开gfp1-4文件并记录debug信息
// #define COUNT_SEED_LENGTH // 记录seed匹配数量降低到1时的长度以及最终扩展的长度
// #define GET_FULL_MATCH_READ // 获取完全匹配的reads
// #define COUNT_CALC_NUM // 统计BSW的剪枝后的计算量和未剪枝前的计算量

447
fastmap.c
View File

@ -24,20 +24,27 @@
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#include <zlib.h>
#include <ctype.h>
#include <limits.h>
#include <math.h>
#include <pthread.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <ctype.h>
#include <math.h>
#include <unistd.h>
#include <zlib.h>
#include "bntseq.h"
#include "bwa.h"
#include "bwamem.h"
#include "kvec.h"
#include "utils.h"
#include "bntseq.h"
#include "debug.h"
#include "hyb_idx.h"
#include "kseq.h"
#include "kvec.h"
#include "profiling.h"
#include "share_mem.h"
#include "utils.h"
#include "yarn.h"
KSEQ_DECLARE(gzFile)
extern unsigned char nst_nt4_table[256];
@ -45,83 +52,242 @@ extern unsigned char nst_nt4_table[256];
void *kopen(const char *fn, int *_fd);
int kclose(void *a);
void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps);
HybridIndex* bwa_hyb_idx_load_from_shm(const char* idx_prefix);
HybridIndex* bwa_hyb_idx_load_from_disk(const char* idx_prefix);
typedef struct {
kseq_t *ks, *ks2;
mem_opt_t *opt;
mem_pestat_t *pes0;
int64_t n_processed;
int copy_comment, actual_chunk_size;
bwaidx_t *idx;
} ktp_aux_t;
typedef struct {
ktp_aux_t *aux;
int n_seqs;
bseq1_t *seqs;
int n_seqs;
int n_sams;
int m_seqs;
int m_sams;
bseq1_t* seqs;
seq_sam_t* sams;
} ktp_data_t;
static void *process(void *shared, int step, void *_data)
{
ktp_aux_t *aux = (ktp_aux_t*)shared;
ktp_data_t *data = (ktp_data_t*)_data;
int i;
if (step == 0) {
ktp_data_t *ret;
int64_t size = 0;
ret = calloc(1, sizeof(ktp_data_t));
ret->seqs = bseq_read(aux->actual_chunk_size, &ret->n_seqs, aux->ks, aux->ks2);
if (ret->seqs == 0) {
free(ret);
return 0;
}
if (!aux->copy_comment)
for (i = 0; i < ret->n_seqs; ++i) {
free(ret->seqs[i].comment);
ret->seqs[i].comment = 0;
}
for (i = 0; i < ret->n_seqs; ++i) size += ret->seqs[i].l_seq;
if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, ret->n_seqs, (long)size);
return ret;
} else if (step == 1) {
const mem_opt_t *opt = aux->opt;
const bwaidx_t *idx = aux->idx;
if (opt->flag & MEM_F_SMARTPE) {
bseq1_t *sep[2];
int n_sep[2];
mem_opt_t tmp_opt = *opt;
bseq_classify(data->n_seqs, data->seqs, n_sep, sep);
if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] %d single-end sequences; %d paired-end sequences\n", __func__, n_sep[0], n_sep[1]);
if (n_sep[0]) {
tmp_opt.flag &= ~MEM_F_PE;
mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, aux->n_processed, n_sep[0], sep[0], 0);
for (i = 0; i < n_sep[0]; ++i)
data->seqs[sep[0][i].id].sam = sep[0][i].sam;
}
if (n_sep[1]) {
tmp_opt.flag |= MEM_F_PE;
mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, aux->n_processed + n_sep[0], n_sep[1], sep[1], aux->pes0);
for (i = 0; i < n_sep[1]; ++i)
data->seqs[sep[1][i].id].sam = sep[1][i].sam;
}
free(sep[0]); free(sep[1]);
} else mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, aux->n_processed, data->n_seqs, data->seqs, aux->pes0);
aux->n_processed += data->n_seqs;
return data;
} else if (step == 2) {
for (i = 0; i < data->n_seqs; ++i) {
if (data->seqs[i].sam) err_fputs(data->seqs[i].sam, stdout);
free(data->seqs[i].name); free(data->seqs[i].comment);
free(data->seqs[i].seq); free(data->seqs[i].qual); free(data->seqs[i].sam);
}
free(data->seqs); free(data);
return 0;
}
return 0;
typedef struct {
kseq_t *ks, *ks2;
mem_opt_t* opt;
mem_pestat_t* pes0;
int64_t n_processed;
int copy_comment, actual_chunk_size;
bwaidx_t* idx;
mem_worker_t* w;
int data_idx; // pingpong buffer index
ktp_data_t* data;
int wbuf_size;
char* wbuf;
volatile int read_complete;
volatile int calc_complete;
long read_idx;
long calc_idx;
long write_idx;
} ktp_aux_t;
///////////////////// new parallel pipeline ///////////////////
// read
static inline void* read_data(ktp_aux_t* aux, ktp_data_t* data) {
PROF_START(read);
ktp_data_t* ret = aux->data + aux->data_idx;
aux->data_idx = !aux->data_idx;
int64_t size = 0;
bseq_read(aux->actual_chunk_size, &ret->n_seqs, aux->ks, aux->ks2, aux->copy_comment, &size, &ret->m_seqs, &ret->seqs);
PROF_END(gprof[G_READ], read);
if (ret->n_seqs == 0) {
return 0;
}
if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, ret->n_seqs, (long)size);
return ret;
}
// calculate
static inline void* calc_data(ktp_aux_t* aux, ktp_data_t* data) {
PROF_START(compute);
const mem_opt_t* opt = aux->opt;
if (data->n_sams != data->n_seqs) {
if (data->m_sams < data->m_seqs) {
data->m_sams = data->m_seqs;
data->sams = (seq_sam_t*)realloc(data->sams, data->m_sams * sizeof(seq_sam_t));
memset(data->sams + data->n_sams, 0, (data->m_sams - data->n_sams) * sizeof(seq_sam_t));
}
data->n_sams = data->n_seqs;
}
if (opt->flag & MEM_F_SMARTPE) {
// 这里应该是把pair-end数据都放在一个文件里了需要先区分这里没有内存优化涉及较多的开辟和释放
int i;
bseq1_t* sep[2];
seq_sam_t* ss[2];
int n_sep[2];
mem_opt_t tmp_opt = *opt;
bseq_classify(data->n_seqs, data->seqs, n_sep, sep);
ss[0] = (seq_sam_t*)calloc(0, n_sep[0] * sizeof(seq_sam_t));
ss[1] = (seq_sam_t*)calloc(0, n_sep[1] * sizeof(seq_sam_t));
if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] %d single-end sequences; %d paired-end sequences\n", __func__, n_sep[0], n_sep[1]);
if (n_sep[0]) {
tmp_opt.flag &= ~MEM_F_PE;
mem_process_seqs(&tmp_opt, aux->w, aux->n_processed, n_sep[0], sep[0], 0, ss[0]);
for (i = 0; i < n_sep[0]; ++i) data->sams[sep[0][i].id].sam = ss[0][i].sam;
}
if (n_sep[1]) {
tmp_opt.flag |= MEM_F_PE;
mem_process_seqs(&tmp_opt, aux->w, aux->n_processed + n_sep[0], n_sep[1], sep[1], aux->pes0, ss[1]);
for (i = 0; i < n_sep[1]; ++i) data->sams[sep[1][i].id].sam = ss[1][i].sam;
}
free(sep[0]);
free(sep[1]);
free(ss[0]);
free(ss[1]);
} else
mem_process_seqs(opt, aux->w, aux->n_processed, data->n_seqs, data->seqs, aux->pes0, data->sams);
aux->n_processed += data->n_seqs;
PROF_END(gprof[G_COMPUTE], compute);
return data;
}
// write
static inline void* write_data(ktp_aux_t* aux, ktp_data_t* data) {
int i;
PROF_START(write);
int buf_written = 0;
for (i = 0; i < data->n_sams; ++i) {
const int slen = data->sams[i].sam.l;
if (slen && (buf_written + slen) < aux->wbuf_size) {
memcpy(&aux->wbuf[buf_written], data->sams[i].sam.s, slen);
buf_written += slen;
} else if (buf_written > 0) {
err_fwrite(aux->wbuf, 1, buf_written, stdout);
if ((buf_written + slen) >= aux->wbuf_size) {
memcpy(&aux->wbuf[0], data->sams[i].sam.s, slen);
buf_written = slen;
} else {
buf_written = 0;
}
}
}
if (buf_written > 0) {
err_fwrite(aux->wbuf, 1, buf_written, stdout);
}
PROF_END(gprof[G_WRITE], write);
return 0;
}
// io 异步,读和写不能同时
static void* process(void* shared, int step, void* _data) {
ktp_aux_t* aux = (ktp_aux_t*)shared;
ktp_data_t* data = (ktp_data_t*)_data;
if (step == 0) {
return read_data(aux, data);
} else if (step == 1) {
return calc_data(aux, data);
} else if (step == 2) {
return write_data(aux, data);
}
return 0;
}
////////////// 读和写可以同时进行的pipeline
static lock_t* input_have = NULL;
static lock_t* output_have = NULL;
static void* thread_read(void* data) {
ktp_aux_t* aux = (ktp_aux_t*)data;
while (1) {
POSSESS(input_have);
WAIT_FOR(input_have, NOT_TO_BE, 0);
RELEASE(input_have);
if (read_data(aux, aux->data) == 0) {
POSSESS(input_have);
aux->read_complete = 1;
TWIST(input_have, BY, -1);
break;
}
POSSESS(input_have);
aux->read_idx++;
TWIST(input_have, BY, -1);
}
return 0;
}
static void* thread_calc(void* data) {
ktp_aux_t* aux = (ktp_aux_t*)data;
int d_idx = 0;
int add_idx = 0;
while (1) {
POSSESS(input_have);
WAIT_FOR(input_have, NOT_TO_BE, 2);
RELEASE(input_have);
POSSESS(output_have);
WAIT_FOR(output_have, NOT_TO_BE, 2);
RELEASE(output_have);
if (aux->calc_idx < aux->read_idx) {
calc_data(aux, aux->data + d_idx);
d_idx = !d_idx;
add_idx = 1;
}
if (aux->read_complete) {
POSSESS(output_have);
if (add_idx)
aux->calc_idx++;
aux->calc_complete = 1;
TWIST(output_have, BY, 1); // 最后要唤醒写线程
break; // 计算完了
}
POSSESS(output_have);
if (add_idx)
aux->calc_idx++;
TWIST(output_have, BY, 1);
POSSESS(input_have);
TWIST(input_have, BY, 1);
}
return 0;
}
static void* thread_write(void* data) {
ktp_aux_t* aux = (ktp_aux_t*)data;
int d_idx = 0;
while (1) {
POSSESS(output_have);
WAIT_FOR(output_have, NOT_TO_BE, 0);
RELEASE(output_have);
if (aux->write_idx < aux->calc_idx) {
write_data(aux, aux->data + d_idx);
d_idx = !d_idx;
aux->write_idx++;
}
if (aux->calc_complete) {
if (aux->write_idx < aux->calc_idx)
write_data(aux, aux->data + d_idx);
break;
}
POSSESS(output_have);
TWIST(output_have, BY, -1);
}
return 0;
}
static void new_pipeline(ktp_aux_t* aux) {
input_have = NEW_LOCK(2);
output_have = NEW_LOCK(0);
pthread_t tid[3];
int i;
pthread_create(&tid[0], 0, thread_read, aux);
pthread_create(&tid[1], 0, thread_calc, aux);
pthread_create(&tid[2], 0, thread_write, aux);
for (i = 0; i < 3; ++i) pthread_join(tid[i], 0);
}
///////////////////////////////////////////////////////////////
static void update_a(mem_opt_t *opt, const mem_opt_t *opt0)
{
if (opt0->a) { // matching score is changed
@ -150,13 +316,28 @@ int main_mem(int argc, char *argv[])
mem_pestat_t pes[4];
ktp_aux_t aux;
#ifdef DEBUG_FILE_OUTPUT
open_debug_files();
#endif
#ifdef SHOW_PERF
#if USE_RDTSC
uint64_t tmp_time = __rdtsc();
sleep(1);
proc_freq = __rdtsc() - tmp_time;
#else
proc_freq = 1000;
#endif
#endif
PROF_START(all);
memset(&aux, 0, sizeof(ktp_aux_t));
memset(pes, 0, 4 * sizeof(mem_pestat_t));
for (i = 0; i < 4; ++i) pes[i].failed = 1;
aux.opt = opt = mem_opt_init();
memset(&opt0, 0, sizeof(mem_opt_t));
while ((c = getopt(argc, argv, "51qpaMCSPVYjuk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:F:z:b:we")) >= 0) {
while ((c = getopt(argc, argv, "51qpaMCSPVYjuk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:F:z:b:ge")) >= 0) {
if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
else if (c == '1') no_mt_io = 1;
else if (c == 'x') mode = optarg;
@ -256,7 +437,7 @@ int main_mem(int argc, char *argv[])
__func__, pes[1].avg, pes[1].std, pes[1].high, pes[1].low);
} else if (c == 'b')
opt->batch_size = atoi(optarg) >> 1 << 1, opt->batch_size = opt->batch_size > 1 ? opt->batch_size : 256;
else if (c == 'w')
else if (c == 'g')
opt->use_bwt = 1;
else if (c == 'e') opt->skip_entire_match = 1;
else return 1;
@ -325,7 +506,7 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " FR orientation only. [inferred]\n");
fprintf(stderr, " -u output XB instead of XA; XB is XA with the alignment score and mapping quality added.\n");
fprintf(stderr, " -b INT batch size of reads to process at one time [%d].\n", opt->batch_size);
fprintf(stderr, " -w Use bwt index for seeding\n");
fprintf(stderr, " -g Use bwt index for seeding\n");
fprintf(stderr, " -e Skip the second and third seeding steps for entire matching reads.\n");
fprintf(stderr, "\n");
fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n");
@ -334,6 +515,9 @@ int main_mem(int argc, char *argv[])
return 1;
}
if (opt->n_threads < 1) opt->n_threads = 1;
if (opt->batch_size < 1) opt->batch_size = 256;
if (mode) {
if (strcmp(mode, "intractg") == 0) {
if (!opt0.o_del) opt->o_del = 16;
@ -366,22 +550,48 @@ int main_mem(int argc, char *argv[])
} else update_a(opt, &opt0);
bwa_fill_scmat(opt->a, opt->b, opt->mat);
aux.idx = bwa_idx_load_from_shm(argv[optind]);
if (aux.idx == 0) {
if ((aux.idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
} else if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] load the bwa index from shared memory\n", __func__);
if (ignore_alt)
PROF_START(load_idx);
if (opt->use_bwt) {
aux.idx = bwa_idx_load_from_shm(argv[optind]);
if (aux.idx == 0) {
if ((aux.idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0)
return 1; // FIXME: memory leak
} else if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] load the bwa index from shared memory\n", __func__);
} else { // load hybrid-index
// 加载除了hyb之外其他的必要部分
char fn[MAX_PATH];
FILE* fp = NULL;
uint64_t ref_len = 0;
sprintf(fn, "%s.ref-len", argv[optind]);
fp = xopen(fn, "r");
err_check_false(fscanf(fp, "%ld", &ref_len), EOF);
err_fclose(fp);
aux.idx = bwa_idx_load(argv[optind], BWA_IDX_BNS | BWA_IDX_PAC);
//////////////////////////////
aux.idx->hyb = bwa_hyb_idx_load_from_shm(argv[optind]);
if (aux.idx->hyb == 0) {
aux.idx->hyb = bwa_hyb_idx_load_from_disk(argv[optind]);
} else {
aux.idx->is_shm = 1;
}
aux.idx->hyb->ref_len = ref_len;
}
if (ignore_alt)
for (i = 0; i < aux.idx->bns->n_seqs; ++i)
aux.idx->bns->anns[i].is_alt = 0;
PROF_END(gprof[G_LOAD_IDX], load_idx);
ko = kopen(argv[optind + 1], &fd);
ko = kopen(argv[optind + 1], &fd);
if (ko == 0) {
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 1]);
return 1;
}
fp = gzdopen(fd, "r");
aux.ks = kseq_init(fp);
start_async_read(fp); // 采用双buffer技术将读取和解压overlap
aux.ks = kseq_init(fp);
if (optind + 2 < argc) {
if (opt->flag&MEM_F_PE) {
if (bwa_verbose >= 2)
@ -393,23 +603,52 @@ int main_mem(int argc, char *argv[])
return 1;
}
fp2 = gzdopen(fd2, "r");
aux.ks2 = kseq_init(fp2);
start_async_read(fp2);
aux.ks2 = kseq_init(fp2);
opt->flag |= MEM_F_PE;
}
}
bwa_print_sam_hdr(aux.idx->bns, hdr_line);
aux.w = init_mem_worker(opt, aux.idx->bwt, aux.idx->hyb, aux.idx->bns, aux.idx->pac);
aux.data = (ktp_data_t*)calloc(2, sizeof(ktp_data_t));
// allocate write buffer
aux.wbuf_size = 16777216;
aux.wbuf = (char*)malloc(aux.wbuf_size);
bwa_print_sam_hdr(aux.idx->bns, hdr_line);
aux.actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads;
kt_pipeline(no_mt_io? 1 : 2, process, &aux, 3);
free(hdr_line);
free(opt);
bwa_idx_destroy(aux.idx);
kseq_destroy(aux.ks);
err_gzclose(fp); kclose(ko);
PROF_START(pipeline);
if (no_mt_io) { // 不同时读写
kt_pipeline(2, process, &aux, 3);
} else {
new_pipeline(&aux);
}
PROF_END(gprof[G_PIPELINE], pipeline);
// no need to free these
// free(hdr_line);
// free(opt);
// bwa_idx_destroy(aux.idx);
// kseq_destroy(aux.ks);
stop_async_read(fp);
err_gzclose(fp); kclose(ko);
if (aux.ks2) {
kseq_destroy(aux.ks2);
err_gzclose(fp2); kclose(ko2);
// kseq_destroy(aux.ks2);
stop_async_read(fp2);
err_gzclose(fp2); kclose(ko2);
}
return 0;
PROF_END(gprof[G_ALL], all);
#ifdef SHOW_PERF
display_stats(opt->n_threads);
#endif
#ifdef DEBUG_FILE_OUTPUT
close_files();
#endif
return 0;
}
int main_fastmap(int argc, char *argv[])
@ -447,7 +686,8 @@ int main_fastmap(int argc, char *argv[])
}
fp = xzopen(argv[optind + 1], "r");
seq = kseq_init(fp);
start_async_read(fp);
seq = kseq_init(fp);
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1;
itr = smem_itr_init(idx->bwt);
smem_config(itr, min_intv, max_len, max_intv);
@ -485,6 +725,7 @@ int main_fastmap(int argc, char *argv[])
smem_itr_destroy(itr);
bwa_idx_destroy(idx);
kseq_destroy(seq);
err_gzclose(fp);
stop_async_read(fp);
err_gzclose(fp);
return 0;
}

145
hyb_bwa.c
View File

@ -38,6 +38,7 @@
#include "utils.h"
#include "kvec.h"
#include "hyb_idx.h"
#include "share_mem.h"
#ifdef _DIVBWT
@ -219,6 +220,61 @@ int bwa_bwt2kmer(int argc, char* argv[]) {
return 0;
}
// 将原始的pac转换一下从低到高存储
void convert_to_hyb_pac(uint8_t* old_pac, uint64_t l_pac, const char* new_pac_fn) {
#define _gp(l) ((old_pac)[(l) >> 2] >> ((~(l) & 3) << 1) & 3)
const uint64_t kPacByteNum = l_pac / 4 + 1;
uint8_t* pac = (uint8_t*)calloc(l_pac, 1);
FILE* pacFp = fopen(new_pac_fn, "wb");
uint8_t byte_bases = 0;
uint64_t i = 0;
uint8_t* p1;
for (; i + 3 < l_pac; i += 4) {
p1 = pac + (i >> 2);
byte_bases = _gp(i) | (_gp(i + 1) << 2) | (_gp(i + 2) << 4) | (_gp(i + 3) << 6);
*p1 = byte_bases;
}
byte_bases = 0;
p1 = pac + (i >> 2);
for (uint32_t j = 0; i < l_pac; ++i, ++j) {
byte_bases |= _gp(i) << j * 2;
}
*p1 = byte_bases;
fwrite(pac, 1, kPacByteNum, pacFp);
uint8_t ct = 0;
if (l_pac % 4 == 0) {
ct = 0;
err_fwrite(&ct, 1, 1, pacFp);
}
ct = l_pac % 4;
err_fwrite(&ct, 1, 1, pacFp);
fclose(pacFp);
}
// 将原pac文件转为hyb需要的格式翻转byte
int bwa_pac2hybpac(int argc, char* argv[]) {
if (optind + 1 > argc) {
fprintf(stderr, "Usage: bwa pac2hybpac <in.prefix>\n\n");
return 1;
}
char fn[MAX_PATH];
FILE* fp;
uint8_t* old_pac = NULL;
uint64_t l_pac = 0;
// fprintf(stderr, "here-1\n");
snprintf(fn, MAX_PATH, "%s.pac", argv[optind]);
_load_file_to_data(fn, old_pac);
sprintf(fn, "%s.ref-len", argv[optind]);
fp = xopen(fn, "r");
err_check_false(fscanf(fp, "%ld", &l_pac), EOF);
err_fclose(fp);
sprintf(fn, "%s.hyb.pac", argv[optind]);
// fprintf(stderr, "here-2\n");
convert_to_hyb_pac(old_pac, l_pac, fn);
return 0;
}
// 创建hybrid index并保存到文件
int bwa_bwt2hyb(int argc, char* argv[]) {
int hyb_idx_build_and_dump(int num_threads, bwt_t* bwt, const char* idx_prefix);
@ -238,7 +294,7 @@ int bwa_bwt2hyb(int argc, char* argv[]) {
}
}
if (optind + 1 > argc || error) {
fprintf(stderr, "Usage: bwa bwt2hyb [Options] <bwt-prefix>\n\n");
fprintf(stderr, "Usage: bwa bwt2hyb [Options] <in.prefix>\n\n");
fprintf(stderr, "Options: -t INT number of threads for hybrid index building [%d]\n", num_threads);
fprintf(stderr, "\n");
return 1;
@ -254,6 +310,93 @@ int bwa_bwt2hyb(int argc, char* argv[]) {
return 0;
}
// 尝试从share memory中加载hybrid index
HybridIndex* bwa_hyb_idx_load_from_shm(const char* idx_prefix) {
char fn[MAX_PATH];
uint8_t* ref_bits = (uint8_t*)shm_get_index(strcat(strcpy(fn, idx_prefix), HYB_PAC_SUFFIX));
uint8_t* sa = (uint8_t*)shm_get_index(strcat(strcpy(fn, idx_prefix), HYB_SA_SUFFIX));
uint8_t* kmer_data = (uint8_t*)shm_get_index(strcat(strcpy(fn, idx_prefix), HYB_KMER_SUFFIX));
uint8_t* index_data = (uint8_t*)shm_get_index(strcat(strcpy(fn, idx_prefix), HYB_DATA_SUFFIX));
if (!ref_bits || !sa || !kmer_data || !index_data) {
return NULL;
}
HybridIndex* hyb = (HybridIndex*)calloc(1, sizeof(HybridIndex));
hyb->ref_bits = ref_bits;
hyb->sa = sa;
hyb->kmer_data = kmer_data;
hyb->index_data = index_data;
return hyb;
}
// 从硬盘中加载hybrid index
HybridIndex* bwa_hyb_idx_load_from_disk(const char* idx_prefix) {
char fn[MAX_PATH];
FILE* fp = NULL;
struct stat st;
double sec_time;
#define __load_hybrid_idx_code(suffix, data) \
sec_time = realtime(); \
sprintf(fn, "%s%s", idx_prefix, suffix); \
err_check_true(stat(fn, &st), 0); \
fp = xopen(fn, "r"); \
data = (uint8_t*)malloc(st.st_size); \
err_fread_noeof(data, 1, st.st_size, fp); \
err_fclose(fp); \
fprintf(stderr, "%s, %0.2f GB, %0.2f s\n", fn, (double)st.st_size / 1024 / 1024 / 1024, realtime() - sec_time);
HybridIndex* hyb = (HybridIndex*)calloc(1, sizeof(HybridIndex));
__load_hybrid_idx_code(HYB_PAC_SUFFIX, hyb->ref_bits);
// load hyb byte-sa
__load_hybrid_idx_code(HYB_SA_SUFFIX, hyb->sa);
// load hyb kmer data
__load_hybrid_idx_code(HYB_KMER_SUFFIX, hyb->kmer_data);
// load hyb index data
__load_hybrid_idx_code(HYB_DATA_SUFFIX, hyb->index_data);
return hyb;
}
// 在共享内存中处理hybrid index
int main_shm_hyb(int argc, char* argv[]) {
char c;
int clear_shm = 0;
int list_shm = 0;
int error = 0;
while ((c = getopt(argc, argv, "dl")) >= 0) {
switch (c) {
case 'd':
clear_shm = 1;
break;
case 'l':
list_shm = 1;
break;
default:
error = 1;
break;
}
}
// fprintf(stderr, "%d %d\n", optind, argc);
if ((optind == argc && !clear_shm && !list_shm) || error) {
fprintf(stderr, "Usage: bwa hybshm [-d|-l] [idx_prefix]\n\n");
fprintf(stderr, "Options: -d destroy all hyb indices in shared memory\n");
fprintf(stderr, " -l list names of indices in shared memory\n");
fprintf(stderr, "\n");
return 1;
}
if (list_shm) {
return list_shm_hyb_indices();
} else if (clear_shm) {
return shm_clear_hyb();
}
return shm_keep_hyb(argv[optind]);
}
////////////////////////////////////////////// for test /////////////////////////////////////
// 创建正向的kmer
uint64_t build_forward_kmer(const uint8_t* q, int qlen, int kmer_len, int* base_consumed) {
uint64_t qbit = 0, i;

View File

@ -59,7 +59,7 @@ typedef struct {
uint8_t* for_bits; // 正向序列 2-bit编码
uint8_t* back_bits; // 反向互补序列 2-bit编码
int id; // for test;
char* seqstr;
// char* seqstr;
} ReadSeq;
typedef kvec_t(ReadSeq) ReadSeqArr;

View File

@ -0,0 +1,314 @@
#include "hyb_idx.h"
#include "profiling.h"
static int handle_hits_1(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const Range* rr, int x,
int rx, int init_match_len, uint64_t ref_pos, const int min_seed_len, HybSeedArr* seeds,
uint64_t* seeds_cap) {
int left_match = 0, right_match = 0;
both_end_match(hyb, read_seq->len, rr, read_seq->back_bits, read_seq->for_bits, rx, init_match_len, ref_pos, &left_match,
&right_match);
if (left_match + right_match >= min_seed_len) {
ref_pos = _rev_ref(hyb, ref_pos);
__add_seed_one_pos(seed, ref_pos - right_match + 1, x - right_match + 1, x + left_match + 1);
}
return MAX(x + left_match + 1, x - right_match + 1 + min_seed_len);
}
static int handle_hits_2(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const Range* rr, int x,
int rx, int init_match_len, uint64_t sa_pos, const int min_seed_len, HybSeedArr* seeds,
uint64_t* seeds_cap, int tid) {
int left_match_arr[2] = {0}, right_match_arr[2] = {0};
Range mr_arr[2] = {0};
uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, sa_pos), hyb_sa_to_ref_pos(hyb->sa, sa_pos + 1)};
int i = 0;
for (i = 0; i < 2; ++i) {
both_end_match(hyb, read_seq->len, rr, read_seq->back_bits, read_seq->for_bits, rx, init_match_len, ref_pos_arr[i],
&left_match_arr[i], &right_match_arr[i]);
_set_range(mr_arr[i], x - right_match_arr[i] + 1, x + left_match_arr[i] + 1);
}
if (_range_equal(mr_arr[0], mr_arr[1])) { // 相等
if (mr_arr[0].end - mr_arr[0].start >= min_seed_len) { // 正向搜索确定ref_pos的先后顺序
uint8_t type_hits = 0;
uint64_t offset = 0;
get_kmer_data(hyb, read_seq->for_bits, mr_arr[0].start, &type_hits, &offset);
if (type_hits == 2) {
ref_pos_arr[0] = hyb_sa_to_ref_pos(hyb->sa, offset);
ref_pos_arr[1] = hyb_sa_to_ref_pos(hyb->sa, offset + 1);
__add_seed_one_pos(seed, ref_pos_arr[0], mr_arr[0].start, mr_arr[0].end);
kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]);
} else { // 需要反向搜索确定ref
uint32_t seq_pos = mr_arr[0].start + HYB_KMER_LEN;
uint32_t hits = type_hits;
uint64_t sa_start = 0;
uint8_t cmp_ref = 0;
// PROF_START(seed_1);
get_leaf_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, read_range->end, &seq_pos, &hits,
&sa_start, &cmp_ref, tid);
// PROF_END(tprof[T_SEED_1_3_1][tid], seed_1);
ref_pos_arr[0] = hyb_sa_to_ref_pos(hyb->sa, sa_start);
ref_pos_arr[1] = hyb_sa_to_ref_pos(hyb->sa, sa_start + 1);
__add_seed_one_pos(seed, ref_pos_arr[0], mr_arr[0].start, mr_arr[0].end);
kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]);
}
}
} else if (_range_cross(mr_arr[0], mr_arr[1])) { // 交叉
if (mr_arr[0].start < mr_arr[1].start) {
__check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[0]) - right_match_arr[0] + 1, mr_arr[0].start,
mr_arr[0].end);
__check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[1]) - right_match_arr[1] + 1, mr_arr[1].start,
mr_arr[1].end);
} else {
__check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[1]) - right_match_arr[1] + 1, mr_arr[1].start,
mr_arr[1].end);
__check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[0]) - right_match_arr[0] + 1, mr_arr[0].start,
mr_arr[0].end);
}
} else { // 包含
if (mr_arr[0].start < mr_arr[1].start || mr_arr[0].end > mr_arr[1].end) {
__check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[0]) - right_match_arr[0] + 1, mr_arr[0].start,
mr_arr[0].end);
} else {
__check_add_seed_one_pos(seed, _rev_ref(hyb, ref_pos_arr[1]) - right_match_arr[1] + 1, mr_arr[1].start,
mr_arr[1].end);
}
}
return MAX(MAX(mr_arr[0].end, mr_arr[1].end), MIN(mr_arr[0].start, mr_arr[1].start) + min_seed_len);
}
static int handle_hits_much(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, int x,
const int min_seed_len, HybSeedArr* seeds, uint64_t* seeds_cap, int tid) {
int max_reach = x + HYB_KMER_LEN;
int right_match = 0;
uint8_t type_hits = 0;
uint64_t offset = 0;
uint64_t ref_pos = 0;
uint32_t seq_pos = x + HYB_KMER_LEN;
uint32_t hits = 0;
uint64_t sa_start = 0;
uint8_t cmp_ref = 0;
int i = 0;
get_kmer_data(hyb, read_seq->for_bits, x, &type_hits, &offset);
if (type_hits == 2) {
int match_end[2] = {0};
uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, offset), hyb_sa_to_ref_pos(hyb->sa, offset + 1)};
for (i = 0; i < 2; ++i) {
right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, HYB_KMER_LEN,
ref_pos_arr[i], &right_match);
match_end[i] = x + right_match;
}
max_reach = MAX(match_end[0], match_end[1]);
if (max_reach - x >= min_seed_len) {
if (match_end[0] == match_end[1]) {
__add_seed_one_pos(seed, ref_pos_arr[0], x, max_reach);
kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]);
} else if (match_end[0] > match_end[1]) {
__add_seed_one_pos(seed, ref_pos_arr[0], x, match_end[0]);
} else {
__add_seed_one_pos(seed, ref_pos_arr[1], x, match_end[1]);
}
}
} else {
hits = type_hits;
// PROF_START(seed_1);
get_leaf_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, read_range->end, &seq_pos, &hits, &sa_start,
&cmp_ref, tid);
// PROF_END(tprof[T_SEED_1_3_1][tid], seed_1);
// tdat[(seq_pos - x - HYB_KMER_LEN + 2) / 3][tid]++;
if (seq_pos == read_range->end || !cmp_ref) {
max_reach = seq_pos;
if (max_reach - x >= min_seed_len) {
__add_seed_one_pos(seed, hyb_sa_to_ref_pos(hyb->sa, sa_start), x, max_reach);
int i = 0;
for (i = 1; i < hits; ++i) {
kv_push(uint64_t, seed->ref_pos_arr, hyb_sa_to_ref_pos(hyb->sa, sa_start + i));
}
}
} else {
ref_pos = hyb_sa_to_ref_pos(hyb->sa, sa_start);
right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, seq_pos - x, ref_pos,
&right_match);
max_reach = x + right_match;
if (right_match >= min_seed_len) {
__add_seed_one_pos(seed, ref_pos, x, max_reach);
}
}
}
return MAX(max_reach, x + min_seed_len);
}
int seeding_from_start(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const int min_seed_len,
HybSeedArr* seeds, int tid) {
// PROF_START(seed_1);
uint64_t seeds_m = seeds->m;
uint64_t* seeds_cap = &seeds_m; // 记录当前seeds的长度, 如果扩容则需要初始化ref_pos_arr.m, n, a 为0
int max_reach = read_range->start + HYB_KMER_LEN; // 返回的结果最远匹配的read的位置
int x = read_range->start; // 从read_range的起始位置开始匹配
uint64_t i = 0;
int right_match = 0;
uint8_t type_hits = 0;
uint64_t offset = 0;
uint64_t ref_pos = 0;
get_kmer_data(hyb, read_seq->for_bits, x, &type_hits, &offset);
// PROF_END(tprof[T_SEED_1_0][tid], seed_1);
if (type_hits == 0) {
// tdat[TD_SEED_1_0][tid]++;
} else if (type_hits == 1) {
// tdat[TD_SEED_1_1][tid]++;
// PROF_START(seed_1);
right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, HYB_KMER_LEN, offset,
&right_match);
max_reach = x + right_match;
if (max_reach - x >= min_seed_len) {
__add_seed_one_pos(seed, offset, x, max_reach);
seed->first_len = HYB_KMER_LEN;
}
// PROF_END(tprof[T_SEED_1_1][tid], seed_1);
} else if (type_hits == 2) {
// tdat[TD_SEED_1_2][tid]++;
// PROF_START(seed_1);
int match_end[2] = {0};
uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, offset), hyb_sa_to_ref_pos(hyb->sa, offset + 1)};
for (i = 0; i < 2; ++i) {
right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, HYB_KMER_LEN,
ref_pos_arr[i], &right_match);
match_end[i] = x + right_match;
}
max_reach = MAX(match_end[0], match_end[1]);
if (max_reach - x >= min_seed_len) {
if (match_end[0] == match_end[1]) {
__add_seed_one_pos(seed, ref_pos_arr[0], x, max_reach);
kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]);
seed->first_len = match_end[0];
// seed->first_len = HYB_KMER_LEN;
} else if (match_end[0] > match_end[1]) {
__add_seed_one_pos(seed, ref_pos_arr[0], x, match_end[0]);
seed->first_len = match_end[1];
// seed->first_len = HYB_KMER_LEN;
} else {
__add_seed_one_pos(seed, ref_pos_arr[1], x, match_end[1]);
seed->first_len = match_end[0];
// seed->first_len = HYB_KMER_LEN;
}
}
// PROF_END(tprof[T_SEED_1_2][tid], seed_1);
} else {
// tdat[TD_SEED_1_3][tid]++;
uint32_t seq_pos = x + HYB_KMER_LEN;
uint32_t hits = type_hits;
uint64_t sa_start = 0;
uint8_t cmp_ref = 0;
// PROF_START(seed_1);
get_leaf_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, read_range->end, &seq_pos, &hits, &sa_start,
&cmp_ref, tid);
// PROF_END(tprof[T_SEED_1_3_1][tid], seed_1);
if (seq_pos == read_range->end || !cmp_ref) {
max_reach = seq_pos;
if (max_reach - x >= min_seed_len) {
__add_seed_one_pos(seed, hyb_sa_to_ref_pos(hyb->sa, sa_start), x, max_reach);
if (hits == 1)
seed->first_len = seq_pos - x;
for (i = 1; i < hits; ++i) {
kv_push(uint64_t, seed->ref_pos_arr, hyb_sa_to_ref_pos(hyb->sa, sa_start + i));
}
}
} else {
ref_pos = hyb_sa_to_ref_pos(hyb->sa, sa_start);
right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, x, seq_pos - x, ref_pos,
&right_match);
max_reach = x + right_match;
if (right_match >= min_seed_len) {
__add_seed_one_pos(seed, ref_pos, x, max_reach);
seed->first_len = seq_pos - x;
}
}
// PROF_END(tprof[T_SEED_1_3][tid], seed_1);
}
// PROF_END(tprof[T_SEED_1_ALL][tid], seed_1);
// PROF_END(tprof[T_SEED_1_1][tid], seed_1);
return MAX(max_reach, x + min_seed_len);
}
//////////////
// 用hybrid-index来寻找smem(seeding-1),要求种子 hits >= min_hits_thres(>0)
void hyb_first_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const int min_seed_len,
HybSeedArr* seeds, int tid) {
int x = seeding_from_start(hyb, read_seq, read_range, min_seed_len, seeds, tid);
int rx = 0; // 对应的反向位置
Range rr = {read_seq->len - read_range->end, read_seq->len - read_range->start};
uint64_t seeds_m = seeds->m;
uint64_t* seeds_cap = &seeds_m; // 记录当前seeds的长度, 如果扩容则需要初始化ref_pos_arr.m, n, a 为0
uint8_t type_hits = 0;
uint64_t offset = 0;
int extra_tend = MAX(0, min_seed_len - HYB_KMER_LEN) + 1;
// PROF_START(seed_1);
while (x < read_range->end) {
// 反向搜索, 此时x距离start超过16
rx = read_seq->len - x - 1; // 反向位置, 因为正向包含x, 所以这里需要减1
// PROF_START(seed_1);
get_kmer_data(hyb, read_seq->back_bits, rx, &type_hits, &offset);
// PROF_END(tprof[T_SEED_1_0][tid], seed_1);
if (type_hits == 0) {
x += extra_tend;
// tdat[TD_SEED_1_0][tid]++;
} else if (type_hits == 1) {
// tdat[TD_SEED_1_1][tid]++;
// PROF_START(seed_1);
x = handle_hits_1(hyb, read_seq, read_range, &rr, x, rx, HYB_KMER_LEN, offset, min_seed_len, seeds, seeds_cap);
// PROF_END(tprof[T_SEED_1_1][tid], seed_1);
} else if (type_hits == 2) {
// tdat[TD_SEED_1_2][tid]++;
// PROF_START(seed_1);
x = handle_hits_2(hyb, read_seq, read_range, &rr, x, rx, HYB_KMER_LEN, offset, min_seed_len, seeds, seeds_cap, tid);
// PROF_END(tprof[T_SEED_1_2][tid], seed_1);
} else {
// tdat[TD_SEED_1_3][tid]++;
// PROF_START(seed_1);
uint32_t seq_pos = rx + HYB_KMER_LEN;
uint32_t hits = type_hits;
uint64_t sa_start = 0;
uint8_t cmp_ref = 0;
get_leaf_node(hyb->index_data + offset, read_seq->back_bits, read_seq->rseq, rr.end, &seq_pos, &hits, &sa_start,
&cmp_ref, tid);
// PROF_END(tprof[T_SEED_1_3_1][tid], seed_1);
// tdat[(seq_pos - rx - HYB_KMER_LEN + 2) / 3][tid]++;
// tdat[TD_SEED_1_0][tid]++;
// if (hits == 1) {
// tdat[TD_SEED_1_1][tid]++;
// } else if (hits == 2) {
// tdat[TD_SEED_1_2][tid]++;
// } else if (hits == 3) {
// tdat[TD_SEED_1_3][tid]++;
// } else if (hits == 4) {
// tdat[TD_SEED_1_4][tid]++;
// } else {
// tdat[TD_SEED_1_5][tid]++;
// }
if (seq_pos == rr.end || !cmp_ref) {
if (hits == 1) {
// PROF_START(seed_1);
x = handle_hits_1(hyb, read_seq, read_range, &rr, x, rx, seq_pos - rx, hyb_sa_to_ref_pos(hyb->sa, sa_start),
min_seed_len, seeds, seeds_cap);
// PROF_END(tprof[T_SEED_1_3_2][tid], seed_1);
} else if (hits == 2) {
// PROF_START(seed_1);
x = handle_hits_2(hyb, read_seq, read_range, &rr, x, rx, seq_pos - rx, sa_start, min_seed_len, seeds,
seeds_cap, tid);
// PROF_END(tprof[T_SEED_1_3_3][tid], seed_1);
} else {
// PROF_START(seed_1);
x = handle_hits_much(hyb, read_seq, read_range, x + rx - seq_pos + 1, min_seed_len, seeds, seeds_cap, tid);
// PROF_END(tprof[T_SEED_1_3_4][tid], seed_1);
}
} else { // hits == 1
// PROF_START(seed_1);
x = handle_hits_1(hyb, read_seq, read_range, &rr, x, rx, seq_pos - rx, hyb_sa_to_ref_pos(hyb->sa, sa_start),
min_seed_len, seeds, seeds_cap);
// PROF_END(tprof[T_SEED_1_3_5][tid], seed_1);
}
// PROF_END(tprof[T_SEED_1_3][tid], seed_1);
}
}
// PROF_END(tprof[T_SEED_1_ALL][tid], seed_1);
// PROF_END(tprof[T_SEED_1_0][tid], seed_1);
}

View File

@ -0,0 +1,208 @@
#include "hyb_idx.h"
#include "profiling.h"
#define CALC_STAT 0
// 需要给定初始化的hits和seq_pos
static void get_min_hits_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int min_hits,
uint32_t* seq_pos_p, uint32_t* hits_p, uint64_t* sa_start_p, int tid) {
uint8_t cmp_ref_val = 0;
int is_head_node = 1;
uint8_t* cmp_ref = &cmp_ref_val;
uint8_t* prev_addr = idata;
uint32_t prev_seq_pos = *seq_pos_p;
uint32_t prev_hits = *hits_p;
uint64_t prev_sa_start = *sa_start_p;
uint8_t* next_addr = parse_first_hyb_node(idata, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid);
#if CALC_STAT
if (next_addr != NULL) {
// fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr);
uint64_t dist = next_addr - prev_addr;
if (dist < 32)
gdat[0]++;
else if (dist < 64)
gdat[1]++;
else if (dist < 128)
gdat[2]++;
else
gdat[3]++;
}
#endif
while (next_addr != NULL && *hits_p >= min_hits && *seq_pos_p < seq_end) {
prev_addr = next_addr;
prev_seq_pos = *seq_pos_p;
prev_hits = *hits_p;
prev_sa_start = *sa_start_p;
next_addr = parse_one_hyb_node(next_addr, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid);
is_head_node = 0;
#if CALC_STAT
if (next_addr != NULL) {
// fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr);
uint64_t dist = next_addr - prev_addr;
if (dist < 32)
gdat[0]++;
else if (dist < 64)
gdat[1]++;
else if (dist < 128)
gdat[2]++;
else
gdat[3]++;
}
#endif
}
if (*hits_p < min_hits) {
*seq_pos_p = prev_seq_pos;
*hits_p = prev_hits;
*sa_start_p = prev_sa_start;
next_addr = prev_addr;
parse_one_hyb_node_min_hits(next_addr, seq_bits, seq_bp, seq_end, min_hits, is_head_node, seq_pos_p, sa_start_p, hits_p,
tid);
}
}
// for seeding-2 , 先反向,后正向
int hyb_second_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, int seed_start, int seed_end, int read_start,
int read_end, uint64_t first_ref, int min_hits, int pre_pivot, int pre_start, int pre_end,
const int min_seed_len, HybSeedArr* seeds, int tid) {
uint64_t seeds_m = seeds->m;
uint64_t* seeds_cap = &seeds_m;
int pivot = (seed_start + seed_end) >> 1;
int x = MAX(MAX(pivot, read_start + min_seed_len - 1), pre_pivot);
int rx = 0;
Range fr = {read_start, read_end};
Range rr = {read_seq->len - read_end, read_seq->len - read_start};
Range* read_range = &fr;
uint8_t type_hits = 0;
uint64_t offset = 0;
int extra_tend = MAX(0, min_seed_len - HYB_KMER_LEN) + 1;
int next_pivot = x;
int cur_left = 0;
int old_n = seeds->n;
int i = 0;
// PROF_START(seed_2);
#if 1
if (pre_end > pre_start && seeds->a[pre_end - 1].seed_end > pivot) {
for (i = pre_start; i < pre_end; ++i) {
HybSeed* seed = &kv_A(*seeds, i);
if (seed->seed_end > pivot) {
__check_add_seed(new_seed);
seed = &kv_A(*seeds, i);
__copy_seed(*seed, *new_seed);
}
}
}
// PROF_END(tprof[T_SEED_2_0][tid], seed_2);
#endif
while (cur_left <= pivot && x < fr.end) {
next_pivot = x;
rx = read_seq->len - x - 1; // 反向位置, 因为正向包含x, 所以这里需要减1
// PROF_START(seed_2);
get_kmer_data(hyb, read_seq->back_bits, rx, &type_hits, &offset);
// PROF_END(tprof[T_SEED_2_0][tid], seed_2);
if (type_hits == 0) {
cur_left = x - HYB_KMER_LEN + 2;
x += extra_tend;
} else if (type_hits == 1) { // min_hits肯定大于1
cur_left = x - HYB_KMER_LEN + 2;
x += extra_tend;
} else if (type_hits == 2) {
// PROF_START(seed_2);
if (min_hits > 2) {
cur_left = x - HYB_KMER_LEN + 2;
x += extra_tend;
} else {
uint64_t ref_pos_arr[2];
int left_match_arr[2] = {0}, right_match_arr[2] = {0};
Range mr_arr[2] = {0};
int new_x = x - HYB_KMER_LEN + 1;
get_kmer_data(hyb, read_seq->for_bits, new_x, &type_hits, &offset);
ref_pos_arr[0] = hyb_sa_to_ref_pos(hyb->sa, offset);
ref_pos_arr[1] = hyb_sa_to_ref_pos(hyb->sa, offset + 1);
for (i = 0; i < 2; ++i) {
if (first_ref + new_x - seed_start == ref_pos_arr[i]) {
left_match_arr[i] = new_x - seed_start;
right_match_arr[i] = seed_end - new_x;
} else {
both_end_match(hyb, read_seq->len, &fr, read_seq->for_bits, read_seq->back_bits, new_x, HYB_KMER_LEN,
ref_pos_arr[i], &left_match_arr[i], &right_match_arr[i]);
}
_set_range(mr_arr[i], new_x - left_match_arr[i], new_x + right_match_arr[i]);
}
Range sr = {MAX(mr_arr[0].start, mr_arr[1].start), MIN(mr_arr[0].end, mr_arr[1].end)};
if (sr.end - sr.start >= min_seed_len && sr.start <= pivot) {
__add_seed_one_pos(seed, ref_pos_arr[0] - new_x + sr.start, sr.start, sr.end);
kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1] - new_x + sr.start);
}
cur_left = sr.start;
x = MAX(x + 1, MAX(sr.end, cur_left + min_seed_len));
}
// PROF_END(tprof[T_SEED_2_1][tid], seed_2);
} else {
// PROF_START(seed_2);
if (type_hits <= HYB_HIT_THRESH && type_hits < min_hits) {
cur_left = x - HYB_KMER_LEN + 2;
x += extra_tend;
} else {
uint32_t seq_pos = rx + HYB_KMER_LEN;
uint32_t hits = type_hits;
uint64_t sa_start = 0;
// PROF_START(seed_2);
get_min_hits_node(hyb->index_data + offset, read_seq->back_bits, read_seq->rseq, rr.end, min_hits, &seq_pos,
&hits, &sa_start, tid);
// PROF_END(tprof[T_SEED_2_2_0][tid], seed_2);
// tdat[(seq_pos - rx - HYB_KMER_LEN + 2) / 3][tid]++;
// forward search
int new_x = x - (seq_pos - rx) + 1;
if (new_x <= pivot) {
// PROF_START(seed_2);
get_kmer_data(hyb, read_seq->for_bits, new_x, &type_hits, &offset);
// PROF_END(tprof[T_SEED_2_2_1][tid], seed_2);
if (type_hits == 2) {
// PROF_START(seed_2);
int right_match = 0;
int match_end[2] = {0};
uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, offset), hyb_sa_to_ref_pos(hyb->sa, offset + 1)};
for (i = 0; i < 2; ++i) {
if (first_ref + new_x - seed_start == ref_pos_arr[i]) {
right_match = seed_end - new_x;
} else {
right_end_match(hyb, read_seq->len, read_range, read_seq->for_bits, read_seq->back_bits, new_x,
HYB_KMER_LEN, ref_pos_arr[i], &right_match);
}
match_end[i] = new_x + right_match;
}
seq_pos = MIN(match_end[0], match_end[1]);
if (seq_pos - new_x >= min_seed_len) {
__add_seed_one_pos(seed, ref_pos_arr[0], new_x, seq_pos);
kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]);
}
// PROF_END(tprof[T_SEED_2_2_2][tid], seed_2);
} else {
hits = type_hits;
seq_pos = new_x + HYB_KMER_LEN;
// PROF_START(seed_2);
get_min_hits_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, fr.end, min_hits,
&seq_pos, &hits, &sa_start, tid);
// PROF_END(tprof[T_SEED_2_2_0][tid], seed_2);
// tdat[(seq_pos - new_x - HYB_KMER_LEN + 2) / 3][tid]++;
if (seq_pos - new_x >= min_seed_len) {
__add_seed_one_pos(seed, hyb_sa_to_ref_pos(hyb->sa, sa_start), new_x, seq_pos);
for (i = 1; i < hits; ++i) {
kv_push(uint64_t, seed->ref_pos_arr, hyb_sa_to_ref_pos(hyb->sa, sa_start + i));
}
}
// PROF_END(tprof[T_SEED_2_2_3][tid], seed_2);
}
}
cur_left = new_x;
x = MAX(seq_pos, cur_left + min_seed_len);
// x = seq_pos;
}
// PROF_END(tprof[T_SEED_2_2][tid], seed_2);
}
}
if (old_n < seeds->n) {
next_pivot = seeds->a[seeds->n - 1].seed_end;
}
// PROF_END(tprof[T_SEED_2_ALL][tid], seed_2);
return next_pivot;
}

View File

@ -0,0 +1,203 @@
#include "hyb_idx.h"
#include "profiling.h"
#define CALC_STAT 0
static void get_seed_end_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int max_hits, int seed_end,
uint32_t* seq_pos_p, uint32_t* hits_p, uint64_t* sa_start_p, int tid) {
uint8_t cmp_ref_val = 0;
int is_head_node = 1;
uint8_t* cmp_ref = &cmp_ref_val;
uint8_t* prev_addr = idata;
uint32_t prev_seq_pos = *seq_pos_p;
uint32_t prev_hits = *hits_p;
uint64_t prev_sa_start = *sa_start_p;
uint8_t* next_addr = parse_first_hyb_node(idata, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid);
#if CALC_STAT
if (next_addr != NULL) {
// fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr);
uint64_t dist = next_addr - prev_addr;
if (dist < 32)
gdat[0]++;
else if (dist < 64)
gdat[1]++;
else if (dist < 128)
gdat[2]++;
else
gdat[3]++;
}
#endif
while (next_addr != NULL && *hits_p > 1 && (*seq_pos_p < seed_end || *hits_p >= max_hits)) {
prev_addr = next_addr;
prev_seq_pos = *seq_pos_p;
prev_hits = *hits_p;
prev_sa_start = *sa_start_p;
next_addr = parse_one_hyb_node(next_addr, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid);
is_head_node = 0;
#if CALC_STAT
if (next_addr != NULL) {
// fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr);
uint64_t dist = next_addr - prev_addr;
if (dist < 32)
gdat[0]++;
else if (dist < 64)
gdat[1]++;
else if (dist < 128)
gdat[2]++;
else
gdat[3]++;
}
#endif
}
uint32_t hold_seq_pos = *seq_pos_p;
uint32_t hold_hits = *hits_p;
uint64_t hold_sa_start = *sa_start_p;
if (*seq_pos_p > seed_end && *hits_p < max_hits) {
// 检查前一个节点
*seq_pos_p = prev_seq_pos;
*hits_p = prev_hits;
*sa_start_p = prev_sa_start;
next_addr = prev_addr;
parse_one_hyb_node_max_hits(next_addr, seq_bits, seq_bp, seq_end, max_hits, seed_end - prev_seq_pos, is_head_node,
seq_pos_p, sa_start_p, hits_p, tid);
if (*hits_p >= max_hits) {
*seq_pos_p = hold_seq_pos;
*hits_p = hold_hits;
*sa_start_p = hold_sa_start;
}
}
}
// assume max_hits > 2
void hyb_third_seeding(const HybridIndex* hyb, const ReadSeq* read_seq, const Range* read_range, const Range* seeds_range,
const int min_seed_len, const int max_hits, HybSeedArr* seeds, int tid) {
if (seeds_range->start == seeds_range->end) {
return;
}
uint64_t seeds_m = seeds->m;
uint64_t* seeds_cap = &seeds_m;
int new_seed_len = min_seed_len + 1;
int i = 0;
int right_match_arr[2] = {0};
Range ff = *read_range;
uint8_t type_hits = 0;
uint64_t offset = 0;
int seeds_i = seeds_range->start;
int x = read_range->start;
int x_end = x + new_seed_len;
int flag_found_x_end = 0;
int flag_i = 0;
// PROF_START(seed_3);
HybSeed s = kv_A(*seeds, seeds_i);
if (s.first_len > 0 && s.first_len < new_seed_len && s.seed_start == x && s.seed_end >= x_end && s.ref_pos_arr.n <= 2) {
__add_seed_one_pos(seed, s.ref_pos_arr.a[0], x, x_end);
if (s.ref_pos_arr.n == 2)
kv_push(uint64_t, seed->ref_pos_arr, s.ref_pos_arr.a[1]);
x = x_end;
}
while (x + min_seed_len < read_range->end) {
while (seeds_i < seeds_range->end && kv_A(*seeds, seeds_i).seed_end < x) ++seeds_i;
if (seeds_i == seeds_range->end)
break;
if (seeds->a[seeds_i].seed_start > x) {
x += new_seed_len;
continue;
}
x_end = x + new_seed_len;
flag_found_x_end = 0;
flag_i = 0;
for (i = seeds_i; i < seeds_range->end; ++i) {
HybSeed* s = &kv_A(*seeds, i);
if (s->seed_start >= x_end)
break;
if (s->seed_start <= x && s->seed_end >= x_end) {
flag_found_x_end = 1; // x_end点存在seed
flag_i = i;
break;
}
}
if (!flag_found_x_end) {
x = x_end;
continue;
}
// PROF_START(seed_3);
get_kmer_data(hyb, read_seq->for_bits, x, &type_hits, &offset);
// PROF_END(tprof[T_SEED_3_0][tid], seed_3);
if (type_hits == 0) {
x += new_seed_len;
} else if (type_hits == 1) {
// PROF_START(seed_3);
__add_seed_one_pos(seed, offset, x, x_end);
x = x_end;
// PROF_END(tprof[T_SEED_3_1][tid], seed_3);
} else if (type_hits == 2) {
// PROF_START(seed_3);
HybSeed s = kv_A(*seeds, flag_i);
if (s.ref_pos_arr.n == 2) {
__add_seed_one_pos(seed, s.ref_pos_arr.a[0] + x - s.seed_start, x, x_end);
kv_push(uint64_t, seed->ref_pos_arr, s.ref_pos_arr.a[1] + x - s.seed_start);
} else { // 只有一个ref_pos
ff.end = x_end;
uint64_t ref_pos_arr[2] = {hyb_sa_to_ref_pos(hyb->sa, offset), hyb_sa_to_ref_pos(hyb->sa, offset + 1)};
for (i = 0; i < 2; ++i) {
if (s.ref_pos_arr.a[0] + x - s.seed_start == ref_pos_arr[i]) {
right_match_arr[i] = MIN(s.seed_end - x, new_seed_len);
} else {
right_end_match(hyb, read_seq->len, &ff, read_seq->for_bits, read_seq->back_bits, x, HYB_KMER_LEN,
ref_pos_arr[i], &right_match_arr[i]);
}
}
if (right_match_arr[0] == right_match_arr[1]) {
if (right_match_arr[0] == new_seed_len) {
__add_seed_one_pos(seed, ref_pos_arr[0], x, x_end);
kv_push(uint64_t, seed->ref_pos_arr, ref_pos_arr[1]);
}
} else {
if (right_match_arr[0] == new_seed_len) {
__add_seed_one_pos(seed, ref_pos_arr[0], x, x_end);
} else if (right_match_arr[1] == new_seed_len) {
__add_seed_one_pos(seed, ref_pos_arr[1], x, x_end);
}
}
}
x = x_end;
// PROF_END(tprof[T_SEED_3_2][tid], seed_3);
} else {
uint32_t seq_pos = x + HYB_KMER_LEN;
uint32_t hits = type_hits;
uint64_t sa_start = 0;
// PROF_START(seed_3);
get_seed_end_node(hyb->index_data + offset, read_seq->for_bits, read_seq->seq, read_range->end, max_hits,
x + new_seed_len, &seq_pos, &hits, &sa_start, tid);
// PROF_END(tprof[T_SEED_3_3_0][tid], seed_3);
// tdat[(seq_pos - x - HYB_KMER_LEN + 2) / 3][tid]++;
if (seq_pos - x < new_seed_len) {
// PROF_START(seed_3);
HybSeed s = kv_A(*seeds, flag_i);
__add_seed_one_pos(seed, s.ref_pos_arr.a[0] + x - s.seed_start, x, x_end);
x = x_end;
// PROF_END(tprof[T_SEED_3_3_1][tid], seed_3);
} else {
// PROF_START(seed_3);
if (hits < max_hits) {
__add_seed_one_pos(seed, hyb_sa_to_ref_pos(hyb->sa, sa_start), x, seq_pos);
int i = 0;
for (i = 1; i < hits; ++i) {
kv_push(uint64_t, seed->ref_pos_arr, hyb_sa_to_ref_pos(hyb->sa, sa_start + i));
}
x = seq_pos;
} else {
x = seq_pos + 1;
}
// PROF_END(tprof[T_SEED_3_3_2][tid], seed_3);
}
// PROF_END(tprof[T_SEED_3_3][tid], seed_3);
}
}
// PROF_END(tprof[T_SEED_3_ALL][tid], seed_3);
}

View File

@ -0,0 +1,817 @@
#include <assert.h>
#include <emmintrin.h>
#include <sys/stat.h>
#include "hyb_idx.h"
#include "share_mem.h"
#include "utils.h"
/////////////////////////////////////////////////////
// 使用hybrid-index的工具函数
// 加载hybrid index
HybridIndex* load_hybrid_idx(const char* prefix) {
HybridIndex* hyb = NULL;
hyb = (HybridIndex*)calloc(1, sizeof(HybridIndex));
// return hyb;
int prefix_len = strlen(prefix);
char* fn = (char*)malloc(prefix_len + 30);
FILE* fp = NULL;
struct stat st;
#define __load_hybrid_idx_code(suffix, data) \
sprintf(fn, "%s%s", prefix, suffix); \
err_check_true(stat(fn, &st), 0); \
fp = xopen(fn, "r"); \
data = (uint8_t*)malloc(st.st_size); \
err_fread_noeof(data, 1, st.st_size, fp); \
err_fclose(fp);
// load ref-len
sprintf(fn, "%s.ref-len", prefix);
// fprintf(stderr, "fn: %s\n", fn);
fp = xopen(fn, "r");
err_check_false(fscanf(fp, "%ld", &hyb->ref_len), EOF);
err_fclose(fp);
// fprintf(stderr, "ref-len: %ld\n", hyb->ref_len);
char* kmer_suffix = ".hybrid.kmer";
char* data_suffix = ".hybrid.data";
// char *kmer_suffix = ".hybrid.learned.kmer";
// char *data_suffix = ".hybrid.learned.data";
#if 0
// shm_clear_hyb();
// load 2-bit ref
__load_hybrid_idx_code(".hybrid.pac", hyb->ref_bits);
// load hyb byte-sa
__load_hybrid_idx_code(".hybrid.sa", hyb->sa);
// load hyb kmer data
__load_hybrid_idx_code(kmer_suffix, hyb->kmer_data);
// load hyb index data
__load_hybrid_idx_code(data_suffix, hyb->index_data);
#else
shm_keep_hyb(prefix);
// load 2-bit ref
hyb->ref_bits = (uint8_t*)shm_get_index(strcat(strcpy(fn, prefix), ".hybrid.pac"));
// load hyb byte-sa
hyb->sa = (uint8_t*)shm_get_index(strcat(strcpy(fn, prefix), ".hybrid.sa"));
// load hyb kmer data
hyb->kmer_data = (uint8_t*)shm_get_index(strcat(strcpy(fn, prefix), kmer_suffix));
// load hyb index data
hyb->index_data = (uint8_t*)shm_get_index(strcat(strcpy(fn, prefix), data_suffix));
#endif
// fprintf(stderr, "文件大小为: %ld 字节, %.2f GB\n", st.st_size, (double)st.st_size / (1024 * 1024 * 1024));
return hyb;
}
// 创建正向反向互补bits
void create_seq_fb_bits(uint8_t* bs, int len, uint8_t* fs, uint8_t* rs) {
if (len > 0) {
uint8_t fbp = 0, rbp = 0;
int i = 0, j = 0, idxf = 0, idxr = 0;
for (; i + 3 < len; i += 4) {
fbp = (bs[i] & 3) | (bs[i + 1] & 3) << 2 | (bs[i + 2] & 3) << 4 | (bs[i + 3] & 3) << 6;
rbp = (3 - (bs[len - i - 1] & 3)) | (3 - (bs[len - i - 2] & 3)) << 2 | (3 - (bs[len - i - 3] & 3)) << 4 |
(3 - (bs[len - i - 4] & 3)) << 6;
fs[idxf++] = fbp;
rs[idxr++] = rbp;
}
fbp = 0;
rbp = 0;
for (; i < len; ++i, ++j) {
fbp |= (bs[i] & 3) << j * 2;
rbp |= (3 - (bs[len - i - 1] & 3)) << j * 2;
}
fs[idxf++] = fbp;
rs[idxr++] = rbp;
}
}
// 将seq和ref正向比对看最多有多少匹配的bpseq和ref都是2-bit编码的
inline int forward_match_len(uint8_t* seq, int64_t seq_pos, int64_t seq_end, uint8_t* ref, int64_t ref_pos, int64_t ref_len) {
if (seq_pos >= seq_end)
return 0;
int64_t max_match_len = MIN(ref_len - ref_pos, seq_end - seq_pos);
int ref_odd = ref_pos & 3;
int seq_odd = seq_pos & 3;
int64_t i = seq_pos;
int64_t j = ref_pos;
int match_len = 0;
/////////////
#define __forward_match_code(first_len, first_ref, first_seq, ref_bits, seq_bits) \
uint64_t bp32ref = first_ref; \
uint64_t bp32seq = first_seq; \
uint64_t cmp = bp32ref ^ bp32seq; \
if (cmp > 0) \
return MIN(__builtin_ctzll(cmp) >> 1, max_match_len); \
int first_cmp_len = first_len; \
match_len = MIN(first_cmp_len, max_match_len); \
i += first_cmp_len; \
j += first_cmp_len; \
seq_odd = i & 3; \
ref_odd = j & 3; \
for (; i + 31 < seq_end; i += 32, j += 32, match_len += 32) { \
bp32ref = ref_bits; \
bp32seq = seq_bits; \
cmp = bp32ref ^ bp32seq; \
if (cmp > 0) \
return MIN(match_len + (__builtin_ctzll(cmp) >> 1), max_match_len); \
} \
if (i < seq_end) { \
bp32ref = ref_bits; \
bp32seq = seq_bits; \
cmp = bp32ref ^ bp32seq; \
if (cmp > 0) \
return MIN(match_len + MIN(__builtin_ctzll(cmp) >> 1, seq_end - i), max_match_len); \
match_len = max_match_len; /*match_len += seq_end - i;*/ \
}
/////////
if (seq_odd < ref_odd) { // 调整到ref的整数字节
__forward_match_code(32 - ref_odd, (*(uint64_t*)&ref[j >> 2]) >> (ref_odd << 1),
(*(uint64_t*)&seq[i >> 2]) << ((ref_odd - seq_odd) << 1) >> (ref_odd << 1),
(*(uint64_t*)&ref[j >> 2]),
seq[i >> 2] >> (seq_odd << 1) | (*(uint64_t*)&seq[(i >> 2) + 1]) << ((4 - seq_odd) << 1));
} else if (seq_odd > ref_odd) { // 调整到seq的整数字节
__forward_match_code(32 - seq_odd, (*(uint64_t*)&ref[j >> 2]) << ((seq_odd - ref_odd) << 1) >> (seq_odd << 1),
(*(uint64_t*)&seq[i >> 2]) >> (seq_odd << 1),
ref[j >> 2] >> (ref_odd << 1) | (*(uint64_t*)&ref[(j >> 2) + 1]) << ((4 - ref_odd) << 1),
(*(uint64_t*)&seq[i >> 2]));
} else { // 可以调整到相同的64位地址进行比较了
__forward_match_code(32 - seq_odd, (*(uint64_t*)&ref[j >> 2]) >> (seq_odd << 1),
(*(uint64_t*)&seq[i >> 2]) >> (seq_odd << 1), (*(uint64_t*)&ref[j >> 2]),
(*(uint64_t*)&seq[i >> 2]));
}
return MIN(match_len, max_match_len);
}
// 将seq和ref反向比对看最多有多少匹配的bp
inline int backward_match_len(uint8_t* seq, int64_t seq_pos, int64_t seq_start, uint8_t* ref, int64_t ref_pos) {
if (seq_pos < seq_start)
return 0;
int64_t max_match_len = MIN(ref_pos + 1, seq_pos - seq_start + 1);
int64_t i = seq_pos;
int64_t j = ref_pos;
int seq_odd = 3 - (i & 3);
int ref_odd = 3 - (j & 3);
int match_len = 0;
/////////////
#define __backward_tail_code(last_code) \
int ext_bp = (7 - (i >> 2)) << 2; \
uint64_t bp32ref = *(uint64_t*)(ref + (j >> 2) - 8) >> ((4 - ref_odd) << 1) | (uint64_t)ref[j >> 2] \
<< ((ref_odd + 28) << 1); \
uint64_t bp32seq = (*(uint64_t*)seq) << ((seq_odd + ext_bp) << 1); \
uint64_t cmp = bp32ref ^ bp32seq; \
if (cmp > 0) \
return MIN(match_len + MIN(__builtin_clzll(cmp) >> 1, (int)i + 1 - seq_start), max_match_len); \
last_code
#define __backward_match_code(first_len, first_ref, first_seq, ref_bits, seq_bits) \
uint64_t bp32ref = first_ref; \
uint64_t bp32seq = first_seq; \
uint64_t cmp = bp32ref ^ bp32seq; \
if (cmp > 0) \
return MIN(MIN(__builtin_clzll(cmp) >> 1, (int)i + 1 - seq_start), max_match_len); \
int first_cmp_len = first_len; \
match_len = MIN(first_cmp_len, max_match_len); \
i -= first_cmp_len; \
j -= first_cmp_len; \
seq_odd = 3 - (i & 3); \
ref_odd = 3 - (j & 3); \
for (; i - 31 >= 0; i -= 32, j -= 32, match_len += 32) { \
bp32ref = ref_bits; \
bp32seq = seq_bits; \
cmp = bp32ref ^ bp32seq; \
if (cmp > 0) \
return MIN(match_len + (__builtin_clzll(cmp) >> 1), max_match_len); \
} \
if (i >= seq_start) { \
__backward_tail_code(match_len = max_match_len); \
}
////////////
if (i < 32) { // 只需要一次比较
__backward_tail_code(return max_match_len);
}
if (seq_odd < ref_odd) { // 调整到ref的整数字节
__backward_match_code(
32 - ref_odd, (*(uint64_t*)&ref[(j >> 2) - 7]) << (ref_odd << 1),
(*(uint64_t*)&seq[(i >> 2) - 7]) >> ((ref_odd - seq_odd) << 1) << (ref_odd << 1), (*(uint64_t*)&ref[(j >> 2) - 7]),
(*(uint64_t*)&seq[(i >> 2) - 8] >> ((4 - seq_odd) << 1)) | ((uint64_t)seq[(i >> 2)] << ((seq_odd + 28) << 1)));
} else if (seq_odd > ref_odd) { // 调整到seq的整数字节
__backward_match_code(
32 - seq_odd, (*(uint64_t*)&ref[(j >> 2) - 7]) >> ((seq_odd - ref_odd) << 1) << (seq_odd << 1),
(*(uint64_t*)&seq[(i >> 2) - 7]) << (seq_odd << 1),
(*(uint64_t*)&ref[(j >> 2) - 8] >> ((4 - ref_odd) << 1)) | ((uint64_t)ref[(j >> 2)] << ((ref_odd + 28) << 1)),
(*(uint64_t*)&seq[(i >> 2) - 7]));
} else { // 可以调整到相同的64位地址进行比较了
__backward_match_code(32 - seq_odd, (*(uint64_t*)&ref[(j >> 2) - 7]) << (seq_odd << 1),
(*(uint64_t*)&seq[(i >> 2) - 7]) << (seq_odd << 1), (*(uint64_t*)&ref[(j >> 2) - 7]),
(*(uint64_t*)&seq[(i >> 2) - 7]));
}
return MIN(match_len, max_match_len);
}
// 根据sa的行获取对应的ref position小端模式
uint64_t hyb_sa_to_ref_pos(uint8_t* sa_arr, uint64_t row) {
const uint64_t start_byte = ((row << 5) + row) >> 3; // 存储这个sa数据的起始字节
uint64_t val = *(uint64_t*)(sa_arr + start_byte);
val = (val >> (row & 7)) & 8589934591ULL; // 33-bits mask
return val;
}
#define __parse_node_start_no_addr(idata) \
*cmp_ref = 1; \
uint32_t seq_pos = *seq_pos_p; \
uint8_t header = *idata; \
idata += 1; \
uint8_t node_type = (header >> 6) & 3; \
uint8_t hits_neq = header >> 5 & 1; \
uint32_t hits_bytes = ((header >> 3) & 3) + 1; \
uint32_t off_bytes = header & 7; \
uint32_t child_ptr_bytes = hits_bytes + off_bytes;
// 解析hyb node初始化变量信息
#define __parse_node_start_code(idata) \
uint8_t* addr = NULL; \
__parse_node_start_no_addr(idata)
// 解析单一碱基序列节点
#define __parse_path_node_code(path_len) \
uint32_t path_len = (header & 1); \
path_len = path_len << 8 | *idata; \
idata += 1; \
int match_len = forward_match_len(seq_bits, seq_pos, seq_end, idata, 0, path_len); \
*seq_pos_p = seq_pos + match_len; \
if (match_len == (int)path_len) { \
addr = idata + (((path_len << 1) + 7) >> 3); \
if (hits_neq) { \
*sa_start_p += 1; \
*hits_p -= 1; \
} \
} else \
*cmp_ref = 0;
// 解析正常hyb节点
#define __parse_child_node_code(kmer_len, mark_bytes, int_type, kmer_code, bits_count, one) \
uint8_t kmer = kmer_code; \
int_type mark = *(int_type*)idata; \
int_type child_num = mark & (one << kmer); \
if (child_num) { \
*seq_pos_p += kmer_len; \
uint32_t nth_child = bits_count(mark & ((one << kmer) - 1)); \
uint8_t has_next_child = bits_count(mark >> kmer >> 1); \
if (*seq_pos_p >= HYB_MAX_SEQ_LEN) { \
*cmp_ref = 0; \
} \
if (off_bytes == HYB_LEAF_NODE) { \
*hits_p -= nth_child + hits_neq + has_next_child; \
*sa_start_p += nth_child + hits_neq; \
} else { \
if (nth_child == 0) { \
idata += mark_bytes; \
uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes]; \
addr = idata + has_next_child * child_ptr_bytes; \
*hits_p = hits_start - hits_neq; \
*sa_start_p += hits_neq; \
} else { \
idata += mark_bytes + (nth_child - 1) * child_ptr_bytes; \
uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes]; \
uint32_t child_offset = *(uint32_t*)(idata + hits_bytes) & ga_hybOffMask[off_bytes]; \
addr = idata + child_offset + (has_next_child + 1) * child_ptr_bytes; \
if (has_next_child) { \
*hits_p = (*(uint32_t*)(idata + child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) - hits_start; \
} else { \
*hits_p -= hits_start + hits_neq; \
} \
*sa_start_p += hits_start; \
} \
} \
} else { \
*cmp_ref = 0; \
}
// 当节点不能完全匹配时候,检查是否能匹配该节点包含的部分碱基序列
#define __parse_part_node_code(kmer_len, mark_bytes, int_type, kmer_base_code, bits_range, bits_count, one) \
uint8_t kmer_base = kmer_base_code; \
int_type mark = *(int_type*)idata; \
int_type kmer_mask = ((one << bits_range) - 1) << kmer_base; \
int_type child_num = mark & kmer_mask; \
if (child_num) { \
*seq_pos_p += kmer_len; \
int_type kmer_pre_mask = (one << kmer_base) - 1; \
uint32_t nth_child = bits_count(mark & kmer_pre_mask); \
uint8_t has_next_child = bits_count(mark >> kmer_base >> bits_range); \
if (off_bytes == HYB_LEAF_NODE) { \
*hits_p -= nth_child + hits_neq + has_next_child; \
*sa_start_p += nth_child + hits_neq; \
} else { \
if (nth_child == 0) { \
idata += mark_bytes; \
uint32_t hits_start = hits_neq; \
if (has_next_child) { \
child_num = bits_count(child_num); \
*hits_p = \
(*(uint32_t*)(idata + (child_num - 1) * child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) - hits_start; \
} else { \
*hits_p -= hits_start; \
} \
*sa_start_p += hits_start; \
} else { \
idata += mark_bytes + (nth_child - 1) * child_ptr_bytes; \
uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes]; \
*sa_start_p += hits_start; \
if (has_next_child) { \
child_num = bits_count(child_num); \
*hits_p = (*(uint32_t*)(idata + child_num * child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) - hits_start; \
} else { \
*hits_p -= hits_start + hits_neq; \
} \
} \
} \
} \
*cmp_ref = 0;
// 解析节点主要代码
#define __parse_hyb_node_code(return_code) \
if (node_type == HYB_BP_PATH) { \
__parse_path_node_code(path_len); \
} else if (node_type == HYB_BP_1) { \
__parse_child_node_code(1, 1, uint8_t, seq_bp[seq_pos], __builtin_popcount, 1); \
} else if (node_type == HYB_BP_2) { \
if (seq_pos + 1 < seq_end) { \
__parse_child_node_code(2, 2, uint16_t, seq_bp[seq_pos] << 2 | seq_bp[seq_pos + 1], __builtin_popcount, 1); \
if (!child_num) { \
__parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1); \
} \
} else { \
__parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1); \
} \
} else { \
if (seq_pos + 2 < seq_end) { \
__parse_child_node_code(3, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2 | seq_bp[seq_pos + 2], \
__builtin_popcountll, 1ULL); \
if (!child_num) { \
__parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, \
__builtin_popcountll, 1ULL); \
if (!child_num) { \
__parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL); \
} \
} \
} else if (seq_pos + 1 < seq_end) { \
__parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, __builtin_popcountll, \
1ULL); \
if (!child_num) { \
__parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL); \
} \
} else { \
__parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL); \
} \
} \
/*__builtin_prefetch(addr, 0, 3); */ \
return_code
/////////
// 解析第一个节点, 返回后续对应的节点地址
uint8_t* parse_first_hyb_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p,
uint64_t* sa_start_p, uint32_t* hits_p, uint8_t* cmp_ref, int tid) {
if (*seq_pos_p == seq_end)
return NULL;
__parse_node_start_code(idata);
*sa_start_p = (*(uint64_t*)idata) & HYB_NODE_SA_MASK;
idata += 5;
if (*hits_p > HYB_HIT_THRESH) { // 更新hits
*hits_p = *((uint32_t*)idata) & ga_hybHitsMask[hits_bytes]; // hits数量
idata += hits_bytes;
}
// __parse_hyb_node_code(return addr);
if (node_type == HYB_BP_PATH) {
__parse_path_node_code(path_len);
} else if (node_type == HYB_BP_1) {
__parse_child_node_code(1, 1, uint8_t, seq_bp[seq_pos], __builtin_popcount, 1);
} else if (node_type == HYB_BP_2) {
if (seq_pos + 1 < seq_end) {
__parse_child_node_code(2, 2, uint16_t, seq_bp[seq_pos] << 2 | seq_bp[seq_pos + 1], __builtin_popcount, 1);
if (!child_num) {
__parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1);
}
} else {
__parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1);
}
} else {
if (seq_pos + 2 < seq_end) {
//__parse_child_node_code(3, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2 | seq_bp[seq_pos + 2],
// __builtin_popcountll, 1ULL);
uint8_t kmer = seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2 | seq_bp[seq_pos + 2];
uint64_t mark = *(uint64_t*)idata;
uint64_t child_num = mark & (1ULL << kmer);
if (child_num) {
*seq_pos_p += 3;
uint32_t nth_child = __builtin_popcountll(mark & ((1ULL << kmer) - 1));
uint8_t has_next_child = __builtin_popcountll(mark >> kmer >> 1);
if (*seq_pos_p >= HYB_MAX_SEQ_LEN) {
*cmp_ref = 0;
}
if (off_bytes == HYB_LEAF_NODE) {
*hits_p -= nth_child + hits_neq + has_next_child;
*sa_start_p += nth_child + hits_neq;
} else {
if (nth_child == 0) {
idata += 8;
uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes];
addr = idata + has_next_child * child_ptr_bytes;
*hits_p = hits_start - hits_neq;
*sa_start_p += hits_neq;
} else {
idata += 8 + (nth_child - 1) * child_ptr_bytes;
uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes];
uint32_t child_offset = *(uint32_t*)(idata + hits_bytes) & ga_hybOffMask[off_bytes];
addr = idata + child_offset + (has_next_child + 1) * child_ptr_bytes;
if (has_next_child) {
*hits_p = (*(uint32_t*)(idata + child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) - hits_start;
} else {
*hits_p -= hits_start + hits_neq;
}
*sa_start_p += hits_start;
}
}
} else {
*cmp_ref = 0;
}
if (!child_num) {
//__parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4,
//__builtin_popcountll,
// 1ULL);
uint8_t kmer_base = seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2;
uint64_t mark = *(uint64_t*)idata;
uint64_t kmer_mask = ((1ULL << 4) - 1) << kmer_base;
uint64_t child_num = mark & kmer_mask;
if (child_num) {
*seq_pos_p += 2;
uint64_t kmer_pre_mask = (1ULL << kmer_base) - 1;
uint32_t nth_child = __builtin_popcountll(mark & kmer_pre_mask);
uint8_t has_next_child = __builtin_popcountll(mark >> kmer_base >> 4);
if (off_bytes == HYB_LEAF_NODE) {
*hits_p -= nth_child + hits_neq + has_next_child;
*sa_start_p += nth_child + hits_neq;
} else {
if (nth_child == 0) {
idata += 8;
uint32_t hits_start = hits_neq;
if (has_next_child) {
child_num = __builtin_popcountll(child_num);
*hits_p =
(*(uint32_t*)(idata + (child_num - 1) * child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) -
hits_start;
} else {
*hits_p -= hits_start;
}
*sa_start_p += hits_start;
} else {
idata += 8 + (nth_child - 1) * child_ptr_bytes;
uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes];
*sa_start_p += hits_start;
if (has_next_child) {
child_num = __builtin_popcountll(child_num);
*hits_p = (*(uint32_t*)(idata + child_num * child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) -
hits_start;
} else {
*hits_p -= hits_start + hits_neq;
}
}
}
}
*cmp_ref = 0;
if (!child_num) {
// __parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL);
uint8_t kmer_base = seq_bp[seq_pos] << 4;
uint64_t mark = *(uint64_t*)idata;
uint64_t kmer_mask = ((1ULL << 16) - 1) << kmer_base;
uint64_t child_num = mark & kmer_mask;
if (child_num) {
*seq_pos_p += 1;
uint64_t kmer_pre_mask = (1ULL << kmer_base) - 1;
uint32_t nth_child = __builtin_popcountll(mark & kmer_pre_mask);
uint8_t has_next_child = __builtin_popcountll(mark >> kmer_base >> 16);
if (off_bytes == HYB_LEAF_NODE) {
*hits_p -= nth_child + hits_neq + has_next_child;
*sa_start_p += nth_child + hits_neq;
} else {
if (nth_child == 0) {
idata += 8;
uint32_t hits_start = hits_neq;
if (has_next_child) {
child_num = __builtin_popcountll(child_num);
*hits_p =
(*(uint32_t*)(idata + (child_num - 1) * child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) -
hits_start;
} else {
*hits_p -= hits_start;
}
*sa_start_p += hits_start;
} else {
idata += 8 + (nth_child - 1) * child_ptr_bytes;
uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes];
*sa_start_p += hits_start;
if (has_next_child) {
child_num = __builtin_popcountll(child_num);
*hits_p = (*(uint32_t*)(idata + child_num * child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) -
hits_start;
} else {
*hits_p -= hits_start + hits_neq;
}
}
}
}
*cmp_ref = 0;
}
}
} else if (seq_pos + 1 < seq_end) {
__parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, __builtin_popcountll,
1ULL);
if (!child_num) {
__parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL);
}
} else {
__parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL);
}
}
return addr;
}
// 解析后续的正常节点
uint8_t* parse_one_hyb_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p,
uint64_t* sa_start_p, uint32_t* hits_p, uint8_t* cmp_ref, int tid) {
if (*seq_pos_p == seq_end)
return NULL;
__parse_node_start_code(idata);
// __parse_hyb_node_code(return addr);
if (node_type == HYB_BP_PATH) {
__parse_path_node_code(path_len);
} else if (node_type == HYB_BP_1) {
__parse_child_node_code(1, 1, uint8_t, seq_bp[seq_pos], __builtin_popcount, 1);
} else if (node_type == HYB_BP_2) {
if (seq_pos + 1 < seq_end) {
__parse_child_node_code(2, 2, uint16_t, seq_bp[seq_pos] << 2 | seq_bp[seq_pos + 1], __builtin_popcount, 1);
if (!child_num) {
__parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1);
}
} else {
__parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1);
}
} else {
if (seq_pos + 2 < seq_end) {
//__parse_child_node_code(3, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2 | seq_bp[seq_pos + 2],
// __builtin_popcountll, 1ULL);
uint8_t kmer = seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2 | seq_bp[seq_pos + 2];
uint64_t mark = *(uint64_t*)idata;
uint64_t child_num = mark & (1ULL << kmer);
if (child_num) {
*seq_pos_p += 3;
uint32_t nth_child = __builtin_popcountll(mark & ((1ULL << kmer) - 1));
uint8_t has_next_child = __builtin_popcountll(mark >> kmer >> 1);
if (*seq_pos_p >= HYB_MAX_SEQ_LEN) {
*cmp_ref = 0;
}
if (off_bytes == HYB_LEAF_NODE) {
*hits_p -= nth_child + hits_neq + has_next_child;
*sa_start_p += nth_child + hits_neq;
} else {
if (nth_child == 0) {
idata += 8;
uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes];
addr = idata + has_next_child * child_ptr_bytes;
*hits_p = hits_start - hits_neq;
*sa_start_p += hits_neq;
} else {
idata += 8 + (nth_child - 1) * child_ptr_bytes;
uint32_t hits_start = *(uint32_t*)idata & ga_hybHitsMask[hits_bytes];
uint32_t child_offset = *(uint32_t*)(idata + hits_bytes) & ga_hybOffMask[off_bytes];
addr = idata + child_offset + (has_next_child + 1) * child_ptr_bytes;
if (has_next_child) {
*hits_p = (*(uint32_t*)(idata + child_ptr_bytes) & ga_hybHitsMask[hits_bytes]) - hits_start;
} else {
*hits_p -= hits_start + hits_neq;
}
*sa_start_p += hits_start;
}
}
} else {
*cmp_ref = 0;
}
if (!child_num) {
__parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, __builtin_popcountll,
1ULL);
if (!child_num) {
__parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL);
}
}
} else if (seq_pos + 1 < seq_end) {
__parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, __builtin_popcountll,
1ULL);
if (!child_num) {
__parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL);
}
} else {
__parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL);
}
}
return addr;
}
void parse_one_hyb_node_min_hits(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int min_hits,
int is_head, uint32_t* seq_pos_p, uint64_t* sa_start_p, uint32_t* hits_p, int tid) {
if (*seq_pos_p == seq_end)
return;
uint8_t cmp_ref_val = 0;
uint8_t* cmp_ref = &cmp_ref_val;
__parse_node_start_no_addr(idata);
if (is_head) {
*sa_start_p = (*(uint64_t*)idata) & HYB_NODE_SA_MASK;
idata += 5;
if (*hits_p > HYB_HIT_THRESH) { // 更新hits
*hits_p = *((uint32_t*)idata) & ga_hybHitsMask[hits_bytes]; // hits数量
idata += hits_bytes;
}
}
uint8_t* prev_idata = idata;
uint32_t prev_seq_pos = *seq_pos_p;
uint32_t prev_hits = *hits_p;
uint64_t prev_sa_start = *sa_start_p;
if (node_type == HYB_BP_2) {
__parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1);
if (*hits_p < min_hits) {
*seq_pos_p = prev_seq_pos;
*hits_p = prev_hits;
*sa_start_p = prev_sa_start;
}
} else if (node_type == HYB_BP_3) {
__parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL);
if (*hits_p < min_hits) {
*seq_pos_p = prev_seq_pos;
*hits_p = prev_hits;
*sa_start_p = prev_sa_start;
} else if (seq_pos + 1 < seq_end) {
uint32_t pp_seq_pos = prev_seq_pos;
uint32_t pp_hits = prev_hits;
uint64_t pp_sa_start = prev_sa_start;
prev_seq_pos = *seq_pos_p;
prev_hits = *hits_p;
prev_sa_start = *sa_start_p;
*seq_pos_p = pp_seq_pos;
*hits_p = pp_hits;
*sa_start_p = pp_sa_start;
idata = prev_idata; // 恢复到上一个节点
__parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, __builtin_popcountll,
1ULL);
if (*hits_p < min_hits) {
*seq_pos_p = prev_seq_pos;
*hits_p = prev_hits;
*sa_start_p = prev_sa_start;
}
}
}
}
void parse_one_hyb_node_max_hits(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, int max_hits, int min_bp,
int is_head, uint32_t* seq_pos_p, uint64_t* sa_start_p, uint32_t* hits_p, int tid) {
if (*seq_pos_p == seq_end)
return;
uint8_t cmp_ref_val = 0;
uint8_t* cmp_ref = &cmp_ref_val;
__parse_node_start_no_addr(idata);
if (is_head) {
*sa_start_p = (*(uint64_t*)idata) & HYB_NODE_SA_MASK;
idata += 5;
if (*hits_p > HYB_HIT_THRESH) { // 更新hits
*hits_p = *((uint32_t*)idata) & ga_hybHitsMask[hits_bytes]; // hits数量
idata += hits_bytes;
}
}
uint8_t* prev_idata = idata;
uint32_t prev_seq_pos = *seq_pos_p;
uint32_t prev_hits = *hits_p;
uint64_t prev_sa_start = *sa_start_p;
if (node_type == HYB_BP_2) {
__parse_part_node_code(1, 2, uint16_t, seq_bp[seq_pos] << 2, 4, __builtin_popcount, 1);
} else if (node_type == HYB_BP_3) {
if (min_bp == 2) {
__parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, __builtin_popcountll,
1ULL);
} else {
__parse_part_node_code(1, 8, uint64_t, seq_bp[seq_pos] << 4, 16, __builtin_popcountll, 1ULL);
if (*hits_p >= max_hits) {
*seq_pos_p = prev_seq_pos;
*hits_p = prev_hits;
*sa_start_p = prev_sa_start;
idata = prev_idata; // 恢复到上一个节点
__parse_part_node_code(2, 8, uint64_t, seq_bp[seq_pos] << 4 | seq_bp[seq_pos + 1] << 2, 4, __builtin_popcountll,
1ULL);
}
}
} else { // path node
if (min_bp > 0)
*seq_pos_p += min_bp;
}
}
// 需要给定初始化的hits和seq_pos
#define CALC_STAT 0
void get_leaf_node(uint8_t* idata, uint8_t* seq_bits, uint8_t* seq_bp, uint32_t seq_end, uint32_t* seq_pos_p, uint32_t* hits_p,
uint64_t* sa_start_p, uint8_t* cmp_ref, int tid) {
uint8_t* next_addr = parse_first_hyb_node(idata, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid);
#if CALC_STAT
uint8_t* prev_addr = idata;
if (next_addr != NULL) {
// fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr);
uint64_t dist = next_addr - prev_addr;
if (dist < 32)
gdat[0]++;
else if (dist < 64)
gdat[1]++;
else if (dist < 128)
gdat[2]++;
else
gdat[3]++;
}
#endif
while (next_addr != NULL && *hits_p > 1) {
#if CALC_STAT
prev_addr = next_addr;
#endif
next_addr = parse_one_hyb_node(next_addr, seq_bits, seq_bp, seq_end, seq_pos_p, sa_start_p, hits_p, cmp_ref, tid);
#if CALC_STAT
if (next_addr != NULL) {
// fprintf(stderr, "addr dist: %ld\n", next_addr - prev_addr);
uint64_t dist = next_addr - prev_addr;
if (dist < 32)
gdat[0]++;
else if (dist < 64)
gdat[1]++;
else if (dist < 128)
gdat[2]++;
else
gdat[3]++;
}
#endif
}
}
void get_kmer_data(const HybridIndex* hyb, uint8_t* seq_bits, int kmer_pos, uint8_t* type_hits, uint64_t* offset) {
uint64_t kmer = _kmer_from_pos(seq_bits, kmer_pos);
uint8_t* kmer_data_addr = hyb->kmer_data + kmer * HYB_KMER_DATA_BYTES;
*type_hits = *kmer_data_addr & HYB_KMER_DATA_TYPE_MASK;
*offset = (*(uint64_t*)kmer_data_addr & HYB_KMER_DATA_MASK) >> HYB_KMER_DATA_TYPE_BITS;
}
void right_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits,
int kmer_start, int init_match_len, uint64_t ref_pos, int* right_match) {
if (ref_pos < hyb->ref_len) {
*right_match = forward_match_len(for_bits, kmer_start + init_match_len, read_range->end, hyb->ref_bits,
ref_pos + init_match_len, hyb->ref_len);
} else {
ref_pos = (hyb->ref_len << 1) - 1 - ref_pos;
*right_match = backward_match_len(back_bits, seq_len - kmer_start - init_match_len - 1, seq_len - read_range->end,
hyb->ref_bits, ref_pos - init_match_len);
}
*right_match += init_match_len; // 包括kmer的长度
}
void left_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits,
int kmer_start, int init_match_len, uint64_t ref_pos, int* left_match) {
if (ref_pos < hyb->ref_len) {
*left_match = backward_match_len(for_bits, kmer_start - 1, read_range->start, hyb->ref_bits, ref_pos - 1);
} else {
ref_pos = (hyb->ref_len << 1) - 1 - ref_pos;
*left_match = forward_match_len(back_bits, seq_len - kmer_start, seq_len - read_range->start, hyb->ref_bits,
ref_pos + 1, hyb->ref_len);
}
}
void both_end_match(const HybridIndex* hyb, const int seq_len, const Range* read_range, uint8_t* for_bits, uint8_t* back_bits,
int kmer_start, int init_match_len, uint64_t ref_pos, int* left_match, int* right_match) {
if (ref_pos < hyb->ref_len) {
*right_match = forward_match_len(for_bits, kmer_start + init_match_len, read_range->end, hyb->ref_bits,
ref_pos + init_match_len, hyb->ref_len);
*left_match = backward_match_len(for_bits, kmer_start - 1, read_range->start, hyb->ref_bits, ref_pos - 1);
} else {
ref_pos = (hyb->ref_len << 1) - 1 - ref_pos;
*right_match = backward_match_len(back_bits, seq_len - kmer_start - init_match_len - 1, seq_len - read_range->end,
hyb->ref_bits, ref_pos - init_match_len);
*left_match = forward_match_len(back_bits, seq_len - kmer_start, seq_len - read_range->start, hyb->ref_bits,
ref_pos + 1, hyb->ref_len);
}
*right_match += init_match_len; // 包括kmer的长度
}

10
kseq.h
View File

@ -221,11 +221,11 @@ typedef struct __kstring_t {
kstream_t *f; \
} kseq_t;
#define KSEQ_INIT2(SCOPE, type_t, __read) \
KSTREAM_INIT(type_t, __read, 16384) \
__KSEQ_TYPE(type_t) \
__KSEQ_BASIC(SCOPE, type_t) \
__KSEQ_READ(SCOPE)
#define KSEQ_INIT2(SCOPE, type_t, __read) \
KSTREAM_INIT(type_t, __read, 16777216) /* 16384 */ \
__KSEQ_TYPE(type_t) \
__KSEQ_BASIC(SCOPE, type_t) \
__KSEQ_READ(SCOPE)
#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)

7
ksw.h
View File

@ -3,6 +3,8 @@
#include <stdint.h>
#include "utils.h"
#define KSW_XBYTE 0x10000
#define KSW_XSTOP 0x20000
#define KSW_XSUBO 0x40000
@ -106,9 +108,12 @@ extern "C" {
*/
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off);
int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off);
int ksw_extend2_avx2(int qlen, const uint8_t* query, int tlen, const uint8_t* target, int is_left, int m, const int8_t* mat, int o_del, int e_del,
int o_ins, int e_ins, int a, int b, int w, int end_bonus, int zdrop, int h0, int* _qle, int* _tle, int* _gtle, int* _gscore,
int* _max_off, buf_t* buf);
#ifdef __cplusplus
}
}
#endif
#endif

816
ksw_extend2_avx2.c 100644
View File

@ -0,0 +1,816 @@
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <string.h>
#include <stdio.h>
#include <immintrin.h>
#include <emmintrin.h>
#include "utils.h"
#include "debug.h"
#define ELIMINATE_DIFF_1
// #define ELIMINATE_DIFF_3
#define NO_VAL -1
#define SIMD_WIDTH 16
extern int ksw_extend2_avx2_u8(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int is_left, int m, const int8_t *mat, int o_del, int e_del,
int o_ins, int e_ins, int a, int b, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off, buf_t *buf);
int ksw_extend2_origin(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int is_left, int m, const int8_t *mat, int o_del, int e_del,
int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off);
static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
{0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0},
{0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}
};
#define permute_mask _MM_SHUFFLE(0, 1, 2, 3)
// 初始化变量
#define SIMD_INIT \
int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \
__m256i zero_vec; \
__m256i max_vec; \
__m256i oe_del_vec; \
__m256i oe_ins_vec; \
__m256i e_del_vec; \
__m256i e_ins_vec; \
__m256i h_vec_mask[SIMD_WIDTH]; \
zero_vec = _mm256_setzero_si256(); \
oe_del_vec = _mm256_set1_epi16(-oe_del); \
oe_ins_vec = _mm256_set1_epi16(-oe_ins); \
e_del_vec = _mm256_set1_epi16(-e_del); \
e_ins_vec = _mm256_set1_epi16(-e_ins); \
__m256i match_sc_vec = _mm256_set1_epi16(a); \
__m256i mis_sc_vec = _mm256_set1_epi16(-b); \
__m256i amb_sc_vec = _mm256_set1_epi16(-1); \
__m256i amb_vec = _mm256_set1_epi16(4); \
for (i=0; i<SIMD_WIDTH; ++i) h_vec_mask[i] = _mm256_loadu_si256((__m256i*) (&h_vec_int_mask[i]));
/*
* e ref
* f seq
* m
* h
*/
// load向量化数据
#define SIMD_LOAD \
__m256i m1 = _mm256_loadu_si256((__m256i*) (&mA1[j])); \
__m256i e1 = _mm256_loadu_si256((__m256i*) (&eA1[j])); \
__m256i m1j1 = _mm256_loadu_si256((__m256i*) (&mA1[j-1])); \
__m256i f1j1 = _mm256_loadu_si256((__m256i*) (&fA1[j-1])); \
__m256i h0j1 = _mm256_loadu_si256((__m256i*) (&hA0[j-1])); \
__m256i qs_vec = _mm256_loadu_si256((__m256i*) (&seq[j-1])); \
__m256i ts_vec = _mm256_loadu_si256((__m256i*) (&ref[tlen - i]));
// 比对ref和seq的序列计算罚分
#define SIMD_CMP_SEQ \
__m256i match_mask_vec = _mm256_cmpeq_epi16(qs_vec, ts_vec); \
__m256i mis_score_vec = _mm256_andnot_si256(match_mask_vec, mis_sc_vec); \
__m256i score_vec = _mm256_and_si256(match_sc_vec, match_mask_vec); \
score_vec = _mm256_or_si256(score_vec, mis_score_vec); \
__m256i q_amb_mask_vec = _mm256_cmpeq_epi16(qs_vec, amb_vec); \
__m256i t_amb_mask_vec = _mm256_cmpeq_epi16(ts_vec, amb_vec); \
__m256i amb_mask_vec = _mm256_or_si256(q_amb_mask_vec, t_amb_mask_vec); \
score_vec = _mm256_andnot_si256(amb_mask_vec, score_vec); \
__m256i amb_score_vec = _mm256_and_si256(amb_mask_vec, amb_sc_vec); \
score_vec = _mm256_or_si256(score_vec, amb_score_vec);
// 向量化计算h, e, f, m
#define SIMD_COMPUTE \
__m256i en_vec0 = _mm256_add_epi16(m1, oe_del_vec); \
__m256i en_vec1 = _mm256_add_epi16(e1, e_del_vec); \
__m256i en_vec = _mm256_max_epi16(en_vec0, en_vec1); \
__m256i fn_vec0 = _mm256_add_epi16(m1j1, oe_ins_vec); \
__m256i fn_vec1 = _mm256_add_epi16(f1j1, e_ins_vec); \
__m256i fn_vec = _mm256_max_epi16(fn_vec0, fn_vec1); \
__m256i mn_vec0 = _mm256_add_epi16(h0j1, score_vec); \
__m256i mn_mask = _mm256_cmpgt_epi16(h0j1, zero_vec); \
__m256i mn_vec = _mm256_and_si256(mn_vec0, mn_mask); \
__m256i hn_vec0 = _mm256_max_epi16(en_vec, fn_vec); \
__m256i hn_vec = _mm256_max_epi16(hn_vec0, mn_vec); \
en_vec = _mm256_max_epi16(en_vec, zero_vec); \
fn_vec = _mm256_max_epi16(fn_vec, zero_vec); \
mn_vec = _mm256_max_epi16(mn_vec, zero_vec); \
hn_vec = _mm256_max_epi16(hn_vec, zero_vec);
// 存储向量化结果
#define SIMD_STORE \
max_vec = _mm256_max_epi16(max_vec, hn_vec); \
_mm256_storeu_si256((__m256i*)&eA2[j], en_vec); \
_mm256_storeu_si256((__m256i*)&fA2[j], fn_vec); \
_mm256_storeu_si256((__m256i*)&mA2[j], mn_vec); \
_mm256_storeu_si256((__m256i*)&hA2[j], hn_vec);
// 去除多余的部分
#define SIMD_REMOVE_EXTRA \
en_vec = _mm256_and_si256(en_vec, h_vec_mask[end-j]); \
fn_vec = _mm256_and_si256(fn_vec, h_vec_mask[end-j]); \
mn_vec = _mm256_and_si256(mn_vec, h_vec_mask[end-j]); \
hn_vec = _mm256_and_si256(hn_vec, h_vec_mask[end-j]);
// 找最大值和位置
#define SIMD_FIND_MAX \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \
max_vec = _mm256_max_epu16(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \
int16_t *maxVal = (int16_t*)&max_vec; \
m = MAX(m, maxVal[0]); /*用来解决与BSW结果不一样的第二种情况(上边界)*/ \
if (maxVal[0] > 0 && m >= max) { \
for(j=beg, i=iend; j<=end; j+=SIMD_WIDTH, i-=SIMD_WIDTH) { \
__m256i h2_vec = _mm256_loadu_si256((__m256i*) (&hA2[j])); \
__m256i vcmp = _mm256_cmpeq_epi16(h2_vec, max_vec); \
uint32_t mask = _mm256_movemask_epi8(vcmp); \
if (mask > 0) { \
int pos = SIMD_WIDTH - 1 - (( __builtin_clz(mask)) >> 1); \
mj = j - 1 + pos; \
mi = i - 1 - pos; \
/*if (m >= max) fprintf(stderr, "%d %d %d %d %d %d %d\n", iend, beg, mi, mj, mask, pos, m);*/ \
} \
} \
}
// 每轮迭代后,交换数组
#define SWAP_DATA_POINTER \
int16_t * tmp=hA0; \
hA0 = hA1; hA1 = hA2; hA2 = tmp; \
tmp = eA1; eA1 = eA2; eA2 = tmp; \
tmp = fA1; fA1 = fA2; fA2 = tmp; \
tmp = mA1; mA1 = mA2; mA2 = tmp;
static void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int h0, int fnum)
{
#ifdef DEBUG_FILE_OUTPUT
// 写到三个文件里query.fatarget.fa每行一个序列info.txt包含前缀得分h0和长度信息qlentlen
FILE *query_f = gfq[fnum],
*target_f = gft[fnum],
*info_f = gfi[fnum];
const char seq_map[5] = {'A', 'C', 'G', 'T', 'N'};
int i;
// 处理query
for (i = 0; i < qlen; ++i)
fprintf(query_f, "%c", seq_map[query[i]]);
fprintf(query_f, "\n");
// 处理target
for (i = 0; i < tlen; ++i)
fprintf(target_f, "%c", seq_map[target[i]]);
fprintf(target_f, "\n");
// 处理其他信息
fprintf(info_f, "%-8d%-8d%-8d\n", qlen, tlen, h0);
#endif
}
int ksw_extend2_avx2(int qlen, // query length 待匹配段碱基的query长度
const uint8_t *query, // read碱基序列
int tlen, // target length reference的长度
const uint8_t *target, // reference序列
int is_left, // 是不是向左扩展
int m, // 碱基种类 (5)
const int8_t *mat, // 每个位置的query和target的匹配得分 m*m
int o_del, // deletion 错配开始的惩罚系数
int e_del, // deletion extension的惩罚系数
int o_ins, // insertion 错配开始的惩罚系数
int e_ins, // insertion extension的惩罚系数SIMD_BTYES
int a, // 碱基match时的分数
int b, // 碱基mismatch时的惩罚分数正数
int w, // 提前剪枝系数w =100 匹配位置和beg的最大距离
int end_bonus,
int zdrop,
int h0, // 该seed的初始得分完全匹配query的碱基数
int *_qle, // 匹配得到全局最大得分的碱基在query的位置
int *_tle, // 匹配得到全局最大得分的碱基在reference的位置
int *_gtle, // query全部匹配上的target的长度
int *_gscore, // query的端到端匹配得分
int *_max_off, // 取得最大得分时在query和reference上位置差的 最大值
buf_t *buf) // 之前已经开辟过的缓存
{
// return ksw_extend2_origin(qlen, query, tlen, target, is_left, m, mat, o_del, e_del, o_ins, e_ins, w, end_bonus, zdrop, h0, _qle, _tle, _gtle, _gscore, _max_off);
#ifdef DEBUG_FILE_OUTPUT
//fprintf(gf[0], "%d\n", qlen);
#ifdef GET_DIFFERENT_EXTENSION_LENGTH
if (qlen <= 30) {
write_query_target_sequence(qlen, query, tlen, target, h0, 0);
} else if (qlen < 60) {
write_query_target_sequence(qlen, query, tlen, target, h0, 1);
} else if (qlen < 90) {
write_query_target_sequence(qlen, query, tlen, target, h0, 2);
} else {
write_query_target_sequence(qlen, query, tlen, target, h0, 3);
}
#endif
#endif
if (qlen * a + h0 < 255) return ksw_extend2_avx2_u8(qlen, query, tlen, target, is_left, m, mat, o_del, e_del, o_ins, e_ins, a, b, w, end_bonus, zdrop, h0, _qle, _tle, _gtle, _gscore, _max_off, buf);
int16_t *mA,*hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H其他的保存上个H E F M
int16_t *seq, *ref;
uint8_t *mem;
int16_t *qtmem, *vmem;
int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH;
int i, ibeg, D, j, k, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
int Dloop = tlen + qlen; // 循环跳出条件
int span, beg1, end1; // 边界条件计算
int col_size = qlen + 2 + SIMD_WIDTH;
int val_mem_size = (col_size * 9 * 2 + 31) >> 5 << 5; // 32字节的整数倍
int mem_size = (seq_size + ref_size) * 2 + val_mem_size;
SIMD_INIT; // 初始化simd用的数据
assert(h0 > 0);
// allocate memory
//mem = malloc(mem_size);
if (buf->m < mem_size) {
buf->m = mem_size;
buf->addr = (uint8_t *)realloc(buf->addr, mem_size);
}
mem = buf->addr;
qtmem = (int16_t *)&mem[0];
seq=&qtmem[0]; ref=&qtmem[seq_size];
if (is_left) {
for (i=0; i<qlen; ++i) seq[i] = query[qlen - 1 - i];
for (i=0; i<tlen; ++i) ref[i] = target[i];
} else {
for (i=0; i<qlen; ++i) seq[i] = query[i];
for (i=0; i<tlen; ++i) ref[i] = target[tlen - 1 - i];
}
vmem = &ref[ref_size];
for (i=0; i<(val_mem_size>>1); i+=SIMD_WIDTH) {
_mm256_storeu_si256((__m256i*)&vmem[i], zero_vec);
}
hA = &vmem[0];
mA = &vmem[col_size * 3];
eA = &vmem[col_size * 5];
fA = &vmem[col_size * 7];
hA0 = &hA[0]; hA1 = &hA[col_size]; hA2 = &hA1[col_size];
mA1 = &mA[0]; mA2 = &mA[col_size];
eA1 = &eA[0]; eA2 = &eA[col_size];
fA1 = &fA[0]; fA2 = &fA[col_size];
// adjust $w if it is too large
k = m * m;
// get the max score
for (i = 0, max = 0; i < k; ++i) max = max > mat[i]? max : mat[i];
max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.);
max_ins = max_ins > 1? max_ins : 1;
w = w < max_ins? w : max_ins;
max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.);
max_del = max_del > 1? max_del : 1;
w = w < max_del? w : max_del; // TODO: is this necessary?
if (tlen < qlen) w = MIN(tlen - 1, w);
// DP loop
max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;;
max_off = 0;
beg = 1; end = qlen;
// init h0
hA0[0] = h0; // 左上角
if (qlen == 0 || tlen == 0) Dloop = 0; // 防止意外情况
if (w >= qlen) { max_ie = 0; gscore = 0; }
int m_last=0;
int iend;
#ifdef ELIMINATE_DIFF_1
int midx = 1, icheck = 0, checkspecial = 1;
int m3 = 0, m2 = 0, m1 = 0;
//int marr[10] = {0};
//int marr[b]; memset(marr, 0, 4 * b);
#endif
//int print_flag = 0; //(qlen == 64 && tlen == 123);
#ifdef DEBUG_SW_EXTEND
int dii, djj;
int16_t ins[tlen + 1][qlen + 2];
int16_t del[tlen + 1][qlen + 2];
int16_t score[tlen + 1][qlen + 2];
for (dii = 0; dii <= tlen; ++dii)
{
for (djj = 0; djj <= qlen; ++djj)
{
ins[dii][djj] = del[dii][djj] = score[dii][djj] = NO_VAL;
}
}
for (dii = 1; dii <= tlen; ++dii)
{
del[dii][0] = MAX(0, h0 - o_del - e_del * dii);
score[dii][0] = del[dii][0];
}
for (djj = 1; djj <= qlen; ++djj)
{
ins[0][djj] = MAX(0, h0 - o_ins - e_ins * djj);
score[0][djj] = ins[0][djj];
}
ins[0][0] = del[0][0] = score[0][0] = h0;
#endif
for (D = 1; LIKELY(D < Dloop); ++D) {
// 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况
if (D > tlen) {
span = MIN(Dloop-D, w);
beg1 = MAX(D-tlen+1, ((D-w) / 2) + 1);
} else {
span = MIN(D-1, w);
beg1 = MAX(1, ((D-w) / 2) + 1);
}
end1 = MIN(qlen, beg1+span);
if (beg < beg1) beg = beg1;
if (end > end1) end = end1;
if (beg > end) break; // 不用计算了直接跳出否则hA2没有被赋值里边是上一轮hA0的值会出bug
iend = D - (beg - 1); // ref开始计算的位置倒序
span = end - beg;
ibeg = iend - span - 1; // 0开始的ref索引位置
// 每一轮需要记录的数据
int m = 0, mj = -1, mi = -1;
max_vec = zero_vec;
//if (print_flag)
//{
//fprintf(stderr, "D: %d, iend: %d, jbeg: %d\n", D, iend, beg);
//}
// 要处理边界
// 左边界 处理f (insert)
if (ibeg == 0) { hA1[end] = MAX(0, h0 - (o_ins + e_ins * end)); m = hA1[end];}
// 上边界
if (beg == 1) { hA1[0] = MAX(0, h0 - (o_del + e_del * iend)); }
else if (D & 1) {
hA1[beg - 1] = 0;
hA2[beg - 1] = 0;
}
for (j=beg, i=iend; j<=end+1-SIMD_WIDTH; j+=SIMD_WIDTH, i-=SIMD_WIDTH) {
// 取数据
SIMD_LOAD;
// 比对seq计算罚分
SIMD_CMP_SEQ;
// 计算
SIMD_COMPUTE;
// 存储结果
SIMD_STORE;
}
// 剩下的计算单元
if (j <= end) {
// 取数据
SIMD_LOAD;
// 比对seq计算罚分
SIMD_CMP_SEQ;
// 计算
SIMD_COMPUTE;
// 去除多余计算的部分
SIMD_REMOVE_EXTRA;
// 存储结果
SIMD_STORE;
}
SIMD_FIND_MAX;
#ifdef ELIMINATE_DIFF_1
// 用来解决与BSW结果不一样的第一种情况(左边界)
#if 0
if (hA1[0] < b && checkspecial) {
int mi;
if (hA1[0] == b - 1) {
icheck = iend + 1;
}
for (mi = 0; mi < b - 1; ++mi) {
if (midx - mi > 0)
marr[mi] = MAX(marr[mi], hA2[midx - mi]);
}
midx += 1;
if (ibeg > icheck)
{
int stopCalc = 0;
for (mi = 0; mi < b - 1; ++mi)
{
stopCalc |= !marr[mi];
}
if (stopCalc)
break;
else
checkspecial = 0;
}
}
#else
if (hA1[0] < 4 && checkspecial) { // b == 4
if (hA1[0] == 3) {
icheck = iend + 1;
} else if (midx == 2) {
m2 = MAX(m2, hA2[midx - 1]);
} else {
m2 = MAX(m2, hA2[midx - 1]);
m1 = MAX(m1, hA2[midx - 2]);
}
m3 = MAX(m3, hA2[midx]);
midx += 1;
if (ibeg > icheck)
{
if (!m1 || !m2 || !m3)
break;
else
checkspecial = 0;
}
//if (print_flag) {
//fprintf(stderr, "jbeg: %d, ibeg: %d, iend: %d, icheck: %d, score: %d %d %d, j: %d\n", beg, ibeg, iend, icheck, hA2[midx + 1], hA2[midx + 2], hA2[midx + 3], midx);
//if (midx > 2) fprintf(stderr, "%d, %d, %d\n", hA2[midx-1], hA2[midx-2], hA2[midx-3]);
//fprintf(stderr, "jbeg: %d, ibeg: %d, iend: %d, icheck: %d, hA1: %d, score: %d %d %d, j: %d\n", beg, ibeg, iend, icheck, hA1[0], m1, m2, m3, midx);
//}
}
#endif
#endif
#ifdef DEBUG_SW_EXTEND
for (djj = beg; djj <= end; ++djj)
{
dii = D - djj + 1;
ins[dii][djj] = fA2[djj];
del[dii][djj] = eA2[djj];
score[dii][djj] = hA2[djj];
}
//if (print_flag)
//{
//fprintf(stderr, "score: %d %d %d\n", hA2[beg], hA2[beg+1], hA2[beg+2]);
//}
#endif
// 注意最后跳出循环j的值
j = end + 1;
if (j == qlen + 1) {
max_ie = gscore > hA2[qlen] ? max_ie : ibeg;
gscore = gscore > hA2[qlen] ? gscore : hA2[qlen];
}
if (m == 0 && m_last==0) break; // 一定要注意,斜对角遍历和按列遍历的不同点
//if (m == 0 && m_last < 2) break;
if (m > max) {
max = m, max_i = mi, max_j = mj;
max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi);
} else if (m == max && max_i >= mi && mj > max_j) {
max_i = mi, max_j = mj;
max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi);
}
else if (zdrop > 0 && mi > -1) {
if (mi - max_i > mj - max_j) {
if (max - m - ((mi - max_i) - (mj - max_j)) * e_del > zdrop) break;
} else {
if (max - m - ((mj - max_j) - (mi - max_i)) * e_ins > zdrop) break;
}
}
// 调整计算的边界
for (j = beg; LIKELY(j <= end); ++j) { int has_val = hA1[j-1] | hA2[j]; if (has_val) break; }
beg = j;
for (j = end+1; LIKELY(j >= beg); --j) { int has_val = hA1[j-1] | hA2[j]; if (has_val) break; else hA0[j-1]=0; }
end = j + 1 <= qlen? j + 1 : qlen;
m_last = m;
// swap m, h, e, f
SWAP_DATA_POINTER;
}
#ifdef DEBUG_FILE_OUTPUT
#ifdef DEBUG_SW_EXTEND
fprintf(gf[0], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max);
fprintf(gf[1], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max);
fprintf(gf[2], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max);
fprintf(gf[0], "%-4d", -1);
fprintf(gf[1], "%-4d", -1);
fprintf(gf[2], "%-4d", -1);
fprintf(gf[0], "%-4d", -1);
fprintf(gf[1], "%-4d", -1);
fprintf(gf[2], "%-4d", -1);
for (djj = 0; djj < qlen; ++djj) {
fprintf(gf[0], "%-4c", "ACGTN"[query[djj]]);
fprintf(gf[1], "%-4c", "ACGTN"[query[djj]]);
fprintf(gf[2], "%-4c", "ACGTN"[query[djj]]);
}
fprintf(gf[0], "\n");
fprintf(gf[1], "\n");
fprintf(gf[2], "\n");
for (dii = 0; dii <= tlen; ++dii)
{
if (dii > 0) {
fprintf(gf[0], "%-4c", "ACGTN"[target[dii - 1]]);
fprintf(gf[1], "%-4c", "ACGTN"[target[dii - 1]]);
fprintf(gf[2], "%-4c", "ACGTN"[target[dii - 1]]);
} else {
fprintf(gf[0], "%-4d", -1);
fprintf(gf[1], "%-4d", -1);
fprintf(gf[2], "%-4d", -1);
}
for (djj = 0; djj <= qlen; ++djj)
{
fprintf(gf[0], "%-4d", score[dii][djj]);
fprintf(gf[1], "%-4d", ins[dii][djj]);
fprintf(gf[2], "%-4d", del[dii][djj]);
}
fprintf(gf[0], "\n");
fprintf(gf[1], "\n");
fprintf(gf[2], "\n");
}
#endif
#endif
// free(mem);
if (_qle) *_qle = max_j + 1;
if (_tle) *_tle = max_i + 1;
if (_gtle) *_gtle = max_ie + 1;
if (_gscore) *_gscore = gscore;
if (_max_off) *_max_off = max_off;
return max;
}
typedef struct {
int32_t h, e;
} eh_t;
int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长度
const uint8_t *query, // read碱基序列
int tlen, // target length reference的长度
const uint8_t *target, // reference序列
int is_left, // 是不是向左扩展
int m, // 碱基种类 (5)
const int8_t *mat, // 每个位置的query和target的匹配得分 m*m
int o_del, // deletion 错配开始的惩罚系数
int e_del, // deletion extension的惩罚系数
int o_ins, // insertion 错配开始的惩罚系数
int e_ins, // insertion extension的惩罚系数
int w, // 提前剪枝系数w =100 匹配位置和beg的最大距离
int end_bonus,
int zdrop,
int h0, // 该seed的初始得分完全匹配query的碱基数
int *_qle, // 匹配得到全局最大得分的碱基在query的位置
int *_tle, // 匹配得到全局最大得分的碱基在reference的位置
int *_gtle, // query全部匹配上的target的长度
int *_gscore, // query的端到端匹配得分
int *_max_off) // 取得最大得分时在query和reference上位置差的 最大值
{
eh_t *eh; // score array
int8_t *qp; // query profile
int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
uint8_t *qmem, *ref, *seq;
assert(h0 > 0);
// allocate memory
qp = (int8_t *)malloc(qlen * m);
eh = (eh_t *)calloc(qlen + 1, 8);
qmem = (uint8_t *)malloc(qlen + tlen);
seq=(uint8_t*)&qmem[0]; ref=(uint8_t*)&qmem[qlen];
if (is_left) {
for (i=0; i<qlen; ++i) seq[i] = query[qlen - 1 - i];
for (i=0; i<tlen; ++i) ref[i] = target[tlen - 1 - i];
} else {
for (i=0; i<qlen; ++i) seq[i] = query[i];
for (i=0; i<tlen; ++i) ref[i] = target[i];
}
// generate the query profile
for (k = i = 0; k < m; ++k) {
const int8_t *p = &mat[k * m];
for (j = 0; j < qlen; ++j) qp[i++] = p[seq[j]];
}
// fill the first row
eh[0].h = h0; eh[1].h = h0 > oe_ins? h0 - oe_ins : 0;
for (j = 2; j <= qlen && eh[j-1].h > e_ins; ++j)
eh[j].h = eh[j-1].h - e_ins;
// adjust $w if it is too large
k = m * m;
for (i = 0, max = 0; i < k; ++i) // get the max score
max = max > mat[i]? max : mat[i];
max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.);
max_ins = max_ins > 1? max_ins : 1;
w = w < max_ins? w : max_ins;
max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.);
max_del = max_del > 1? max_del : 1;
w = w < max_del? w : max_del; // TODO: is this necessary?
//fprintf(stderr, "%d\n", w);
// DP loop
max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;
max_off = 0;
beg = 0, end = qlen;
//int print_flag = 0; //(qlen == 116 && tlen == 241);
//fprintf(stderr, "%d %d %d\n", print_flag, qlen, tlen);
#ifdef DEBUG_SW_EXTEND
int dii, djj;
int16_t ins[tlen + 1][qlen + 2];
int16_t del[tlen + 1][qlen + 2];
int16_t score[tlen + 1][qlen + 2];
for (dii = 0; dii <= tlen; ++dii)
{
for (djj = 0; djj <= qlen; ++djj)
{
ins[dii][djj] = del[dii][djj] = score[dii][djj] = NO_VAL;
}
}
for (dii = 1; dii <= tlen; ++dii)
{
del[dii][0] = MAX(0, h0 - o_del - e_del * dii);
score[dii][0] = del[dii][0];
}
for (djj = 1; djj <= qlen; ++djj)
{
ins[0][djj] = MAX(0, h0 - o_ins - e_ins * djj);
score[0][djj] = ins[0][djj];
}
ins[0][0] = del[0][0] = score[0][0] = h0;
#endif
#ifdef DEBUG_FILE_OUTPUT
#ifdef COUNT_CALC_NUM
int bsw_cal_num = 0;
int real_cal_num = 0;
for (i = 0; i < tlen; ++i)
{
int beg = MAX(0, i - w);
int end = MIN(qlen, i + w + 1);
if (beg >= end) break;
bsw_cal_num += end - beg;
}
fprintf(gf[0], "start\n%d\n", bsw_cal_num);
#endif
#endif
#ifdef ELIMINATE_DIFF_3
int prun_end = qlen; // for test diff_3
#endif
for (i = 0; LIKELY(i < tlen); ++i) {
int t, f = 0, h1, m = 0, mj = -1;
int8_t *q = &qp[ref[i] * qlen];
// apply the band and the constraint (if provided)
if (beg < i - w) beg = i - w;
if (end > i + w + 1) end = i + w + 1;
if (end > qlen) end = qlen; // 没用
// compute the first column
if (beg == 0) {
h1 = h0 - (o_del + e_del * (i + 1));
if (h1 < 0) h1 = 0;
} else h1 = 0;
//m = h1; // 用来解决和VP-BSW结果不一样的第一种情况(左边界)
for (j = beg; LIKELY(j < end); ++j) {
#ifdef DEBUG_FILE_OUTPUT
#ifdef COUNT_CALC_NUM
real_cal_num++;
#endif
#endif
#ifdef DEBUG_SW_EXTEND
ins[i+1][j+1] = f;
#endif
// At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
// Similar to SSE2-SW, cells are computed in the following order:
// H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
// E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape
// F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape
eh_t *p = &eh[j];
int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j)
p->h = h1; // set H(i,j-1) for the next row
M = M? M + q[j] : 0;// separating H and M to disallow a cigar like "100M3I3D20M",保证分值不小于0sw和nw的区别
h = M > e? M : e; // e and f are guaranteed to be non-negative, so h>=0 even if M<0
h = h > f? h : f;
#ifdef ELIMINATE_DIFF_3
if (j >= prun_end && h==0) break; // for test diff_3
#endif
h1 = h; // save H(i,j) to h1 for the next column
#ifdef DEBUG_SW_EXTEND
score[i+1][j+1] = h;
#endif
mj = m > h? mj : j; // record the position where max score is achieved
m = m > h? m : h; // m is stored at eh[mj+1]
t = M - oe_del;
t = t > 0? t : 0;
e -= e_del;
#ifdef DEBUG_SW_EXTEND
del[i + 1][j + 1] = e;
#endif
e = e > t? e : t; // computed E(i+1,j)
#ifdef DEBUG_SW_EXTEND
// del[i+1][j+1] = e;
#endif
p->e = e; // save E(i+1,j) for the next row
t = M - oe_ins;
t = t > 0? t : 0;
f -= e_ins;
f = f > t? f : t; // computed F(i,j+1)
}
eh[end].h = h1; eh[end].e = 0;
if (j == qlen) {
max_ie = gscore > h1? max_ie : i;
gscore = gscore > h1? gscore : h1;
}
if (m == 0) break;
if (m > max) {
max = m, max_i = i, max_j = mj;
max_off = max_off > abs(mj - i)? max_off : abs(mj - i);
//fprintf(stderr, "%d %d %d %d\n", i, mj, max_off, m);
} else if (zdrop > 0) {
if (i - max_i > mj - max_j) {
if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) break;
} else {
if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop) break;
}
}
// update beg and end for the next round
for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j); // 这里为什么不考虑finsert score
beg = j;
for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j);
#ifdef ELIMINATE_DIFF_3
prun_end = j + 2 < qlen ? j + 2 : qlen; end = qlen; // for test diff_3
#else
end = j + 2 < qlen? j + 2 : qlen;
#endif
// beg = 0; end = qlen; // uncomment this line for debugging
// if (print_flag) {
// fprintf(stderr, "beg: %d; end: %d\n", beg, end);
// }
}
#ifdef DEBUG_FILE_OUTPUT
#ifdef DEBUG_SW_EXTEND
fprintf(gf[0], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max);
fprintf(gf[1], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max);
fprintf(gf[2], "qlen: %d, tlen: %d, h0: %d, w: %d, mi: %d, mj: %d, mie: %d, max_off: %d, score: %d, max: %d\n", qlen, tlen, h0, w, max_i + 1, max_j + 1, max_ie + 1, max_off, gscore, max);
fprintf(gf[0], "%-4d", -1);
fprintf(gf[1], "%-4d", -1);
fprintf(gf[2], "%-4d", -1);
fprintf(gf[0], "%-4d", -1);
fprintf(gf[1], "%-4d", -1);
fprintf(gf[2], "%-4d", -1);
for (djj = 0; djj < qlen; ++djj)
{
fprintf(gf[0], "%-4c", "ACGTN"[query[djj]]);
fprintf(gf[1], "%-4c", "ACGTN"[query[djj]]);
fprintf(gf[2], "%-4c", "ACGTN"[query[djj]]);
}
fprintf(gf[0], "\n");
fprintf(gf[1], "\n");
fprintf(gf[2], "\n");
for (dii = 0; dii <= tlen; ++dii)
{
if (dii > 0)
{
fprintf(gf[0], "%-4c", "ACGTN"[target[dii - 1]]);
fprintf(gf[1], "%-4c", "ACGTN"[target[dii - 1]]);
fprintf(gf[2], "%-4c", "ACGTN"[target[dii - 1]]);
}
else
{
fprintf(gf[0], "%-4d", -1);
fprintf(gf[1], "%-4d", -1);
fprintf(gf[2], "%-4d", -1);
}
for (djj = 0; djj <= qlen; ++djj)
{
fprintf(gf[0], "%-4d", score[dii][djj]);
fprintf(gf[1], "%-4d", ins[dii][djj]);
fprintf(gf[2], "%-4d", del[dii][djj]);
}
fprintf(gf[0], "\n");
fprintf(gf[1], "\n");
fprintf(gf[2], "\n");
}
#endif
#endif
#ifdef DEBUG_FILE_OUTPUT
#ifdef COUNT_CALC_NUM
fprintf(gf[0], "%d\nend\n", real_cal_num);
#endif
#endif
free(eh); free(qp); free(qmem);
if (_qle) *_qle = max_j + 1;
if (_tle) *_tle = max_i + 1;
if (_gtle) *_gtle = max_ie + 1;
if (_gscore) *_gscore = gscore;
if (_max_off) *_max_off = max_off;
return max;
}

View File

@ -0,0 +1,454 @@
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <stdio.h>
#include <immintrin.h>
#include <emmintrin.h>
#include "utils.h"
#define ELIMINATE_DIFF_1
#define SIMD_WIDTH 32
static const uint8_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
{0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
};
//static const uint8_t reverse_mask[SIMD_WIDTH] = {7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8};
#define permute_mask _MM_SHUFFLE(0, 1, 2, 3)
//const int permute_mask = _MM_SHUFFLE(0, 1, 2, 3);
// 初始化变量
#define SIMD_INIT \
int oe_del = o_del + e_del, oe_ins = o_ins + e_ins; \
__m256i zero_vec; \
__m256i max_vec; \
__m256i oe_del_vec; \
__m256i oe_ins_vec; \
__m256i e_del_vec; \
__m256i e_ins_vec; \
__m256i h_vec_mask[SIMD_WIDTH]; \
zero_vec = _mm256_setzero_si256(); \
oe_del_vec = _mm256_set1_epi8(oe_del); \
oe_ins_vec = _mm256_set1_epi8(oe_ins); \
e_del_vec = _mm256_set1_epi8(e_del); \
e_ins_vec = _mm256_set1_epi8(e_ins); \
__m256i match_sc_vec = _mm256_set1_epi8(a); \
__m256i mis_sc_vec = _mm256_set1_epi8(b); \
__m256i amb_sc_vec = _mm256_set1_epi8(1); \
__m256i amb_vec = _mm256_set1_epi8(4); \
for (i = 0; i < SIMD_WIDTH; ++i) h_vec_mask[i] = _mm256_loadu_si256((__m256i *)(&h_vec_int_mask[i]));
/*
* e ref
* f seq
* m
* h
*/
// load向量化数据
#define SIMD_LOAD \
__m256i m1 = _mm256_loadu_si256((__m256i*) (&mA1[j])); \
__m256i e1 = _mm256_loadu_si256((__m256i*) (&eA1[j])); \
__m256i m1j1 = _mm256_loadu_si256((__m256i*) (&mA1[j-1])); \
__m256i f1j1 = _mm256_loadu_si256((__m256i*) (&fA1[j-1])); \
__m256i h0j1 = _mm256_loadu_si256((__m256i*) (&hA0[j-1])); \
__m256i qs_vec = _mm256_loadu_si256((__m256i*) (&seq[j-1])); \
__m256i ts_vec = _mm256_loadu_si256((__m256i*) (&ref[tlen - i]));
// 比对ref和seq的序列计算罚分
#define SIMD_CMP_SEQ \
__m256i match_mask_vec = _mm256_cmpeq_epi8(qs_vec, ts_vec); \
__m256i mis_score_vec = _mm256_andnot_si256(match_mask_vec, mis_sc_vec); \
__m256i match_score_vec = _mm256_and_si256(match_sc_vec, match_mask_vec); \
__m256i q_amb_mask_vec = _mm256_cmpeq_epi8(qs_vec, amb_vec); \
__m256i t_amb_mask_vec = _mm256_cmpeq_epi8(ts_vec, amb_vec); \
__m256i amb_mask_vec = _mm256_or_si256(q_amb_mask_vec, t_amb_mask_vec); \
__m256i amb_score_vec = _mm256_and_si256(amb_mask_vec, amb_sc_vec); \
mis_score_vec = _mm256_andnot_si256(amb_mask_vec, mis_score_vec); \
mis_score_vec = _mm256_or_si256(amb_score_vec, mis_score_vec); \
match_score_vec = _mm256_andnot_si256(amb_mask_vec, match_score_vec);
// 向量化计算h, e, f, m
#define SIMD_COMPUTE \
__m256i en_vec0 = _mm256_max_epu8(m1, oe_del_vec); \
en_vec0 = _mm256_subs_epu8(en_vec0, oe_del_vec); \
__m256i en_vec1 = _mm256_max_epu8(e1, e_del_vec); \
en_vec1 = _mm256_subs_epu8(en_vec1, e_del_vec); \
__m256i en_vec = _mm256_max_epu8(en_vec0, en_vec1); \
__m256i fn_vec0 = _mm256_max_epu8(m1j1, oe_ins_vec); \
fn_vec0 = _mm256_subs_epu8(fn_vec0, oe_ins_vec); \
__m256i fn_vec1 = _mm256_max_epu8(f1j1, e_ins_vec); \
fn_vec1 = _mm256_subs_epu8(fn_vec1, e_ins_vec); \
__m256i fn_vec = _mm256_max_epu8(fn_vec0, fn_vec1); \
__m256i mn_vec0 = _mm256_adds_epu8(h0j1, match_score_vec); \
mn_vec0 = _mm256_max_epu8(mn_vec0, mis_score_vec); \
mn_vec0 = _mm256_subs_epu8(mn_vec0, mis_score_vec); \
__m256i mn_mask = _mm256_cmpeq_epi8(h0j1, zero_vec); \
__m256i mn_vec = _mm256_andnot_si256(mn_mask, mn_vec0); \
__m256i hn_vec0 = _mm256_max_epu8(en_vec, fn_vec); \
__m256i hn_vec = _mm256_max_epu8(hn_vec0, mn_vec);
// 存储向量化结果
#define SIMD_STORE \
max_vec = _mm256_max_epu8(max_vec, hn_vec); \
_mm256_storeu_si256((__m256i*)&eA2[j], en_vec); \
_mm256_storeu_si256((__m256i*)&fA2[j], fn_vec); \
_mm256_storeu_si256((__m256i*)&mA2[j], mn_vec); \
_mm256_storeu_si256((__m256i*)&hA2[j], hn_vec);
// 去除多余的部分
#define SIMD_REMOVE_EXTRA \
en_vec = _mm256_and_si256(en_vec, h_vec_mask[end-j]); \
fn_vec = _mm256_and_si256(fn_vec, h_vec_mask[end-j]); \
mn_vec = _mm256_and_si256(mn_vec, h_vec_mask[end-j]); \
hn_vec = _mm256_and_si256(hn_vec, h_vec_mask[end-j]);
#define __max_32(xx) \
do { \
(xx) = _mm256_max_epu8((xx), _mm256_srli_si256((xx), 8)); \
(xx) = _mm256_max_epu8((xx), _mm256_srli_si256((xx), 4)); \
(xx) = _mm256_max_epu8((xx), _mm256_srli_si256((xx), 2)); \
(xx) = _mm256_max_epu8((xx), _mm256_srli_si256((xx), 1)); \
maxVal[0] = MAX(maxVal[0], maxVal[16]); \
} while (0)
// 找最大值和位置
#define SIMD_FIND_MAX_NEW \
uint8_t *maxVal = (uint8_t *)&(max_vec); \
__max_32(max_vec); \
m = MAX(m, maxVal[0]); \
if (maxVal[0] > 0 && m >= max) { \
for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) { \
__m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \
__m256i vcmp = _mm256_cmpeq_epi8(h2_vec, max_vec); \
uint32_t mask = _mm256_movemask_epi8(vcmp); \
if (mask > 0) { \
int pos = SIMD_WIDTH - 1 - __builtin_clz(mask); \
mj = j - 1 + pos; \
mi = i - 1 - pos; \
} \
} \
}
#define SIMD_FIND_MAX \
uint8_t *maxVal = (uint8_t *)&max_vec; \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 1)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 2)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 3)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 4)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 5)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 6)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 7)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_alignr_epi8(max_vec, max_vec, 8)); \
max_vec = _mm256_max_epu8(max_vec, _mm256_permute2x128_si256(max_vec, max_vec, 0x01)); \
m = MAX(m, maxVal[0]); \
if (maxVal[0] > 0 && m >= max) { \
for (j = beg, i = iend; j <= end; j += SIMD_WIDTH, i -= SIMD_WIDTH) { \
__m256i h2_vec = _mm256_loadu_si256((__m256i *)(&hA2[j])); \
__m256i vcmp = _mm256_cmpeq_epi8(h2_vec, max_vec); \
uint32_t mask = _mm256_movemask_epi8(vcmp); \
if (mask > 0) { \
int pos = SIMD_WIDTH - 1 - __builtin_clz(mask); \
mj = j - 1 + pos; \
mi = i - 1 - pos; \
} \
} \
}
// 每轮迭代后,交换数组
#define SWAP_DATA_POINTER \
uint8_t * tmp=hA0; \
hA0 = hA1; hA1 = hA2; hA2 = tmp; \
tmp = eA1; eA1 = eA2; eA2 = tmp; \
tmp = fA1; fA1 = fA2; fA2 = tmp; \
tmp = mA1; mA1 = mA2; mA2 = tmp;
int ksw_extend2_avx2_u8(int qlen, // query length 待匹配段碱基的query长度
const uint8_t *query, // read碱基序列
int tlen, // target length reference的长度
const uint8_t *target, // reference序列
int is_left, // 是不是向左扩展
int m, // 碱基种类 (5)
const int8_t *mat, // 每个位置的query和target的匹配得分 m*m
int o_del, // deletion 错配开始的惩罚系数
int e_del, // deletion extension的惩罚系数
int o_ins, // insertion 错配开始的惩罚系数
int e_ins, // insertion extension的惩罚系数
int a, // 碱基match时的分数
int b, // 碱基mismatch时的惩罚分数正数
int w, // 提前剪枝系数w =100 匹配位置和beg的最大距离
int end_bonus,
int zdrop,
int h0, // 该seed的初始得分完全匹配query的碱基数
int *_qle, // 匹配得到全局最大得分的碱基在query的位置
int *_tle, // 匹配得到全局最大得分的碱基在reference的位置
int *_gtle, // query全部匹配上的target的长度
int *_gscore, // query的端到端匹配得分
int *_max_off, // 取得最大得分时在query和reference上位置差的 最大值
buf_t *buf) // 之前已经开辟过的缓存
{
uint8_t *mA,*hA, *eA, *fA, *mA1, *mA2, *hA0, *hA1, *eA1, *fA1, *hA2, *eA2, *fA2; // hA0保存上上个col的H其他的保存上个H E F M
uint8_t *seq, *ref;
uint8_t *mem, *qtmem, *vmem;
int seq_size = qlen + SIMD_WIDTH, ref_size = tlen + SIMD_WIDTH;
int i, ibeg, D, j, k, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
int Dloop = tlen + qlen; // 循环跳出条件
int span, beg1, end1; // 边界条件计算
int col_size = qlen + 2 + SIMD_WIDTH;
int val_mem_size = (col_size * 9 + 31) >> 5 << 5; // 32字节的整数倍
int mem_size = seq_size + ref_size + val_mem_size;
SIMD_INIT; // 初始化simd用的数据
assert(h0 > 0);
// allocate memory
//mem = malloc(mem_size);
if (buf->m < mem_size) {
buf->m = mem_size;
buf->addr = (uint8_t *)realloc(buf->addr, mem_size);
}
mem = buf->addr;
qtmem = &mem[0];
seq=(uint8_t*)&qtmem[0]; ref=(uint8_t*)&qtmem[seq_size];
if (is_left) {
for (i=0; i<qlen; ++i) seq[i] = query[qlen - 1 - i];
for (i=0; i<tlen; ++i) ref[i] = target[i];
} else {
for (i=0; i<qlen; ++i) seq[i] = query[i];
for (i=0; i<tlen; ++i) ref[i] = target[tlen - 1 - i];
}
vmem = &ref[ref_size];
for (i=0; i<val_mem_size; i+=SIMD_WIDTH) {
_mm256_storeu_si256((__m256i*)&vmem[i], zero_vec);
}
hA = &vmem[0];
mA = &vmem[col_size * 3];
eA = &vmem[col_size * 5];
fA = &vmem[col_size * 7];
hA0 = &hA[0]; hA1 = &hA[col_size]; hA2 = &hA1[col_size];
mA1 = &mA[0]; mA2 = &mA[col_size];
eA1 = &eA[0]; eA2 = &eA[col_size];
fA1 = &fA[0]; fA2 = &fA[col_size];
// adjust $w if it is too large
k = m * m;
// get the max score
for (i = 0, max = 0; i < k; ++i) max = max > mat[i]? max : mat[i];
max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.);
max_ins = max_ins > 1? max_ins : 1;
w = w < max_ins? w : max_ins;
max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.);
max_del = max_del > 1? max_del : 1;
w = w < max_del? w : max_del; // TODO: is this necessary?
if (tlen < qlen) w = MIN(tlen - 1, w);
// DP loop
max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;;
max_off = 0;
beg = 1; end = qlen;
// init h0
hA0[0] = h0; // 左上角
if (qlen == 0 || tlen == 0) Dloop = 0; // 防止意外情况
if (w >= qlen) { max_ie = 0; gscore = 0; }
int m_last=0;
int iend;
#ifdef ELIMINATE_DIFF_1
int midx = 1, icheck = 0, checkspecial = 1;
int m3 = 0, m2 = 0, m1 = 0;
// int marr[10] = {0};
// int marr[b]; memset(marr, 0, 4 * b);
#endif
for (D = 1; LIKELY(D < Dloop); ++D) {
// 边界条件一定要注意! tlen 大于,等于,小于 qlen时的情况
if (D > tlen) {
span = MIN(Dloop-D, w);
beg1 = MAX(D-tlen+1, ((D-w) / 2) + 1);
} else {
span = MIN(D-1, w);
beg1 = MAX(1, ((D-w) / 2) + 1);
}
end1 = MIN(qlen, beg1+span);
if (beg < beg1) beg = beg1;
if (end > end1) end = end1;
if (beg > end) break; // 不用计算了直接跳出否则hA2没有被赋值里边是上一轮hA0的值会出bug
iend = D - (beg - 1); // ref开始计算的位置倒序
span = end - beg;
ibeg = iend - span - 1; // 0开始的ref索引位置
// 每一轮需要记录的数据
int m = 0, mj = -1, mi = -1;
max_vec = zero_vec;
// 要处理边界
// 左边界 处理f (insert)
if (ibeg == 0) { hA1[end] = MAX(0, h0 - (o_ins + e_ins * end)); m = hA1[end]; }
// 上边界
if (beg == 1) { hA1[0] = MAX(0, h0 - (o_del + e_del * iend)); }
else if (D & 1) {
hA1[beg - 1] = 0;
hA2[beg - 1] = 0;
}
for (j=beg, i=iend; j<=end+1-SIMD_WIDTH; j+=SIMD_WIDTH, i-=SIMD_WIDTH) {
// 取数据
SIMD_LOAD;
// 比对seq计算罚分
SIMD_CMP_SEQ;
// 计算
SIMD_COMPUTE;
// 存储结果
SIMD_STORE;
}
// 剩下的计算单元
if (j <= end) {
// 取数据
SIMD_LOAD;
// 比对seq计算罚分
SIMD_CMP_SEQ;
// 计算
SIMD_COMPUTE;
// 去除多余计算的部分
SIMD_REMOVE_EXTRA;
// 存储结果
SIMD_STORE;
}
SIMD_FIND_MAX;
#ifdef ELIMINATE_DIFF_1
#if 0
if (hA1[0] < b && checkspecial) {
int mi;
if (hA1[0] == b - 1) {
icheck = iend + 1;
}
for (mi = 0; mi < b - 1; ++mi) {
if (midx - mi > 0)
marr[mi] = MAX(marr[mi], hA2[midx - mi]);
}
midx += 1;
if (ibeg > icheck)
{
int stopCalc = 0;
for (mi = 0; mi < b - 1; ++mi)
{
stopCalc |= !marr[mi];
}
if (stopCalc)
break;
else
checkspecial = 0;
}
}
#else
if (hA1[0] < 4 && checkspecial)
{ // b == 4
if (hA1[0] == 3)
{
icheck = iend + 1;
}
else if (midx == 2)
{
m2 = MAX(m2, hA2[midx - 1]);
}
else
{
m2 = MAX(m2, hA2[midx - 1]);
m1 = MAX(m1, hA2[midx - 2]);
}
m3 = MAX(m3, hA2[midx]);
midx += 1;
if (ibeg > icheck)
{
if (!m1 || !m2 || !m3)
break;
else
checkspecial = 0;
}
}
#endif
#endif
// 注意最后跳出循环j的值
j = end + 1;
if (j == qlen + 1) {
max_ie = gscore > hA2[qlen] ? max_ie : ibeg;
gscore = gscore > hA2[qlen] ? gscore : hA2[qlen];
}
if (m == 0 && m_last==0) break; // 一定要注意,斜对角遍历和按列遍历的不同点
if (m > max) {
max = m, max_i = mi, max_j = mj;
max_off = max_off > abs(mj - mi)? max_off : abs(mj - mi);
}
else if (m == max && max_i >= mi && mj > max_j) {
max_i = mi, max_j = mj;
max_off = max_off > abs(mj - mi) ? max_off : abs(mj - mi);
}
else if (zdrop > 0 && mi > -1) {
if (mi - max_i > mj - max_j) {
if (max - m - ((mi - max_i) - (mj - max_j)) * e_del > zdrop) break;
} else {
if (max - m - ((mj - max_j) - (mi - max_i)) * e_ins > zdrop) break;
}
}
// 调整计算的边界
for (j = beg; LIKELY(j <= end); ++j) { int has_val = hA1[j-1] | hA2[j]; if (has_val) break; }
beg = j;
for (j = end+1; LIKELY(j >= beg); --j) { int has_val = hA1[j-1] | hA2[j]; if (has_val) break; else hA0[j-1]=0; }
end = j + 1 <= qlen? j + 1 : qlen;
m_last = m;
// swap m, h, e, f
SWAP_DATA_POINTER;
}
//free(mem);
if (_qle) *_qle = max_j + 1;
if (_tle) *_tle = max_i + 1;
if (_gtle) *_gtle = max_ie + 1;
if (_gscore) *_gscore = gscore;
if (_max_off) *_max_off = max_off;
return max;
}

8
main.c
View File

@ -56,8 +56,10 @@ int main_maxk(int argc, char *argv[]);
int bwa_bwt2kmer(int argc, char* argv[]); // create kmer-index from bwt
int bwa_bwt2fullbytesa(int argc, char* argv[]); // create full byte-based Suffix-Array
int bwa_bwt2hyb(int argc, char* argv[]); // create hybrid-index
int bwa_pac2hybpac(int argc, char* argv[]); // convert pac to hyb.pac
int bwa_extract_sa(int argc, char* argv[]); // extract suffix array from non-sampled suffix array
int bwa_extract_byte_sa(int argc, char* argv[]); // extract suffix array from non-sampled suffix array
int main_shm_hyb(int argc, char* argv[]); // manage hybrid index in shared memory
int hyb_test(int argc, char* argv[]); // for test
@ -86,8 +88,10 @@ static int usage()
fprintf(stderr, " bwt2fullbytesa generate SA(using byte array) from BWT and Occ\n");
fprintf(stderr, " bwt2kmer generate kmer hash index from bwt to accelarate the first 14 bases in seeding process.\n");
fprintf(stderr, " bwt2hyb generate hybrid index from BWT\n");
fprintf(stderr, " pac2hybpac convert pac to hyb.pac\n");
fprintf(stderr, " extractsa generate sa from full byte suffix array\n");
fprintf(stderr, " extractbytesa generate byte sa from full byte suffix array\n");
fprintf(stderr, " hybshm manage hybrid index in shared memory\n");
fprintf(stderr, "\n");
fprintf(stderr,
"Note: To use BWA, you need to first index the genome with `bwa index'.\n"
@ -128,9 +132,11 @@ int main(int argc, char *argv[])
else if (strcmp(argv[1], "bwt2fullbytesa") == 0) ret = bwa_bwt2fullbytesa(argc - 1, argv + 1);
else if (strcmp(argv[1], "bwt2kmer") == 0) ret = bwa_bwt2kmer(argc - 1, argv + 1);
else if (strcmp(argv[1], "bwt2hyb") == 0) ret = bwa_bwt2hyb(argc - 1, argv + 1);
else if (strcmp(argv[1], "pac2hybpac") == 0) ret = bwa_pac2hybpac(argc - 1, argv + 1);
else if (strcmp(argv[1], "extractsa") == 0) ret = bwa_extract_sa(argc - 1, argv + 1);
else if (strcmp(argv[1], "extractbytesa") == 0) ret = bwa_extract_byte_sa(argc - 1, argv + 1);
else if (strcmp(argv[1], "hybtest") == 0) ret = hyb_test(argc - 1, argv + 1);
else if (strcmp(argv[1], "hybshm") == 0) ret = main_shm_hyb(argc - 1, argv + 1);
else if (strcmp(argv[1], "hybtest") == 0) ret = hyb_test(argc - 1, argv + 1);
else {
fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
return 1;

View File

@ -222,8 +222,10 @@ int main_pemerge(int argc, char *argv[])
gzFile fp, fp2 = 0;
kseq_t *ks, *ks2 = 0;
pem_opt_t *opt;
int64_t seq_size = 0;
int m = 0;
opt = pem_opt_init();
opt = pem_opt_init();
while ((c = getopt(argc, argv, "muQ:t:T:")) >= 0) {
if (c == 'm') flag |= 1;
else if (c == 'u') flag |= 2;
@ -269,10 +271,11 @@ int main_pemerge(int argc, char *argv[])
}
memset(cnt, 0, 8 * (MAX_ERR+1));
while ((bseq = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) {
process_seqs(opt, n, bseq, cnt);
free(bseq);
}
bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2, 1, &seq_size, &m, &bseq);
while (n > 0) {
process_seqs(opt, n, bseq, cnt);
bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2, 1, &seq_size, &m, &bseq);
}
fprintf(stderr, "%12ld %s\n", (long)cnt[0], err_msg[0]);
for (i = 1; i <= MAX_ERR; ++i)

View File

@ -133,9 +133,11 @@ int display_stats(int nthreads)
fprintf(stderr, "time_ksw_loop: %0.2lf s\n", gprof[G_KSW_LOOP] * 1.0 / proc_freq);
fprintf(stderr, "time_ksw_end_loop: %0.2lf s\n", gprof[G_KSW_END_LOOP] * 1.0 / proc_freq);
#if SHOW_DATA_PERF
fprintf(stderr, "seq num: %ld\n", gdat[0]);
fprintf(stderr, "full num: %ld\n", gdat[1]);
fprintf(stderr, "percent: %0.2lf%c\n", (double)gdat[1] / gdat[0] * 100, '%');
#endif
fprintf(stderr, "all_match_len: %ld\n", all_match_len);
fprintf(stderr, "all_seq_num: %ld\n", all_seq_num);

View File

@ -9,6 +9,8 @@ Date : 2024/04/06
#ifndef PROFILING_H_
#define PROFILING_H_
#include <emmintrin.h>
#include <immintrin.h>
#include <stdint.h>
#define USE_RDTSC 1

167
share_mem.c 100644
View File

@ -0,0 +1,167 @@
#include "share_mem.h"
#include <fcntl.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include "utils.h"
#define SHM_NAME_LIST "/shm_hybbwa_name_list"
#define SHM_HYB_PREFIX "/shm_hybbwa_"
#define SHM_NAME_LIST_SIZE 65535
static inline double get_GB(double bytes) { return bytes / 1024 / 1024 / 1024; }
// 根据文件路径获取文件名
const char* get_fn_from_path(const char* file_path) {
const char* fn = strrchr(file_path, '/');
if (fn != NULL)
return fn + 1;
return file_path;
}
// 将hybrid-index保存到share memrory里
int shm_keep_hyb(const char* idx_prefix) {
char full_path[MAX_PATH];
const char* file_name = NULL;
char share_name[MAX_PATH];
FILE* fp = NULL;
struct stat st;
int shmid, init_shm = 0, idx_name_len;
uint8_t *shm_idx_list, *mem;
uint16_t* shm_idx_cnt; // share memory中index数量
uint16_t* shm_byte_cnt; // 和占用的总内存数
double sec_time;
/////////////////
#define __shm_keep_hyb_code(suffix) \
sec_time = realtime(); \
strcat(strcpy(full_path, idx_prefix), suffix); \
file_name = get_fn_from_path(full_path); \
strcat(strcpy(share_name, SHM_HYB_PREFIX), get_fn_from_path(full_path)); \
if ((shmid = shm_open(share_name, O_CREAT | O_RDWR | O_EXCL, 0644)) < 0) { \
perror("shm_open()"); \
return -1; \
} \
err_check_true(stat(full_path, &st), 0); \
if (ftruncate(shmid, st.st_size) < 0) \
return -1; \
idx_name_len = 8 + strlen(file_name) + 1; \
if (idx_name_len + *shm_byte_cnt > SHM_NAME_LIST_SIZE) \
return -1; \
memcpy(shm_idx_list + *shm_byte_cnt, &st.st_size, 8); \
memcpy(shm_idx_list + *shm_byte_cnt + 8, file_name, idx_name_len - 8); \
*shm_byte_cnt += idx_name_len; \
*shm_idx_cnt += 1; \
mem = (uint8_t*)mmap(0, st.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, shmid, 0); \
fp = xopen(full_path, "r"); \
err_fread_noeof(mem, 1, st.st_size, fp); \
err_fclose(fp); \
munmap(mem, st.st_size); \
fprintf(stderr, "%s, %0.2f GB, %0.2f s\n", file_name, get_GB(st.st_size), realtime() - sec_time);
//////////////////////
// 打开保存索引名称的共享内存
if ((shmid = shm_open(SHM_NAME_LIST, O_RDWR, 0)) < 0) {
// 之前没有创建过,那就创建并初始化
shmid = shm_open(SHM_NAME_LIST, O_CREAT | O_RDWR | O_EXCL, 0644);
init_shm = 1;
}
if (shmid < 0 || ftruncate(shmid, SHM_NAME_LIST_SIZE) < 0)
return -1;
shm_idx_list = (uint8_t*)mmap(0, SHM_NAME_LIST_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, shmid, 0);
shm_idx_cnt = (uint16_t*)shm_idx_list;
shm_byte_cnt = (uint16_t*)(shm_idx_list + 2);
if (init_shm) { // 需要初始化share mem中的索引列表
memset(shm_idx_list, 0, SHM_NAME_LIST_SIZE);
*shm_byte_cnt = 4;
}
__shm_keep_hyb_code(HYB_PAC_SUFFIX);
__shm_keep_hyb_code(HYB_SA_SUFFIX);
__shm_keep_hyb_code(HYB_KMER_SUFFIX);
__shm_keep_hyb_code(HYB_DATA_SUFFIX);
return 0;
}
// 清理所有保存在share memory中的索引数据
int shm_clear_hyb() {
char share_name[MAX_PATH];
int shmid;
uint16_t *shm_idx_cnt, i;
char* shm_idx_list;
char* ptr;
if ((shmid = shm_open(SHM_NAME_LIST, O_RDONLY, 0)) < 0)
return -1;
shm_idx_list = (char*)mmap(0, SHM_NAME_LIST_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
shm_idx_cnt = (uint16_t*)shm_idx_list;
for (i = 0, ptr = shm_idx_list + 4; i < *shm_idx_cnt; ++i) {
ptr += 8;
strcat(strcpy(share_name, SHM_HYB_PREFIX), ptr);
fprintf(stderr, "clear: %s\n", ptr);
shm_unlink(share_name);
ptr += strlen(ptr) + 1;
}
munmap(shm_idx_list, SHM_NAME_LIST_SIZE);
shm_unlink(SHM_NAME_LIST);
return 0;
}
// 从share mem中获取对应的索引数据
void* shm_get_index(const char* full_path) {
char share_name[MAX_PATH];
int shmid;
uint16_t *shm_idx_cnt, i;
char* shm_idx_list;
uint64_t idx_bytes;
char* ptr;
const char* file_name = get_fn_from_path(full_path);
if ((shmid = shm_open(SHM_NAME_LIST, O_RDONLY, 0)) < 0)
return NULL;
shm_idx_list = (char*)mmap(0, SHM_NAME_LIST_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
shm_idx_cnt = (uint16_t*)shm_idx_list;
for (i = 0, ptr = shm_idx_list + 4; i < *shm_idx_cnt; ++i) {
memcpy(&idx_bytes, ptr, 8);
ptr += 8;
if (strcmp(ptr, file_name) == 0)
break;
ptr += strlen(ptr) + 1;
}
if (i == *shm_idx_cnt)
return NULL;
munmap(shm_idx_list, SHM_NAME_LIST_SIZE);
strcat(strcpy(share_name, SHM_HYB_PREFIX), file_name);
if ((shmid = shm_open(share_name, O_RDONLY, 0)) < 0)
return NULL;
return mmap(0, idx_bytes, PROT_READ, MAP_SHARED, shmid, 0);
}
// 列出共享内存中的hybrid-index
int list_shm_hyb_indices() {
int shmid;
uint16_t *shm_idx_cnt, i;
char* shm_idx_list;
char* ptr;
if ((shmid = shm_open(SHM_NAME_LIST, O_RDONLY, 0)) < 0) {
fprintf(stderr, "No shared hybrid index found.\n");
return -1;
}
shm_idx_list = (char*)mmap(0, SHM_NAME_LIST_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
shm_idx_cnt = (uint16_t*)shm_idx_list;
fprintf(stderr, "Shared hybrid indices (%d):\n", *shm_idx_cnt);
for (i = 0, ptr = shm_idx_list + 4; i < *shm_idx_cnt; ++i) {
uint64_t idx_bytes;
memcpy(&idx_bytes, ptr, 8);
ptr += 8;
fprintf(stderr, "%s, %0.2f GB\n", ptr, get_GB(idx_bytes));
ptr += strlen(ptr) + 1;
}
munmap(shm_idx_list, SHM_NAME_LIST_SIZE);
return 0;
}

20
share_mem.h 100644
View File

@ -0,0 +1,20 @@
#pragma once
#include "utils.h"
#if 0
#define HYB_PAC_SUFFIX ".hyb.pac"
#define HYB_SA_SUFFIX ".hyb.bytesa"
#define HYB_KMER_SUFFIX ".hyb.kmer"
#define HYB_DATA_SUFFIX ".hyb.data"
#else
#define HYB_PAC_SUFFIX ".hybrid.pac"
#define HYB_SA_SUFFIX ".hybrid.sa"
#define HYB_KMER_SUFFIX ".hybrid.kmer"
#define HYB_DATA_SUFFIX ".hybrid.data"
#endif
int shm_keep_hyb(const char* idx_prefix);
int shm_clear_hyb();
void* shm_get_index(const char* full_path);
int list_shm_hyb_indices();

115
utils.c
View File

@ -39,9 +39,28 @@
#endif
#include <sys/resource.h>
#include <sys/time.h>
#include "utils.h"
#include "khash.h"
#include "ksort.h"
#include "kvec.h"
#include "utils.h"
#include "yarn.h"
#define USE_ASYNC_READ
typedef struct {
pthread_t tid;
void* buf[2];
volatile int readSize[2];
uint64_t getIdx;
uint64_t putIdx;
volatile int finish;
lock_t* mtx;
} FileKV;
KHASH_MAP_INIT_INT64(fkv, FileKV);
static khash_t(fkv) * fHash = 0;
#define pair64_lt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y < (b).y))
KSORT_INIT(128, pair64_t, pair64_lt)
KSORT_INIT(64, uint64_t, ks_lt_generic)
@ -141,9 +160,38 @@ size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream)
int err_gzread(gzFile file, void *ptr, unsigned int len)
{
int ret = gzread(file, ptr, len);
int ret = 0;
PROF_START(read);
#ifdef USE_ASYNC_READ
khiter_t k = kh_get(fkv, fHash, (int64_t)file);
FileKV* val = &kh_value(fHash, k);
POSSESS(val->mtx);
WAIT_FOR(val->mtx, NOT_TO_BE, 0); // 等待有数据
RELEASE(val->mtx);
if (ret < 0)
int curIdx = val->getIdx % 2;
if (val->finish) {
if (val->getIdx < val->putIdx) {
ret = val->readSize[curIdx];
if (ret > 0)
memcpy(ptr, val->buf[curIdx], ret);
++val->getIdx;
return ret;
}
return 0;
}
ret = val->readSize[curIdx];
memcpy(ptr, val->buf[curIdx], ret);
POSSESS(val->mtx);
++val->getIdx;
TWIST(val->mtx, BY, -1);
#else
ret = gzread(file, ptr, len);
#endif
PROF_END(gprof[G_read_seq], read);
if (ret < 0)
{
int errnum = 0;
const char *msg = gzerror(file, &errnum);
@ -304,3 +352,64 @@ long peakrss(void)
return r.ru_maxrss;
#endif
}
static int64_t kBufSize = 16777216;
static void* async_gzread(void* data) {
gzFile file = (gzFile)data;
khiter_t k = kh_get(fkv, fHash, (int64_t)file);
FileKV* val = &kh_value(fHash, k);
int ret = 0;
while (1) {
POSSESS(val->mtx);
WAIT_FOR(val->mtx, NOT_TO_BE, 2); // 等待有数据
RELEASE(val->mtx);
int curIdx = val->putIdx % 2;
ret = gzread(file, val->buf[curIdx], kBufSize);
val->readSize[curIdx] = ret;
if (ret <= 0) {
POSSESS(val->mtx);
val->finish = 1;
TWIST(val->mtx, BY, 1);
break;
}
POSSESS(val->mtx);
val->putIdx += 1;
TWIST(val->mtx, BY, 1);
}
return NULL;
}
int start_async_read(gzFile file) {
int ret = 0;
#ifdef USE_ASYNC_READ
if (fHash == 0) {
fHash = kh_init(fkv);
}
khiter_t k = kh_put(fkv, fHash, (int64_t)file, &ret);
kh_key(fHash, k) = (int64_t)file;
FileKV* fv = &kh_value(fHash, k);
fv->mtx = NEW_LOCK(0);
fv->getIdx = fv->putIdx = fv->finish = 0;
fv->readSize[0] = fv->readSize[1] = 0;
fv->buf[0] = malloc(kBufSize);
fv->buf[1] = malloc(kBufSize);
ret = pthread_create(&fv->tid, 0, async_gzread, file);
#endif
return ret;
}
int stop_async_read(gzFile file) {
#ifdef USE_ASYNC_READ
khiter_t k = kh_get(fkv, fHash, (int64_t)file);
FileKV* val = &kh_value(fHash, k);
pthread_join(val->tid, 0);
#endif
return 0;
}

14
utils.h
View File

@ -28,11 +28,15 @@
#define LH3_UTILS_H
#include <getopt.h>
#include <pthread.h>
#include <stdint.h>
#include <stdio.h>
#include <sys/stat.h>
#include <zlib.h>
#include "debug.h"
#include "profiling.h"
#ifdef __GNUC__
// Tell GCC to validate printf format string and args
#define ATTRIBUTE(list) __attribute__ (list)
@ -121,6 +125,11 @@ typedef struct {
typedef struct { size_t n, m; uint64_t *a; } uint64_v;
typedef struct { size_t n, m; pair64_t *a; } pair64_v;
typedef struct {
size_t m;
uint8_t* addr;
} buf_t;
#ifdef __cplusplus
extern "C" {
#endif
@ -158,8 +167,11 @@ extern "C" {
void ks_introsort_64 (size_t n, uint64_t *a);
void ks_introsort_128(size_t n, pair64_t *a);
int start_async_read(gzFile file);
int stop_async_read(gzFile file);
#ifdef __cplusplus
}
}
#endif
static inline uint64_t hash_64(uint64_t key)

398
yarn.c 100644
View File

@ -0,0 +1,398 @@
/* yarn.c -- generic thread operations implemented using pthread functions
* Copyright (C) 2008, 2011, 2012, 2015, 2018, 2019, 2020 Mark Adler
* Version 1.7 12 Apr 2020 Mark Adler
* For conditions of distribution and use, see copyright notice in yarn.h
*/
/* Basic thread operations implemented using the POSIX pthread library. All
pthread references are isolated within this module to allow alternate
implementations with other thread libraries. See yarn.h for the description
of these operations. */
/* Version history:
1.0 19 Oct 2008 First version
1.1 26 Oct 2008 No need to set the stack size -- remove
Add yarn_abort() function for clean-up on error exit
1.2 19 Dec 2011 (changes reversed in 1.3)
1.3 13 Jan 2012 Add large file #define for consistency with pigz.c
Update thread portability #defines per IEEE 1003.1-2008
Fix documentation in yarn.h for yarn_prefix
1.4 19 Jan 2015 Allow yarn_abort() to avoid error message to stderr
Accept and do nothing for NULL argument to free_lock()
1.5 8 May 2018 Remove destruct() to avoid use of pthread_cancel()
Normalize the code style
1.6 3 Apr 2019 Add debugging information to fail() error messages
1.7 12 Apr 2020 Fix use after free bug in ignition()
*/
// For thread portability.
#define _XOPEN_SOURCE 700
#define _POSIX_C_SOURCE 200809L
#define _THREAD_SAFE
// Use large file functions if available.
#define _FILE_OFFSET_BITS 64
// External libraries and entities referenced.
#include <stdio.h> // fprintf(), stderr
#include <stdlib.h> // exit(), malloc(), free(), NULL
#include <pthread.h> // pthread_t, pthread_create(), pthread_join(),
// pthread_attr_t, pthread_attr_init(), pthread_attr_destroy(),
// PTHREAD_CREATE_JOINABLE, pthread_attr_setdetachstate(),
// pthread_self(), pthread_equal(),
// pthread_mutex_t, PTHREAD_MUTEX_INITIALIZER, pthread_mutex_init(),
// pthread_mutex_lock(), pthread_mutex_unlock(), pthread_mutex_destroy(),
// pthread_cond_t, PTHREAD_COND_INITIALIZER, pthread_cond_init(),
// pthread_cond_broadcast(), pthread_cond_wait(), pthread_cond_destroy()
#include <errno.h> // EPERM, ESRCH, EDEADLK, ENOMEM, EBUSY, EINVAL, EAGAIN
// Interface definition.
#include "yarn.h"
// Constants.
#define local static // for non-exported functions and globals
// Error handling external globals, resettable by application.
char *yarn_prefix = (char*)"yarn";
void (*yarn_abort)(int) = NULL;
// Immediately exit -- use for errors that shouldn't ever happen.
local void fail(int err, char const *file, long line, char const *func) {
fprintf(stderr, "%s: ", yarn_prefix);
switch (err) {
case EPERM:
fputs("already unlocked", stderr);
break;
case ESRCH:
fputs("no such thread", stderr);
break;
case EDEADLK:
fputs("resource deadlock", stderr);
break;
case ENOMEM:
fputs("out of memory", stderr);
break;
case EBUSY:
fputs("can't destroy locked resource", stderr);
break;
case EINVAL:
fputs("invalid request", stderr);
break;
case EAGAIN:
fputs("resource unavailable", stderr);
break;
default:
fprintf(stderr, "internal error %d", err);
}
fprintf(stderr, " (%s:%ld:%s)\n", file, line, func);
if (yarn_abort != NULL)
yarn_abort(err);
exit(err);
}
// Memory handling routines provided by user. If none are provided, malloc()
// and free() are used, which are therefore assumed to be thread-safe.
typedef void *(*malloc_t)(size_t);
typedef void (*free_t)(void *);
local malloc_t my_malloc_f = malloc;
local free_t my_free = free;
// Use user-supplied allocation routines instead of malloc() and free().
void yarn_mem(malloc_t lease, free_t vacate) {
my_malloc_f = lease;
my_free = vacate;
}
// Memory allocation that cannot fail (from the point of view of the caller).
local void *my_malloc(size_t size, char const *file, long line) {
void *block;
if ((block = my_malloc_f(size)) == NULL)
fail(ENOMEM, file, line, "malloc");
return block;
}
// -- Lock functions --
struct lock_s {
pthread_mutex_t mutex;
pthread_cond_t cond;
long value;
};
lock_t *new_lock_(long initial, char const *file, long line) {
lock_t *bolt = (lock_t *)my_malloc(sizeof(struct lock_s), file, line);
int ret = pthread_mutex_init(&(bolt->mutex), NULL);
if (ret)
fail(ret, file, line, "mutex_init");
ret = pthread_cond_init(&(bolt->cond), NULL);
if (ret)
fail(ret, file, line, "cond_init");
bolt->value = initial;
return bolt;
}
void possess_(lock_t *bolt, char const *file, long line) {
int ret = pthread_mutex_lock(&(bolt->mutex));
if (ret)
fail(ret, file, line, "mutex_lock");
}
void release_(lock_t *bolt, char const *file, long line) {
int ret = pthread_mutex_unlock(&(bolt->mutex));
if (ret)
fail(ret, file, line, "mutex_unlock");
}
void twist_(lock_t *bolt, enum twist_op op, long val,
char const *file, long line) {
if (op == TO)
bolt->value = val;
else if (op == BY)
bolt->value += val;
int ret = pthread_cond_broadcast(&(bolt->cond));
if (ret)
fail(ret, file, line, "cond_broadcast");
ret = pthread_mutex_unlock(&(bolt->mutex));
if (ret)
fail(ret, file, line, "mutex_unlock");
}
#define until(a) while(!(a))
void wait_for_(lock_t *bolt, enum wait_op op, long val,
char const *file, long line) {
switch (op) {
case TO_BE:
until (bolt->value == val) {
int ret = pthread_cond_wait(&(bolt->cond), &(bolt->mutex));
if (ret)
fail(ret, file, line, "cond_wait");
}
break;
case NOT_TO_BE:
until (bolt->value != val) {
int ret = pthread_cond_wait(&(bolt->cond), &(bolt->mutex));
if (ret)
fail(ret, file, line, "cond_wait");
}
break;
case TO_BE_MORE_THAN:
until (bolt->value > val) {
int ret = pthread_cond_wait(&(bolt->cond), &(bolt->mutex));
if (ret)
fail(ret, file, line, "cond_wait");
}
break;
case TO_BE_LESS_THAN:
until (bolt->value < val) {
int ret = pthread_cond_wait(&(bolt->cond), &(bolt->mutex));
if (ret)
fail(ret, file, line, "cond_wait");
}
}
}
long peek_lock(lock_t *bolt) {
return bolt->value;
}
void free_lock_(lock_t *bolt, char const *file, long line) {
if (bolt == NULL)
return;
int ret = pthread_cond_destroy(&(bolt->cond));
if (ret)
fail(ret, file, line, "cond_destroy");
ret = pthread_mutex_destroy(&(bolt->mutex));
if (ret)
fail(ret, file, line, "mutex_destroy");
my_free(bolt);
}
// -- Thread functions (uses the lock_t functions above) --
struct thread_s {
pthread_t id;
int done; // true if this thread has exited
thread *next; // for list of all launched threads
};
// List of threads launched but not joined, count of threads exited but not
// joined (incremented by ignition() just before exiting).
local lock_t threads_lock = {
PTHREAD_MUTEX_INITIALIZER,
PTHREAD_COND_INITIALIZER,
0 // number of threads exited but not joined
};
local thread *threads = NULL; // list of extant threads
// Structure in which to pass the probe and its payload to ignition().
struct capsule {
void (*probe)(void *);
void *payload;
char const *file;
long line;
};
// Mark the calling thread as done and alert join_all().
local void reenter(void *arg) {
struct capsule *capsule = (struct capsule *)arg;
// find this thread in the threads list by matching the thread id
pthread_t me = pthread_self();
possess_(&(threads_lock), capsule->file, capsule->line);
thread **prior = &(threads);
thread *match;
while ((match = *prior) != NULL) {
if (pthread_equal(match->id, me))
break;
prior = &(match->next);
}
if (match == NULL)
fail(ESRCH, capsule->file, capsule->line, "reenter lost");
// mark this thread as done and move it to the head of the list
match->done = 1;
if (threads != match) {
*prior = match->next;
match->next = threads;
threads = match;
}
// update the count of threads to be joined and alert join_all()
twist_(&(threads_lock), BY, +1, capsule->file, capsule->line);
// free the capsule resource, even if the thread is cancelled (though yarn
// doesn't use pthread_cancel() -- you never know)
my_free(capsule);
}
// All threads go through this routine. Just before a thread exits, it marks
// itself as done in the threads list and alerts join_all() so that the thread
// resources can be released. Use a cleanup stack so that the marking occurs
// even if the thread is cancelled.
local void *ignition(void *arg) {
struct capsule *capsule = (struct capsule *)arg;
// run reenter() before leaving
pthread_cleanup_push(reenter, arg);
// execute the requested function with argument
capsule->probe(capsule->payload);
// mark this thread as done, letting join_all() know, and free capsule
pthread_cleanup_pop(1);
// exit thread
return NULL;
}
// Not all POSIX implementations create threads as joinable by default, so that
// is made explicit here.
thread *launch_(void (*probe)(void *), void *payload,
char const *file, long line) {
// construct the requested call and argument for the ignition() routine
// (allocated instead of automatic so that we're sure this will still be
// there when ignition() actually starts up -- ignition() will free this
// allocation)
struct capsule *capsule = (struct capsule *)my_malloc(sizeof(struct capsule), file, line);
capsule->probe = probe;
capsule->payload = payload;
capsule->file = file;
capsule->line = line;
// assure this thread is in the list before join_all() or ignition() looks
// for it
possess_(&(threads_lock), file, line);
// create the thread and call ignition() from that thread
thread *th = (thread *)my_malloc(sizeof(struct thread_s), file, line);
pthread_attr_t attr;
int ret = pthread_attr_init(&attr);
if (ret)
fail(ret, file, line, "attr_init");
ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
if (ret)
fail(ret, file, line, "attr_setdetachstate");
ret = pthread_create(&(th->id), &attr, ignition, capsule);
if (ret)
fail(ret, file, line, "create");
ret = pthread_attr_destroy(&attr);
if (ret)
fail(ret, file, line, "attr_destroy");
// put the thread in the threads list for join_all()
th->done = 0;
th->next = threads;
threads = th;
release_(&(threads_lock), file, line);
return th;
}
void join_(thread *ally, char const *file, long line) {
// wait for thread to exit and return its resources
int ret = pthread_join(ally->id, NULL);
if (ret)
fail(ret, file, line, "join");
// find the thread in the threads list
possess_(&(threads_lock), file, line);
thread **prior = &(threads);
thread *match;
while ((match = *prior) != NULL) {
if (match == ally)
break;
prior = &(match->next);
}
if (match == NULL)
fail(ESRCH, file, line, "join lost");
// remove thread from list and update exited count, free thread
if (match->done)
threads_lock.value--;
*prior = match->next;
release_(&(threads_lock), file, line);
my_free(ally);
}
// This implementation of join_all() only attempts to join threads that have
// announced that they have exited (see ignition()). When there are many
// threads, this is faster than waiting for some random thread to exit while a
// bunch of other threads have already exited.
int join_all_(char const *file, long line) {
// grab the threads list and initialize the joined count
int count = 0;
possess_(&(threads_lock), file, line);
// do until threads list is empty
while (threads != NULL) {
// wait until at least one thread has reentered
wait_for_(&(threads_lock), NOT_TO_BE, 0, file, line);
// find the first thread marked done (should be at or near the top)
thread **prior = &(threads);
thread *match;
while ((match = *prior) != NULL) {
if (match->done)
break;
prior = &(match->next);
}
if (match == NULL)
fail(ESRCH, file, line, "join_all lost");
// join the thread (will be almost immediate), remove from the threads
// list, update the reenter count, and free the thread
int ret = pthread_join(match->id, NULL);
if (ret)
fail(ret, file, line, "join");
threads_lock.value--;
*prior = match->next;
my_free(match);
count++;
}
// let go of the threads list and return the number of threads joined
release_(&(threads_lock), file, line);
return count;
}

139
yarn.h 100644
View File

@ -0,0 +1,139 @@
/* yarn.h -- generic interface for thread operations
* Copyright (C) 2008, 2011, 2012, 2015, 2018, 2019, 2020 Mark Adler
* Version 1.7 12 Apr 2020 Mark Adler
*/
/*
This software is provided 'as-is', without any express or implied
warranty. In no event will the author be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
Mark Adler
madler@alumni.caltech.edu
*/
/* Basic thread operations
This interface isolates the local operating system implementation of threads
from the application in order to facilitate platform independent use of
threads. All of the implementation details are deliberately hidden.
Assuming adequate system resources and proper use, none of these functions
can fail. As a result, any errors encountered will cause an exit() to be
executed, or the execution of your own optionally-provided abort function.
These functions allow the simple launching and joining of threads, and the
locking of objects and synchronization of changes of objects. The latter is
implemented with a single lock_t type that contains an integer value. The
value can be ignored for simple exclusive access to an object, or the value
can be used to signal and wait for changes to an object.
-- Arguments --
thread *thread; identifier for launched thread, used by join
void probe(void *); pointer to function "probe", run when thread starts
void *payload; single argument passed to the probe function
lock_t *lock_t; a lock_t with a value -- used for exclusive access to
an object and to synchronize threads waiting for
changes to an object
long val; value to set lock_t, increment lock_t, or wait for
int n; number of threads joined
-- Thread functions --
thread = launch(probe, payload) - launch a thread -- exit via probe() return
join(thread) - join a thread and by joining end it, waiting for the thread
to exit if it hasn't already -- will free the resources allocated by
launch() (don't try to join the same thread more than once)
n = join_all() - join all threads launched by launch() that are not joined
yet and free the resources allocated by the launches, usually to clean
up when the thread processing is done -- join_all() returns an int with
the count of the number of threads joined (join_all() should only be
called from the main thread, and should only be called after any calls
of join() have completed)
-- Lock functions --
lock_t = new_lock(val) - create a new lock_t with initial value val (lock_t is
created in the released state)
possess(lock_t) - acquire exclusive possession of a lock_t, waiting if necessary
twist(lock_t, [TO | BY], val) - set lock_t to or increment lock_t by val, signal
all threads waiting on this lock_t and then release the lock_t -- must
possess the lock_t before calling (twist releases, so don't do a
release() after a twist() on the same lock_t)
wait_for(lock_t, [TO_BE | NOT_TO_BE | TO_BE_MORE_THAN | TO_BE_LESS_THAN], val)
- wait on lock_t value to be, not to be, be greater than, or be less than
val -- must possess the lock_t before calling, will possess the lock_t on
return but the lock_t is released while waiting to permit other threads
to use twist() to change the value and signal the change (so make sure
that the object is in a usable state when waiting)
release(lock_t) - release a possessed lock_t (do not try to release a lock_t that
the current thread does not possess)
val = peek_lock(lock_t) - return the value of the lock_t (assumes that lock_t is
already possessed, no possess or release is done by peek_lock())
free_lock(lock_t) - free the resources allocated by new_lock() (application
must assure that the lock_t is released before calling free_lock())
-- Memory allocation ---
yarn_mem(better_malloc, better_free) - set the memory allocation and free
routines for use by the yarn routines where the supplied routines have
the same interface and operation as malloc() and free(), and may be
provided in order to supply thread-safe memory allocation routines or
for any other reason -- by default malloc() and free() will be used
-- Error control --
yarn_prefix - a char pointer to a string that will be the prefix for any
error messages that these routines generate before exiting -- if not
changed by the application, "yarn" will be used
yarn_abort - an external function that will be executed when there is an
internal yarn error, due to out of memory or misuse -- this function
may exit to abort the application, or if it returns, the yarn error
handler will exit (set to NULL by default for no action)
*/
extern char *yarn_prefix;
extern void (*yarn_abort)(int);
void yarn_mem(void *(*)(size_t), void (*)(void *));
typedef struct thread_s thread;
thread *launch_(void (*)(void *), void *, char const *, long);
#define LAUNCH(a, b) launch_(a, b, __FILE__, __LINE__)
void join_(thread *, char const *, long);
#define JOIN(a) join_(a, __FILE__, __LINE__)
int join_all_(char const *, long);
#define JOIN_ALL() join_all_(__FILE__, __LINE__)
typedef struct lock_s lock_t;
lock_t *new_lock_(long, char const *, long);
#define NEW_LOCK(a) new_lock_(a, __FILE__, __LINE__)
void possess_(lock_t *, char const *, long);
#define POSSESS(a) possess_(a, __FILE__, __LINE__)
void release_(lock_t *, char const *, long);
// #define release(a) release_(a, __FILE__, __LINE__)
#define RELEASE(a) release_(a, __FILE__, __LINE__)
enum twist_op { TO, BY };
void twist_(lock_t *, enum twist_op, long, char const *, long);
#define TWIST(a, b, c) twist_(a, b, c, __FILE__, __LINE__)
enum wait_op {
TO_BE, /* or */ NOT_TO_BE, /* that is the question */
TO_BE_MORE_THAN, TO_BE_LESS_THAN };
void wait_for_(lock_t *, enum wait_op, long, char const *, long);
#define WAIT_FOR(a, b, c) wait_for_(a, b, c, __FILE__, __LINE__)
long peek_lock(lock_t *);
void free_lock_(lock_t *, char const *, long);
#define FREE_LOCK(a) free_lock_(a, __FILE__, __LINE__)