处理read seq的潜在的bug

This commit is contained in:
zzh 2026-01-25 23:13:49 +08:00
parent fd21021e6a
commit e701805337
8 changed files with 36 additions and 28 deletions

View File

@ -2,7 +2,7 @@ CC= gcc
CFLAGS= -g -Wall -Wno-unused-function -mavx2 -mavx512bw -O3
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
SAM_EXACT= #-DSAM_EXACT
SAM_EXACT= -DSAM_EXACT
SHOW_PERF= -DSHOW_PERF
SHOW_DATA_PERF= -DSHOW_DATA_PERF
FILTER_FULL_MATCH= #-DFILTER_FULL_MATCH

5
bwa.c
View File

@ -164,7 +164,10 @@ void bseq_read_pe_mt(int chunk_size, int *n_, void *ks1_, void *ks2_, int copy_c
*n_ = n; *seqs_ptr = seqs; *size_ = size; *m_ = n;
return;
}
m = (chunk_size + size / init_n_reads - 1) / (size / init_n_reads);
float bases_per_read = size * 1.0 / init_n_reads;
m = (chunk_size + bases_per_read * 100) / (bases_per_read); // 多开辟100个reads的冗余空间
// m = (chunk_size + size / init_n_reads - 1) / (size / init_n_reads);
#else
m = 50000;
#endif

2
bwa.h
View File

@ -85,7 +85,7 @@ typedef struct {
int l_seq, id;
int m_name, m_comment, m_seq, m_qual;
char *name, *comment, *seq, *qual;
msw_task_ptr_v msw;
// msw_task_ptr_v msw;
msw_seq_task_v msw_task;
// int_v msw;
// kstring_t sam;

View File

@ -1248,7 +1248,7 @@ static void worker_sam(void *data, int i, int tid)
}
}
static void worker_sam_single_end(void *data, int i, int tid)
static void worker_sam_single_end(void *data, long i, int tid)
{
extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2], seq_sam_t ss[2], int tid);
extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a);
@ -1641,7 +1641,7 @@ void mem_core_process(const mem_opt_t *opt, const FMTIndex *fmt, const bntseq_t
}
// smemchainalign
static void worker_smem_align(void *data, int i, int tid)
static void worker_smem_align(void *data, long i, int tid)
{
mem_worker_t *w = (mem_worker_t *)data;
int start = i * w->opt->batch_size;
@ -1651,7 +1651,6 @@ static void worker_smem_align(void *data, int i, int tid)
void mem_process_seqs(const mem_opt_t *opt, mem_worker_t *w, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0, seq_sam_t *sams)
{
extern void kt_for(int n_threads, void (*func)(void *, int, int), void *data, int n);
PROF_START(mem_prepare);
mem_pestat_t pes[4];
int batch_size = opt->batch_size; // pair-end2pair-end

View File

@ -376,7 +376,7 @@ int main_mem(int argc, char *argv[])
PROF_START(all);
PROF_START(prepare);
mem_opt_t *opt, opt0;
int fd, fd2, i, c, ignore_alt = 0, no_mt_io = 0;
int fd, fd2, i, c, ignore_alt = 0, no_mt_io = 2;
int fixed_chunk_size = -1;
gzFile fp, fp2 = 0;
char *p, *rg_line = 0, *hdr_line = 0;

View File

@ -7,9 +7,11 @@
#include "ksw_align_avx.h"
#include "kvec.h"
#include "debug.h"
#include "mate_sw.h"
#include "profiling.h"
// 原版生成sam函数
static void worker_sam(void* data, int i, int tid) {
static void worker_sam(void* data, long i, int tid) {
mem_worker_t* w = (mem_worker_t*)data;
if (bwa_verbose >= 4)
printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i << 1 | 0].name);
@ -140,7 +142,7 @@ static void get_matesw_tasks(const mem_opt_t* opt, const bntseq_t* bns, const ui
}
}
// 先计算哪些read需要做matesw
static void worker_matesw_tasks(void* data, int idx, int tid) {
static void worker_matesw_tasks(void* data, long idx, int tid) {
mem_worker_t* w = (mem_worker_t*)data;
const mem_opt_t* opt = w->opt;
int startIdx = idx * opt->batch_size;
@ -160,7 +162,7 @@ static void worker_matesw_tasks(void* data, int idx, int tid) {
int64_t i_s = id;
bseq1_t* s = &w->seqs[i_s];
mem_alnreg_v* a = &w->regs[i_s];
s->msw.n = 0; // 清空之前的matesw数据
//s->msw.n = 0; // 清空之前的matesw数据
int i, j;
_clear_vec(b[0]);
@ -259,7 +261,7 @@ static void msw_avx512_u8(const mem_opt_t* opt, int num_tasks, msw_buf_t* msw_bu
}
// 再多线程计算matesw利用inter-query的simd并行
static void worker_calc_matesw_avx512_u8(void* data, int idx, int tid) {
static void worker_calc_matesw_avx512_u8(void* data, long idx, int tid) {
mem_worker_t* w = (mem_worker_t*)data;
const mem_opt_t* opt = w->opt;
int startIdx = idx * w->opt->msw_batch_size;
@ -364,7 +366,7 @@ static void worker_calc_matesw_avx512_u8(void* data, int idx, int tid) {
kv_destroy(msw_task_ids);
}
static void worker_calc_matesw_avx512_i16(void* data, int i, int tid) {}
static void worker_calc_matesw_avx512_i16(void* data, long idx, int tid) {}
// 检测添加新的align
static int check_add_align(const mem_opt_t* opt, kswr_avx_t aln, int is_rev, int64_t l_pac, mem_alnreg_t* a, int l_ms, uint8_t* ms,
@ -570,7 +572,7 @@ end_clear:
}
// 最后再计算并生成sam数据
static void workder_gen_sam(void* data, int idx, int tid) {
static void workder_gen_sam(void* data, long idx, int tid) {
mem_worker_t* w = (mem_worker_t*)data;
const mem_opt_t* opt = w->opt;
const bntseq_t* bns = w->bns;
@ -689,8 +691,10 @@ static void workder_gen_sam(void* data, int idx, int tid) {
// 划分matesw任务
static void gather_matesw_task(mem_worker_t* w, msw_task_v** thread_tasks) {
_clear_vec(w->msw->p_msw_tasks_u8);
_clear_vec(w->msw->p_msw_tasks_i16);
//_clear_vec(w->msw->p_msw_tasks_u8);
//_clear_vec(w->msw->p_msw_tasks_i16);
_destory_clear_vec(w->msw->p_msw_tasks_u8);
_destory_clear_vec(w->msw->p_msw_tasks_i16);
int i = 0, j = 0;
for (i = 0; i < w->opt->n_threads; ++i) {
for (j = 0; j < thread_tasks[i][0].n; ++j) {
@ -756,8 +760,10 @@ void gen_paired_sam(mem_worker_t* w) {
// 清空一下任务数组,不能放到线程里去做
int i = 0;
for (i = 0; i < w->opt->n_threads; ++i) {
_clear_vec(w->msw->t_msw_tasks[i][0]);
_clear_vec(w->msw->t_msw_tasks[i][1]);
//_clear_vec(w->msw->t_msw_tasks[i][0]);
//_clear_vec(w->msw->t_msw_tasks[i][1]);
_destory_clear_vec(w->msw->t_msw_tasks[i][0]);
_destory_clear_vec(w->msw->t_msw_tasks[i][1]);
}
int batch_n = (w->n_reads + w->opt->batch_size - 1) / w->opt->batch_size;
@ -797,4 +803,4 @@ void gen_paired_sam(mem_worker_t* w) {
PROF_END(gprof[G_gen_sam], gen_sam);
}
fprintf(stderr, "zzh : u8: %ld i16: %ld\n", w->msw->p_msw_tasks_u8.n, w->msw->p_msw_tasks_i16.n);
}
}

View File

@ -23,7 +23,7 @@ extern int mem_sam_pe(const mem_opt_t* opt, const bntseq_t* bns, const uint8_t*
extern void mem_reg2ovlp(const mem_opt_t* opt, const bntseq_t* bns, const uint8_t* pac, bseq1_t* s, mem_alnreg_v* a);
extern void kt_for(int n_threads, void (*func)(void*, int, int), void* data, int n);
extern void kt_for(int n_threads, void (*func)(void*, long, int), void* data, int n);
extern int mem_sort_dedup_patch(const mem_opt_t* opt, const bntseq_t* bns, const uint8_t* pac, uint8_t* query, int n, mem_alnreg_t* a);
@ -41,4 +41,4 @@ extern char** mem_gen_alt(const mem_opt_t* opt, const bntseq_t* bns, const uint8
extern void mem_reorder_primary5(int T, mem_alnreg_v* a);
extern int mem_pair(const mem_opt_t* opt, const bntseq_t* bns, const uint8_t* pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id,
int* sub, int* n_sub, int z[2], int n_pri[2]);
int* sub, int* n_sub, int z[2], int n_pri[2]);

16
run.sh
View File

@ -1,11 +1,11 @@
thread=1
thread=64
make clean; make -j 32
#n1=~/data/dataset/D2/n1.fq.gz
#n2=~/data/dataset/D2/n2.fq.gz
n1=~/data/dataset/D2/d1.fq
n2=~/data/dataset/D2/d2.fq
n1=~/data/dataset/D2/n1.fq.gz
n2=~/data/dataset/D2/n2.fq.gz
#n1=~/data/dataset/D2/d1.fq
#n2=~/data/dataset/D2/d2.fq
#n1=~/data/SRR25735656_1.fastq.gz
#n2=~/data/SRR25735656_2.fastq.gz
@ -15,10 +15,10 @@ n2=~/data/dataset/D2/d2.fq
reference=~/data/reference/fmt/human_g1k_v37_decoy.fasta
#out=/dev/null
out=/dev/null
#out=./oldsam-D2-out.sam
#out=./D2-1.sam
out=./d.sam
#out=./d.sam
prog=./fastalign
#prog=/home/zzh/fastbwa
@ -26,4 +26,4 @@ time $prog mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:n
$reference \
$n1 \
$n2 \
-o $out -2
-o $out