改了几个bug,比如gen_sam需要全局id,dedup-sort需要一个一个做,现在还有一个问题,就是dedup-sort之后有些skip结果会变

This commit is contained in:
zzh 2026-01-16 22:16:16 +08:00
parent fb0cd0f663
commit 67cbb5b8e9
5 changed files with 41 additions and 18 deletions

1
.gitignore vendored
View File

@ -1,6 +1,7 @@
*.log
*.sam
output/
scripts/
# Prerequisites
*.d

6
.vscode/launch.json vendored
View File

@ -18,10 +18,10 @@
"-R",
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
"~/data/reference/fmt/human_g1k_v37_decoy.fasta",
"~/data/dataset/D2/n1.fq.gz",
"~/data/dataset/D2/n2.fq.gz",
//"~/data/dataset/D2/n1.fq.gz", "~/data/dataset/D2/n2.fq.gz",
"~/data/dataset/D2/d1.fq", "~/data/dataset/D2/d2.fq",
"-o",
"D1-out.sam",
"/dev/null", // "D1-out.sam",
],
"cwd": "${workspaceFolder}", //
},

View File

@ -1,5 +1,5 @@
CC= gcc
CFLAGS= -g -Wall -Wno-unused-function -mavx2 -mavx512bw -O3
CFLAGS= -g -Wall -Wno-unused-function -mavx2 -mavx512bw #-O3
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
SHOW_PERF= -DSHOW_PERF

View File

@ -579,7 +579,13 @@ static void workder_gen_sam(void* data, int idx, int tid) {
mem_alnreg_v* a = &w->regs[id];
seq_sam_t* ss = &w->sams[id];
// int cmp = strcmp("ERR194147.1548", s[0].name);
int cmp = strcmp("ERR194147.42331121", s[0].name) == 0;
// int cmp = (id + w->n_processed) == 202924;
if (cmp == 1) {
fprintf(stderr, "%s\n", s[0].name);
fflush(stderr);
//exit(0);
}
int n[2] = {0}, nn[2] = {0};
for (i = 0; i < 2; ++i) {
@ -600,37 +606,49 @@ static void workder_gen_sam(void* data, int idx, int tid) {
skip |= 1 << r;
}
}
if ((t->skip | skip) != 15) { // 检查新添加的aln是不是把当前的任务skip掉了
kswr_avx_t aln = t->aln; fprintf(gf[0], "id-%ld score-%d te-%d qe-%d score2-%d te2-%d tb-%d qb-%d\n", s[si].id +
w->n_processed, aln.score, aln.te, aln.qe, aln.score2, aln.te2, aln.tb, aln.qb);
//if ((t->skip | skip) != 15) { // 检查新添加的aln是不是把当前的任务skip掉了
kswr_avx_t aln = t->aln; // fprintf(gf[0], "id-%ld score-%d te-%d qe-%d score2-%d te2-%d tb-%d qb-%d\n", s[si].id + w->n_processed, aln.score, aln.te, aln.qe, aln.score2, aln.te2, aln.tb, aln.qb);
if (cmp) {
fprintf(stderr, "id-%ld score-%d te-%d qe-%d score2-%d te2-%d tb-%d qb-%d\n", s[si].id + w->n_processed, aln.score, aln.te, aln.qe, aln.score2, aln.te2, aln.tb, aln.qb);
fflush(stderr);
}
nn[si] += 1;
n[si] += check_add_align(t->aln, opt->min_seed_len, t->is_rev, bns->l_pac, b, s[si].l_seq, (uint8_t*)s[si].seq, ma, t->rb);
}
//}
}
}
// 处理完2个pair read之后再排序因为上面操作有插入排序后之前记录的索引就无效了所以上面不能排序要两个pair read处理完之后再排序
for (i = 0; i < 2; ++i) {
int tmp;
if (nn[i] > 0) {
for (j = 0; j < n[i]; ++j) { // 在这里排序
int origin_n = a[i].n;
int start_n = a[i].n - n[i];
for (j = 0; j < n[i]; ++j) { // 在这里排序
// move b s.t. ma is sorted
int nidx = a[i].n - n[i] + j;
int nidx = origin_n - n[i] + j;
mem_alnreg_t b = a[i].a[nidx]; // 用指针一定要慎重,注意指向的地址内容会变
for (k = 0; k < nidx; ++k) // find the insertion point
if (a[i].a[k].score < b.score)
break;
tmp = k;
for (k = nidx; k > tmp; --k) a[i].a[k] = a[i].a[k - 1];
for (k = start_n; k > tmp; --k) a[i].a[k] = a[i].a[k - 1];
a[i].a[k] = b;
// 每添加一个aln都要sort-dedup一次
#if 1
start_n = mem_sort_dedup_patch(opt, 0, 0, 0, start_n + 1, a[i].a);
a[i].n = start_n;
#else
start_n += 1;
#endif
}
a[i].n = mem_sort_dedup_patch(opt, 0, 0, 0, a[i].n, a[i].a);
// a[i].n = mem_sort_dedup_patch(opt, 0, 0, 0, a[i].n, a[i].a);
}
}
////////////////////////////////////////////////////////////
generate_sam(opt, bns, pac, pes, id >> 1, s, a, ss, w->n_processed, tid);
// 这里需要传入全局id
generate_sam(opt, bns, pac, pes, (w->n_processed + id) >> 1, s, a, ss, w->n_processed, tid);
}
}

10
run.sh
View File

@ -2,16 +2,20 @@ thread=1
make clean; make -j 32
n1=~/data/dataset/D2/n1.fq.gz
n2=~/data/dataset/D2/n2.fq.gz
#n1=~/data/dataset/D2/n1.fq.gz
#n2=~/data/dataset/D2/n2.fq.gz
n1=~/data/dataset/D2/d1.fq
n2=~/data/dataset/D2/d2.fq
#n1=/file-data/zzh/dataset/D1/45mr1.fq.gz
#n2=/file-data/zzh/dataset/D1/45mr2.fq.gz
reference=~/data/reference/fmt/human_g1k_v37_decoy.fasta
out=/dev/null
#out=/dev/null
#out=./oldsam-D2-out.sam
#out=./D2-1.sam
out=./d.sam
prog=./fastalign
#prog=/home/zzh/fastbwa