解决了创建bwt索引时,一同创建fmt相关的索引相关的bug,现在可以正常一起创建索引了,接下来还可以将sa和bytesa一起创建来减小时间

This commit is contained in:
zzh 2024-04-06 15:05:20 +08:00
parent dd7db7beb6
commit 3e20d7ee0f
5 changed files with 105 additions and 12 deletions

2
.vscode/launch.json vendored
View File

@ -17,7 +17,7 @@
"-M", "-M",
"-R", "-R",
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'", "'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
"~/reference/human_g1k_v37_decoy.fasta", "~/reference/bwa/human_g1k_v37_decoy.fasta",
"~/data/fastq/dataset/na12878_wes_144/SRR25735653_1.fastq", "~/data/fastq/dataset/na12878_wes_144/SRR25735653_1.fastq",
"~/data/fastq/dataset/na12878_wes_144/SRR25735653_2.fastq", "~/data/fastq/dataset/na12878_wes_144/SRR25735653_2.fastq",
"-o", "-o",

View File

@ -7,7 +7,7 @@ WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
SHOW_PERF= -DSHOW_PERF SHOW_PERF= -DSHOW_PERF
SHOW_DATA_PERF= #-DSHOW_DATA_PERF SHOW_DATA_PERF= #-DSHOW_DATA_PERF
FILTER_FULL_MATCH= #-DFILTER_FULL_MATCH FILTER_FULL_MATCH= #-DFILTER_FULL_MATCH
USE_MT_READ= #-DUSE_MT_READ USE_MT_READ= -DUSE_MT_READ
AR= ar AR= ar
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) $(SHOW_PERF) $(SHOW_DATA_PERF) $(FILTER_FULL_MATCH) $(USE_MT_READ) -DUSE_AVX2 -DKSW_EQUAL DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) $(SHOW_PERF) $(SHOW_DATA_PERF) $(FILTER_FULL_MATCH) $(USE_MT_READ) -DUSE_AVX2 -DKSW_EQUAL
@ -22,7 +22,7 @@ PROG= fastbwa
INCLUDES= INCLUDES=
LIBS= -lm -lz -lpthread -ldl LIBS= -lm -lz -lpthread -ldl
SUBDIRS= . SUBDIRS= .
JE_MALLOC=/home/zzh/work/jemalloc/lib/libjemalloc.a JE_MALLOC=#/home/zzh/work/jemalloc/lib/libjemalloc.a
ifeq ($(shell uname -s),Linux) ifeq ($(shell uname -s),Linux)
LIBS += -lrt LIBS += -lrt

View File

@ -367,24 +367,28 @@ int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_s
bwt = bwt_restore_bwt(str); bwt = bwt_restore_bwt(str);
bwt_cal_sa(bwt, 32); bwt_cal_sa(bwt, 32);
bwt_dump_sa(str3, bwt); bwt_dump_sa(str3, bwt);
bwt_destroy(bwt); // bwt_destroy(bwt);
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
{ {
t = clock();
// build FMT-Index // build FMT-Index
t = clock();
FMTIndex *fmt; FMTIndex *fmt;
strcpy(str, prefix); strcat(str, ".fmt"); strcpy(str, prefix); strcat(str, ".fmt");
fmt = create_fmt_from_bwt(bwt); fmt = create_fmt_from_bwt(bwt);
dump_fmt(str, fmt); dump_fmt(str, fmt);
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
// create Kmer-Hash // create Kmer-Hash
t = clock();
fmt_create_kmer_index(fmt); fmt_create_kmer_index(fmt);
strcpy(str, prefix); strcat(str, ".kmer"); strcpy(str, prefix); strcat(str, ".kmer");
fmt_dump_kmer_idx(str, &fmt->kmer_hash); fmt_dump_kmer_idx(str, &fmt->kmer_hash);
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
// create byte sa // create byte sa
t = clock();
bwt_cal_byte_sa(bwt, 4); bwt_cal_byte_sa(bwt, 4);
strcpy(str, prefix); strcat(str, ".bytesa"); strcpy(str, prefix); strcat(str, ".bytesa");
bwt_dump_byte_sa(str, bwt); bwt_dump_byte_sa(str, bwt);
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
} }
} }
free(str3); free(str2); free(str); free(str3); free(str2); free(str);

View File

@ -1258,7 +1258,7 @@ inline static void fmt_get_previous_base(const FMTIndex *fmt, bwtint_t k, uint8_
} }
// k, k1, k2都是bwt矩阵对应的行 // k, k1, k2都是bwt矩阵对应的行
inline static void fmt_previous_line(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2) inline static void fmt_previous_line_old(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2)
{ {
uint8_t b1, b2; uint8_t b1, b2;
bwtint_t tk[4], kk; bwtint_t tk[4], kk;
@ -1269,6 +1269,68 @@ inline static void fmt_previous_line(const FMTIndex *fmt, bwtint_t k, bwtint_t *
*k2 = fmt->L2[b2] + tk[3]; *k2 = fmt->L2[b2] + tk[3];
} }
inline static void fmt_previous_line(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2)
{
uint8_t b1, b2;
uint32_t x = 0;
bwtint_t cnt[4];
k = k - (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行去掉了$,所以大于$的行都减掉1
uint32_t *p, *pocc, *ptocc, tmp;
uint8_t base2;
bwtint_t str_line = k, cp_line = k & (~FMT_OCC_INTV_MASK);
// 第一步找到check point位置
pocc = fmt_occ_intv(fmt, k); // check point起始位置
p = pocc + 20; // bwt碱基起始位置
// 第二步找到mid check point位置
int mk = k & FMT_OCC_INTV_MASK;
int n_mintv = mk >> FMT_MID_INTV_SHIFT;
p += n_mintv * (4 + (FMT_MID_INTERVAL >> 3)); // 跳过mid间隔的bwt碱基位置
ptocc = p;
// 第三步找到具体的uint32_t
p += (k & FMT_MID_INTV_MASK) >> 3; // 每个uint32_t包含8个碱基和8个倒数第二bwt碱基
// 第四步,获取碱基
base2 = *p >> ((~(k) & 0x7) << 2) & 0xf;
b2 = base2 >> 2 & 3;
b1 = base2 & 3;
cnt[1] = pocc[b1];
cnt[3] = (pocc + 4 + b1 * 4)[b2];
if (n_mintv > 0) {
ptocc -= 4;
x = *(ptocc + b1);
cnt[1] += __fmt_mid_sum(x);
cnt[3] += x >> (b2 << 3) & 0xff;
x = 0;
ptocc += 4;
}
uint32_t *end = ptocc + ((k >> 3) - ((k & ~FMT_MID_INTV_MASK) >> 3));
int ti = b1 << 2 | b2;
for (; ptocc < end; ++ptocc)
{
x += __fmt_occ_e2_aux2(fmt, ti, *ptocc);
}
tmp = *ptocc & ~((1U << ((~k & 7) << 2)) - 1);
x += __fmt_occ_e2_aux2(fmt, ti, tmp);
if (b1 == 0)
{
x -= (~k & 7) << 8;
if (b2 == 0)
x -= (~k & 7) << 24;
}
if (b1 == fmt->first_base && b2 == fmt->last_base && cp_line < fmt->sec_primary && str_line >= fmt->sec_primary)
{
cnt[3] -= 1;
}
cnt[1] += x >> 8 & 0xff;
cnt[3] += x >> 24 & 0xff;
*k1 = fmt->L2[b1] + cnt[1];
*k2 = fmt->L2[b2] + cnt[3];
}
bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k) bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k)
{ {
bwtint_t sa = 0, mask = fmt->sa_intv - 1; bwtint_t sa = 0, mask = fmt->sa_intv - 1;
@ -1277,6 +1339,7 @@ bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k)
{ {
++sa; ++sa;
fmt_previous_line(fmt, k, &k1, &k2); fmt_previous_line(fmt, k, &k1, &k2);
//fmt_previous_line_old(fmt, k, &k1, &k2);
__builtin_prefetch(fmt_occ_intv(fmt, k2), 0, 2); __builtin_prefetch(fmt_occ_intv(fmt, k2), 0, 2);
if (!(k1 & mask)) if (!(k1 & mask))
{ {
@ -1284,6 +1347,11 @@ bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k)
break; break;
} }
++sa; ++sa;
if (!(k2 & mask))
{
k = k2;
break;
}
k = k2; k = k2;
} }
sa += bwt_get_sa(fmt->sa, k / fmt->sa_intv); sa += bwt_get_sa(fmt->sa, k / fmt->sa_intv);
@ -1305,6 +1373,11 @@ bwtint_t fmt_sa_offset(const FMTIndex *fmt, bwtint_t k)
break; break;
} }
++sa; ++sa;
if (!(k2 & mask))
{
k = k2;
break;
}
k = k2; k = k2;
} }
sa = (sa << 48) | (k / fmt->sa_intv); sa = (sa << 48) | (k / fmt->sa_intv);

28
run.sh
View File

@ -1,28 +1,44 @@
thread=1 thread=64
## d1 ## d1
n_r1=~/data/fastq/dataset/na12878_wes_144/s_1.fq #n_r1=~/data/fastq/dataset/na12878_wes_144/s_1.fq
n_r2=~/data/fastq/dataset/na12878_wes_144/s_2.fq #n_r2=~/data/fastq/dataset/na12878_wes_144/s_2.fq
#n_r1=~/data/fastq/dataset/na12878_wes_144/45m_r1.fq
#n_r2=~/data/fastq/dataset/na12878_wes_144/45m_r2.fq
#n_r1=~/data/fastq/dataset/na12878_wes_144/45mr1.fq.gz
#n_r2=~/data/fastq/dataset/na12878_wes_144/45mr2.fq.gz
## d2 ## d2
#n_r1=~/data/fastq/dataset/na12878_wgs_101/s_1.fq #n_r1=~/data/fastq/dataset/na12878_wgs_101/s_1.fq
#n_r2=~/data/fastq/dataset/na12878_wgs_101/s_2.fq #n_r2=~/data/fastq/dataset/na12878_wgs_101/s_2.fq
n_r1=~/data/fastq/dataset/na12878_wgs_101/45m_r1.fq
n_r2=~/data/fastq/dataset/na12878_wgs_101/45m_r2.fq
# d3 # d3
#n_r1=~/data/fastq/dataset/na12878_wgs_150/s_1.fq #n_r1=~/data/fastq/dataset/na12878_wgs_150/s_1.fq
#n_r2=~/data/fastq/dataset/na12878_wgs_150/s_2.fq #n_r2=~/data/fastq/dataset/na12878_wgs_150/s_2.fq
#n_r1=~/data/fastq/dataset/na12878_wgs_150/45mr1.fq.gz
#n_r2=~/data/fastq/dataset/na12878_wgs_150/45mr2.fq.gz
## d4 ## d4
#n_r1=~/data/fastq/dataset/zy_wes/s_1.fq #n_r1=~/data/fastq/dataset/zy_wes/s_1.fq
#n_r2=~/data/fastq/dataset/zy_wes/s_2.fq #n_r2=~/data/fastq/dataset/zy_wes/s_2.fq
#n_r1=~/data/fastq/dataset/zy_wes/45mr1.fq.gz
#n_r2=~/data/fastq/dataset/zy_wes/45mr2.fq.gz
## d5 ## d5
#n_r1=~/data/fastq/dataset/zy_wgs/45mr1.fq.gz
#n_r2=~/data/fastq/dataset/zy_wgs/45mr2.fq.gz
#n_r1=~/data/fastq/dataset/zy_wgs/s_1.fq #n_r1=~/data/fastq/dataset/zy_wgs/s_1.fq
#n_r2=~/data/fastq/dataset/zy_wgs/s_2.fq #n_r2=~/data/fastq/dataset/zy_wgs/s_2.fq
#n_r1=~/data1/fastq/dataset/zy_wgs/E150010395_L01_690_1.fq
#n_r2=~/data1/fastq/dataset/zy_wgs/E150010395_L01_690_2.fq
reference=~/data1/fmt_ref/human_g1k_v37_decoy.fasta
#reference=~/reference/bwa/human_g1k_v37_decoy.fasta #reference=~/reference/bwa/human_g1k_v37_decoy.fasta
reference=~/data/reference/human_g1k_v37_decoy.fasta #reference=~/data/reference/human_g1k_v37_decoy.fasta
#out=./all.sam #out=./all.sam
#out=./sn.sam #out=./sn.sam
#out=./ssn-x1.sam #out=./ssn-x1.sam
#out=./out.sam out=~/data1/d2-45m.sam
out=/dev/null #out=~/data/out1.sam
#out=/dev/null
#out=./na12878.sam #out=./na12878.sam
#time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ #time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
# /home/zzh/data/reference/human_g1k_v37_decoy.fasta \ # /home/zzh/data/reference/human_g1k_v37_decoy.fasta \