解决了创建bwt索引时,一同创建fmt相关的索引相关的bug,现在可以正常一起创建索引了,接下来还可以将sa和bytesa一起创建来减小时间
This commit is contained in:
parent
dd7db7beb6
commit
3e20d7ee0f
|
|
@ -17,7 +17,7 @@
|
||||||
"-M",
|
"-M",
|
||||||
"-R",
|
"-R",
|
||||||
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
|
"'@RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa'",
|
||||||
"~/reference/human_g1k_v37_decoy.fasta",
|
"~/reference/bwa/human_g1k_v37_decoy.fasta",
|
||||||
"~/data/fastq/dataset/na12878_wes_144/SRR25735653_1.fastq",
|
"~/data/fastq/dataset/na12878_wes_144/SRR25735653_1.fastq",
|
||||||
"~/data/fastq/dataset/na12878_wes_144/SRR25735653_2.fastq",
|
"~/data/fastq/dataset/na12878_wes_144/SRR25735653_2.fastq",
|
||||||
"-o",
|
"-o",
|
||||||
|
|
|
||||||
4
Makefile
4
Makefile
|
|
@ -7,7 +7,7 @@ WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
||||||
SHOW_PERF= -DSHOW_PERF
|
SHOW_PERF= -DSHOW_PERF
|
||||||
SHOW_DATA_PERF= #-DSHOW_DATA_PERF
|
SHOW_DATA_PERF= #-DSHOW_DATA_PERF
|
||||||
FILTER_FULL_MATCH= #-DFILTER_FULL_MATCH
|
FILTER_FULL_MATCH= #-DFILTER_FULL_MATCH
|
||||||
USE_MT_READ= #-DUSE_MT_READ
|
USE_MT_READ= -DUSE_MT_READ
|
||||||
|
|
||||||
AR= ar
|
AR= ar
|
||||||
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) $(SHOW_PERF) $(SHOW_DATA_PERF) $(FILTER_FULL_MATCH) $(USE_MT_READ) -DUSE_AVX2 -DKSW_EQUAL
|
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) $(SHOW_PERF) $(SHOW_DATA_PERF) $(FILTER_FULL_MATCH) $(USE_MT_READ) -DUSE_AVX2 -DKSW_EQUAL
|
||||||
|
|
@ -22,7 +22,7 @@ PROG= fastbwa
|
||||||
INCLUDES=
|
INCLUDES=
|
||||||
LIBS= -lm -lz -lpthread -ldl
|
LIBS= -lm -lz -lpthread -ldl
|
||||||
SUBDIRS= .
|
SUBDIRS= .
|
||||||
JE_MALLOC=/home/zzh/work/jemalloc/lib/libjemalloc.a
|
JE_MALLOC=#/home/zzh/work/jemalloc/lib/libjemalloc.a
|
||||||
|
|
||||||
ifeq ($(shell uname -s),Linux)
|
ifeq ($(shell uname -s),Linux)
|
||||||
LIBS += -lrt
|
LIBS += -lrt
|
||||||
|
|
|
||||||
|
|
@ -367,24 +367,28 @@ int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_s
|
||||||
bwt = bwt_restore_bwt(str);
|
bwt = bwt_restore_bwt(str);
|
||||||
bwt_cal_sa(bwt, 32);
|
bwt_cal_sa(bwt, 32);
|
||||||
bwt_dump_sa(str3, bwt);
|
bwt_dump_sa(str3, bwt);
|
||||||
bwt_destroy(bwt);
|
// bwt_destroy(bwt);
|
||||||
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||||
{
|
{
|
||||||
t = clock();
|
|
||||||
// build FMT-Index
|
// build FMT-Index
|
||||||
|
t = clock();
|
||||||
FMTIndex *fmt;
|
FMTIndex *fmt;
|
||||||
strcpy(str, prefix); strcat(str, ".fmt");
|
strcpy(str, prefix); strcat(str, ".fmt");
|
||||||
fmt = create_fmt_from_bwt(bwt);
|
fmt = create_fmt_from_bwt(bwt);
|
||||||
dump_fmt(str, fmt);
|
dump_fmt(str, fmt);
|
||||||
|
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||||
// create Kmer-Hash
|
// create Kmer-Hash
|
||||||
|
t = clock();
|
||||||
fmt_create_kmer_index(fmt);
|
fmt_create_kmer_index(fmt);
|
||||||
strcpy(str, prefix); strcat(str, ".kmer");
|
strcpy(str, prefix); strcat(str, ".kmer");
|
||||||
fmt_dump_kmer_idx(str, &fmt->kmer_hash);
|
fmt_dump_kmer_idx(str, &fmt->kmer_hash);
|
||||||
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||||
// create byte sa
|
// create byte sa
|
||||||
|
t = clock();
|
||||||
bwt_cal_byte_sa(bwt, 4);
|
bwt_cal_byte_sa(bwt, 4);
|
||||||
strcpy(str, prefix); strcat(str, ".bytesa");
|
strcpy(str, prefix); strcat(str, ".bytesa");
|
||||||
bwt_dump_byte_sa(str, bwt);
|
bwt_dump_byte_sa(str, bwt);
|
||||||
|
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
free(str3); free(str2); free(str);
|
free(str3); free(str2); free(str);
|
||||||
|
|
|
||||||
75
fmt_idx.c
75
fmt_idx.c
|
|
@ -1258,7 +1258,7 @@ inline static void fmt_get_previous_base(const FMTIndex *fmt, bwtint_t k, uint8_
|
||||||
}
|
}
|
||||||
|
|
||||||
// k, k1, k2都是bwt矩阵对应的行
|
// k, k1, k2都是bwt矩阵对应的行
|
||||||
inline static void fmt_previous_line(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2)
|
inline static void fmt_previous_line_old(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2)
|
||||||
{
|
{
|
||||||
uint8_t b1, b2;
|
uint8_t b1, b2;
|
||||||
bwtint_t tk[4], kk;
|
bwtint_t tk[4], kk;
|
||||||
|
|
@ -1269,6 +1269,68 @@ inline static void fmt_previous_line(const FMTIndex *fmt, bwtint_t k, bwtint_t *
|
||||||
*k2 = fmt->L2[b2] + tk[3];
|
*k2 = fmt->L2[b2] + tk[3];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline static void fmt_previous_line(const FMTIndex *fmt, bwtint_t k, bwtint_t *k1, bwtint_t *k2)
|
||||||
|
{
|
||||||
|
uint8_t b1, b2;
|
||||||
|
uint32_t x = 0;
|
||||||
|
bwtint_t cnt[4];
|
||||||
|
k = k - (k >= fmt->primary); // k由bwt矩阵对应的行转换成bwt字符串对应的行(去掉了$,所以大于$的行,都减掉1)
|
||||||
|
uint32_t *p, *pocc, *ptocc, tmp;
|
||||||
|
uint8_t base2;
|
||||||
|
bwtint_t str_line = k, cp_line = k & (~FMT_OCC_INTV_MASK);
|
||||||
|
// 第一步,找到check point位置
|
||||||
|
pocc = fmt_occ_intv(fmt, k); // check point起始位置
|
||||||
|
p = pocc + 20; // bwt碱基起始位置
|
||||||
|
// 第二步,找到mid check point位置
|
||||||
|
int mk = k & FMT_OCC_INTV_MASK;
|
||||||
|
int n_mintv = mk >> FMT_MID_INTV_SHIFT;
|
||||||
|
p += n_mintv * (4 + (FMT_MID_INTERVAL >> 3)); // 跳过mid间隔的bwt碱基位置
|
||||||
|
ptocc = p;
|
||||||
|
// 第三步,找到具体的uint32_t
|
||||||
|
p += (k & FMT_MID_INTV_MASK) >> 3; // 每个uint32_t包含8个碱基(和8个倒数第二bwt碱基)
|
||||||
|
// 第四步,获取碱基
|
||||||
|
base2 = *p >> ((~(k) & 0x7) << 2) & 0xf;
|
||||||
|
b2 = base2 >> 2 & 3;
|
||||||
|
b1 = base2 & 3;
|
||||||
|
|
||||||
|
cnt[1] = pocc[b1];
|
||||||
|
cnt[3] = (pocc + 4 + b1 * 4)[b2];
|
||||||
|
if (n_mintv > 0) {
|
||||||
|
ptocc -= 4;
|
||||||
|
x = *(ptocc + b1);
|
||||||
|
cnt[1] += __fmt_mid_sum(x);
|
||||||
|
cnt[3] += x >> (b2 << 3) & 0xff;
|
||||||
|
x = 0;
|
||||||
|
ptocc += 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t *end = ptocc + ((k >> 3) - ((k & ~FMT_MID_INTV_MASK) >> 3));
|
||||||
|
int ti = b1 << 2 | b2;
|
||||||
|
for (; ptocc < end; ++ptocc)
|
||||||
|
{
|
||||||
|
x += __fmt_occ_e2_aux2(fmt, ti, *ptocc);
|
||||||
|
}
|
||||||
|
|
||||||
|
tmp = *ptocc & ~((1U << ((~k & 7) << 2)) - 1);
|
||||||
|
x += __fmt_occ_e2_aux2(fmt, ti, tmp);
|
||||||
|
|
||||||
|
if (b1 == 0)
|
||||||
|
{
|
||||||
|
x -= (~k & 7) << 8;
|
||||||
|
if (b2 == 0)
|
||||||
|
x -= (~k & 7) << 24;
|
||||||
|
}
|
||||||
|
if (b1 == fmt->first_base && b2 == fmt->last_base && cp_line < fmt->sec_primary && str_line >= fmt->sec_primary)
|
||||||
|
{
|
||||||
|
cnt[3] -= 1;
|
||||||
|
}
|
||||||
|
cnt[1] += x >> 8 & 0xff;
|
||||||
|
cnt[3] += x >> 24 & 0xff;
|
||||||
|
|
||||||
|
*k1 = fmt->L2[b1] + cnt[1];
|
||||||
|
*k2 = fmt->L2[b2] + cnt[3];
|
||||||
|
}
|
||||||
|
|
||||||
bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k)
|
bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k)
|
||||||
{
|
{
|
||||||
bwtint_t sa = 0, mask = fmt->sa_intv - 1;
|
bwtint_t sa = 0, mask = fmt->sa_intv - 1;
|
||||||
|
|
@ -1277,6 +1339,7 @@ bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k)
|
||||||
{
|
{
|
||||||
++sa;
|
++sa;
|
||||||
fmt_previous_line(fmt, k, &k1, &k2);
|
fmt_previous_line(fmt, k, &k1, &k2);
|
||||||
|
//fmt_previous_line_old(fmt, k, &k1, &k2);
|
||||||
__builtin_prefetch(fmt_occ_intv(fmt, k2), 0, 2);
|
__builtin_prefetch(fmt_occ_intv(fmt, k2), 0, 2);
|
||||||
if (!(k1 & mask))
|
if (!(k1 & mask))
|
||||||
{
|
{
|
||||||
|
|
@ -1284,6 +1347,11 @@ bwtint_t fmt_sa(const FMTIndex *fmt, bwtint_t k)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
++sa;
|
++sa;
|
||||||
|
if (!(k2 & mask))
|
||||||
|
{
|
||||||
|
k = k2;
|
||||||
|
break;
|
||||||
|
}
|
||||||
k = k2;
|
k = k2;
|
||||||
}
|
}
|
||||||
sa += bwt_get_sa(fmt->sa, k / fmt->sa_intv);
|
sa += bwt_get_sa(fmt->sa, k / fmt->sa_intv);
|
||||||
|
|
@ -1305,6 +1373,11 @@ bwtint_t fmt_sa_offset(const FMTIndex *fmt, bwtint_t k)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
++sa;
|
++sa;
|
||||||
|
if (!(k2 & mask))
|
||||||
|
{
|
||||||
|
k = k2;
|
||||||
|
break;
|
||||||
|
}
|
||||||
k = k2;
|
k = k2;
|
||||||
}
|
}
|
||||||
sa = (sa << 48) | (k / fmt->sa_intv);
|
sa = (sa << 48) | (k / fmt->sa_intv);
|
||||||
|
|
|
||||||
28
run.sh
28
run.sh
|
|
@ -1,28 +1,44 @@
|
||||||
thread=1
|
thread=64
|
||||||
|
|
||||||
## d1
|
## d1
|
||||||
n_r1=~/data/fastq/dataset/na12878_wes_144/s_1.fq
|
#n_r1=~/data/fastq/dataset/na12878_wes_144/s_1.fq
|
||||||
n_r2=~/data/fastq/dataset/na12878_wes_144/s_2.fq
|
#n_r2=~/data/fastq/dataset/na12878_wes_144/s_2.fq
|
||||||
|
#n_r1=~/data/fastq/dataset/na12878_wes_144/45m_r1.fq
|
||||||
|
#n_r2=~/data/fastq/dataset/na12878_wes_144/45m_r2.fq
|
||||||
|
#n_r1=~/data/fastq/dataset/na12878_wes_144/45mr1.fq.gz
|
||||||
|
#n_r2=~/data/fastq/dataset/na12878_wes_144/45mr2.fq.gz
|
||||||
## d2
|
## d2
|
||||||
#n_r1=~/data/fastq/dataset/na12878_wgs_101/s_1.fq
|
#n_r1=~/data/fastq/dataset/na12878_wgs_101/s_1.fq
|
||||||
#n_r2=~/data/fastq/dataset/na12878_wgs_101/s_2.fq
|
#n_r2=~/data/fastq/dataset/na12878_wgs_101/s_2.fq
|
||||||
|
n_r1=~/data/fastq/dataset/na12878_wgs_101/45m_r1.fq
|
||||||
|
n_r2=~/data/fastq/dataset/na12878_wgs_101/45m_r2.fq
|
||||||
# d3
|
# d3
|
||||||
#n_r1=~/data/fastq/dataset/na12878_wgs_150/s_1.fq
|
#n_r1=~/data/fastq/dataset/na12878_wgs_150/s_1.fq
|
||||||
#n_r2=~/data/fastq/dataset/na12878_wgs_150/s_2.fq
|
#n_r2=~/data/fastq/dataset/na12878_wgs_150/s_2.fq
|
||||||
|
#n_r1=~/data/fastq/dataset/na12878_wgs_150/45mr1.fq.gz
|
||||||
|
#n_r2=~/data/fastq/dataset/na12878_wgs_150/45mr2.fq.gz
|
||||||
## d4
|
## d4
|
||||||
#n_r1=~/data/fastq/dataset/zy_wes/s_1.fq
|
#n_r1=~/data/fastq/dataset/zy_wes/s_1.fq
|
||||||
#n_r2=~/data/fastq/dataset/zy_wes/s_2.fq
|
#n_r2=~/data/fastq/dataset/zy_wes/s_2.fq
|
||||||
|
#n_r1=~/data/fastq/dataset/zy_wes/45mr1.fq.gz
|
||||||
|
#n_r2=~/data/fastq/dataset/zy_wes/45mr2.fq.gz
|
||||||
## d5
|
## d5
|
||||||
|
#n_r1=~/data/fastq/dataset/zy_wgs/45mr1.fq.gz
|
||||||
|
#n_r2=~/data/fastq/dataset/zy_wgs/45mr2.fq.gz
|
||||||
#n_r1=~/data/fastq/dataset/zy_wgs/s_1.fq
|
#n_r1=~/data/fastq/dataset/zy_wgs/s_1.fq
|
||||||
#n_r2=~/data/fastq/dataset/zy_wgs/s_2.fq
|
#n_r2=~/data/fastq/dataset/zy_wgs/s_2.fq
|
||||||
|
#n_r1=~/data1/fastq/dataset/zy_wgs/E150010395_L01_690_1.fq
|
||||||
|
#n_r2=~/data1/fastq/dataset/zy_wgs/E150010395_L01_690_2.fq
|
||||||
|
|
||||||
|
reference=~/data1/fmt_ref/human_g1k_v37_decoy.fasta
|
||||||
#reference=~/reference/bwa/human_g1k_v37_decoy.fasta
|
#reference=~/reference/bwa/human_g1k_v37_decoy.fasta
|
||||||
reference=~/data/reference/human_g1k_v37_decoy.fasta
|
#reference=~/data/reference/human_g1k_v37_decoy.fasta
|
||||||
#out=./all.sam
|
#out=./all.sam
|
||||||
#out=./sn.sam
|
#out=./sn.sam
|
||||||
#out=./ssn-x1.sam
|
#out=./ssn-x1.sam
|
||||||
#out=./out.sam
|
out=~/data1/d2-45m.sam
|
||||||
out=/dev/null
|
#out=~/data/out1.sam
|
||||||
|
#out=/dev/null
|
||||||
#out=./na12878.sam
|
#out=./na12878.sam
|
||||||
#time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
#time ./bwa mem -t 12 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
||||||
# /home/zzh/data/reference/human_g1k_v37_decoy.fasta \
|
# /home/zzh/data/reference/human_g1k_v37_decoy.fasta \
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue