diff --git a/Makefile b/Makefile index 3169076..b23534e 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ CC= gcc #CC= clang --analyze # CFLAGS= -g -Wall -Wno-unused-function -O2 -CFLAGS= -g -Wall -Wno-unused-function #-O2 +CFLAGS= -g -Wall -Wno-unused-function -O2 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS AR= ar DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) @@ -10,7 +10,8 @@ LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_p AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ bwape.o kopen.o pemerge.o maxk.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ - bwtsw2_chain.o fastmap.o bwtsw2_pair.o + bwtsw2_chain.o fastmap.o bwtsw2_pair.o \ + fmt_idx.o PROG= bwa INCLUDES= LIBS= -lm -lz -lpthread diff --git a/bwa.h b/bwa.h index 95c324b..90e18d2 100644 --- a/bwa.h +++ b/bwa.h @@ -30,6 +30,7 @@ #include #include "bntseq.h" #include "bwt.h" +#include "fmt_idx.h" #define BWA_IDX_BWT 0x1 #define BWA_IDX_BNS 0x2 @@ -47,6 +48,7 @@ typedef struct { bwt_t *bwt; // FM-index + bwtd_t *bwtd;// FMT-index bntseq_t *bns; // information on the reference sequences uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base diff --git a/fastmap.c b/fastmap.c index be7ba0e..3511263 100644 --- a/fastmap.c +++ b/fastmap.c @@ -38,6 +38,7 @@ #include "utils.h" #include "bntseq.h" #include "kseq.h" +#include "fmt_idx.h" KSEQ_DECLARE(gzFile) extern unsigned char nst_nt4_table[256]; @@ -390,6 +391,9 @@ int main_mem(int argc, char *argv[]) opt->flag |= MEM_F_PE; } } + + BuildBwtdFromBwt(aux.idx->bwt, &aux.idx->bwtd); + bwa_print_sam_hdr(aux.idx->bns, hdr_line); aux.actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads; kt_pipeline(no_mt_io? 1 : 2, process, &aux, 3); diff --git a/fmt_idx.c b/fmt_idx.c new file mode 100644 index 0000000..f35e3ab --- /dev/null +++ b/fmt_idx.c @@ -0,0 +1,27 @@ +/* +Description: 通过fmt-idx数据结构对seed过程进行加速(fm-index twice search in one time) + +Copyright : All right reserved by ICT + +Author : Zhang Zhonghai +Date : 2023/12/24 +*/ + +#include +#include +#include "fmt_idx.h" + +// 创建fmt-index索引数据 +void BuildBwtdFromBwt(bwt_t *bwt, bwtd_t **bwtd_p) { + *bwtd_p = (bwtd_t *)malloc(sizeof(bwtd_t)); + bwtd_t *bwtd = *bwtd_p; + + const int baseLen = 12; + const int kmerSize = 1 << (baseLen << 1); + + bwtd->kmer_range = (kmer_range_t *)malloc(kmerSize * sizeof(kmer_range_t)); + + printf("kmer size: %ld M\n", kmerSize * sizeof(kmer_range_t) / 1024 / 1024); + + exit(0); +} \ No newline at end of file diff --git a/fmt_idx.h b/fmt_idx.h new file mode 100644 index 0000000..edafff7 --- /dev/null +++ b/fmt_idx.h @@ -0,0 +1,44 @@ +/* +Description: 通过fmt-idx数据结构对seed过程进行加速(fm-index twice search in one time) + +Copyright : All right reserved by ICT + +Author : Zhang Zhonghai +Date : 2023/12/24 +*/ +#ifndef BWA_FMT_IDX_H +#define BWA_FMT_IDX_H +#include +#include +#include "bwt.h" + +typedef uint64_t bwtint_t; + +/* + ignore the first 12 bases, and give the entrence directly +*/ +typedef struct { + uint8_t range[9]; // 前36位表示起始行,后36位表示结束行(fm-index索引行) +} kmer_range_t; + +typedef struct +{ + bwtint_t primary; // S^{-1}(0), or the primary index of BWT + bwtint_t L2[5]; // C(), cumulative count + bwtint_t seq_len; // sequence length + bwtint_t bwt_size; // size of bwt, about seq_len/4 + uint32_t *bwt; // BWT + // kmer entry + kmer_range_t *kmer_range; + // occurance array, separated to two parts + uint32_t cnt_table[256]; + // suffix array + int sa_intv; + bwtint_t n_sa; + bwtint_t *sa; +} bwtd_t; + +// 创建fmt-index索引数据 +void BuildBwtdFromBwt(bwt_t *bwt, bwtd_t **bwtd_p); + +#endif \ No newline at end of file diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..5664607 --- /dev/null +++ b/run.sh @@ -0,0 +1,25 @@ +time ./bwa mem -t 1 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ + /home/zzh/data/reference/human_g1k_v37_decoy.fasta \ + /home/zzh/data/fastq/nm1.fq \ + /home/zzh/data/fastq/nm2.fq #-o /dev/null + +# time ./bwa mem -t 1 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ +# /public/home/zzh/data/reference/human_g1k_v37_decoy.fasta \ +# /public/home/zzh/data/fastq/ZY2003109152013000/nm1.fq \ +# /public/home/zzh/data/fastq/ZY2003109152013000/nm2.fq \ +# -o /dev/null + +# time ./bwa mem -t 64 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \ +# /public/home/zzh/data/reference/human_g1k_v37_decoy.fasta \ +# /public/home/zzh/data/fastq/ZY2003109152013000/nm1.fq \ +# /public/home/zzh/data/fastq/ZY2003109152013000/nm2.fq \ +# -o /dev/null + +#/public/home/zzh/data/fastq/n1.fq \ +#/public/home/zzh/data/fastq/n2.fq \ +#/share_nas3/zyseq-release-v1.1.3/zyseq/wes/resource/reference/human_g1k_v37_decoy.fasta \ +#/share_nas3/zyseq-release-v1.1.3/zyseq/data/n1.fq \ +#/share_nas3/zyseq-release-v1.1.3/zyseq/data/n2.fq \ +#-o reads_mapping.sam + +