所有代码都合并了,还差一点建立索引的时候,一起都建立了
This commit is contained in:
parent
dd03596997
commit
20e072f6af
|
|
@ -5,6 +5,7 @@
|
|||
*.fa
|
||||
dataset/
|
||||
bwa
|
||||
fastbwa
|
||||
test
|
||||
test64
|
||||
.*.swp
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@
|
|||
"preLaunchTask": "Build",
|
||||
"type": "cppdbg",
|
||||
"request": "launch",
|
||||
"program": "${workspaceRoot}/bwa",
|
||||
"program": "${workspaceRoot}/fastbwa",
|
||||
"args": [
|
||||
"mem",
|
||||
"-t",
|
||||
|
|
@ -30,10 +30,23 @@
|
|||
"preLaunchTask": "Build",
|
||||
"type": "cppdbg",
|
||||
"request": "launch",
|
||||
"program": "${workspaceRoot}/bwa",
|
||||
"program": "${workspaceRoot}/fastbwa",
|
||||
"args": [
|
||||
"index",
|
||||
"/mnt/d/data/reference/human_g1k_v37_decoy.fasta"
|
||||
"~/data/reference/human_g1k_v37_decoy.fasta"
|
||||
],
|
||||
"cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间
|
||||
},
|
||||
{
|
||||
"name": "buildkmer",
|
||||
"preLaunchTask": "Build",
|
||||
"type": "cppdbg",
|
||||
"request": "launch",
|
||||
"program": "${workspaceRoot}/fastbwa",
|
||||
"args": [
|
||||
"buildkmer",
|
||||
"~/data/reference/human_g1k_v37_decoy.fasta.256.64.fmt",
|
||||
"~/data/reference/human_g1k_v37_decoy.fasta.kmer"
|
||||
],
|
||||
"cwd": "${workspaceFolder}", // 当前工作路径:当前文件所在的工作空间
|
||||
}
|
||||
|
|
|
|||
|
|
@ -16,6 +16,12 @@
|
|||
"emmintrin.h": "c",
|
||||
"bwamem.h": "c",
|
||||
"utils.h": "c",
|
||||
"stdio.h": "c"
|
||||
"stdio.h": "c",
|
||||
"kvec.h": "c",
|
||||
"string.h": "c",
|
||||
"stdlib.h": "c",
|
||||
"array": "c",
|
||||
"initializer_list": "c",
|
||||
"utility": "c"
|
||||
}
|
||||
}
|
||||
10
Makefile
10
Makefile
|
|
@ -6,13 +6,11 @@ WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
|||
|
||||
SHOW_PERF= -DSHOW_PERF
|
||||
SHOW_DATA_PERF= #-DSHOW_DATA_PERF
|
||||
DEBUG_OUTPUT= #-DDEBUG_OUTPUT
|
||||
DEBUG_SW_EXTEND= #-DDEBUG_SW_EXTEND
|
||||
FILTER_FULL_MATCH= #-DFILTER_FULL_MATCH
|
||||
USE_MT_READ= #-DUSE_MT_READ
|
||||
|
||||
AR= ar
|
||||
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) $(SHOW_PERF) $(SHOW_DATA_PERF) $(DEBUG_OUTPUT) $(DEBUG_SW_EXTEND) $(FILTER_FULL_MATCH) $(USE_MT_READ) -DUSE_AVX2 -DKSW_EQUAL
|
||||
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) $(SHOW_PERF) $(SHOW_DATA_PERF) $(FILTER_FULL_MATCH) $(USE_MT_READ) -DUSE_AVX2 -DKSW_EQUAL
|
||||
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o \
|
||||
QSufSort.o bwt_gen.o rope.o rle.o is.o bwtindex.o yarn.o
|
||||
AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
||||
|
|
@ -20,11 +18,11 @@ AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
|||
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
||||
bwtsw2_chain.o fastmap.o bwtsw2_pair.o \
|
||||
fmt_idx.o ksw_extend2_avx2.o ksw_extend2_avx2_u8.o
|
||||
PROG= bwa
|
||||
PROG= fastbwa
|
||||
INCLUDES=
|
||||
LIBS= -lm -lz -lpthread -ldl
|
||||
SUBDIRS= .
|
||||
JE_MALLOC=/home/zzh/work/jemalloc/lib/libjemalloc.a # -ljemalloc -L/home/zzh/work/jemalloc/lib/
|
||||
JE_MALLOC=/home/zzh/work/jemalloc/lib/libjemalloc.a
|
||||
|
||||
ifeq ($(shell uname -s),Linux)
|
||||
LIBS += -lrt
|
||||
|
|
@ -40,7 +38,7 @@ all:$(PROG)
|
|||
#bwa:libbwa.a $(AOBJS) main.o
|
||||
# $(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
|
||||
|
||||
bwa:libbwa.a $(AOBJS) main.o
|
||||
$(PROG):libbwa.a $(AOBJS) main.o
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) $(AOBJS) $(JE_MALLOC) main.o -o $@ -L. -lbwa $(LIBS)
|
||||
|
||||
bwamem-lite:libbwa.a example.o
|
||||
|
|
|
|||
2
bntseq.c
2
bntseq.c
|
|
@ -342,7 +342,7 @@ int bwa_fa2pac(int argc, char *argv[])
|
|||
}
|
||||
}
|
||||
if (argc == optind) {
|
||||
fprintf(stderr, "Usage: bwa fa2pac [-f] <in.fasta> [<out.prefix>]\n");
|
||||
fprintf(stderr, "Usage: fastbwa fa2pac [-f] <in.fasta> [<out.prefix>]\n");
|
||||
return 1;
|
||||
}
|
||||
fp = xzopen(argv[optind], "r");
|
||||
|
|
|
|||
3
bwa.c
3
bwa.c
|
|
@ -455,9 +455,11 @@ FMTIndex *bwa_idx_load_fmt(const char *hint)
|
|||
//kmer_bit_fn = malloc(l_hint + 32);
|
||||
sa_fn = malloc(l_hint + 32);
|
||||
sprintf(suffix, ".256.%d.fmt", FMT_MID_INTERVAL);
|
||||
// sprintf(suffix, ".fmt");
|
||||
strcpy(fmt_idx_fn, hint);
|
||||
strcpy(fmt_idx_fn + l_hint, suffix);
|
||||
sprintf(suffix, ".%d.kmer", HASH_KMER_LEN);
|
||||
// sprintf(suffix, ".kmer");
|
||||
strcpy(kmer_idx_fn, hint);
|
||||
strcpy(kmer_idx_fn + l_hint, suffix);
|
||||
|
||||
|
|
@ -473,6 +475,7 @@ FMTIndex *bwa_idx_load_fmt(const char *hint)
|
|||
|
||||
strcpy(sa_fn, hint);
|
||||
sprintf(suffix, ".33.%d.sa", SA_INTV);
|
||||
// sprintf(suffix, ".bytesa");
|
||||
strcpy(sa_fn + l_hint, suffix); // partial suffix array (SA)
|
||||
fmt_restore_sa(sa_fn, fmt);
|
||||
|
||||
|
|
|
|||
18
bwamem.c
18
bwamem.c
|
|
@ -1447,9 +1447,17 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
|
|||
while (x < len) {
|
||||
if (seq[x] < 4) {
|
||||
start_flag = 0;
|
||||
//int last_x = x;
|
||||
#ifdef DEBUG_OUTPUT
|
||||
#ifdef COUNT_SEED_LENGTH
|
||||
int last_x = x;
|
||||
#endif
|
||||
#endif
|
||||
x = fmt_smem(fmt, len, seq, x, start_width, opt->min_seed_len, 0, &a->mem1, a->tmpv[0]);
|
||||
//fprintf(gfp1, "%d\n", x - last_x);
|
||||
#ifdef DEBUG_OUTPUT
|
||||
#ifdef COUNT_SEED_LENGTH
|
||||
fprintf(gfp1, "%d\n", x - last_x);
|
||||
#endif
|
||||
#endif
|
||||
for (i = 0; i < a->mem1.n; ++i) {
|
||||
bwtintv_t *p = &a->mem1.a[i];
|
||||
int slen = (uint32_t)p->info - (p->info >> 32); // seed length
|
||||
|
|
@ -1458,7 +1466,9 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
|
|||
kv_push(bwtintv_t, smem->mem, *p);
|
||||
}
|
||||
}
|
||||
//break; // for test full match time
|
||||
#ifdef COUNT_SEED_LENGTH
|
||||
break; // for test full match time
|
||||
#endif
|
||||
} else {
|
||||
++x;
|
||||
if (start_flag) ++start_N_num;
|
||||
|
|
@ -1486,9 +1496,11 @@ static void mem_collect_intv_batch(const mem_opt_t *opt, const FMTIndex *fmt, in
|
|||
|
||||
#ifdef DEBUG_OUTPUT
|
||||
if (start_N_num == 0) {
|
||||
#ifdef GET_FULL_MATCH_READ
|
||||
for (i = 0; i < len; ++i)
|
||||
fprintf(gfp1, "%c", "ACGT"[seq[i]]);
|
||||
fprintf(gfp1, "\n");
|
||||
#endif
|
||||
#ifdef SHOW_DATA_PERF
|
||||
gdat[2]++;
|
||||
if (gdat[2] % 100 == 0) {
|
||||
|
|
|
|||
2
bwape.c
2
bwape.c
|
|
@ -757,7 +757,7 @@ int bwa_sai2sam_pe(int argc, char *argv[])
|
|||
|
||||
if (optind + 5 > argc) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Usage: bwa sampe [options] <prefix> <in1.sai> <in2.sai> <in1.fq> <in2.fq>\n\n");
|
||||
fprintf(stderr, "Usage: fastbwa sampe [options] <prefix> <in1.sai> <in2.sai> <in1.fq> <in2.fq>\n\n");
|
||||
fprintf(stderr, "Options: -a INT maximum insert size [%d]\n", popt->max_isize);
|
||||
fprintf(stderr, " -o INT maximum occurrences for one end [%d]\n", popt->max_occ);
|
||||
fprintf(stderr, " -n INT maximum hits to output for paired reads [%d]\n", popt->n_multi);
|
||||
|
|
|
|||
2
bwase.c
2
bwase.c
|
|
@ -593,7 +593,7 @@ int bwa_sai2sam_se(int argc, char *argv[])
|
|||
}
|
||||
|
||||
if (optind + 3 > argc) {
|
||||
fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] <prefix> <in.sai> <in.fq>\n");
|
||||
fprintf(stderr, "Usage: fastbwa samse [-n max_occ] [-f out.sam] [-r RG_line] <prefix> <in.sai> <in.fq>\n");
|
||||
return 1;
|
||||
}
|
||||
if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
|
||||
|
|
|
|||
2
bwashm.c
2
bwashm.c
|
|
@ -186,7 +186,7 @@ int main_shm(int argc, char *argv[])
|
|||
else if (c == 'f') tmpfn = optarg;
|
||||
}
|
||||
if (optind == argc && !to_list && !to_drop) {
|
||||
fprintf(stderr, "\nUsage: bwa shm [-d|-l] [-f tmpFile] [idxbase]\n\n");
|
||||
fprintf(stderr, "\nUsage: fastbwa shm [-d|-l] [-f tmpFile] [idxbase]\n\n");
|
||||
fprintf(stderr, "Options: -d destroy all indices in shared memory\n");
|
||||
fprintf(stderr, " -l list names of indices in shared memory\n");
|
||||
fprintf(stderr, " -f FILE temporary file to reduce peak memory\n\n");
|
||||
|
|
|
|||
2
bwtaln.c
2
bwtaln.c
|
|
@ -273,7 +273,7 @@ int bwa_aln(int argc, char *argv[])
|
|||
|
||||
if (optind + 2 > argc) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Usage: bwa aln [options] <prefix> <in.fq>\n\n");
|
||||
fprintf(stderr, "Usage: fastbwa aln [options] <prefix> <in.fq>\n\n");
|
||||
fprintf(stderr, "Options: -n NUM max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n",
|
||||
BWA_AVG_ERR, opt->fnr);
|
||||
fprintf(stderr, " -o INT maximum number or fraction of gap opens [%d]\n", opt->max_gapo);
|
||||
|
|
|
|||
52
bwtindex.c
52
bwtindex.c
|
|
@ -136,7 +136,7 @@ int bwa_pac2bwt(int argc, char *argv[]) // the "pac2bwt" command; IMPORTANT: bwt
|
|||
}
|
||||
}
|
||||
if (optind + 2 > argc) {
|
||||
fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
|
||||
fprintf(stderr, "Usage: fastbwa pac2bwt [-d] <in.pac> <out.bwt>\n");
|
||||
return 1;
|
||||
}
|
||||
bwt = bwt_pac2bwt(argv[optind], use_is);
|
||||
|
|
@ -175,7 +175,7 @@ int bwa_bwtupdate(int argc, char *argv[]) // the "bwtupdate" command
|
|||
{
|
||||
bwt_t *bwt;
|
||||
if (argc != 2) {
|
||||
fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
|
||||
fprintf(stderr, "Usage: fastbwa bwtupdate <the.bwt>\n");
|
||||
return 1;
|
||||
}
|
||||
bwt = bwt_restore_bwt(argv[1]);
|
||||
|
|
@ -196,7 +196,7 @@ int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command
|
|||
}
|
||||
}
|
||||
if (optind + 2 > argc) {
|
||||
fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
|
||||
fprintf(stderr, "Usage: fastbwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
|
||||
return 1;
|
||||
}
|
||||
bwt = bwt_restore_bwt(argv[optind]);
|
||||
|
|
@ -206,7 +206,7 @@ int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command
|
|||
return 0;
|
||||
}
|
||||
|
||||
int bwa_bwt2bytesa(int argc, char *argv[]) // the "bwt2sa" command
|
||||
int bwa_bwt2bytesa(int argc, char *argv[]) // the "bwt2bytesa" command
|
||||
{
|
||||
bwt_t *bwt;
|
||||
int c, sa_intv = 32;
|
||||
|
|
@ -217,7 +217,7 @@ int bwa_bwt2bytesa(int argc, char *argv[]) // the "bwt2sa" command
|
|||
}
|
||||
}
|
||||
if (optind + 2 > argc) {
|
||||
fprintf(stderr, "Usage: bwa bwt2bytesa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
|
||||
fprintf(stderr, "Usage: fastbwa bwt2bytesa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
|
||||
return 1;
|
||||
}
|
||||
bwt = bwt_restore_bwt(argv[optind]);
|
||||
|
|
@ -227,6 +227,37 @@ int bwa_bwt2bytesa(int argc, char *argv[]) // the "bwt2sa" command
|
|||
return 0;
|
||||
}
|
||||
|
||||
int bwa_bwt2fmt(int argc, char *argv[]) // create fmt index
|
||||
{
|
||||
bwt_t *bwt;
|
||||
char buf[1024];
|
||||
if (optind + 1 > argc) {
|
||||
fprintf(stderr, "Usage: fastbwa bwt2fmt <in.bwt> <out.fmt>\n");
|
||||
return 1;
|
||||
}
|
||||
bwt = bwt_restore_bwt(argv[optind]);
|
||||
FMTIndex *fmt;
|
||||
fmt = create_fmt_from_bwt(bwt);
|
||||
dump_fmt(argv[optind + 1], fmt);
|
||||
// sprintf(buf, "%s.kmer", argv[optind + 1]);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bwa_build_kmer(int argc, char *argv[])
|
||||
{
|
||||
char buf[1024];
|
||||
if (optind + 1 > argc)
|
||||
{
|
||||
fprintf(stderr, "Usage: fastbwa build_kmerhash <in.fmt> <out.kmerhash>\n");
|
||||
return 1;
|
||||
}
|
||||
FMTIndex *fmt = fmt_restore_fmt(argv[optind]);
|
||||
fmt_create_kmer_index(fmt);
|
||||
sprintf(buf, "%s", argv[optind + 1]);
|
||||
fmt_dump_kmer_idx(buf, &fmt->kmer_hash);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bwa_index(int argc, char *argv[]) // the "index" command
|
||||
{
|
||||
int c, algo_type = BWTALGO_AUTO, is_64 = 0, block_size = 10000000;
|
||||
|
|
@ -253,7 +284,7 @@ int bwa_index(int argc, char *argv[]) // the "index" command
|
|||
|
||||
if (optind + 1 > argc) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Usage: bwa index [options] <in.fasta>\n\n");
|
||||
fprintf(stderr, "Usage: fastbwa index [options] <in.fasta>\n\n");
|
||||
fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw, is or rb2 [auto]\n");
|
||||
fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n");
|
||||
fprintf(stderr, " -b INT block size for the bwtsw algorithm (effective with -a bwtsw) [%d]\n", block_size);
|
||||
|
|
@ -338,6 +369,15 @@ int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_s
|
|||
bwt_dump_sa(str3, bwt);
|
||||
bwt_destroy(bwt);
|
||||
if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||
{
|
||||
// build FMT-Index
|
||||
FMTIndex *fmt;
|
||||
strcpy(str, prefix); strcat(str, ".fmt");
|
||||
fmt = create_fmt_from_bwt(bwt);
|
||||
dump_fmt(str, fmt);
|
||||
// create Kmer-Hash
|
||||
|
||||
}
|
||||
}
|
||||
free(str3); free(str2); free(str);
|
||||
return 0;
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ int bwa_bwtsw2(int argc, char *argv[])
|
|||
|
||||
if (optind + 2 > argc) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Usage: bwa bwasw [options] <target.prefix> <query.fa> [query2.fa]\n\n");
|
||||
fprintf(stderr, "Usage: fastbwa bwasw [options] <target.prefix> <query.fa> [query2.fa]\n\n");
|
||||
fprintf(stderr, "Options: -a INT score for a match [%d]\n", opt->a);
|
||||
fprintf(stderr, " -b INT mismatch penalty [%d]\n", opt->b);
|
||||
fprintf(stderr, " -q INT gap open penalty [%d]\n", opt->q);
|
||||
|
|
|
|||
2
debug.sh
2
debug.sh
|
|
@ -35,7 +35,7 @@ out=./out.sam
|
|||
# /mnt/d/data/fastq/ZY2105177532213000/ZY2105177532213010_L4_2.fq.gz \
|
||||
# -o /dev/null
|
||||
|
||||
time ./bwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
||||
time ./fastbwa mem -t $thread -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
||||
$reference \
|
||||
$n_r1 \
|
||||
$n_r2 \
|
||||
|
|
|
|||
|
|
@ -570,7 +570,7 @@ int main_mem(int argc, char *argv[])
|
|||
|
||||
if (optind + 1 >= argc || optind + 3 < argc) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Usage: bwa mem [options] <idxbase> <in1.fq> [in2.fq]\n\n");
|
||||
fprintf(stderr, "Usage: fastbwa mem [options] <idxbase> <in1.fq> [in2.fq]\n\n");
|
||||
fprintf(stderr, "Algorithm options:\n\n");
|
||||
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
|
||||
fprintf(stderr, " -b INT batch size of reads to process at one time [%d]\n", opt->batch_size);
|
||||
|
|
@ -849,7 +849,7 @@ int main_fastmap(int argc, char *argv[])
|
|||
}
|
||||
if (optind + 1 >= argc) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Usage: bwa fastmap [options] <idxbase> <in.fq>\n\n");
|
||||
fprintf(stderr, "Usage: fastbwa fastmap [options] <idxbase> <in.fq>\n\n");
|
||||
fprintf(stderr, "Options: -l INT min SMEM length to output [%d]\n", min_len);
|
||||
fprintf(stderr, " -w INT max interval size to find coordiantes [%d]\n", min_iwidth);
|
||||
fprintf(stderr, " -i INT min SMEM interval size [%d]\n", min_intv);
|
||||
|
|
|
|||
197
fmt_idx.c
197
fmt_idx.c
|
|
@ -15,22 +15,23 @@ Date : 2023/12/24
|
|||
#include "utils.h"
|
||||
#include "bntseq.h"
|
||||
#include "kvec.h"
|
||||
#include "kstring.h"
|
||||
|
||||
const static char BASE[4] = {'A', 'C', 'G', 'T'};
|
||||
//const static char BASE[4] = {'A', 'C', 'G', 'T'};
|
||||
|
||||
// 生成所有KMER_LEN长度的序列,字符串表示
|
||||
void gen_all_seq(char **seq_arr, int kmer_len)
|
||||
{
|
||||
uint32_t seq_up_val = (1 << (kmer_len << 1));
|
||||
for (uint32_t i = 0; i < seq_up_val; ++i)
|
||||
{
|
||||
seq_arr[i] = (char *)malloc(kmer_len);
|
||||
for (int j = kmer_len - 1; j >= 0; --j)
|
||||
{
|
||||
seq_arr[i][kmer_len - 1 - j] = BASE[(i >> (j << 1)) & 3];
|
||||
}
|
||||
}
|
||||
}
|
||||
//void gen_all_seq(char **seq_arr, int kmer_len)
|
||||
//{
|
||||
// uint32_t seq_up_val = (1 << (kmer_len << 1));
|
||||
// for (uint32_t i = 0; i < seq_up_val; ++i)
|
||||
// {
|
||||
// seq_arr[i] = (char *)malloc(kmer_len);
|
||||
// for (int j = kmer_len - 1; j >= 0; --j)
|
||||
// {
|
||||
// seq_arr[i][kmer_len - 1 - j] = BASE[(i >> (j << 1)) & 3];
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
|
||||
// 生成occ,每个字节对应一个pattern
|
||||
void fmt_gen_cnt_occ(FMTIndex *fmt)
|
||||
|
|
@ -123,8 +124,12 @@ void fmt_dump_kmer_idx(const char *fn, const KmerHash *kh)
|
|||
err_fwrite(kh->ke10, 1, (1 << (10 << 1)) * sizeof(KmerEntryArr), fp);
|
||||
err_fwrite(kh->ke11, 1, (1 << (11 << 1)) * sizeof(KmerEntry), fp);
|
||||
err_fwrite(kh->ke12, 1, (1 << (12 << 1)) * sizeof(KmerEntry), fp);
|
||||
#if HASH_KMER_LEN > 12
|
||||
err_fwrite(kh->ke13, 1, (1 << (13 << 1)) * sizeof(KmerEntry), fp);
|
||||
#endif
|
||||
#if HASH_KMER_LEN > 13
|
||||
err_fwrite(kh->ke14, 1, (1 << (14 << 1)) * sizeof(KmerEntry), fp);
|
||||
#endif
|
||||
err_fflush(fp);
|
||||
err_fclose(fp);
|
||||
}
|
||||
|
|
@ -158,6 +163,156 @@ KmerHash fmt_restore_kmer_idx(const char *fn)
|
|||
return khash;
|
||||
}
|
||||
|
||||
// 生成所有KMER_LEN长度的序列,字符串表示
|
||||
void gen_kmer_base(uint8_t **seq_arr, uint64_t *kmer_arr_size, int kmer_len)
|
||||
{
|
||||
uint64_t i;
|
||||
uint64_t seq_up_val = (1 << (kmer_len << 1));
|
||||
*kmer_arr_size = (uint64_t)seq_up_val;
|
||||
*seq_arr = realloc(*seq_arr, seq_up_val * (uint64_t)kmer_len);
|
||||
for (i = 0; i < seq_up_val; ++i)
|
||||
{
|
||||
const uint64_t base_idx = i * kmer_len;
|
||||
for (int j = kmer_len - 1; j >= 0; --j)
|
||||
{
|
||||
(*seq_arr)[base_idx + kmer_len - 1 - j] = (i >> (j << 1)) & 3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t global_num = 0;
|
||||
|
||||
// 利用fmt搜索seed,完整搜索,只需要单向搜索
|
||||
bwtintv_t fmt_search(FMTIndex *fmt, const uint8_t *q, int qlen)
|
||||
{
|
||||
bwtintv_t ik;
|
||||
bwtintv_t ok1;
|
||||
bwtintv_t ok2;
|
||||
int i, c1, c2, x = 0;
|
||||
|
||||
fmt_set_intv(fmt, q[x], ik);
|
||||
ik.info = x + 1;
|
||||
for (i = x + 1; i + 1 < qlen; i += 2)
|
||||
{
|
||||
if (q[i] < 4 && q[i + 1] < 4)
|
||||
{
|
||||
c1 = 3 - q[i];
|
||||
c2 = 3 - q[i + 1];
|
||||
fmt_extend2(fmt, &ik, &ok1, &ok2, 0, c1, c2);
|
||||
ik = ok2;
|
||||
ik.info = i + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (i < qlen && q[i] < 4)
|
||||
{ // 最后一次扩展
|
||||
c1 = 3 - q[i];
|
||||
fmt_extend1(fmt, &ik, &ok1, 0, c1);
|
||||
//if (qlen == 14) fprintf(stderr, "%ld %ld %ld\n", ok1.x[0], ok1.x[1], ok1.x[2]);
|
||||
//if (qlen == 14) ++global_num;
|
||||
//if (qlen == 14 && global_num % 10000 == 0) fprintf(stderr, "%ld\n", global_num);
|
||||
ik = ok1;
|
||||
ik.info = i + 1;
|
||||
}
|
||||
return ik;
|
||||
}
|
||||
|
||||
// 创建xmer hash索引
|
||||
void fmt_create_kmer_index(FMTIndex *fmt) {
|
||||
uint64_t kmer_arr_size = 0;
|
||||
uint8_t *seq_arr = 0;
|
||||
gen_kmer_base(&seq_arr, &kmer_arr_size, 10);
|
||||
bwtintv_t ik;
|
||||
uint64_t j;
|
||||
int i, c1, c2;
|
||||
int qlen = 10;
|
||||
bwtint_t tk[4], tl[4];
|
||||
KmerHash *kh = &fmt->kmer_hash;
|
||||
|
||||
kh->ke10 = (KmerEntryArr *)malloc((1 << (10 << 1)) * sizeof(KmerEntryArr));
|
||||
kh->ke11 = (KmerEntry *)malloc((1 << (11 << 1)) * sizeof(KmerEntry));
|
||||
kh->ke12 = (KmerEntry *)malloc((1 << (12 << 1)) * sizeof(KmerEntry));
|
||||
kh->ke13 = (KmerEntry *)malloc((1 << (13 << 1)) * sizeof(KmerEntry));
|
||||
kh->ke14 = (KmerEntry *)malloc((1 << (14 << 1)) * sizeof(KmerEntry));
|
||||
|
||||
// 长度为10的kmer
|
||||
for (j = 0; j < kmer_arr_size; ++j)
|
||||
{
|
||||
uint8_t *q = &seq_arr[j * 10];
|
||||
uint8_t *mem_addr = kh->ke10[j].intv_arr;
|
||||
fmt_set_intv(fmt, q[0], ik);
|
||||
kmer_setval_at(mem_addr, ik, 0);
|
||||
|
||||
// 每次扩展两个碱基
|
||||
for (i = 1; i + 1 < qlen; i += 2)
|
||||
{
|
||||
c1 = 3 - q[i];
|
||||
c2 = 3 - q[i + 1];
|
||||
|
||||
fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk);
|
||||
fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl);
|
||||
// 第一次扩展的结果
|
||||
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
|
||||
ik.x[1] = fmt->L2[c1] + 1 + tk[1];
|
||||
ik.x[2] = tl[1] - tk[1];
|
||||
kmer_setval_at(mem_addr, ik, i);
|
||||
|
||||
// 第二次扩展的结果
|
||||
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[2] - tk[2];
|
||||
ik.x[1] = fmt->L2[c2] + 1 + tk[3];
|
||||
ik.x[2] = tl[3] - tk[3];
|
||||
kmer_setval_at(mem_addr, ik, i + 1);
|
||||
}
|
||||
{ // 最后一次扩展
|
||||
c1 = 3 - q[i];
|
||||
c2 = 3;
|
||||
fmt_e2_occ(fmt, ik.x[1] - 1, c1, c2, tk);
|
||||
fmt_e2_occ(fmt, ik.x[1] - 1 + ik.x[2], c1, c2, tl);
|
||||
// 第一次扩展的结果
|
||||
ik.x[0] = ik.x[0] + (ik.x[1] <= fmt->primary && ik.x[1] + ik.x[2] - 1 >= fmt->primary) + tl[0] - tk[0];
|
||||
ik.x[1] = fmt->L2[c1] + 1 + tk[1];
|
||||
ik.x[2] = tl[1] - tk[1];
|
||||
kmer_setval_at(mem_addr, ik, i);
|
||||
}
|
||||
}
|
||||
// 长度11的kmer
|
||||
gen_kmer_base(&seq_arr, &kmer_arr_size, 11);
|
||||
for (j = 0; j < kmer_arr_size; ++j)
|
||||
{
|
||||
bwtintv_t p = fmt_search(fmt, &seq_arr[j * 11], 11);
|
||||
kmer_setval_at(fmt->kmer_hash.ke11[j].intv_arr, p, 0);
|
||||
}
|
||||
|
||||
// 长度12的kmer
|
||||
gen_kmer_base(&seq_arr, &kmer_arr_size, 12);
|
||||
for (j = 0; j < kmer_arr_size; ++j)
|
||||
{
|
||||
bwtintv_t p = fmt_search(fmt, &seq_arr[j * 12], 12);
|
||||
kmer_setval_at(fmt->kmer_hash.ke12[j].intv_arr, p, 0);
|
||||
}
|
||||
gen_kmer_base(&seq_arr, &kmer_arr_size, 13);
|
||||
for (j = 0; j < kmer_arr_size; ++j)
|
||||
{
|
||||
bwtintv_t p = fmt_search(fmt, &seq_arr[j * 13], 13);
|
||||
kmer_setval_at(fmt->kmer_hash.ke13[j].intv_arr, p, 0);
|
||||
}
|
||||
|
||||
// 长度14的kmer
|
||||
gen_kmer_base(&seq_arr, &kmer_arr_size, 14);
|
||||
//fprintf(stderr, "14-size:%ld\n", kmer_arr_size);
|
||||
for (j = 0; j < kmer_arr_size; ++j)
|
||||
{
|
||||
//if (j % 10000 == 0) fprintf(stderr, "arr_size: %ld, %ld\n", j, kmer_arr_size);
|
||||
bwtintv_t p = fmt_search(fmt, &seq_arr[j * 14], 14);
|
||||
kmer_setval_at(fmt->kmer_hash.ke14[j].intv_arr, p, 0);
|
||||
}
|
||||
free(seq_arr);
|
||||
}
|
||||
|
||||
// 读取sa数据
|
||||
void fmt_restore_sa(const char *fn, FMTIndex *fmt)
|
||||
{
|
||||
|
|
@ -241,7 +396,7 @@ FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
|
|||
if (i % 16 == 0) // 每个32位整数可以包含16个碱基,每次需要处理16个碱基,也就是间隔最小可以设置为16
|
||||
{
|
||||
uint32_t pre_bwt_16_seq = 0; // 16个pre-bwt碱基串
|
||||
uint32_t *bwt_addr = bwt_occ_intv(bwt, i) + 4; // 这里加4还是加8要看保存occ的是是uint32还是uint64,bwt字符串i对应的基准行,因为原始的bwt-cp(check point)包含由4个uint32_t(8个uint32_t)组成的occ信息
|
||||
uint32_t *bwt_addr = bwt_occ_intv(bwt, i) + 8;//4; // 这里加4还是加8要看保存occ的是是uint32还是uint64,bwt字符串i对应的基准行,因为原始的bwt-cp(check point)包含由4个uint32_t(8个uint32_t)组成的occ信息
|
||||
int offset = (i % OCC_INTERVAL) / 16; // 每OCC_INTERVAL个碱基共享同一个基准地址,每16个碱基共用一个uint32整型,因此需要偏移量来获取当前碱基串的首地址
|
||||
uint32_t bwt_16_seq = *(bwt_addr + offset); // 待处理的当前16个碱基串的首地址
|
||||
for (j = 0; j < 16; ++j) // 对于bwt碱基串,一个一个碱基分别处理
|
||||
|
|
@ -335,7 +490,7 @@ FMTIndex *create_fmt_from_bwt(bwt_t *bwt)
|
|||
k += 4;
|
||||
memcpy(buf + k, c2, sizeof(uint32_t) * 16);
|
||||
k += 16;
|
||||
xassert(k == fmt->bwt_size, "inconsistent bwt_size");
|
||||
xassert(k == fmt->bwt_size, "inconsistent fmt_size");
|
||||
// update fmt
|
||||
fmt->bwt = buf;
|
||||
return fmt;
|
||||
|
|
@ -708,10 +863,18 @@ int fmt_smem_forward(const FMTIndex *fmt, int len, const uint8_t *q, int x, int
|
|||
#if 1
|
||||
if (min_intv == 1 && ok2.x[2] == min_intv)
|
||||
{
|
||||
// fprintf(gfp1, "%d\t", i + 2 - x);
|
||||
#ifdef DEBUG_OUTPUT
|
||||
#ifdef COUNT_SEED_LENGTH
|
||||
fprintf(gfp1, "%d\t", i + 2 - x);
|
||||
#endif
|
||||
#endif
|
||||
direct_extend(fmt, len, q, x, i + 2, ok2.x[0], &mt);
|
||||
//mt.x[0] = ok2.x[0];
|
||||
//fprintf(gfp1, "mt %ld %ld\n", ok2.x[0], ok2.x[1]);
|
||||
#ifdef DEBUG_OUTPUT
|
||||
#if 0
|
||||
fprintf(gfp1, "mt %ld %ld\n", ok2.x[0], ok2.x[1]);
|
||||
#endif
|
||||
#endif
|
||||
kv_push(bwtintv_t, *mem, mt);
|
||||
ret = (uint32_t)mt.info;
|
||||
goto fmt_smem_forward_end;
|
||||
|
|
|
|||
|
|
@ -80,6 +80,7 @@ typedef struct
|
|||
void dump_fmt(const char *fn, const FMTIndex *fmt);
|
||||
// 从文件中读取fmt结构数据
|
||||
FMTIndex *fmt_restore_fmt(const char *fn);
|
||||
void fmt_create_kmer_index(FMTIndex *fmt);
|
||||
// 将kmer hash数据写入到文件
|
||||
void fmt_dump_kmer_idx(const char *fn, const KmerHash *kh);
|
||||
// 从文件中读取kmer hash信息
|
||||
|
|
@ -100,7 +101,7 @@ void fmt_extend2(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok1, bwtintv_t *
|
|||
void fmt_direct_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1);
|
||||
void fmt_extend1(const FMTIndex *fmt, bwtintv_t *ik, bwtintv_t *ok, int is_back, int b1);
|
||||
// 生成所有KMER_LEN长度的序列,字符串表示
|
||||
void gen_all_seq(char **seq_arr, int kmer_len);
|
||||
// void gen_all_seq(char **seq_arr, int kmer_len);
|
||||
// 设置kmer第pos个碱基对应的fmt匹配信息
|
||||
void kmer_setval_at(uint8_t *mem_addr, bwtintv_t ik, int pos);
|
||||
// 获取kmer的fmt匹配信息
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
#include <stdio.h>
|
||||
#include <immintrin.h>
|
||||
#include <emmintrin.h>
|
||||
#include "utils.h"
|
||||
|
||||
#ifdef DEBUG_OUTPUT
|
||||
extern FILE *gfp1, *gfp2, *gfp3;
|
||||
|
|
@ -21,13 +22,13 @@ extern FILE *gfq[4], *gft[4], *gfi[4];
|
|||
#define UNLIKELY(x) (x)
|
||||
#endif
|
||||
|
||||
#undef MAX
|
||||
#undef MIN
|
||||
#define MAX(x, y) ((x) > (y) ? (x) : (y))
|
||||
#define MIN(x, y) ((x) < (y) ? (x) : (y))
|
||||
//#undef MAX
|
||||
//#undef MIN
|
||||
//#define MAX(x, y) ((x) > (y) ? (x) : (y))
|
||||
//#define MIN(x, y) ((x) < (y) ? (x) : (y))
|
||||
#define SIMD_WIDTH 16
|
||||
|
||||
typedef struct { size_t m; uint8_t *addr; } buf_t;
|
||||
// typedef struct { size_t m; uint8_t *addr; } buf_t;
|
||||
|
||||
extern int ksw_extend2_avx2_u8(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int is_left, int m, const int8_t *mat, int o_del, int e_del,
|
||||
int o_ins, int e_ins, int a, int b, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off, buf_t *buf);
|
||||
|
|
@ -175,6 +176,7 @@ static const uint16_t h_vec_int_mask[SIMD_WIDTH][SIMD_WIDTH] = {
|
|||
|
||||
void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int h0, int fnum)
|
||||
{
|
||||
#ifdef DEBUG_OUTPUT
|
||||
// 写到三个文件里,query.fa,target.fa,每行一个序列,info.txt,包含前缀得分h0,和长度信息qlen,tlen
|
||||
FILE *query_f = gfq[fnum],
|
||||
*target_f = gft[fnum],
|
||||
|
|
@ -191,6 +193,7 @@ void write_query_target_sequence(int qlen, const uint8_t *query, int tlen, const
|
|||
fprintf(target_f, "\n");
|
||||
// 处理其他信息
|
||||
fprintf(info_f, "%-8d%-8d%-8d\n", qlen, tlen, h0);
|
||||
#endif
|
||||
}
|
||||
|
||||
int ksw_extend2_avx2(int qlen, // query length 待匹配段碱基的query长度
|
||||
|
|
@ -221,6 +224,7 @@ int ksw_extend2_avx2(int qlen, // query length 待匹配段碱基的query长度
|
|||
|
||||
#ifdef DEBUG_OUTPUT
|
||||
//fprintf(gfp1, "%d\n", qlen);
|
||||
#ifdef GET_DIFFERENT_EXTENSION_LENGTH
|
||||
if (qlen <= 30) {
|
||||
write_query_target_sequence(qlen, query, tlen, target, h0, 0);
|
||||
} else if (qlen < 60) {
|
||||
|
|
@ -230,6 +234,7 @@ int ksw_extend2_avx2(int qlen, // query length 待匹配段碱基的query长度
|
|||
} else {
|
||||
write_query_target_sequence(qlen, query, tlen, target, h0, 3);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (qlen * a + h0 < 255) return ksw_extend2_avx2_u8(qlen, query, tlen, target, is_left, m, mat, o_del, e_del, o_ins, e_ins, a, b, w, end_bonus, zdrop, h0, _qle, _tle, _gtle, _gscore, _max_off, buf);
|
||||
|
|
@ -654,7 +659,7 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长
|
|||
#endif
|
||||
|
||||
#ifdef DEBUG_OUTPUT
|
||||
// fprintf(gfp1, "start\n");
|
||||
#ifdef COUNT_CALC_NUM
|
||||
int bsw_cal_num = 0;
|
||||
int real_cal_num = 0;
|
||||
for (i = 0; i < tlen; ++i)
|
||||
|
|
@ -664,7 +669,12 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长
|
|||
if (beg >= end) break;
|
||||
bsw_cal_num += end - beg;
|
||||
}
|
||||
// fprintf(gfp1, "%d\n", bsw_cal_num);
|
||||
fprintf(gfp1, "start\n%d\n", bsw_cal_num);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ELIMINATE_DIFF_3
|
||||
int prun_end = qlen; // for test diff_3
|
||||
#endif
|
||||
|
||||
for (i = 0; LIKELY(i < tlen); ++i) {
|
||||
|
|
@ -683,8 +693,10 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长
|
|||
for (j = beg; LIKELY(j < end); ++j) {
|
||||
|
||||
#ifdef DEBUG_OUTPUT
|
||||
#ifdef COUNT_CALC_NUM
|
||||
real_cal_num++;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG_SW_EXTEND
|
||||
ins[i+1][j+1] = f;
|
||||
|
|
@ -700,6 +712,9 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长
|
|||
M = M? M + q[j] : 0;// separating H and M to disallow a cigar like "100M3I3D20M",保证分值不小于0,sw和nw的区别
|
||||
h = M > e? M : e; // e and f are guaranteed to be non-negative, so h>=0 even if M<0
|
||||
h = h > f? h : f;
|
||||
#ifdef ELIMINATE_DIFF_3
|
||||
if (j >= prun_end && h==0) break; // for test diff_3
|
||||
#endif
|
||||
h1 = h; // save H(i,j) to h1 for the next column
|
||||
|
||||
#ifdef DEBUG_SW_EXTEND
|
||||
|
|
@ -710,10 +725,13 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长
|
|||
t = M - oe_del;
|
||||
t = t > 0? t : 0;
|
||||
e -= e_del;
|
||||
#ifdef DEBUG_SW_EXTEND
|
||||
del[i + 1][j + 1] = e;
|
||||
#endif
|
||||
e = e > t? e : t; // computed E(i+1,j)
|
||||
|
||||
#ifdef DEBUG_SW_EXTEND
|
||||
del[i+1][j+1] = e;
|
||||
// del[i+1][j+1] = e;
|
||||
#endif
|
||||
p->e = e; // save E(i+1,j) for the next row
|
||||
t = M - oe_ins;
|
||||
|
|
@ -742,11 +760,15 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长
|
|||
for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j); // 这里为什么不考虑f(insert score)
|
||||
beg = j;
|
||||
for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j);
|
||||
#ifdef ELIMINATE_DIFF_3
|
||||
prun_end = j + 2 < qlen ? j + 2 : qlen; end = qlen; // for test diff_3
|
||||
#else
|
||||
end = j + 2 < qlen? j + 2 : qlen;
|
||||
//beg = 0; end = qlen; // uncomment this line for debugging
|
||||
//if (print_flag) {
|
||||
#endif
|
||||
// beg = 0; end = qlen; // uncomment this line for debugging
|
||||
// if (print_flag) {
|
||||
// fprintf(stderr, "beg: %d; end: %d\n", beg, end);
|
||||
//}
|
||||
// }
|
||||
}
|
||||
#ifdef DEBUG_OUTPUT
|
||||
#ifdef DEBUG_SW_EXTEND
|
||||
|
|
@ -797,7 +819,9 @@ int ksw_extend2_origin(int qlen, // query length 待匹配段碱基的query长
|
|||
#endif
|
||||
|
||||
#ifdef DEBUG_OUTPUT
|
||||
//fprintf(gfp1, "%d\nend\n", real_cal_num);
|
||||
#ifdef COUNT_CALC_NUM
|
||||
fprintf(gfp1, "%d\nend\n", real_cal_num);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
free(eh); free(qp); free(qmem);
|
||||
|
|
|
|||
13
main.c
13
main.c
|
|
@ -30,7 +30,7 @@
|
|||
#include "utils.h"
|
||||
|
||||
#ifndef PACKAGE_VERSION
|
||||
#define PACKAGE_VERSION "0.7.17-r1198-dirty"
|
||||
#define PACKAGE_VERSION "v1.0-0.7.17"
|
||||
#endif
|
||||
|
||||
int bwa_fa2pac(int argc, char *argv[]);
|
||||
|
|
@ -38,6 +38,8 @@ int bwa_pac2bwt(int argc, char *argv[]);
|
|||
int bwa_bwtupdate(int argc, char *argv[]);
|
||||
int bwa_bwt2sa(int argc, char *argv[]);
|
||||
int bwa_bwt2bytesa(int argc, char *argv[]);
|
||||
int bwa_bwt2fmt(int argc, char *argv[]);
|
||||
int bwa_build_kmer(int argc, char *argv[]);
|
||||
int bwa_index(int argc, char *argv[]);
|
||||
int bwt_bwtgen_main(int argc, char *argv[]);
|
||||
|
||||
|
|
@ -57,10 +59,11 @@ int main_maxk(int argc, char *argv[]);
|
|||
static int usage()
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Program: bwa (alignment via Burrows-Wheeler transformation)\n");
|
||||
fprintf(stderr, "Program: fast-bwa-mem (alignment via Burrows-Wheeler transformation)\n");
|
||||
fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
|
||||
fprintf(stderr, "Contact: Heng Li <hli@ds.dfci.harvard.edu>\n\n");
|
||||
fprintf(stderr, "Usage: bwa <command> [options]\n\n");
|
||||
fprintf(stderr, "Contact: Heng Li <hli@ds.dfci.harvard.edu> (for bwa)\n\n");
|
||||
fprintf(stderr, "Contact: Zhonghai Zhang <zhangzhonghai@ict.ac.cn> (for fast-bwa-mem)\n\n");
|
||||
fprintf(stderr, "Usage: fastbwa <command> [options]\n\n");
|
||||
fprintf(stderr, "Command: index index sequences in the FASTA format\n");
|
||||
fprintf(stderr, " mem BWA-MEM algorithm\n");
|
||||
fprintf(stderr, " fastmap identify super-maximal exact matches\n");
|
||||
|
|
@ -103,6 +106,8 @@ int main(int argc, char *argv[])
|
|||
else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "bwt2bytesa") == 0) ret = bwa_bwt2bytesa(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "bwt2fmt") == 0) ret = bwa_bwt2fmt(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "buildkmer") == 0) ret = bwa_build_kmer(argc - 1, argv + 1);
|
||||
else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1);
|
||||
|
|
|
|||
2
maxk.c
2
maxk.c
|
|
@ -23,7 +23,7 @@ int main_maxk(int argc, char *argv[])
|
|||
if (c == 's') self = 1;
|
||||
}
|
||||
if (optind + 2 > argc) {
|
||||
fprintf(stderr, "Usage: bwa maxk [-s] <index.prefix> <seq.fa>\n");
|
||||
fprintf(stderr, "Usage: fastbwa maxk [-s] <index.prefix> <seq.fa>\n");
|
||||
return 1;
|
||||
}
|
||||
fp = strcmp(argv[optind+1], "-")? gzopen(argv[optind+1], "rb") : gzdopen(fileno(stdin), "rb");
|
||||
|
|
|
|||
|
|
@ -238,7 +238,7 @@ int main_pemerge(int argc, char *argv[])
|
|||
|
||||
if (optind == argc) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Usage: bwa pemerge [-mu] <read1.fq> [read2.fq]\n\n");
|
||||
fprintf(stderr, "Usage: fastbwa pemerge [-mu] <read1.fq> [read2.fq]\n\n");
|
||||
fprintf(stderr, "Options: -m output merged reads only\n");
|
||||
fprintf(stderr, " -u output unmerged reads only\n");
|
||||
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
|
||||
|
|
|
|||
74
run.sh
74
run.sh
|
|
@ -1,63 +1,23 @@
|
|||
thread=1
|
||||
#n_r1=~/data/fastq/dataset/na12878_wgs_150/bn1.fq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wgs_150/bn2.fq
|
||||
#n_r1=~/data/fastq/dataset/na12878_wes_144/SRR25735653_1.fastq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wes_144/SRR25735653_2.fastq
|
||||
|
||||
## d1
|
||||
n_r1=~/data/fastq/dataset/na12878_wes_144/s_1.fq
|
||||
n_r2=~/data/fastq/dataset/na12878_wes_144/s_2.fq
|
||||
## d2
|
||||
#n_r1=~/data/fastq/dataset/na12878_wgs_101/s_1.fq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wgs_101/s_2.fq
|
||||
# d3
|
||||
#n_r1=~/data/fastq/dataset/na12878_wgs_150/s_1.fq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wgs_150/s_2.fq
|
||||
#n_r1=~/data/fastq/dataset/na12878_wgs_101/na_1.fq
|
||||
#n_r2=~/data/fastq/dataset/na12878_wgs_101/na_2.fq
|
||||
n_r1=~/data/fastq/dataset/zy_wes/s_1.fq
|
||||
n_r2=~/data/fastq/dataset/zy_wes/s_2.fq
|
||||
#n_r1=~/data/fastq/dataset/zy_wes/bloodgDNA_r1.fastq
|
||||
#n_r2=~/data/fastq/dataset/zy_wes/bloodgDNA_r2.fastq
|
||||
## d4
|
||||
#n_r1=~/data/fastq/dataset/zy_wes/s_1.fq
|
||||
#n_r2=~/data/fastq/dataset/zy_wes/s_2.fq
|
||||
## d5
|
||||
#n_r1=~/data/fastq/dataset/zy_wgs/s_1.fq
|
||||
#n_r2=~/data/fastq/dataset/zy_wgs/s_2.fq
|
||||
|
||||
|
||||
#n_r1=~/data/fastq/zy/wgs/n1.fq
|
||||
#n_r2=~/data/fastq/zy/wgs/n2.fq
|
||||
#n_r1=~/data/fastq/ds/n1.fq
|
||||
#n_r2=~/data/fastq/ds/n2.fq
|
||||
#n_r1=~/data/fastq/platinum/n1.fq
|
||||
#n_r2=~/data/fastq/platinum/n2.fq
|
||||
#n_r1=~/data/fastq/zy/t1.fq
|
||||
#n_r2=~/data/fastq/zy/t2.fq
|
||||
#n_r1=~/data/fastq/zy/n1.fq
|
||||
#n_r2=~/data/fastq/zy/n2.fq
|
||||
#n_r1=~/data/fastq/ds/d1_1.fq
|
||||
#n_r2=~/data/fastq/ds/d1_2.fq
|
||||
#n_r1=~/data/fastq/ds/d2_1.fq
|
||||
#n_r2=~/data/fastq/ds/d2_2.fq
|
||||
#n_r1=~/data/fastq/ds/wes/n1.fq
|
||||
#n_r2=~/data/fastq/ds/wes/n2.fq
|
||||
#n_r1=~/fastq/ZY2105177532213010_L4_1.fq
|
||||
#n_r2=~/fastq/ZY2105177532213010_L4_2.fq
|
||||
#n_r1=~/data/fastq/na12878/nas_1.fq
|
||||
#n_r2=~/data/fastq/na12878/nas_2.fq
|
||||
#n_r1=~/data/fastq/na12878/na_1.fq
|
||||
#n_r2=~/data/fastq/na12878/na_2.fq
|
||||
#n_r1=~/fastq/na12878/na12878_r1.fq
|
||||
#n_r2=~/fastq/na12878/na12878_r2.fq
|
||||
#n_r1=~/fastq/n_r1.fq
|
||||
#n_r2=~/fastq/n_r2.fq
|
||||
#n_r1=~/data/fastq/ZY2105177532213000/n_r1.fq
|
||||
#n_r2=~/data/fastq/ZY2105177532213000/n_r2.fq
|
||||
#n_r1=~/data/fastq/ZY2105177532213000/sn_r1.fq
|
||||
#n_r2=~/data/fastq/ZY2105177532213000/sn_r2.fq
|
||||
#reference=~/data/reference/human_g1k_v37_decoy.fasta
|
||||
#n_r1=~/data/fastq/sn_r1.fq
|
||||
#n_r2=~/data/fastq/sn_r2.fq
|
||||
#n_r1=~/data/fastq/ssn_r1.fq
|
||||
#n_r2=~/data/fastq/ssn_r2.fq
|
||||
#n_r1=~/fastq/ERR194147_1_120w.fastq
|
||||
#n_r2=~/fastq/ERR194147_2_120w.fastq
|
||||
#n_r1=~/fastq/tiny_n_r1.fq
|
||||
#n_r2=~/fastq/tiny_n_r2.fq
|
||||
#n_r1=~/fastq/diff_r1.fq
|
||||
#n_r2=~/fastq/diff_r2.fq
|
||||
#n_r1=~/fastq/d_r1.fq
|
||||
#n_r2=~/fastq/d_r2.fq
|
||||
reference=~/reference/human_g1k_v37_decoy.fasta
|
||||
#reference=~/data/reference/human_g1k_v37_decoy.fasta
|
||||
#reference=~/reference/bwa/human_g1k_v37_decoy.fasta
|
||||
reference=~/data/reference/human_g1k_v37_decoy.fasta
|
||||
#out=./all.sam
|
||||
#out=./sn.sam
|
||||
#out=./ssn-x1.sam
|
||||
|
|
@ -75,7 +35,7 @@ out=/dev/null
|
|||
# /mnt/d/data/fastq/ZY2105177532213000/ZY2105177532213010_L4_2.fq.gz \
|
||||
# -o /dev/null
|
||||
|
||||
time ./bwa mem -t $thread -b 256 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
||||
time ./fastbwa mem -t $thread -b 256 -M -R @RG\\tID:normal\\tSM:normal\\tPL:illumina\\tLB:normal\\tPG:bwa \
|
||||
$reference \
|
||||
$n_r1 \
|
||||
$n_r2 \
|
||||
|
|
|
|||
12
utils.h
12
utils.h
|
|
@ -31,6 +31,17 @@
|
|||
#include <stdio.h>
|
||||
#include <zlib.h>
|
||||
|
||||
// for debug and test
|
||||
|
||||
//#define DEBUG_OUTPUT // 打开gfp1-4文件,并记录debug信息
|
||||
//#define COUNT_SEED_LENGTH // 记录seed匹配数量降低到1时的长度,以及最终扩展的长度
|
||||
//#define GET_FULL_MATCH_READ // 获取完全匹配的reads
|
||||
//#define COUNT_CALC_NUM // 统计BSW的剪枝后的计算量和未剪枝前的计算量
|
||||
// #define GET_DIFFERENT_EXTENSION_LENGTH // 获取不同长度extension的query,target,和其他用于计算的数据
|
||||
//#define DEBUG_SW_EXTEND // 将bsw的分值输入到debug文件里
|
||||
|
||||
////////////////////
|
||||
|
||||
#define USE_RDTSC 1
|
||||
|
||||
#ifdef SHOW_PERF
|
||||
|
|
@ -96,6 +107,7 @@ typedef struct {
|
|||
} pair64_t;
|
||||
|
||||
typedef struct { size_t n, m; uint64_t *a; } uint64_v;
|
||||
typedef struct { size_t n, m; uint32_t *a; } uint32_v;
|
||||
typedef struct { size_t n, m; pair64_t *a; } pair64_v;
|
||||
|
||||
typedef struct { size_t m; uint8_t *addr; } buf_t;
|
||||
|
|
|
|||
Loading…
Reference in New Issue