diff --git a/Makefile b/Makefile index 726cdf2..7151435 100644 --- a/Makefile +++ b/Makefile @@ -4,9 +4,10 @@ CFLAGS= -g -Wall -Wno-unused-function -O2 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS AR= ar DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) -LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o -AOBJS= QSufSort.o bwt_gen.o rope.o rle.o bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ - is.o bwtindex.o bwape.o kopen.o pemerge.o maxk.o \ +LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o \ + QSufSort.o bwt_gen.o rope.o rle.o is.o bwtindex.o +AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ + bwape.o kopen.o pemerge.o maxk.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o PROG= bwa @@ -63,7 +64,7 @@ bwt_gen.o: QSufSort.h malloc_wrap.h bwt_lite.o: bwt_lite.h malloc_wrap.h bwtaln.o: bwtaln.h bwt.h bwtgap.h utils.h bwa.h bntseq.h malloc_wrap.h bwtgap.o: bwtgap.h bwt.h bwtaln.h malloc_wrap.h -bwtindex.o: bntseq.h bwt.h utils.h rle.h rope.h malloc_wrap.h +bwtindex.o: bntseq.h bwa.h bwt.h utils.h rle.h rope.h malloc_wrap.h bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h kstring.h bwtsw2_aux.o: malloc_wrap.h bwa.h ksw.h kseq.h ksort.h bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h malloc_wrap.h ksort.h diff --git a/bwa.h b/bwa.h index 1541c1c..aa21725 100644 --- a/bwa.h +++ b/bwa.h @@ -12,6 +12,11 @@ #define BWA_CTL_SIZE 0x10000 +#define BWTALGO_AUTO 0 +#define BWTALGO_RB2 1 +#define BWTALGO_BWTSW 2 +#define BWTALGO_IS 3 + typedef struct { bwt_t *bwt; // FM-index bntseq_t *bns; // information on the reference sequences @@ -41,6 +46,8 @@ extern "C" { uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); + int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_size); + char *bwa_idx_infer_prefix(const char *hint); bwt_t *bwa_idx_load_bwt(const char *hint); diff --git a/bwtindex.c b/bwtindex.c index 826bb5a..fb32231 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -32,6 +32,7 @@ #include #include #include "bntseq.h" +#include "bwa.h" #include "bwt.h" #include "utils.h" #include "rle.h" @@ -208,19 +209,14 @@ int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command int bwa_index(int argc, char *argv[]) // the "index" command { - extern void bwa_pac_rev_core(const char *fn, const char *fn_rev); - - char *prefix = 0, *str, *str2, *str3; - int c, algo_type = 0, is_64 = 0, block_size = 10000000; - clock_t t; - int64_t l_pac; - + int c, algo_type = BWTALGO_AUTO, is_64 = 0, block_size = 10000000; + char *prefix = 0, *str; while ((c = getopt(argc, argv, "6a:p:b:")) >= 0) { switch (c) { case 'a': // if -a is not set, algo_type will be determined later - if (strcmp(optarg, "rb2") == 0) algo_type = 1; - else if (strcmp(optarg, "bwtsw") == 0) algo_type = 2; - else if (strcmp(optarg, "is") == 0) algo_type = 3; + if (strcmp(optarg, "rb2") == 0) algo_type = BWTALGO_RB2; + else if (strcmp(optarg, "bwtsw") == 0) algo_type = BWTALGO_BWTSW; + else if (strcmp(optarg, "is") == 0) algo_type = BWTALGO_IS; else err_fatal(__func__, "unknown algorithm: '%s'.", optarg); break; case 'p': prefix = strdup(optarg); break; @@ -252,16 +248,29 @@ int bwa_index(int argc, char *argv[]) // the "index" command strcpy(prefix, argv[optind]); if (is_64) strcat(prefix, ".64"); } + bwa_idx_build(argv[optind], prefix, algo_type, block_size); + free(prefix); + return 0; +} + +int bwa_idx_build(const char *fa, const char *prefix, int algo_type, int block_size) +{ + extern void bwa_pac_rev_core(const char *fn, const char *fn_rev); + + char *str, *str2, *str3; + clock_t t; + int64_t l_pac; + str = (char*)calloc(strlen(prefix) + 10, 1); str2 = (char*)calloc(strlen(prefix) + 10, 1); str3 = (char*)calloc(strlen(prefix) + 10, 1); { // nucleotide indexing - gzFile fp = xzopen(argv[optind], "r"); + gzFile fp = xzopen(fa, "r"); t = clock(); - fprintf(stderr, "[bwa_index] Pack FASTA... "); + if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Pack FASTA... "); l_pac = bns_fasta2bntseq(fp, prefix, 0); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); err_gzclose(fp); } if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT @@ -269,7 +278,7 @@ int bwa_index(int argc, char *argv[]) // the "index" command strcpy(str, prefix); strcat(str, ".pac"); strcpy(str2, prefix); strcat(str2, ".bwt"); t = clock(); - fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n"); + if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n"); if (algo_type == 2) bwt_bwtgen2(str, str2, block_size); else if (algo_type == 1 || algo_type == 3) { bwt_t *bwt; @@ -277,25 +286,25 @@ int bwa_index(int argc, char *argv[]) // the "index" command bwt_dump_bwt(str2, bwt); bwt_destroy(bwt); } - fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC); + if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC); } { bwt_t *bwt; strcpy(str, prefix); strcat(str, ".bwt"); t = clock(); - fprintf(stderr, "[bwa_index] Update BWT... "); + if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Update BWT... "); bwt = bwt_restore_bwt(str); bwt_bwtupdate_core(bwt); bwt_dump_bwt(str, bwt); bwt_destroy(bwt); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); } { - gzFile fp = xzopen(argv[optind], "r"); + gzFile fp = xzopen(fa, "r"); t = clock(); - fprintf(stderr, "[bwa_index] Pack forward-only FASTA... "); + if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Pack forward-only FASTA... "); l_pac = bns_fasta2bntseq(fp, prefix, 1); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); err_gzclose(fp); } { @@ -303,13 +312,13 @@ int bwa_index(int argc, char *argv[]) // the "index" command strcpy(str, prefix); strcat(str, ".bwt"); strcpy(str3, prefix); strcat(str3, ".sa"); t = clock(); - fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... "); + if (bwa_verbose >= 3) fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... "); bwt = bwt_restore_bwt(str); bwt_cal_sa(bwt, 32); bwt_dump_sa(str3, bwt); bwt_destroy(bwt); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + if (bwa_verbose >= 3) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); } - free(str3); free(str2); free(str); free(prefix); + free(str3); free(str2); free(str); return 0; }