From 72563c38f383b4d3df444caaf5cfb8327ab629a3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 9 Jun 2011 17:33:25 -0400 Subject: [PATCH] automatically choose the algorithm for BWT --- bntseq.c | 5 ++++- bntseq.h | 2 +- bwtindex.c | 14 ++++++++++---- main.c | 2 +- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/bntseq.c b/bntseq.c index 86888c1..21ba91f 100644 --- a/bntseq.c +++ b/bntseq.c @@ -163,7 +163,7 @@ void bns_destroy(bntseq_t *bns) } } -void bns_fasta2bntseq(gzFile fp_fa, const char *prefix) +int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix) { kseq_t *seq; char name[1024]; @@ -172,6 +172,7 @@ void bns_fasta2bntseq(gzFile fp_fa, const char *prefix) int l_buf; unsigned char buf[0x10000]; int32_t m_seqs, m_holes, l, i; + int64_t ret = -1; FILE *fp; // initialization @@ -235,6 +236,7 @@ void bns_fasta2bntseq(gzFile fp_fa, const char *prefix) bns->l_pac += seq->seq.l; } xassert(bns->l_pac, "zero length sequence."); + ret = bns->l_pac; { // finalize .pac file ubyte_t ct; fwrite(buf, 1, (l_buf>>2) + ((l_buf&3) == 0? 0 : 1), fp); @@ -251,6 +253,7 @@ void bns_fasta2bntseq(gzFile fp_fa, const char *prefix) bns_dump(bns, prefix); bns_destroy(bns); kseq_destroy(seq); + return ret; } int bwa_fa2pac(int argc, char *argv[]) diff --git a/bntseq.h b/bntseq.h index 21b831e..189e017 100644 --- a/bntseq.h +++ b/bntseq.h @@ -70,7 +70,7 @@ extern "C" { bntseq_t *bns_restore(const char *prefix); bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename); void bns_destroy(bntseq_t *bns); - void bns_fasta2bntseq(gzFile fp_fa, const char *prefix); + int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix); int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq); #ifdef __cplusplus diff --git a/bwtindex.c b/bwtindex.c index 68792f7..c752a2f 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -42,12 +42,13 @@ void bwa_pac_rev_core(const char *fn, const char *fn_rev); int bwa_index(int argc, char *argv[]) { char *prefix = 0, *str, *str2, *str3; - int c, algo_type = 3, is_color = 0; + int c, algo_type = 0, is_color = 0; clock_t t; + int64_t l_pac; while ((c = getopt(argc, argv, "ca:p:")) >= 0) { switch (c) { - case 'a': + case 'a': // if -a is not set, algo_type will be determined later if (strcmp(optarg, "div") == 0) algo_type = 1; else if (strcmp(optarg, "bwtsw") == 0) algo_type = 2; else if (strcmp(optarg, "is") == 0) algo_type = 3; @@ -79,7 +80,7 @@ int bwa_index(int argc, char *argv[]) gzFile fp = xzopen(argv[optind], "r"); t = clock(); fprintf(stderr, "[bwa_index] Pack FASTA... "); - bns_fasta2bntseq(fp, prefix); + l_pac = bns_fasta2bntseq(fp, prefix); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); gzclose(fp); } else { // color indexing @@ -87,7 +88,7 @@ int bwa_index(int argc, char *argv[]) strcat(strcpy(str, prefix), ".nt"); t = clock(); fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... "); - bns_fasta2bntseq(fp, str); + l_pac = bns_fasta2bntseq(fp, str); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); gzclose(fp); { @@ -99,6 +100,11 @@ int bwa_index(int argc, char *argv[]) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); } } + if (l_pac > 0xffffffffu) { + fprintf(stderr, "[%s] BWA only works with reference sequences shorter than 4GB in total. Abort!\n", __func__); + return 1; + } + if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT { strcpy(str, prefix); strcat(str, ".pac"); strcpy(str2, prefix); strcat(str2, ".rpac"); diff --git a/main.c b/main.c index b22552e..0e545a7 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.5.9-r19-dev" +#define PACKAGE_VERSION "0.5.9-r20-dev" #endif static int usage()