From 6230f86799f30b836495beea1d81ed2d384c61b4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 17:23:34 -0500 Subject: [PATCH] merged bwtmisc.c to bwtindex.c bwtmisc.c implements routines related to indexing --- Makefile | 2 +- bwtindex.c | 149 +++++++++++++++++++++++++++++++++++++++++++- bwtmisc.c | 179 ----------------------------------------------------- 3 files changed, 147 insertions(+), 183 deletions(-) delete mode 100644 bwtmisc.c diff --git a/Makefile b/Makefile index f14d906..bfed694 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwamem.o bwamem_pair.o AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ - is.o bwtmisc.o bwtindex.o bwape.o \ + is.o bwtindex.o bwape.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o PROG= bwa diff --git a/bwtindex.c b/bwtindex.c index c01fa95..298153d 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -36,11 +36,154 @@ #include "main.h" #include "utils.h" -bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is); -void bwa_pac_rev_core(const char *fn, const char *fn_rev); +#ifdef _DIVBWT +#include "divsufsort.h" +#endif -int bwa_index(int argc, char *argv[]) +int is_bwt(ubyte_t *T, int n); + +int64_t bwa_seq_len(const char *fn_pac) { + FILE *fp; + int64_t pac_len; + ubyte_t c; + fp = xopen(fn_pac, "rb"); + fseek(fp, -1, SEEK_END); + pac_len = ftell(fp); + fread(&c, 1, 1, fp); + fclose(fp); + return (pac_len - 1) * 4 + (int)c; +} + +bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is) +{ + bwt_t *bwt; + ubyte_t *buf, *buf2; + int i, pac_size; + FILE *fp; + + // initialization + bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); + bwt->seq_len = bwa_seq_len(fn_pac); + bwt->bwt_size = (bwt->seq_len + 15) >> 4; + fp = xopen(fn_pac, "rb"); + + // prepare sequence + pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1); + buf2 = (ubyte_t*)calloc(pac_size, 1); + fread(buf2, 1, pac_size, fp); + fclose(fp); + memset(bwt->L2, 0, 5 * 4); + buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1); + for (i = 0; i < bwt->seq_len; ++i) { + buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3; + ++bwt->L2[1+buf[i]]; + } + for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1]; + free(buf2); + + // Burrows-Wheeler Transform + if (use_is) { + bwt->primary = is_bwt(buf, bwt->seq_len); + } else { +#ifdef _DIVBWT + bwt->primary = divbwt(buf, buf, 0, bwt->seq_len); +#else + err_fatal_simple("libdivsufsort is not compiled in."); +#endif + } + bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4); + for (i = 0; i < bwt->seq_len; ++i) + bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1); + free(buf); + return bwt; +} + +int bwa_pac2bwt(int argc, char *argv[]) // the "pac2bwt" command; IMPORTANT: bwt generated at this step CANNOT be used with BWA. bwtupdate is required! +{ + bwt_t *bwt; + int c, use_is = 1; + while ((c = getopt(argc, argv, "d")) >= 0) { + switch (c) { + case 'd': use_is = 0; break; + default: return 1; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: bwa pac2bwt [-d] \n"); + return 1; + } + bwt = bwt_pac2bwt(argv[optind], use_is); + bwt_dump_bwt(argv[optind+1], bwt); + bwt_destroy(bwt); + return 0; +} + +#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3) + +void bwt_bwtupdate_core(bwt_t *bwt) +{ + bwtint_t i, k, c[4], n_occ; + uint32_t *buf; + + n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; + bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size + buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt + c[0] = c[1] = c[2] = c[3] = 0; + for (i = k = 0; i < bwt->seq_len; ++i) { + if (i % OCC_INTERVAL == 0) { + memcpy(buf + k, c, sizeof(bwtint_t) * 4); + k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4) + } + if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2 + ++c[bwt_B00(bwt, i)]; + } + // the last element + memcpy(buf + k, c, sizeof(bwtint_t) * 4); + xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size"); + // update bwt + free(bwt->bwt); bwt->bwt = buf; +} + +int bwa_bwtupdate(int argc, char *argv[]) // the "bwtupdate" command +{ + bwt_t *bwt; + if (argc < 2) { + fprintf(stderr, "Usage: bwa bwtupdate \n"); + return 1; + } + bwt = bwt_restore_bwt(argv[1]); + bwt_bwtupdate_core(bwt); + bwt_dump_bwt(argv[1], bwt); + bwt_destroy(bwt); + return 0; +} + +int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command +{ + bwt_t *bwt; + int c, sa_intv = 32; + while ((c = getopt(argc, argv, "i:")) >= 0) { + switch (c) { + case 'i': sa_intv = atoi(optarg); break; + default: return 1; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: bwa bwt2sa [-i %d] \n", sa_intv); + return 1; + } + bwt = bwt_restore_bwt(argv[optind]); + bwt_cal_sa(bwt, sa_intv); + bwt_dump_sa(argv[optind+1], bwt); + bwt_destroy(bwt); + return 0; +} + +int bwa_index(int argc, char *argv[]) // the "index" command +{ + extern void bwa_pac_rev_core(const char *fn, const char *fn_rev); + char *prefix = 0, *str, *str2, *str3; int c, algo_type = 0, is_64 = 0; clock_t t; diff --git a/bwtmisc.c b/bwtmisc.c deleted file mode 100644 index de96dc2..0000000 --- a/bwtmisc.c +++ /dev/null @@ -1,179 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 Genome Research Ltd (GRL). - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Contact: Heng Li */ - -#include -#include -#include -#include -#include "bntseq.h" -#include "utils.h" -#include "main.h" -#include "bwt.h" - -#ifdef _DIVBWT -#include "divsufsort.h" -#endif - -int is_bwt(ubyte_t *T, int n); - -int64_t bwa_seq_len(const char *fn_pac) -{ - FILE *fp; - int64_t pac_len; - ubyte_t c; - fp = xopen(fn_pac, "rb"); - fseek(fp, -1, SEEK_END); - pac_len = ftell(fp); - fread(&c, 1, 1, fp); - fclose(fp); - return (pac_len - 1) * 4 + (int)c; -} - -bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is) -{ - bwt_t *bwt; - ubyte_t *buf, *buf2; - int i, pac_size; - FILE *fp; - - // initialization - bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); - bwt->seq_len = bwa_seq_len(fn_pac); - bwt->bwt_size = (bwt->seq_len + 15) >> 4; - fp = xopen(fn_pac, "rb"); - - // prepare sequence - pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1); - buf2 = (ubyte_t*)calloc(pac_size, 1); - fread(buf2, 1, pac_size, fp); - fclose(fp); - memset(bwt->L2, 0, 5 * 4); - buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1); - for (i = 0; i < bwt->seq_len; ++i) { - buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3; - ++bwt->L2[1+buf[i]]; - } - for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1]; - free(buf2); - - // Burrows-Wheeler Transform - if (use_is) { - bwt->primary = is_bwt(buf, bwt->seq_len); - } else { -#ifdef _DIVBWT - bwt->primary = divbwt(buf, buf, 0, bwt->seq_len); -#else - err_fatal_simple("libdivsufsort is not compiled in."); -#endif - } - bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4); - for (i = 0; i < bwt->seq_len; ++i) - bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1); - free(buf); - return bwt; -} - -int bwa_pac2bwt(int argc, char *argv[]) -{ - bwt_t *bwt; - int c, use_is = 1; - while ((c = getopt(argc, argv, "d")) >= 0) { - switch (c) { - case 'd': use_is = 0; break; - default: return 1; - } - } - if (optind + 2 > argc) { - fprintf(stderr, "Usage: bwa pac2bwt [-d] \n"); - return 1; - } - bwt = bwt_pac2bwt(argv[optind], use_is); - bwt_dump_bwt(argv[optind+1], bwt); - bwt_destroy(bwt); - return 0; -} - -#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3) - -void bwt_bwtupdate_core(bwt_t *bwt) -{ - bwtint_t i, k, c[4], n_occ; - uint32_t *buf; - - n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; - bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size - buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt - c[0] = c[1] = c[2] = c[3] = 0; - for (i = k = 0; i < bwt->seq_len; ++i) { - if (i % OCC_INTERVAL == 0) { - memcpy(buf + k, c, sizeof(bwtint_t) * 4); - k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4) - } - if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2 - ++c[bwt_B00(bwt, i)]; - } - // the last element - memcpy(buf + k, c, sizeof(bwtint_t) * 4); - xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size"); - // update bwt - free(bwt->bwt); bwt->bwt = buf; -} - -int bwa_bwtupdate(int argc, char *argv[]) -{ - bwt_t *bwt; - if (argc < 2) { - fprintf(stderr, "Usage: bwa bwtupdate \n"); - return 1; - } - bwt = bwt_restore_bwt(argv[1]); - bwt_bwtupdate_core(bwt); - bwt_dump_bwt(argv[1], bwt); - bwt_destroy(bwt); - return 0; -} - -int bwa_bwt2sa(int argc, char *argv[]) -{ - bwt_t *bwt; - int c, sa_intv = 32; - while ((c = getopt(argc, argv, "i:")) >= 0) { - switch (c) { - case 'i': sa_intv = atoi(optarg); break; - default: return 1; - } - } - if (optind + 2 > argc) { - fprintf(stderr, "Usage: bwa bwt2sa [-i %d] \n", sa_intv); - return 1; - } - bwt = bwt_restore_bwt(argv[optind]); - bwt_cal_sa(bwt, sa_intv); - bwt_dump_sa(argv[optind+1], bwt); - bwt_destroy(bwt); - return 0; -}