merged bwtmisc.c to bwtindex.c
bwtmisc.c implements routines related to indexing
This commit is contained in:
parent
545fb87feb
commit
6230f86799
2
Makefile
2
Makefile
|
|
@ -5,7 +5,7 @@ AR= ar
|
||||||
DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64
|
DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64
|
||||||
LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwamem.o bwamem_pair.o
|
LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwamem.o bwamem_pair.o
|
||||||
AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
||||||
is.o bwtmisc.o bwtindex.o bwape.o \
|
is.o bwtindex.o bwape.o \
|
||||||
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
||||||
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
|
bwtsw2_chain.o fastmap.o bwtsw2_pair.o
|
||||||
PROG= bwa
|
PROG= bwa
|
||||||
|
|
|
||||||
149
bwtindex.c
149
bwtindex.c
|
|
@ -36,11 +36,154 @@
|
||||||
#include "main.h"
|
#include "main.h"
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
|
|
||||||
bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is);
|
#ifdef _DIVBWT
|
||||||
void bwa_pac_rev_core(const char *fn, const char *fn_rev);
|
#include "divsufsort.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
int bwa_index(int argc, char *argv[])
|
int is_bwt(ubyte_t *T, int n);
|
||||||
|
|
||||||
|
int64_t bwa_seq_len(const char *fn_pac)
|
||||||
{
|
{
|
||||||
|
FILE *fp;
|
||||||
|
int64_t pac_len;
|
||||||
|
ubyte_t c;
|
||||||
|
fp = xopen(fn_pac, "rb");
|
||||||
|
fseek(fp, -1, SEEK_END);
|
||||||
|
pac_len = ftell(fp);
|
||||||
|
fread(&c, 1, 1, fp);
|
||||||
|
fclose(fp);
|
||||||
|
return (pac_len - 1) * 4 + (int)c;
|
||||||
|
}
|
||||||
|
|
||||||
|
bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
|
||||||
|
{
|
||||||
|
bwt_t *bwt;
|
||||||
|
ubyte_t *buf, *buf2;
|
||||||
|
int i, pac_size;
|
||||||
|
FILE *fp;
|
||||||
|
|
||||||
|
// initialization
|
||||||
|
bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
|
||||||
|
bwt->seq_len = bwa_seq_len(fn_pac);
|
||||||
|
bwt->bwt_size = (bwt->seq_len + 15) >> 4;
|
||||||
|
fp = xopen(fn_pac, "rb");
|
||||||
|
|
||||||
|
// prepare sequence
|
||||||
|
pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1);
|
||||||
|
buf2 = (ubyte_t*)calloc(pac_size, 1);
|
||||||
|
fread(buf2, 1, pac_size, fp);
|
||||||
|
fclose(fp);
|
||||||
|
memset(bwt->L2, 0, 5 * 4);
|
||||||
|
buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1);
|
||||||
|
for (i = 0; i < bwt->seq_len; ++i) {
|
||||||
|
buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3;
|
||||||
|
++bwt->L2[1+buf[i]];
|
||||||
|
}
|
||||||
|
for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1];
|
||||||
|
free(buf2);
|
||||||
|
|
||||||
|
// Burrows-Wheeler Transform
|
||||||
|
if (use_is) {
|
||||||
|
bwt->primary = is_bwt(buf, bwt->seq_len);
|
||||||
|
} else {
|
||||||
|
#ifdef _DIVBWT
|
||||||
|
bwt->primary = divbwt(buf, buf, 0, bwt->seq_len);
|
||||||
|
#else
|
||||||
|
err_fatal_simple("libdivsufsort is not compiled in.");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4);
|
||||||
|
for (i = 0; i < bwt->seq_len; ++i)
|
||||||
|
bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1);
|
||||||
|
free(buf);
|
||||||
|
return bwt;
|
||||||
|
}
|
||||||
|
|
||||||
|
int bwa_pac2bwt(int argc, char *argv[]) // the "pac2bwt" command; IMPORTANT: bwt generated at this step CANNOT be used with BWA. bwtupdate is required!
|
||||||
|
{
|
||||||
|
bwt_t *bwt;
|
||||||
|
int c, use_is = 1;
|
||||||
|
while ((c = getopt(argc, argv, "d")) >= 0) {
|
||||||
|
switch (c) {
|
||||||
|
case 'd': use_is = 0; break;
|
||||||
|
default: return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (optind + 2 > argc) {
|
||||||
|
fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
bwt = bwt_pac2bwt(argv[optind], use_is);
|
||||||
|
bwt_dump_bwt(argv[optind+1], bwt);
|
||||||
|
bwt_destroy(bwt);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
|
||||||
|
|
||||||
|
void bwt_bwtupdate_core(bwt_t *bwt)
|
||||||
|
{
|
||||||
|
bwtint_t i, k, c[4], n_occ;
|
||||||
|
uint32_t *buf;
|
||||||
|
|
||||||
|
n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;
|
||||||
|
bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size
|
||||||
|
buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt
|
||||||
|
c[0] = c[1] = c[2] = c[3] = 0;
|
||||||
|
for (i = k = 0; i < bwt->seq_len; ++i) {
|
||||||
|
if (i % OCC_INTERVAL == 0) {
|
||||||
|
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
|
||||||
|
k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4)
|
||||||
|
}
|
||||||
|
if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2
|
||||||
|
++c[bwt_B00(bwt, i)];
|
||||||
|
}
|
||||||
|
// the last element
|
||||||
|
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
|
||||||
|
xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size");
|
||||||
|
// update bwt
|
||||||
|
free(bwt->bwt); bwt->bwt = buf;
|
||||||
|
}
|
||||||
|
|
||||||
|
int bwa_bwtupdate(int argc, char *argv[]) // the "bwtupdate" command
|
||||||
|
{
|
||||||
|
bwt_t *bwt;
|
||||||
|
if (argc < 2) {
|
||||||
|
fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
bwt = bwt_restore_bwt(argv[1]);
|
||||||
|
bwt_bwtupdate_core(bwt);
|
||||||
|
bwt_dump_bwt(argv[1], bwt);
|
||||||
|
bwt_destroy(bwt);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command
|
||||||
|
{
|
||||||
|
bwt_t *bwt;
|
||||||
|
int c, sa_intv = 32;
|
||||||
|
while ((c = getopt(argc, argv, "i:")) >= 0) {
|
||||||
|
switch (c) {
|
||||||
|
case 'i': sa_intv = atoi(optarg); break;
|
||||||
|
default: return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (optind + 2 > argc) {
|
||||||
|
fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
bwt = bwt_restore_bwt(argv[optind]);
|
||||||
|
bwt_cal_sa(bwt, sa_intv);
|
||||||
|
bwt_dump_sa(argv[optind+1], bwt);
|
||||||
|
bwt_destroy(bwt);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int bwa_index(int argc, char *argv[]) // the "index" command
|
||||||
|
{
|
||||||
|
extern void bwa_pac_rev_core(const char *fn, const char *fn_rev);
|
||||||
|
|
||||||
char *prefix = 0, *str, *str2, *str3;
|
char *prefix = 0, *str, *str2, *str3;
|
||||||
int c, algo_type = 0, is_64 = 0;
|
int c, algo_type = 0, is_64 = 0;
|
||||||
clock_t t;
|
clock_t t;
|
||||||
|
|
|
||||||
179
bwtmisc.c
179
bwtmisc.c
|
|
@ -1,179 +0,0 @@
|
||||||
/* The MIT License
|
|
||||||
|
|
||||||
Copyright (c) 2008 Genome Research Ltd (GRL).
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining
|
|
||||||
a copy of this software and associated documentation files (the
|
|
||||||
"Software"), to deal in the Software without restriction, including
|
|
||||||
without limitation the rights to use, copy, modify, merge, publish,
|
|
||||||
distribute, sublicense, and/or sell copies of the Software, and to
|
|
||||||
permit persons to whom the Software is furnished to do so, subject to
|
|
||||||
the following conditions:
|
|
||||||
|
|
||||||
The above copyright notice and this permission notice shall be
|
|
||||||
included in all copies or substantial portions of the Software.
|
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
||||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
||||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
||||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
||||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* Contact: Heng Li <lh3@sanger.ac.uk> */
|
|
||||||
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#include "bntseq.h"
|
|
||||||
#include "utils.h"
|
|
||||||
#include "main.h"
|
|
||||||
#include "bwt.h"
|
|
||||||
|
|
||||||
#ifdef _DIVBWT
|
|
||||||
#include "divsufsort.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int is_bwt(ubyte_t *T, int n);
|
|
||||||
|
|
||||||
int64_t bwa_seq_len(const char *fn_pac)
|
|
||||||
{
|
|
||||||
FILE *fp;
|
|
||||||
int64_t pac_len;
|
|
||||||
ubyte_t c;
|
|
||||||
fp = xopen(fn_pac, "rb");
|
|
||||||
fseek(fp, -1, SEEK_END);
|
|
||||||
pac_len = ftell(fp);
|
|
||||||
fread(&c, 1, 1, fp);
|
|
||||||
fclose(fp);
|
|
||||||
return (pac_len - 1) * 4 + (int)c;
|
|
||||||
}
|
|
||||||
|
|
||||||
bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
|
|
||||||
{
|
|
||||||
bwt_t *bwt;
|
|
||||||
ubyte_t *buf, *buf2;
|
|
||||||
int i, pac_size;
|
|
||||||
FILE *fp;
|
|
||||||
|
|
||||||
// initialization
|
|
||||||
bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
|
|
||||||
bwt->seq_len = bwa_seq_len(fn_pac);
|
|
||||||
bwt->bwt_size = (bwt->seq_len + 15) >> 4;
|
|
||||||
fp = xopen(fn_pac, "rb");
|
|
||||||
|
|
||||||
// prepare sequence
|
|
||||||
pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1);
|
|
||||||
buf2 = (ubyte_t*)calloc(pac_size, 1);
|
|
||||||
fread(buf2, 1, pac_size, fp);
|
|
||||||
fclose(fp);
|
|
||||||
memset(bwt->L2, 0, 5 * 4);
|
|
||||||
buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1);
|
|
||||||
for (i = 0; i < bwt->seq_len; ++i) {
|
|
||||||
buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3;
|
|
||||||
++bwt->L2[1+buf[i]];
|
|
||||||
}
|
|
||||||
for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1];
|
|
||||||
free(buf2);
|
|
||||||
|
|
||||||
// Burrows-Wheeler Transform
|
|
||||||
if (use_is) {
|
|
||||||
bwt->primary = is_bwt(buf, bwt->seq_len);
|
|
||||||
} else {
|
|
||||||
#ifdef _DIVBWT
|
|
||||||
bwt->primary = divbwt(buf, buf, 0, bwt->seq_len);
|
|
||||||
#else
|
|
||||||
err_fatal_simple("libdivsufsort is not compiled in.");
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4);
|
|
||||||
for (i = 0; i < bwt->seq_len; ++i)
|
|
||||||
bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1);
|
|
||||||
free(buf);
|
|
||||||
return bwt;
|
|
||||||
}
|
|
||||||
|
|
||||||
int bwa_pac2bwt(int argc, char *argv[])
|
|
||||||
{
|
|
||||||
bwt_t *bwt;
|
|
||||||
int c, use_is = 1;
|
|
||||||
while ((c = getopt(argc, argv, "d")) >= 0) {
|
|
||||||
switch (c) {
|
|
||||||
case 'd': use_is = 0; break;
|
|
||||||
default: return 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (optind + 2 > argc) {
|
|
||||||
fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
bwt = bwt_pac2bwt(argv[optind], use_is);
|
|
||||||
bwt_dump_bwt(argv[optind+1], bwt);
|
|
||||||
bwt_destroy(bwt);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
|
|
||||||
|
|
||||||
void bwt_bwtupdate_core(bwt_t *bwt)
|
|
||||||
{
|
|
||||||
bwtint_t i, k, c[4], n_occ;
|
|
||||||
uint32_t *buf;
|
|
||||||
|
|
||||||
n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;
|
|
||||||
bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size
|
|
||||||
buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt
|
|
||||||
c[0] = c[1] = c[2] = c[3] = 0;
|
|
||||||
for (i = k = 0; i < bwt->seq_len; ++i) {
|
|
||||||
if (i % OCC_INTERVAL == 0) {
|
|
||||||
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
|
|
||||||
k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4)
|
|
||||||
}
|
|
||||||
if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2
|
|
||||||
++c[bwt_B00(bwt, i)];
|
|
||||||
}
|
|
||||||
// the last element
|
|
||||||
memcpy(buf + k, c, sizeof(bwtint_t) * 4);
|
|
||||||
xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size");
|
|
||||||
// update bwt
|
|
||||||
free(bwt->bwt); bwt->bwt = buf;
|
|
||||||
}
|
|
||||||
|
|
||||||
int bwa_bwtupdate(int argc, char *argv[])
|
|
||||||
{
|
|
||||||
bwt_t *bwt;
|
|
||||||
if (argc < 2) {
|
|
||||||
fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
bwt = bwt_restore_bwt(argv[1]);
|
|
||||||
bwt_bwtupdate_core(bwt);
|
|
||||||
bwt_dump_bwt(argv[1], bwt);
|
|
||||||
bwt_destroy(bwt);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int bwa_bwt2sa(int argc, char *argv[])
|
|
||||||
{
|
|
||||||
bwt_t *bwt;
|
|
||||||
int c, sa_intv = 32;
|
|
||||||
while ((c = getopt(argc, argv, "i:")) >= 0) {
|
|
||||||
switch (c) {
|
|
||||||
case 'i': sa_intv = atoi(optarg); break;
|
|
||||||
default: return 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (optind + 2 > argc) {
|
|
||||||
fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
bwt = bwt_restore_bwt(argv[optind]);
|
|
||||||
bwt_cal_sa(bwt, sa_intv);
|
|
||||||
bwt_dump_sa(argv[optind+1], bwt);
|
|
||||||
bwt_destroy(bwt);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
Loading…
Reference in New Issue