diff --git a/README.md b/README.md index 250a009..cfc84c6 100644 --- a/README.md +++ b/README.md @@ -71,11 +71,11 @@ algorithm and setting may vary. The following list gives the recommended settings: * Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly - contigs up to a few megabases mapped to a close related reference genome: + contigs up to a few megabases mapped to a closely related reference genome: bwa mem ref.fa reads.fq > aln.sam -* Illumina single-end reads no longer than ~70bp: +* Illumina single-end reads shorter than ~70bp: bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam @@ -83,24 +83,21 @@ settings: bwa mem ref.fa read1.fq read2.fq > aln-pe.sam -* Illumina paired-end reads no longer than ~70bp: +* Illumina paired-end reads shorter than ~70bp: bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai bwa sampe ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam -* PacBio subreads to a reference genome: +* PacBio subreads or Oxford Nanopore reads to a reference genome: bwa mem -x pacbio ref.fa reads.fq > aln.sam - -* Oxford Nanopore reads to a reference genome: - bwa mem -x ont2d ref.fa reads.fq > aln.sam BWA-MEM is recommended for query sequences longer than ~70bp for a variety of error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with errors given longer query sequences as the chance of missing all seeds is small. -As is shown above, with non-default settings, BWA-MEM works with PacBio subreads -with a sequencing error rate as high as ~15%. +As is shown above, with non-default settings, BWA-MEM works with Oxford Nanopore +reads with a sequencing error rate over 20%. ####2. Why does a read appear multiple times in the output SAM? diff --git a/bwa.c b/bwa.c index 495b197..e2e1376 100644 --- a/bwa.c +++ b/bwa.c @@ -259,12 +259,80 @@ bwaidx_t *bwa_idx_load(const char *hint, int which) void bwa_idx_destroy(bwaidx_t *idx) { if (idx == 0) return; - if (idx->bwt) bwt_destroy(idx->bwt); - if (idx->bns) bns_destroy(idx->bns); - if (idx->pac) free(idx->pac); + if (idx->mem == 0) { + if (idx->bwt) bwt_destroy(idx->bwt); + if (idx->bns) bns_destroy(idx->bns); + if (idx->pac) free(idx->pac); + } else free((uint8_t*)idx->mem); free(idx); } +int bwa_mem2idx(int64_t l_mem, const uint8_t *mem, bwaidx_t *idx) +{ + int64_t k = 0, x; + int i; + + // generate idx->bwt + x = sizeof(bwt_t); idx->bwt = (bwt_t*)(mem + k); k += x; + x = idx->bwt->n_sa * sizeof(bwtint_t); idx->bwt->sa = (bwtint_t*)(mem + k); k += x; + x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x; + + // generate idx->bns and idx->pac + x = sizeof(bntseq_t); idx->bns = (bntseq_t*)(mem + k); k += x; + x = idx->bns->n_holes * sizeof(bntamb1_t); idx->bns->ambs = (bntamb1_t*)(mem + k); k += x; + x = idx->bns->n_seqs * sizeof(bntann1_t); idx->bns->anns = (bntann1_t*)(mem + k); k += x; + for (i = 0; i < idx->bns->n_seqs; ++i) { + idx->bns->anns[i].name = (char*)(mem + k); k += strlen(idx->bns->anns[i].name) + 1; + idx->bns->anns[i].anno = (char*)(mem + k); k += strlen(idx->bns->anns[i].anno) + 1; + } + idx->pac = (uint8_t*)(mem + k); + + idx->l_mem = k; idx->mem = mem; + return 0; +} + +int bwa_idx2mem(bwaidx_t *idx) +{ + int i; + int64_t k = 0, x, tmp; + uint8_t *mem = 0; + + // copy idx->bwt + mem = malloc(sizeof(bwt_t) + idx->bwt->n_sa * sizeof(bwtint_t)); + x = sizeof(bwt_t); memcpy(mem + k, idx->bwt, x); k += x; + x = idx->bwt->n_sa * sizeof(bwtint_t); memcpy(mem + k, idx->bwt->sa, x); k += x; + free(idx->bwt->sa); + mem = realloc(mem, k + idx->bwt->bwt_size * 4); + x = idx->bwt->bwt_size * 4; memcpy(mem + k, idx->bwt->bwt, x); k += x; + free(idx->bwt->bwt); + free(idx->bwt); idx->bwt = 0; + + // copy idx->bns + tmp = idx->bns->n_seqs * sizeof(bntann1_t) + idx->bns->n_holes * sizeof(bntamb1_t); + for (i = 0; i < idx->bns->n_seqs; ++i) // compute the size of heap-allocated memory + tmp += strlen(idx->bns->anns[i].name) + strlen(idx->bns->anns[i].anno) + 2; + mem = realloc(mem, k + sizeof(bntseq_t) + tmp); + x = sizeof(bntseq_t); memcpy(mem + k, idx->bns, x); k += x; + x = idx->bns->n_holes * sizeof(bntamb1_t); memcpy(mem + k, idx->bns->ambs, x); k += x; + free(idx->bns->ambs); + x = idx->bns->n_seqs * sizeof(bntann1_t); memcpy(mem + k, idx->bns->anns, x); k += x; + for (i = 0; i < idx->bns->n_seqs; ++i) { + x = strlen(idx->bns->anns[i].name) + 1; memcpy(mem + k, idx->bns->anns[i].name, x); k += x; + x = strlen(idx->bns->anns[i].anno) + 1; memcpy(mem + k, idx->bns->anns[i].anno, x); k += x; + free(idx->bns->anns[i].name); free(idx->bns->anns[i].anno); + } + free(idx->bns->anns); + + // copy idx->pac + x = idx->bns->l_pac/4+1; + mem = realloc(mem, k + x); + memcpy(mem + k, idx->pac, x); k += x; + free(idx->bns); idx->bns = 0; + free(idx->pac); idx->pac = 0; + + return bwa_mem2idx(k, mem, idx); +} + /*********************** * SAM header routines * ***********************/ diff --git a/bwa.h b/bwa.h index bbc2525..7e2c153 100644 --- a/bwa.h +++ b/bwa.h @@ -14,6 +14,9 @@ typedef struct { bwt_t *bwt; // FM-index bntseq_t *bns; // information on the reference sequences uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base + + int64_t l_mem; + const uint8_t *mem; } bwaidx_t; typedef struct { @@ -39,6 +42,8 @@ extern "C" { bwaidx_t *bwa_idx_load(const char *hint, int which); void bwa_idx_destroy(bwaidx_t *idx); + int bwa_idx2mem(bwaidx_t *idx); + int bwa_mem2idx(int64_t l_mem, const uint8_t *mem, bwaidx_t *idx); void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line); char *bwa_set_rg(const char *s); diff --git a/main.c b/main.c index 77cace1..9108c49 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r896-dirty" +#define PACKAGE_VERSION "0.7.10-r898-dirty" #endif int bwa_fa2pac(int argc, char *argv[]);