r898: read the index into a single memory block

Prepare for shared memory. Not used now.
This commit is contained in:
Heng Li 2014-10-15 12:27:45 -04:00
parent c44357926f
commit c5e859b49f
4 changed files with 83 additions and 13 deletions

View File

@ -71,11 +71,11 @@ algorithm and setting may vary. The following list gives the recommended
settings:
* Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly
contigs up to a few megabases mapped to a close related reference genome:
contigs up to a few megabases mapped to a closely related reference genome:
bwa mem ref.fa reads.fq > aln.sam
* Illumina single-end reads no longer than ~70bp:
* Illumina single-end reads shorter than ~70bp:
bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam
@ -83,24 +83,21 @@ settings:
bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
* Illumina paired-end reads no longer than ~70bp:
* Illumina paired-end reads shorter than ~70bp:
bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai
bwa sampe ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam
* PacBio subreads to a reference genome:
* PacBio subreads or Oxford Nanopore reads to a reference genome:
bwa mem -x pacbio ref.fa reads.fq > aln.sam
* Oxford Nanopore reads to a reference genome:
bwa mem -x ont2d ref.fa reads.fq > aln.sam
BWA-MEM is recommended for query sequences longer than ~70bp for a variety of
error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with
errors given longer query sequences as the chance of missing all seeds is small.
As is shown above, with non-default settings, BWA-MEM works with PacBio subreads
with a sequencing error rate as high as ~15%.
As is shown above, with non-default settings, BWA-MEM works with Oxford Nanopore
reads with a sequencing error rate over 20%.
####<a name="multihit"></a>2. Why does a read appear multiple times in the output SAM?

74
bwa.c
View File

@ -259,12 +259,80 @@ bwaidx_t *bwa_idx_load(const char *hint, int which)
void bwa_idx_destroy(bwaidx_t *idx)
{
if (idx == 0) return;
if (idx->bwt) bwt_destroy(idx->bwt);
if (idx->bns) bns_destroy(idx->bns);
if (idx->pac) free(idx->pac);
if (idx->mem == 0) {
if (idx->bwt) bwt_destroy(idx->bwt);
if (idx->bns) bns_destroy(idx->bns);
if (idx->pac) free(idx->pac);
} else free((uint8_t*)idx->mem);
free(idx);
}
int bwa_mem2idx(int64_t l_mem, const uint8_t *mem, bwaidx_t *idx)
{
int64_t k = 0, x;
int i;
// generate idx->bwt
x = sizeof(bwt_t); idx->bwt = (bwt_t*)(mem + k); k += x;
x = idx->bwt->n_sa * sizeof(bwtint_t); idx->bwt->sa = (bwtint_t*)(mem + k); k += x;
x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x;
// generate idx->bns and idx->pac
x = sizeof(bntseq_t); idx->bns = (bntseq_t*)(mem + k); k += x;
x = idx->bns->n_holes * sizeof(bntamb1_t); idx->bns->ambs = (bntamb1_t*)(mem + k); k += x;
x = idx->bns->n_seqs * sizeof(bntann1_t); idx->bns->anns = (bntann1_t*)(mem + k); k += x;
for (i = 0; i < idx->bns->n_seqs; ++i) {
idx->bns->anns[i].name = (char*)(mem + k); k += strlen(idx->bns->anns[i].name) + 1;
idx->bns->anns[i].anno = (char*)(mem + k); k += strlen(idx->bns->anns[i].anno) + 1;
}
idx->pac = (uint8_t*)(mem + k);
idx->l_mem = k; idx->mem = mem;
return 0;
}
int bwa_idx2mem(bwaidx_t *idx)
{
int i;
int64_t k = 0, x, tmp;
uint8_t *mem = 0;
// copy idx->bwt
mem = malloc(sizeof(bwt_t) + idx->bwt->n_sa * sizeof(bwtint_t));
x = sizeof(bwt_t); memcpy(mem + k, idx->bwt, x); k += x;
x = idx->bwt->n_sa * sizeof(bwtint_t); memcpy(mem + k, idx->bwt->sa, x); k += x;
free(idx->bwt->sa);
mem = realloc(mem, k + idx->bwt->bwt_size * 4);
x = idx->bwt->bwt_size * 4; memcpy(mem + k, idx->bwt->bwt, x); k += x;
free(idx->bwt->bwt);
free(idx->bwt); idx->bwt = 0;
// copy idx->bns
tmp = idx->bns->n_seqs * sizeof(bntann1_t) + idx->bns->n_holes * sizeof(bntamb1_t);
for (i = 0; i < idx->bns->n_seqs; ++i) // compute the size of heap-allocated memory
tmp += strlen(idx->bns->anns[i].name) + strlen(idx->bns->anns[i].anno) + 2;
mem = realloc(mem, k + sizeof(bntseq_t) + tmp);
x = sizeof(bntseq_t); memcpy(mem + k, idx->bns, x); k += x;
x = idx->bns->n_holes * sizeof(bntamb1_t); memcpy(mem + k, idx->bns->ambs, x); k += x;
free(idx->bns->ambs);
x = idx->bns->n_seqs * sizeof(bntann1_t); memcpy(mem + k, idx->bns->anns, x); k += x;
for (i = 0; i < idx->bns->n_seqs; ++i) {
x = strlen(idx->bns->anns[i].name) + 1; memcpy(mem + k, idx->bns->anns[i].name, x); k += x;
x = strlen(idx->bns->anns[i].anno) + 1; memcpy(mem + k, idx->bns->anns[i].anno, x); k += x;
free(idx->bns->anns[i].name); free(idx->bns->anns[i].anno);
}
free(idx->bns->anns);
// copy idx->pac
x = idx->bns->l_pac/4+1;
mem = realloc(mem, k + x);
memcpy(mem + k, idx->pac, x); k += x;
free(idx->bns); idx->bns = 0;
free(idx->pac); idx->pac = 0;
return bwa_mem2idx(k, mem, idx);
}
/***********************
* SAM header routines *
***********************/

5
bwa.h
View File

@ -14,6 +14,9 @@ typedef struct {
bwt_t *bwt; // FM-index
bntseq_t *bns; // information on the reference sequences
uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
int64_t l_mem;
const uint8_t *mem;
} bwaidx_t;
typedef struct {
@ -39,6 +42,8 @@ extern "C" {
bwaidx_t *bwa_idx_load(const char *hint, int which);
void bwa_idx_destroy(bwaidx_t *idx);
int bwa_idx2mem(bwaidx_t *idx);
int bwa_mem2idx(int64_t l_mem, const uint8_t *mem, bwaidx_t *idx);
void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line);
char *bwa_set_rg(const char *s);

2
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.10-r896-dirty"
#define PACKAGE_VERSION "0.7.10-r898-dirty"
#endif
int bwa_fa2pac(int argc, char *argv[]);