r898: read the index into a single memory block

Prepare for shared memory. Not used now.
This commit is contained in:
Heng Li 2014-10-15 12:27:45 -04:00
parent c44357926f
commit c5e859b49f
4 changed files with 83 additions and 13 deletions

View File

@ -71,11 +71,11 @@ algorithm and setting may vary. The following list gives the recommended
settings: settings:
* Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly * Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly
contigs up to a few megabases mapped to a close related reference genome: contigs up to a few megabases mapped to a closely related reference genome:
bwa mem ref.fa reads.fq > aln.sam bwa mem ref.fa reads.fq > aln.sam
* Illumina single-end reads no longer than ~70bp: * Illumina single-end reads shorter than ~70bp:
bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam
@ -83,24 +83,21 @@ settings:
bwa mem ref.fa read1.fq read2.fq > aln-pe.sam bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
* Illumina paired-end reads no longer than ~70bp: * Illumina paired-end reads shorter than ~70bp:
bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai
bwa sampe ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam bwa sampe ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam
* PacBio subreads to a reference genome: * PacBio subreads or Oxford Nanopore reads to a reference genome:
bwa mem -x pacbio ref.fa reads.fq > aln.sam bwa mem -x pacbio ref.fa reads.fq > aln.sam
* Oxford Nanopore reads to a reference genome:
bwa mem -x ont2d ref.fa reads.fq > aln.sam bwa mem -x ont2d ref.fa reads.fq > aln.sam
BWA-MEM is recommended for query sequences longer than ~70bp for a variety of BWA-MEM is recommended for query sequences longer than ~70bp for a variety of
error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with
errors given longer query sequences as the chance of missing all seeds is small. errors given longer query sequences as the chance of missing all seeds is small.
As is shown above, with non-default settings, BWA-MEM works with PacBio subreads As is shown above, with non-default settings, BWA-MEM works with Oxford Nanopore
with a sequencing error rate as high as ~15%. reads with a sequencing error rate over 20%.
####<a name="multihit"></a>2. Why does a read appear multiple times in the output SAM? ####<a name="multihit"></a>2. Why does a read appear multiple times in the output SAM?

74
bwa.c
View File

@ -259,12 +259,80 @@ bwaidx_t *bwa_idx_load(const char *hint, int which)
void bwa_idx_destroy(bwaidx_t *idx) void bwa_idx_destroy(bwaidx_t *idx)
{ {
if (idx == 0) return; if (idx == 0) return;
if (idx->bwt) bwt_destroy(idx->bwt); if (idx->mem == 0) {
if (idx->bns) bns_destroy(idx->bns); if (idx->bwt) bwt_destroy(idx->bwt);
if (idx->pac) free(idx->pac); if (idx->bns) bns_destroy(idx->bns);
if (idx->pac) free(idx->pac);
} else free((uint8_t*)idx->mem);
free(idx); free(idx);
} }
int bwa_mem2idx(int64_t l_mem, const uint8_t *mem, bwaidx_t *idx)
{
int64_t k = 0, x;
int i;
// generate idx->bwt
x = sizeof(bwt_t); idx->bwt = (bwt_t*)(mem + k); k += x;
x = idx->bwt->n_sa * sizeof(bwtint_t); idx->bwt->sa = (bwtint_t*)(mem + k); k += x;
x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x;
// generate idx->bns and idx->pac
x = sizeof(bntseq_t); idx->bns = (bntseq_t*)(mem + k); k += x;
x = idx->bns->n_holes * sizeof(bntamb1_t); idx->bns->ambs = (bntamb1_t*)(mem + k); k += x;
x = idx->bns->n_seqs * sizeof(bntann1_t); idx->bns->anns = (bntann1_t*)(mem + k); k += x;
for (i = 0; i < idx->bns->n_seqs; ++i) {
idx->bns->anns[i].name = (char*)(mem + k); k += strlen(idx->bns->anns[i].name) + 1;
idx->bns->anns[i].anno = (char*)(mem + k); k += strlen(idx->bns->anns[i].anno) + 1;
}
idx->pac = (uint8_t*)(mem + k);
idx->l_mem = k; idx->mem = mem;
return 0;
}
int bwa_idx2mem(bwaidx_t *idx)
{
int i;
int64_t k = 0, x, tmp;
uint8_t *mem = 0;
// copy idx->bwt
mem = malloc(sizeof(bwt_t) + idx->bwt->n_sa * sizeof(bwtint_t));
x = sizeof(bwt_t); memcpy(mem + k, idx->bwt, x); k += x;
x = idx->bwt->n_sa * sizeof(bwtint_t); memcpy(mem + k, idx->bwt->sa, x); k += x;
free(idx->bwt->sa);
mem = realloc(mem, k + idx->bwt->bwt_size * 4);
x = idx->bwt->bwt_size * 4; memcpy(mem + k, idx->bwt->bwt, x); k += x;
free(idx->bwt->bwt);
free(idx->bwt); idx->bwt = 0;
// copy idx->bns
tmp = idx->bns->n_seqs * sizeof(bntann1_t) + idx->bns->n_holes * sizeof(bntamb1_t);
for (i = 0; i < idx->bns->n_seqs; ++i) // compute the size of heap-allocated memory
tmp += strlen(idx->bns->anns[i].name) + strlen(idx->bns->anns[i].anno) + 2;
mem = realloc(mem, k + sizeof(bntseq_t) + tmp);
x = sizeof(bntseq_t); memcpy(mem + k, idx->bns, x); k += x;
x = idx->bns->n_holes * sizeof(bntamb1_t); memcpy(mem + k, idx->bns->ambs, x); k += x;
free(idx->bns->ambs);
x = idx->bns->n_seqs * sizeof(bntann1_t); memcpy(mem + k, idx->bns->anns, x); k += x;
for (i = 0; i < idx->bns->n_seqs; ++i) {
x = strlen(idx->bns->anns[i].name) + 1; memcpy(mem + k, idx->bns->anns[i].name, x); k += x;
x = strlen(idx->bns->anns[i].anno) + 1; memcpy(mem + k, idx->bns->anns[i].anno, x); k += x;
free(idx->bns->anns[i].name); free(idx->bns->anns[i].anno);
}
free(idx->bns->anns);
// copy idx->pac
x = idx->bns->l_pac/4+1;
mem = realloc(mem, k + x);
memcpy(mem + k, idx->pac, x); k += x;
free(idx->bns); idx->bns = 0;
free(idx->pac); idx->pac = 0;
return bwa_mem2idx(k, mem, idx);
}
/*********************** /***********************
* SAM header routines * * SAM header routines *
***********************/ ***********************/

5
bwa.h
View File

@ -14,6 +14,9 @@ typedef struct {
bwt_t *bwt; // FM-index bwt_t *bwt; // FM-index
bntseq_t *bns; // information on the reference sequences bntseq_t *bns; // information on the reference sequences
uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
int64_t l_mem;
const uint8_t *mem;
} bwaidx_t; } bwaidx_t;
typedef struct { typedef struct {
@ -39,6 +42,8 @@ extern "C" {
bwaidx_t *bwa_idx_load(const char *hint, int which); bwaidx_t *bwa_idx_load(const char *hint, int which);
void bwa_idx_destroy(bwaidx_t *idx); void bwa_idx_destroy(bwaidx_t *idx);
int bwa_idx2mem(bwaidx_t *idx);
int bwa_mem2idx(int64_t l_mem, const uint8_t *mem, bwaidx_t *idx);
void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line); void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line);
char *bwa_set_rg(const char *s); char *bwa_set_rg(const char *s);

2
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h" #include "utils.h"
#ifndef PACKAGE_VERSION #ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.10-r896-dirty" #define PACKAGE_VERSION "0.7.10-r898-dirty"
#endif #endif
int bwa_fa2pac(int argc, char *argv[]); int bwa_fa2pac(int argc, char *argv[]);