diff --git a/Makefile b/Makefile
index 9bccbf6..d0d50cf 100644
--- a/Makefile
+++ b/Makefile
@@ -4,7 +4,7 @@ CFLAGS= -g -Wall -Wno-unused-function -O2
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
AR= ar
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
-LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o
+LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwashm.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o
AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
is.o bwtindex.o bwape.o kopen.o pemerge.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
@@ -14,6 +14,10 @@ INCLUDES=
LIBS= -lm -lz -lpthread
SUBDIRS= .
+ifeq ($(shell uname -s),Linux)
+ LIBS += -lrt
+endif
+
.SUFFIXES:.c .o .cc
.c.o:
@@ -52,6 +56,7 @@ bwape.o: ksw.h khash.h
bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h
bwase.o: bwa.h ksw.h
bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h
+bwashm.o: bwa.h bntseq.h bwt.h
bwt.o: utils.h bwt.h kvec.h malloc_wrap.h
bwt_gen.o: QSufSort.h malloc_wrap.h
bwt_lite.o: bwt_lite.h malloc_wrap.h
@@ -66,7 +71,6 @@ bwtsw2_core.o: khash.h ksort.h
bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h
bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h
bwtsw2_pair.o: malloc_wrap.h ksw.h
-cutvar.o: bwa.h bntseq.h bwt.h kvec.h malloc_wrap.h
example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h
fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h
is.o: malloc_wrap.h
diff --git a/README-alt.md b/README-alt.md
new file mode 100644
index 0000000..8dac185
--- /dev/null
+++ b/README-alt.md
@@ -0,0 +1,123 @@
+## Getting Started
+
+Since version 0.7.11, BWA-MEM supports read mapping against a reference genome
+with long alternative haplotypes present in separate ALT contigs. To use the
+ALT-aware mode, users need to provide pairwise ALT-to-reference alignment in the
+SAM format and rename the file to ""*idxbase*.alt". For GRCh38, this alignment is
+available from the [BWA resource bundle for GRCh38][res].
+
+#### Option 1: Mapping to the official GRCh38 with ALT contigs
+
+Construct the index:
+```sh
+wget ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
+gzip -d GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
+mv GCA_000001405.15_GRCh38_full_analysis_set.fna hs38a.fa
+bwa index hs38a.fa
+cp bwa-hs38-res/hs38d4.fa.alt hs38a.fa.alt
+```
+
+Perform mapping:
+```sh
+bwa mem hs38a.fa read1.fq read2.fq \
+ | samblaster \
+ | bwa-hs38-res/k8-linux bwa-postalt.js hs38a.fa.alt \
+ | samtools view -bS - > aln.unsrt.bam
+```
+For short reads, the postprocessing script `bwa-postalt.js` runs at about the
+same speed as BAM compression.
+
+#### Option 2: Mapping to the collection of GRCh38, decoy and HLA genes
+
+Construct the index:
+```sh
+cat hs38a.fa bwa-hs38-res/hs38d4-extra.fa > hs38d4.fa
+bwa index hs38d4.fa
+cp bwa-hs38-res/hs38d4.fa.alt .
+```
+Perform mapping:
+```sh
+bwa mem -g.8 hs38d4.fa read1.fq read2.fq \
+ | samblaster \
+ | bwa-hs38-res/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \
+ | samtools view -bS - > aln.unsrt.bam
+```
+This command line generates `postinfo.ctw` which loosely evaluates the presence
+of an ALT contig with an empirical score at the last column.
+
+## Background
+
+GRCh38 ALT contigs are totaled 109Mb in length, spanning 60Mbp genomic regions.
+However, sequences that are highly diverged from the primary assembly only
+contribute a few million bp. Most subsequences of ALT contigs are highly similar
+or identical to the primary assembly. If we align sequence reads to GRCh38+ALT
+treating ALT equal to the primary assembly, we will get many reads with zero
+mapping quality and lose variants on them. It is crucial to make the mapper
+aware of ALTs.
+
+BWA-MEM is designed to minimize the interference of ALT contigs such that on the
+primary assembly, the ALT-aware alignment is highly similar to the alignment
+without using ALT contigs in the index. This design choice makes it almost
+always safe to map reads to GRCh38+ALT. Although we don't know yet how much
+variations on ALT contigs contribute to phenotypes, we would not get the answer
+without mapping large cohorts to these extra sequences. We hope our current
+implementation encourages researchers to use ALT contigs soon and often.
+
+## Methods
+
+As of now, ALT mapping is done in two separate steps: BWA-MEM mapping and
+postprocessing.
+
+#### Step 1: BWA-MEM mapping
+
+At this step, BWA-MEM reads the ALT contig names from "*idxbase*.alt", ignoring
+the ALT-to-ref alignment, and labels a potential hit as *ALT* or *non-ALT*,
+depending on whether the hit lands on an ALT contig or not. BWA-MEM then reports
+alignments and assigns mapQ following these two rules:
+
+ * The original mapQ of a non-ALT hit is computed across non-ALT hits only.
+ The reported mapQ of an ALT hit is computed across all hits.
+
+ * An ALT hit is only reported if its score is strictly better than all
+ overlapping non-ALT hits. A reported ALT hit is flagged with 0x800
+ (supplementary) unless there are no non-ALT hits.
+
+When option `-g FLOAT` is in use (which is the default), a third rule kicks in:
+
+ * The mapQ of a non-ALT hit is reduced to zero if its score is less than FLOAT
+ times the score of an overlapping ALT hit. In this case, the original mapQ is
+ moved to the `om` tag.
+
+If we don't care about ALT hits, we may actually skip postprocessing (step 2).
+Nonetheless, postprocessing is recommended as it improves mapQ and gives more
+information about ALT hits.
+
+#### Step 2: Postprocessing
+
+Postprocessing is done with a separate script `bwa-postalt.js`. It reads all
+potential hits reported in the XA tag, lifts ALT hits to the chromosomal
+positions using the ALT-to-ref alignment, groups them after lifting and then
+reassigns mapQ based on the best scoring hit in each group with all the hits in
+a group get the same mapQ. Knowing the ALT-to-ref alignment, this script can
+greatly improve mapQ of ALT hits and occasionally improve mapQ of non-ALT hits.
+
+The script also measures the presence of each ALT contig. For a group of
+overlapping ALT contigs c_1, ..., c_m, the weight for c_k equals `\frac{\sum_j
+P(c_k|r_j)}{\sum_j\max_i P(c_i|r_j)}`, where `P(c_k|r)=\frac{pow(4,s_k)}{\sum_i
+pow(4,s_i)}` is the posterior of c_k given a read r mapped to it with a
+Smith-Waterman score s_k. This weight is reported in `postinfo.ctw` in the
+option 2 above.
+
+## Problems and Future Development
+
+There are some uncertainties about ALT mappings - we are not sure whether they
+help biological discovery and don't know the best way to analyze them. Without
+clear demand from downstream analyses, it is very difficult to design the
+optimal mapping strategy. The current BWA-MEM method is just a start. If it
+turns out to be useful in research, we will probably rewrite bwa-postalt.js in C
+for performance; if not, we will try new designs. It is also possible that we
+may make breakthrough on the representation of multiple genomes, in which case,
+we can even get rid of ALT contigs once for all.
+
+[res]: https://sourceforge.net/projects/bio-bwa/files/
+[sb]: https://github.com/GregoryFaust/samblaster
diff --git a/README.md b/README.md
index 3472e91..cfc84c6 100644
--- a/README.md
+++ b/README.md
@@ -63,7 +63,6 @@ do not have plan to submit it to a peer-reviewed journal in the near future.
3. [Does BWA work on reference sequences longer than 4GB in total?](#4gb)
4. [Why can one read in a pair has high mapping quality but the other has zero?](#pe0)
5. [How can a BWA-backtrack alignment stands out of the end of a chromosome?](#endref)
-6. [How to map sequences to GRCh38 with ALT contigs?](#h38)
####1. What types of data does BWA work with?
@@ -72,11 +71,11 @@ algorithm and setting may vary. The following list gives the recommended
settings:
* Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly
- contigs up to a few megabases mapped to a close related reference genome:
+ contigs up to a few megabases mapped to a closely related reference genome:
bwa mem ref.fa reads.fq > aln.sam
-* Illumina single-end reads no longer than ~70bp:
+* Illumina single-end reads shorter than ~70bp:
bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam
@@ -84,20 +83,21 @@ settings:
bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
-* Illumina paired-end reads no longer than ~70bp:
+* Illumina paired-end reads shorter than ~70bp:
bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai
bwa sampe ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam
-* PacBio subreads to a reference genome:
+* PacBio subreads or Oxford Nanopore reads to a reference genome:
bwa mem -x pacbio ref.fa reads.fq > aln.sam
+ bwa mem -x ont2d ref.fa reads.fq > aln.sam
BWA-MEM is recommended for query sequences longer than ~70bp for a variety of
error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with
errors given longer query sequences as the chance of missing all seeds is small.
-As is shown above, with non-default settings, BWA-MEM works with PacBio subreads
-with a sequencing error rate as high as ~15%.
+As is shown above, with non-default settings, BWA-MEM works with Oxford Nanopore
+reads with a sequencing error rate over 20%.
####2. Why does a read appear multiple times in the output SAM?
diff --git a/bntseq.c b/bntseq.c
index 93cdba1..465e383 100644
--- a/bntseq.c
+++ b/bntseq.c
@@ -179,17 +179,25 @@ bntseq_t *bns_restore(const char *prefix)
if ((fp = fopen(strcat(strcpy(alt_filename, prefix), ".alt"), "r")) != 0) { // read .alt file if present
char str[1024];
khash_t(str) *h;
- int i, absent;
+ int c, i, absent;
khint_t k;
h = kh_init(str);
for (i = 0; i < bns->n_seqs; ++i) {
k = kh_put(str, h, bns->anns[i].name, &absent);
kh_val(h, k) = i;
}
- while (fscanf(fp, "%s", str) == 1) {
- k = kh_get(str, h, str);
- if (k != kh_end(h))
- bns->anns[kh_val(h, k)].is_alt = 1;
+ i = 0;
+ while ((c = fgetc(fp)) != EOF) {
+ if (c == '\t' || c == '\n' || c == '\r') {
+ str[i] = 0;
+ if (str[0] != '@') {
+ k = kh_get(str, h, str);
+ if (k != kh_end(h))
+ bns->anns[kh_val(h, k)].is_alt = 1;
+ }
+ while (c != '\n' && c != EOF) c = fgetc(fp);
+ i = 0;
+ } else str[i++] = c; // FIXME: potential segfault here
}
kh_destroy(str, h);
fclose(fp);
@@ -226,7 +234,7 @@ static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_
}
p = bns->anns + bns->n_seqs;
p->name = strdup((char*)seq->name.s);
- p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)");
+ p->anno = seq->comment.l > 0? strdup((char*)seq->comment.s) : strdup("(null)");
p->gi = 0; p->len = seq->seq.l;
p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
p->n_ambs = 0;
@@ -358,8 +366,9 @@ int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re)
{
int is_rev, rid_b, rid_e;
if (rb < bns->l_pac && re > bns->l_pac) return -2;
+ assert(rb <= re);
rid_b = bns_pos2rid(bns, bns_depos(bns, rb, &is_rev));
- rid_e = bns_pos2rid(bns, bns_depos(bns, re, &is_rev) - 1);
+ rid_e = rb < re? bns_pos2rid(bns, bns_depos(bns, re - 1, &is_rev)) : rid_b;
return rid_b == rid_e? rid_b : -1;
}
diff --git a/bwa-postalt.js b/bwa-postalt.js
index 5e6b7c7..411cfdd 100644
--- a/bwa-postalt.js
+++ b/bwa-postalt.js
@@ -52,7 +52,7 @@ var getopt = function(args, ostr) {
return optopt;
}
-// print an object in a format similar to JSON. For debugging.
+// print an object in a format similar to JSON. For debugging only.
function obj2str(o)
{
if (typeof(o) != 'object') {
@@ -203,54 +203,31 @@ function parse_hit(s, opt)
return h;
}
-// read the ALT-to-REF alignment and generate the index
-function read_ALT_sam(fn)
-{
- var intv = {};
- var file = new File(fn);
- var buf = new Bytes();
- while (file.readline(buf) >= 0) {
- var line = buf.toString();
- var t = line.split("\t");
- if (line.charAt(0) == '@') continue;
- var flag = parseInt(t[1]);
- var m, cigar = [], l_qaln = 0, l_qclip = 0;
- while ((m = re_cigar.exec(t[5])) != null) {
- var l = parseInt(m[1]);
- cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip
- if (m[2] == 'M' || m[2] == 'I') l_qaln += l;
- else if (m[2] == 'S' || m[2] == 'H') l_qclip += l;
- }
- var j = flag&16? cigar.length-1 : 0;
- var start = cigar[j][0] == 'S'? cigar[j][1] : 0;
- if (intv[t[0]] == null) intv[t[0]] = [];
- intv[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, parseInt(t[3]) - 1, cigar]);
- //print(start, start + l_qaln, t[2], flag&16? true : false, parseInt(t[3]), cigar);
- }
- buf.destroy();
- file.close();
- // create index for intervals on ALT contigs
- var idx = {};
- for (var ctg in intv)
- idx[ctg] = intv_ovlp(intv[ctg]);
- return idx;
-}
-
function bwa_postalt(args)
{
- var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true };
+ var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:10, show_ev:false };
- while ((c = getopt(args, 'pqv:')) != null) {
+ while ((c = getopt(args, 'Pqev:p:')) != null) {
if (c == 'v') opt.verbose = parseInt(getopt.arg);
- else if (c == 'p') opt.show_pri = true;
+ else if (c == 'p') opt.pre = getopt.arg;
+ else if (c == 'P') opt.show_pri = true;
else if (c == 'q') opt.recover_maq = false;
+ else if (c == 'e') opt.show_ev = true;
+ }
+
+ if (opt.show_ev && opt.pre == null) {
+ warn("ERROR: option '-p' must be specified if '-e' is applied.");
+ exit(1);
}
if (args.length == getopt.ind) {
print("");
- print("Usage: k8 bwa-postalt.js [-p] [aln.sam]\n");
- print("Options: -p output lifted non-ALT hit in a SAM line (for ALT-unware alignments)");
- print(" -q don't recover mapQ for non-ALTs hit overlapping lifted ALT");
+ print("Usage: k8 bwa-postalt.js [options] [aln.sam]\n");
+ print("Options: -p STR prefix of file(s) for additional information [null]");
+ print(" PREFIX.ctw - weight of each ALT contig");
+ print(" PREFIX.evi - reads supporting ALT contigs (effective with -e)");
+ print(" -q don't modify mapQ for non-ALTs hit overlapping lifted ALT");
+ print(" -e show reads supporting ALT contigs into file PREFIX.evi");
print("");
print("Note: This script inspects the XA tag, lifts the mapping positions of ALT hits to");
print(" the primary assembly, groups them and then estimates mapQ across groups. If");
@@ -262,12 +239,56 @@ function bwa_postalt(args)
exit(1);
}
- var file, buf = new Bytes();
+ var fp_evi = opt.show_ev && opt.pre? new File(opt.pre + '.evi', "w") : null;
var aux = new Bytes(); // used for reverse and reverse complement
- var idx = read_ALT_sam(args[getopt.ind]);
- var buf2 = [];
+ var buf = new Bytes();
+
+ // read ALT-to-REF alignment
+ var intv_alt = {}, intv_pri = {}, idx_un = {};
+ var file = new File(args[getopt.ind]);
+ while (file.readline(buf) >= 0) {
+ var line = buf.toString();
+ if (line.charAt(0) == '@') continue;
+ var t = line.split("\t");
+ if (t.length < 11) continue; // incomplete lines
+ var pos = parseInt(t[3]) - 1;
+ var flag = parseInt(t[1]);
+ if ((flag&4) || t[2] == '*') {
+ idx_un[t[0]] = true;
+ continue;
+ }
+ var m, cigar = [], l_qaln = 0, l_tlen = 0, l_qclip = 0;
+ while ((m = re_cigar.exec(t[5])) != null) {
+ var l = parseInt(m[1]);
+ cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip
+ if (m[2] == 'M') l_qaln += l, l_tlen += l;
+ else if (m[2] == 'I') l_qaln += l;
+ else if (m[2] == 'S' || m[2] == 'H') l_qclip += l;
+ else if (m[2] == 'D' || m[2] == 'N') l_tlen += l;
+ }
+ var j = flag&16? cigar.length-1 : 0;
+ var start = cigar[j][0] == 'S'? cigar[j][1] : 0;
+ if (intv_alt[t[0]] == null) intv_alt[t[0]] = [];
+ intv_alt[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, pos - 1, cigar, pos + l_tlen]);
+ if (intv_pri[t[2]] == null) intv_pri[t[2]] = [];
+ intv_pri[t[2]].push([pos, pos + l_tlen, t[0]]);
+ }
+ file.close();
+ var idx_alt = {}, idx_pri = {};
+ for (var ctg in intv_alt)
+ idx_alt[ctg] = intv_ovlp(intv_alt[ctg]);
+ for (var ctg in intv_pri)
+ idx_pri[ctg] = intv_ovlp(intv_pri[ctg]);
+
+ // initialize the list of ALT contigs
+ var weight_alt = [];
+ for (var ctg in idx_alt)
+ weight_alt[ctg] = [0, 0, 0, 0, 0, 0, intv_alt[ctg][0][3], intv_alt[ctg][0][5], intv_alt[ctg][0][7]];
+ for (var ctg in idx_un)
+ weight_alt[ctg] = [0, 0, 0, 0, 0, 0, '~', 0, 0];
// process SAM
+ var buf2 = [];
file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File();
while (file.readline(buf) >= 0) {
var m, line = buf.toString();
@@ -286,36 +307,49 @@ function bwa_postalt(args)
buf2 = [];
}
- if ((m = /\tXA:Z:(\S+)/.exec(line)) == null) {
+ // skip unmapped lines
+ if (t[1]&4) {
buf2.push(t);
continue;
}
- var XA_strs = m[1].split(";");
// parse the reported hit
- var hits = [];
var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1];
var flag = t[1];
var h = parse_hit([t[2], ((flag&16)?'-':'+') + t[3], t[5], NM], opt);
- if (h.hard) { // don't process lines with hard clips
+ if (h.hard) { // the following does not work with hard clipped alignments
buf2.push(t);
continue;
}
- hits.push(h);
+ var hits = [h];
// parse hits in the XA tag
- for (var i = 0; i < XA_strs.length; ++i)
- if (XA_strs[i] != '') // as the last symbol in an XA tag is ";", the last split is an empty string
- hits.push(parse_hit(XA_strs[i].split(","), opt));
+ if ((m = /\tXA:Z:(\S+)/.exec(line)) != null) {
+ var XA_strs = m[1].split(";");
+ for (var i = 0; i < XA_strs.length; ++i)
+ if (XA_strs[i] != '') // as the last symbol in an XA tag is ";", the last split is an empty string
+ hits.push(parse_hit(XA_strs[i].split(","), opt));
+ }
+
+ // check if there are ALT hits
+ var has_alt = false;
+ for (var i = 0; i < hits.length; ++i)
+ if (weight_alt[hits[i].ctg] != null) {
+ has_alt = true;
+ break;
+ }
+ if (!has_alt) {
+ buf2.push(t);
+ continue;
+ }
// lift mapping positions to the primary assembly
- var n_lifted = 0, n_rpt_lifted = 0;
+ var n_rpt_lifted = 0, rpt_lifted = null;
for (var i = 0; i < hits.length; ++i) {
- var h = hits[i];
+ var a, h = hits[i];
- if (idx[h.ctg] == null) continue;
- var a = idx[h.ctg](h.start, h.end);
- if (a == null || a.length == 0) continue;
+ if (idx_alt[h.ctg] == null || (a = idx_alt[h.ctg](h.start, h.end)) == null || a.length == 0)
+ continue;
// find the approximate position on the primary assembly
var lifted = [];
@@ -333,36 +367,46 @@ function bwa_postalt(args)
lifted.push([a[j][3], (h.rev!=a[j][4]), s, e]);
if (i == 0) ++n_rpt_lifted;
}
- if (lifted.length) ++n_lifted, hits[i].lifted = lifted;
- }
- if (n_lifted == 0) {
- buf2.push(t);
- continue;
+ if (i == 0 && n_rpt_lifted == 1) rpt_lifted = lifted[0].slice(0);
+ if (lifted.length) hits[i].lifted = lifted;
}
- // group hits based on the lifted positions on the primary assembly
+ // prepare for hits grouping
for (var i = 0; i < hits.length; ++i) { // set keys for sorting
- if (hits[i].lifted && hits[i].lifted.length) // TODO: only the first element in lifted[] is used
+ if (hits[i].lifted != null) // TODO: only the first element in lifted[] is used
hits[i].pctg = hits[i].lifted[0][0], hits[i].pstart = hits[i].lifted[0][2], hits[i].pend = hits[i].lifted[0][3];
else hits[i].pctg = hits[i].ctg, hits[i].pstart = hits[i].start, hits[i].pend = hits[i].end;
hits[i].i = i; // keep the original index
}
- hits.sort(function(a,b) { return a.pctg != b.pctg? (a.pctg < b.pctg? -1 : 1) : a.pstart - b.pstart });
- var last_chr = null, end = 0, g = -1;
- for (var i = 0; i < hits.length; ++i) {
- if (last_chr != hits[i].pctg) ++g, last_chr = hits[i].pctg, end = 0;
- else if (hits[i].pstart >= end) ++g;
- hits[i].g = g;
- end = end > hits[i].pend? end : hits[i].pend;
+
+ // group hits based on the lifted positions on non-ALT sequences
+ if (hits.length > 1) {
+ hits.sort(function(a,b) { return a.pctg != b.pctg? (a.pctg < b.pctg? -1 : 1) : a.pstart - b.pstart });
+ var last_chr = null, end = 0, g = -1;
+ for (var i = 0; i < hits.length; ++i) {
+ if (last_chr != hits[i].pctg) ++g, last_chr = hits[i].pctg, end = 0;
+ else if (hits[i].pstart >= end) ++g;
+ hits[i].g = g;
+ end = end > hits[i].pend? end : hits[i].pend;
+ }
+ } else hits[0].g = 0;
+
+ // find the index and group id of the reported hit; find the size of the reported group
+ var reported_g = null, reported_i = null, n_group0 = 0;
+ if (hits.length > 1) {
+ for (var i = 0; i < hits.length; ++i)
+ if (hits[i].i == 0)
+ reported_g = hits[i].g, reported_i = i;
+ for (var i = 0; i < hits.length; ++i)
+ if (hits[i].g == reported_g)
+ ++n_group0;
+ } else {
+ if (weight_alt[hits[0].ctg] == null) { // no need to go through the following if the single hit is non-ALT
+ buf2.push(t);
+ continue;
+ }
+ reported_g = reported_i = 0, n_group0 = 1;
}
- var reported_g = null, reported_i = null;
- for (var i = 0; i < hits.length; ++i)
- if (hits[i].i == 0)
- reported_g = hits[i].g, reported_i = i;
- var n_group0 = 0; // #hits overlapping the reported hit
- for (var i = 0; i < hits.length; ++i)
- if (hits[i].g == reported_g)
- ++n_group0;
// re-estimate mapping quality if necessary
var mapQ, ori_mapQ = t[4];
@@ -379,12 +423,65 @@ function bwa_postalt(args)
mapQ = group_max.length == 1? 60 : 6 * (group_max[0][0] - group_max[1][0]);
} else mapQ = 0;
mapQ = mapQ < 60? mapQ : 60;
- mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ;
+ if (idx_alt[t[2]] == null) mapQ = mapQ < ori_mapQ? mapQ : ori_mapQ;
+ else mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ;
} else mapQ = t[4];
+ // ALT genotyping
+ if (mapQ >= opt.min_mapq && hits[reported_i].score >= opt.min_sc) {
+ // collect all overlapping ALT contigs
+ var hits2 = [];
+ for (var i = 0; i < hits.length; ++i) {
+ var h = hits[i];
+ if (h.g == reported_g)
+ hits2.push([h.pctg, h.pstart, h.pend, h.ctg, h.score, h.NM]);
+ }
+ var start = hits2[0][1], end = hits2[0][2];
+ for (var i = 1; i < hits2.length; ++i)
+ end = end > hits2[i][2]? end : hits2[i][2];
+ var alts = {};
+ for (var i = 0; i < hits2.length; ++i)
+ if (weight_alt[hits2[i][3]] != null)
+ alts[hits2[i][3]] = [hits2[i][4], hits2[i][5]];
+ if (idx_pri[hits2[0][0]] != null) { // add other unreported hits
+ var ovlp = idx_pri[hits2[0][0]](start, end);
+ for (var i = 0; i < ovlp.length; ++i)
+ if (ovlp[i][0] <= start && end <= ovlp[i][1] && alts[ovlp[i][2]] == null)
+ alts[ovlp[i][2]] = [0, 0];
+ }
+
+ // add weight to each ALT contig
+ var alt_arr = [], max_sc = -1, max_i = -1, sum = 0, min_sc = 1<<30, max_nm = -1;
+ for (var ctg in alts)
+ alt_arr.push([ctg, alts[ctg][0], 0, alts[ctg][1]]);
+ for (var i = 0; i < alt_arr.length; ++i) {
+ if (max_sc < alt_arr[i][1])
+ max_sc = alt_arr[i][1], max_i = i;
+ min_sc = min_sc < alt_arr[i][1]? min_sc : alt_arr[i][1];
+ var nm = alt_arr[i][1] > 0? alt_arr[i][3] : opt.max_nm_sc;
+ max_nm = max_nm > nm? max_nm : nm;
+ }
+ if (max_nm > opt.max_nm_sc) max_nm = opt.max_nm_sc;
+ if (max_sc > 0 && (alt_arr.length == 1 || min_sc < max_sc)) {
+ for (var i = 0; i < alt_arr.length; ++i)
+ sum += (alt_arr[i][2] = Math.pow(10, .6 * (alt_arr[i][1] - max_sc)));
+ for (var i = 0; i < alt_arr.length; ++i) alt_arr[i][2] /= sum;
+ for (var i = 0; i < alt_arr.length; ++i) {
+ var e = [alt_arr[i][0], 1, alt_arr[max_i][2], alt_arr[i][2], max_nm, max_nm * alt_arr[max_i][2], max_nm * alt_arr[i][2]];
+ var w = weight_alt[e[0]];
+ for (var j = 0; j < 6; ++j) w[j] += e[j+1];
+ if (fp_evi) {
+ e[2] = e[2].toFixed(3); e[3] = e[3].toFixed(3);
+ e[5] = e[5].toFixed(3); e[6] = e[6].toFixed(3);
+ fp_evi.write(t[0] + '/' + (t[1]>>6&3) + '\t' + e.join("\t") + '\n');
+ }
+ }
+ }
+ }
+
// check if the reported hit overlaps a hit to the primary assembly; if so, don't reduce mapping quality
if (opt.recover_mapq && n_rpt_lifted == 1 && mapQ > 0) {
- var l = lifted[0];
+ var l = rpt_lifted;
for (var i = 0; i < buf2.length; ++i) {
var s = buf2[i];
if (l[0] != s[2]) continue; // different chr
@@ -423,6 +520,7 @@ function bwa_postalt(args)
need_rev = true;
}
if (need_rev) { // reverse and reverse complement
+ aux.length = 0;
aux.set(t[9], 0); aux.revcomp(); rs = aux.toString();
aux.set(t[10],0); aux.reverse(); rq = aux.toString();
}
@@ -438,14 +536,16 @@ function bwa_postalt(args)
for (var i = 0; i < hits.length; ++i) {
if (opt.verbose >= 5) print(obj2str(hits[i]));
if (hits[i].g != reported_g || i == reported_i) continue;
- if (!opt.show_pri && idx[hits[i].ctg] == null) continue;
- var s = [t[0], flag&0xf10, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, '*', 0, 0];
- // update name
- if (flag&0x40) s[0] += "/1";
- if (flag&0x80) s[0] += "/2";
- s[0] += "_" + (++cnt);
- if (hits[i].rev == hits[reported_i].rev) s.push(t[9], t[10]);
- else s.push(rs, rq);
+ if (!opt.show_pri && idx_alt[hits[i].ctg] == null) continue;
+ var s = [t[0], 0, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, '*', 0, 0];
+ // print sequence/quality and set the rev flag
+ if (hits[i].rev == hits[reported_i].rev) {
+ s.push(t[9], t[10]);
+ s[1] = flag | 0x800;
+ } else {
+ s.push(rs, rq);
+ s[1] = (flag ^ 0x10) | 0x800;
+ }
s.push("NM:i:" + hits[i].NM);
if (hits[i].lifted_str) s.push("lt:Z:" + hits[i].lifted_str);
buf2.push(s);
@@ -454,9 +554,30 @@ function bwa_postalt(args)
for (var i = 0; i < buf2.length; ++i)
print(buf2[i].join("\t"));
file.close();
+ if (fp_evi != null) fp_evi.close();
- aux.destroy();
buf.destroy();
+ aux.destroy();
+
+ // print weight of each contig
+ if (opt.pre != null) {
+ var fpout = new File(opt.pre + '.ctw', "w");
+ var weight_arr = [];
+ for (var ctg in weight_alt) {
+ var w = weight_alt[ctg];
+ weight_arr.push([ctg, w[6], w[7], w[8],
+ w[0], w[1].toFixed(3), w[2].toFixed(3), w[1] > 0? (w[2]/w[1]).toFixed(3) : '0.000',
+ w[3], w[4].toFixed(3), w[5].toFixed(3), w[4] > 0? (w[5]/w[4]).toFixed(3) : '0.000']);
+ }
+ weight_arr.sort(function(a,b) {
+ return a[1] < b[1]? -1 : a[1] > b[1]? 1 : a[2] != b[2]? a[2] - b[2] : a[0] < b[0]? -1 : a[0] > b[0]? 1 : 0;
+ });
+ for (var i = 0; i < weight_arr.length; ++i) {
+ if (weight_arr[i][1] == '~') weight_arr[i][1] = '*';
+ fpout.write(weight_arr[i].join("\t") + '\n');
+ }
+ fpout.close();
+ }
}
bwa_postalt(arguments);
diff --git a/bwa.c b/bwa.c
index 43d8a58..30e5284 100644
--- a/bwa.c
+++ b/bwa.c
@@ -227,7 +227,7 @@ bwt_t *bwa_idx_load_bwt(const char *hint)
return bwt;
}
-bwaidx_t *bwa_idx_load(const char *hint, int which)
+bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
{
bwaidx_t *idx;
char *prefix;
@@ -239,7 +239,12 @@ bwaidx_t *bwa_idx_load(const char *hint, int which)
idx = calloc(1, sizeof(bwaidx_t));
if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
if (which & BWA_IDX_BNS) {
+ int i, c;
idx->bns = bns_restore(prefix);
+ for (i = c = 0; i < idx->bns->n_seqs; ++i)
+ if (idx->bns->anns[i].is_alt) ++c;
+ if (bwa_verbose >= 3)
+ fprintf(stderr, "[M::%s] read %d ALT contigs\n", __func__, c);
if (which & BWA_IDX_PAC) {
idx->pac = calloc(idx->bns->l_pac/4+1, 1);
err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
@@ -251,15 +256,91 @@ bwaidx_t *bwa_idx_load(const char *hint, int which)
return idx;
}
+bwaidx_t *bwa_idx_load(const char *hint, int which)
+{
+ return bwa_idx_load_from_disk(hint, which);
+}
+
void bwa_idx_destroy(bwaidx_t *idx)
{
if (idx == 0) return;
- if (idx->bwt) bwt_destroy(idx->bwt);
- if (idx->bns) bns_destroy(idx->bns);
- if (idx->pac) free(idx->pac);
+ if (idx->mem == 0) {
+ if (idx->bwt) bwt_destroy(idx->bwt);
+ if (idx->bns) bns_destroy(idx->bns);
+ if (idx->pac) free(idx->pac);
+ } else {
+ free(idx->bwt); free(idx->bns->anns); free(idx->bns);
+ if (!idx->is_shm) free(idx->mem);
+ }
free(idx);
}
+int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx)
+{
+ int64_t k = 0, x;
+ int i;
+
+ // generate idx->bwt
+ x = sizeof(bwt_t); idx->bwt = malloc(x); memcpy(idx->bwt, mem + k, x); k += x;
+ x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x;
+ x = idx->bwt->n_sa * sizeof(bwtint_t); idx->bwt->sa = (bwtint_t*)(mem + k); k += x;
+
+ // generate idx->bns and idx->pac
+ x = sizeof(bntseq_t); idx->bns = malloc(x); memcpy(idx->bns, mem + k, x); k += x;
+ x = idx->bns->n_holes * sizeof(bntamb1_t); idx->bns->ambs = (bntamb1_t*)(mem + k); k += x;
+ x = idx->bns->n_seqs * sizeof(bntann1_t); idx->bns->anns = malloc(x); memcpy(idx->bns->anns, mem + k, x); k += x;
+ for (i = 0; i < idx->bns->n_seqs; ++i) {
+ idx->bns->anns[i].name = (char*)(mem + k); k += strlen(idx->bns->anns[i].name) + 1;
+ idx->bns->anns[i].anno = (char*)(mem + k); k += strlen(idx->bns->anns[i].anno) + 1;
+ }
+ idx->pac = (uint8_t*)(mem + k); k += idx->bns->l_pac/4+1;
+ assert(k == l_mem);
+
+ idx->l_mem = k; idx->mem = mem;
+ return 0;
+}
+
+int bwa_idx2mem(bwaidx_t *idx)
+{
+ int i;
+ int64_t k, x, tmp;
+ uint8_t *mem;
+
+ // copy idx->bwt
+ x = idx->bwt->bwt_size * 4;
+ mem = realloc(idx->bwt->bwt, sizeof(bwt_t) + x); idx->bwt->bwt = 0;
+ memmove(mem + sizeof(bwt_t), mem, x);
+ memcpy(mem, idx->bwt, sizeof(bwt_t)); k = sizeof(bwt_t) + x;
+ x = idx->bwt->n_sa * sizeof(bwtint_t); mem = realloc(mem, k + x); memcpy(mem + k, idx->bwt->sa, x); k += x;
+ free(idx->bwt->sa);
+ free(idx->bwt); idx->bwt = 0;
+
+ // copy idx->bns
+ tmp = idx->bns->n_seqs * sizeof(bntann1_t) + idx->bns->n_holes * sizeof(bntamb1_t);
+ for (i = 0; i < idx->bns->n_seqs; ++i) // compute the size of heap-allocated memory
+ tmp += strlen(idx->bns->anns[i].name) + strlen(idx->bns->anns[i].anno) + 2;
+ mem = realloc(mem, k + sizeof(bntseq_t) + tmp);
+ x = sizeof(bntseq_t); memcpy(mem + k, idx->bns, x); k += x;
+ x = idx->bns->n_holes * sizeof(bntamb1_t); memcpy(mem + k, idx->bns->ambs, x); k += x;
+ free(idx->bns->ambs);
+ x = idx->bns->n_seqs * sizeof(bntann1_t); memcpy(mem + k, idx->bns->anns, x); k += x;
+ for (i = 0; i < idx->bns->n_seqs; ++i) {
+ x = strlen(idx->bns->anns[i].name) + 1; memcpy(mem + k, idx->bns->anns[i].name, x); k += x;
+ x = strlen(idx->bns->anns[i].anno) + 1; memcpy(mem + k, idx->bns->anns[i].anno, x); k += x;
+ free(idx->bns->anns[i].name); free(idx->bns->anns[i].anno);
+ }
+ free(idx->bns->anns);
+
+ // copy idx->pac
+ x = idx->bns->l_pac/4+1;
+ mem = realloc(mem, k + x);
+ memcpy(mem + k, idx->pac, x); k += x;
+ free(idx->bns); idx->bns = 0;
+ free(idx->pac); idx->pac = 0;
+
+ return bwa_mem2idx(k, mem, idx);
+}
+
/***********************
* SAM header routines *
***********************/
diff --git a/bwa.h b/bwa.h
index bbc2525..6fcc82e 100644
--- a/bwa.h
+++ b/bwa.h
@@ -10,10 +10,16 @@
#define BWA_IDX_PAC 0x4
#define BWA_IDX_ALL 0x7
+#define BWA_CTL_SIZE 0x10000
+
typedef struct {
bwt_t *bwt; // FM-index
bntseq_t *bns; // information on the reference sequences
uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
+
+ int is_shm;
+ int64_t l_mem;
+ uint8_t *mem;
} bwaidx_t;
typedef struct {
@@ -37,8 +43,12 @@ extern "C" {
char *bwa_idx_infer_prefix(const char *hint);
bwt_t *bwa_idx_load_bwt(const char *hint);
+ bwaidx_t *bwa_idx_load_from_shm(const char *hint);
+ bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which);
bwaidx_t *bwa_idx_load(const char *hint, int which);
void bwa_idx_destroy(bwaidx_t *idx);
+ int bwa_idx2mem(bwaidx_t *idx);
+ int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx);
void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line);
char *bwa_set_rg(const char *s);
diff --git a/bwamem.c b/bwamem.c
index 6bbbc6e..79785f4 100644
--- a/bwamem.c
+++ b/bwamem.c
@@ -73,7 +73,7 @@ mem_opt_t *mem_opt_init()
o->chunk_size = 10000000;
o->n_threads = 1;
o->max_XA_hits = 5;
- o->max_XA_hits_alt = 50;
+ o->max_XA_hits_alt = 200;
o->max_matesw = 50;
o->mask_level_redun = 0.95;
o->min_chain_weight = 0;
@@ -512,38 +512,38 @@ static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t *
int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id)
{
- int i, j, n_pri;
+ int i, n_pri;
int_v z = {0,0,0};
if (n == 0) return 0;
for (i = n_pri = 0; i < n; ++i) {
- a[i].sub = a[i].alt_sc = 0, a[i].secondary = a[i].secondary_alt = -1, a[i].hash = hash_64(id+i);
+ a[i].sub = a[i].alt_sc = 0, a[i].secondary = a[i].secondary_all = -1, a[i].hash = hash_64(id+i);
if (!a[i].is_alt) ++n_pri;
}
ks_introsort(mem_ars_hash, n, a);
mem_mark_primary_se_core(opt, n, a, &z);
for (i = 0; i < n; ++i) {
mem_alnreg_t *p = &a[i];
- p->secondary_alt = i; // keep the rank in the first round
+ p->secondary_all = i; // keep the rank in the first round
if (!p->is_alt && p->secondary >= 0 && a[p->secondary].is_alt)
p->alt_sc = a[p->secondary].score;
}
if (n_pri >= 0 && n_pri < n) {
kv_resize(int, z, n);
if (n_pri > 0) ks_introsort(mem_ars_hash2, n, a);
- for (i = 0; i < n; ++i) z.a[a[i].secondary_alt] = i;
+ for (i = 0; i < n; ++i) z.a[a[i].secondary_all] = i;
for (i = 0; i < n; ++i) {
- if (a[i].secondary < 0) {
- a[i].secondary_alt = -1;
- continue;
- }
- j = z.a[a[i].secondary];
- a[i].secondary_alt = a[j].is_alt? j : -1;
- if (a[i].is_alt) a[i].secondary = INT_MAX;
+ if (a[i].secondary >= 0) {
+ a[i].secondary_all = z.a[a[i].secondary];
+ if (a[i].is_alt) a[i].secondary = INT_MAX;
+ } else a[i].secondary_all = -1;
}
if (n_pri > 0) { // mark primary for hits to the primary assembly only
for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1;
mem_mark_primary_se_core(opt, n_pri, a, &z);
}
+ } else {
+ for (i = 0; i < n; ++i)
+ a[i].secondary_all = a[i].secondary;
}
free(z.a);
return n_pri;
diff --git a/bwamem.h b/bwamem.h
index 6079e5a..ae53601 100644
--- a/bwamem.h
+++ b/bwamem.h
@@ -69,7 +69,7 @@ typedef struct {
int w; // actual band width used in extension
int seedcov; // length of regions coverged by seeds
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
- int secondary_alt;
+ int secondary_all;
int seedlen0; // length of the starting seed
int n_comp:30, is_alt:2; // number of sub-alignments chained together
float frac_rep;
diff --git a/bwamem_extra.c b/bwamem_extra.c
index eea5ac7..09faaa2 100644
--- a/bwamem_extra.c
+++ b/bwamem_extra.c
@@ -112,34 +112,35 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
s->sam = str.s;
}
-static inline void get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i, int r[2])
+static inline int get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i)
{
- int j = a[i].secondary, k = a[i].secondary_alt;
- r[0] = r[1] = -1;
- if (j >= 0 && j < INT_MAX && !a[j].is_alt && !a[i].is_alt && a[i].score >= a[j].score * XA_drop_ratio) r[0] = j;
- if (k >= 0 && a[k].is_alt && a[i].score >= a[k].score * XA_drop_ratio) r[1] = k;
+ int k = a[i].secondary_all;
+ if (k >= 0 && a[i].score >= a[k].score * XA_drop_ratio) return k;
+ return -1;
}
// Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup.
char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se()
{
- int i, k, *cnt, tot, r[2];
+ int i, k, r, *cnt, tot;
kstring_t *aln = 0, str = {0,0,0};
- char **XA = 0;
+ char **XA = 0, *has_alt;
cnt = calloc(a->n, sizeof(int));
+ has_alt = calloc(a->n, 1);
for (i = 0, tot = 0; i < a->n; ++i) {
- get_pri_idx(opt->XA_drop_ratio, a->a, i, r);
- if (r[0] >= 0) ++cnt[r[0]], ++tot;
- if (r[1] >= 0) ++cnt[r[1]], ++tot;
+ r = get_pri_idx(opt->XA_drop_ratio, a->a, i);
+ if (r >= 0) {
+ ++cnt[r], ++tot;
+ if (a->a[i].is_alt) has_alt[r] = 1;
+ }
}
if (tot == 0) goto end_gen_alt;
aln = calloc(a->n, sizeof(kstring_t));
for (i = 0; i < a->n; ++i) {
mem_aln_t t;
- get_pri_idx(opt->XA_drop_ratio, a->a, i, r);
- if (r[0] < 0 && r[1] < 0) continue;
- if ((r[0] >= 0 && cnt[r[0]] > opt->max_XA_hits) || (r[1] >= 0 && cnt[r[1]] > opt->max_XA_hits_alt)) continue;
+ if ((r = get_pri_idx(opt->XA_drop_ratio, a->a, i)) < 0) continue;
+ if (cnt[r] > opt->max_XA_hits_alt || (!has_alt[r] && cnt[r] > opt->max_XA_hits)) continue;
t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]);
str.l = 0;
kputs(bns->anns[t.rid].name, &str);
@@ -152,14 +153,13 @@ char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
kputc(',', &str); kputw(t.NM, &str);
kputc(';', &str);
free(t.cigar);
- if (r[0] >= 0 && cnt[r[0]] <= opt->max_XA_hits) kputsn(str.s, str.l, &aln[r[0]]);
- if (r[1] >= 0 && cnt[r[1]] <= opt->max_XA_hits_alt) kputsn(str.s, str.l, &aln[r[1]]);
+ kputsn(str.s, str.l, &aln[r]);
}
XA = calloc(a->n, sizeof(char*));
for (k = 0; k < a->n; ++k)
XA[k] = aln[k].s;
end_gen_alt:
- free(cnt); free(aln); free(str.s);
+ free(has_alt); free(cnt); free(aln); free(str.s);
return XA;
}
diff --git a/bwashm.c b/bwashm.c
new file mode 100644
index 0000000..4d60350
--- /dev/null
+++ b/bwashm.c
@@ -0,0 +1,200 @@
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include "bwa.h"
+
+int bwa_shm_stage(bwaidx_t *idx, const char *hint, const char *_tmpfn)
+{
+ const char *name;
+ uint8_t *shm, *shm_idx;
+ uint16_t *cnt, i;
+ int shmid, to_init = 0, l;
+ char path[PATH_MAX + 1], *p, *tmpfn = (char*)_tmpfn;
+
+ if (hint == 0 || hint[0] == 0) return -1;
+ for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
+ ++name;
+
+ if ((shmid = shm_open("/bwactl", O_RDWR, 0444)) < 0) {
+ shmid = shm_open("/bwactl", O_CREAT|O_RDWR|O_EXCL, 0644);
+ to_init = 1;
+ }
+ if (shmid < 0) return -1;
+ ftruncate(shmid, BWA_CTL_SIZE);
+ shm = mmap(0, BWA_CTL_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
+ cnt = (uint16_t*)shm;
+ if (to_init) {
+ memset(shm, 0, BWA_CTL_SIZE);
+ cnt[1] = 4;
+ } else {
+ for (i = 0, p = (char*)shm + 4; i < cnt[0]; ++i) {
+ if (strcmp(p + 8, name) == 0) break;
+ p += 9 + strlen(p + 8);
+ }
+ if (i < cnt[0]) {
+ fprintf(stderr, "[W::%s] index '%s' is already in shared memory\n", __func__, name);
+ return -1;
+ }
+ }
+
+ if (idx->mem == 0) bwa_idx2mem(idx);
+
+ if (tmpfn) {
+ FILE *fp;
+ if ((fp = fopen(tmpfn, "wb")) != 0) {
+ int64_t rest = idx->l_mem;
+ while (rest > 0) {
+ int64_t l = rest < 0x1000000? rest : 0x1000000;
+ rest -= fwrite(&idx->mem[idx->l_mem - rest], 1, l, fp);
+ }
+ fclose(fp);
+ free(idx->mem); idx->mem = 0;
+ } else {
+ fprintf(stderr, "[W::%s] fail to create the temporary file. Option '-f' is ignored.\n", __func__);
+ tmpfn = 0;
+ }
+ }
+
+ strcat(strcpy(path, "/bwaidx-"), name);
+ l = 8 + strlen(name) + 1;
+ if (cnt[1] + l > BWA_CTL_SIZE) return -1;
+ memcpy(shm + cnt[1], &idx->l_mem, 8);
+ memcpy(shm + cnt[1] + 8, name, l - 8);
+ if ((shmid = shm_open(path, O_CREAT|O_RDWR|O_EXCL, 0644)) < 0) {
+ shm_unlink(path);
+ perror("shm_open()");
+ return -1;
+ }
+ cnt[1] += l; ++cnt[0];
+ ftruncate(shmid, idx->l_mem);
+ shm_idx = mmap(0, idx->l_mem, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
+ if (tmpfn) {
+ FILE *fp;
+ fp = fopen(tmpfn, "rb");
+ int64_t rest = idx->l_mem;
+ while (rest > 0) {
+ int64_t l = rest < 0x1000000? rest : 0x1000000;
+ rest -= fread(&shm_idx[idx->l_mem - rest], 1, l, fp);
+ }
+ fclose(fp);
+ unlink(tmpfn);
+ } else {
+ memcpy(shm_idx, idx->mem, idx->l_mem);
+ free(idx->mem);
+ }
+ bwa_mem2idx(idx->l_mem, shm_idx, idx);
+ idx->is_shm = 1;
+ return 0;
+}
+
+bwaidx_t *bwa_idx_load_from_shm(const char *hint)
+{
+ const char *name;
+ uint8_t *shm, *shm_idx;
+ uint16_t *cnt, i;
+ char *p, path[PATH_MAX + 1];
+ int shmid;
+ int64_t l_mem;
+ bwaidx_t *idx;
+
+ if (hint == 0 || hint[0] == 0) return 0;
+ for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
+ ++name;
+ if ((shmid = shm_open("/bwactl", O_RDONLY, 0444)) < 0) return 0;
+ shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
+ cnt = (uint16_t*)shm;
+ if (cnt[0] == 0) return 0;
+ for (i = 0, p = (char*)(shm + 4); i < cnt[0]; ++i) {
+ memcpy(&l_mem, p, 8); p += 8;
+ if (strcmp(p, name) == 0) break;
+ p += strlen(p) + 1;
+ }
+ if (i == cnt[0]) return 0;
+
+ strcat(strcpy(path, "/bwaidx-"), name);
+ if ((shmid = shm_open(path, O_RDONLY, 0444)) < 0) return 0;
+ shm_idx = mmap(0, l_mem, PROT_READ, MAP_SHARED, shmid, 0);
+ idx = calloc(1, sizeof(bwaidx_t));
+ bwa_mem2idx(l_mem, shm_idx, idx);
+ idx->is_shm = 1;
+ return idx;
+}
+
+int bwa_shm_list(void)
+{
+ int shmid;
+ uint16_t *cnt, i;
+ char *p, *shm;
+ if ((shmid = shm_open("/bwactl", O_RDONLY, 0444)) < 0) return -1;
+ shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
+ cnt = (uint16_t*)shm;
+ for (i = 0, p = shm + 4; i < cnt[0]; ++i) {
+ int64_t l_mem;
+ memcpy(&l_mem, p, 8); p += 8;
+ printf("%s\t%ld\n", p, (long)l_mem);
+ p += strlen(p) + 1;
+ }
+ return 0;
+}
+
+int bwa_shm_destroy(void)
+{
+ int shmid;
+ uint16_t *cnt, i;
+ char *p, *shm;
+ char path[PATH_MAX + 1];
+
+ if ((shmid = shm_open("/bwactl", O_RDONLY, 0444)) < 0) return -1;
+ shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
+ cnt = (uint16_t*)shm;
+ for (i = 0, p = shm + 4; i < cnt[0]; ++i) {
+ int64_t l_mem;
+ memcpy(&l_mem, p, 8); p += 8;
+ strcat(strcpy(path, "/bwaidx-"), p);
+ shm_unlink(path);
+ p += strlen(p) + 1;
+ }
+ munmap(shm, BWA_CTL_SIZE);
+ shm_unlink("/bwactl");
+ return 0;
+}
+
+int main_shm(int argc, char *argv[])
+{
+ int c, to_list = 0, to_drop = 0, ret = 0;
+ char *tmpfn = 0;
+ while ((c = getopt(argc, argv, "ldf:")) >= 0) {
+ if (c == 'l') to_list = 1;
+ else if (c == 'd') to_drop = 1;
+ else if (c == 'f') tmpfn = optarg;
+ }
+ if (optind == argc && !to_list && !to_drop) {
+ fprintf(stderr, "\nUsage: bwa shm [-d|-l] [-f tmpFile] [idxbase]\n\n");
+ fprintf(stderr, "Options: -d destroy all indices in shared memory\n");
+ fprintf(stderr, " -l list names of indices in shared memory\n");
+ fprintf(stderr, " -f FILE temporary file to reduce peak memory\n\n");
+ return 1;
+ }
+ if (optind < argc && (to_list || to_drop)) {
+ fprintf(stderr, "[E::%s] open -l or -d cannot be used when 'idxbase' is present\n", __func__);
+ return 1;
+ }
+ if (optind < argc) {
+ bwaidx_t *idx;
+ idx = bwa_idx_load_from_disk(argv[optind], BWA_IDX_ALL);
+ if (bwa_shm_stage(idx, argv[optind], tmpfn) < 0) {
+ fprintf(stderr, "[E::%s] failed to stage the index in shared memory\n", __func__);
+ ret = 1;
+ }
+ bwa_idx_destroy(idx);
+ }
+ if (to_list) bwa_shm_list();
+ if (to_drop) bwa_shm_destroy();
+ return ret;
+}
diff --git a/fastmap.c b/fastmap.c
index 43ddfd3..e949da3 100644
--- a/fastmap.c
+++ b/fastmap.c
@@ -230,7 +230,11 @@ int main_mem(int argc, char *argv[])
} else update_a(opt, &opt0);
bwa_fill_scmat(opt->a, opt->b, opt->mat);
- if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
+ idx = bwa_idx_load_from_shm(argv[optind]);
+ if (idx == 0) {
+ if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
+ } else if (bwa_verbose >= 3)
+ fprintf(stderr, "[M::%s] load the bwa index from shared memory\n", __func__);
if (ignore_alt)
for (i = 0; i < idx->bns->n_seqs; ++i)
idx->bns->anns[i].is_alt = 0;
diff --git a/main.c b/main.c
index 4b39d6e..313278c 100644
--- a/main.c
+++ b/main.c
@@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.7.10-r876-dirty"
+#define PACKAGE_VERSION "0.7.10-r907-dirty"
#endif
int bwa_fa2pac(int argc, char *argv[]);
@@ -22,6 +22,7 @@ int bwa_bwtsw2(int argc, char *argv[]);
int main_fastmap(int argc, char *argv[]);
int main_mem(int argc, char *argv[]);
+int main_shm(int argc, char *argv[]);
int main_pemerge(int argc, char *argv[]);
@@ -43,6 +44,7 @@ static int usage()
fprintf(stderr, " sampe generate alignment (paired ended)\n");
fprintf(stderr, " bwasw BWA-SW for long queries\n");
fprintf(stderr, "\n");
+ fprintf(stderr, " shm manage indices in shared memory\n");
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
fprintf(stderr, " pac2bwt generate BWT from PAC\n");
fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n");
@@ -81,6 +83,7 @@ int main(int argc, char *argv[])
else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1);
else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1);
+ else if (strcmp(argv[1], "shm") == 0) ret = main_shm(argc-1, argv+1);
else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1);
else {
fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);