diff --git a/Makefile b/Makefile index 9bccbf6..d0d50cf 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ CFLAGS= -g -Wall -Wno-unused-function -O2 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS AR= ar DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) -LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o +LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwashm.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ is.o bwtindex.o bwape.o kopen.o pemerge.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ @@ -14,6 +14,10 @@ INCLUDES= LIBS= -lm -lz -lpthread SUBDIRS= . +ifeq ($(shell uname -s),Linux) + LIBS += -lrt +endif + .SUFFIXES:.c .o .cc .c.o: @@ -52,6 +56,7 @@ bwape.o: ksw.h khash.h bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h bwase.o: bwa.h ksw.h bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h +bwashm.o: bwa.h bntseq.h bwt.h bwt.o: utils.h bwt.h kvec.h malloc_wrap.h bwt_gen.o: QSufSort.h malloc_wrap.h bwt_lite.o: bwt_lite.h malloc_wrap.h @@ -66,7 +71,6 @@ bwtsw2_core.o: khash.h ksort.h bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h bwtsw2_pair.o: malloc_wrap.h ksw.h -cutvar.o: bwa.h bntseq.h bwt.h kvec.h malloc_wrap.h example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h is.o: malloc_wrap.h diff --git a/README-alt.md b/README-alt.md new file mode 100644 index 0000000..8dac185 --- /dev/null +++ b/README-alt.md @@ -0,0 +1,123 @@ +## Getting Started + +Since version 0.7.11, BWA-MEM supports read mapping against a reference genome +with long alternative haplotypes present in separate ALT contigs. To use the +ALT-aware mode, users need to provide pairwise ALT-to-reference alignment in the +SAM format and rename the file to ""*idxbase*.alt". For GRCh38, this alignment is +available from the [BWA resource bundle for GRCh38][res]. + +#### Option 1: Mapping to the official GRCh38 with ALT contigs + +Construct the index: +```sh +wget ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz +gzip -d GCA_000001405.15_GRCh38_full_analysis_set.fna.gz +mv GCA_000001405.15_GRCh38_full_analysis_set.fna hs38a.fa +bwa index hs38a.fa +cp bwa-hs38-res/hs38d4.fa.alt hs38a.fa.alt +``` + +Perform mapping: +```sh +bwa mem hs38a.fa read1.fq read2.fq \ + | samblaster \ + | bwa-hs38-res/k8-linux bwa-postalt.js hs38a.fa.alt \ + | samtools view -bS - > aln.unsrt.bam +``` +For short reads, the postprocessing script `bwa-postalt.js` runs at about the +same speed as BAM compression. + +#### Option 2: Mapping to the collection of GRCh38, decoy and HLA genes + +Construct the index: +```sh +cat hs38a.fa bwa-hs38-res/hs38d4-extra.fa > hs38d4.fa +bwa index hs38d4.fa +cp bwa-hs38-res/hs38d4.fa.alt . +``` +Perform mapping: +```sh +bwa mem -g.8 hs38d4.fa read1.fq read2.fq \ + | samblaster \ + | bwa-hs38-res/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \ + | samtools view -bS - > aln.unsrt.bam +``` +This command line generates `postinfo.ctw` which loosely evaluates the presence +of an ALT contig with an empirical score at the last column. + +## Background + +GRCh38 ALT contigs are totaled 109Mb in length, spanning 60Mbp genomic regions. +However, sequences that are highly diverged from the primary assembly only +contribute a few million bp. Most subsequences of ALT contigs are highly similar +or identical to the primary assembly. If we align sequence reads to GRCh38+ALT +treating ALT equal to the primary assembly, we will get many reads with zero +mapping quality and lose variants on them. It is crucial to make the mapper +aware of ALTs. + +BWA-MEM is designed to minimize the interference of ALT contigs such that on the +primary assembly, the ALT-aware alignment is highly similar to the alignment +without using ALT contigs in the index. This design choice makes it almost +always safe to map reads to GRCh38+ALT. Although we don't know yet how much +variations on ALT contigs contribute to phenotypes, we would not get the answer +without mapping large cohorts to these extra sequences. We hope our current +implementation encourages researchers to use ALT contigs soon and often. + +## Methods + +As of now, ALT mapping is done in two separate steps: BWA-MEM mapping and +postprocessing. + +#### Step 1: BWA-MEM mapping + +At this step, BWA-MEM reads the ALT contig names from "*idxbase*.alt", ignoring +the ALT-to-ref alignment, and labels a potential hit as *ALT* or *non-ALT*, +depending on whether the hit lands on an ALT contig or not. BWA-MEM then reports +alignments and assigns mapQ following these two rules: + + * The original mapQ of a non-ALT hit is computed across non-ALT hits only. + The reported mapQ of an ALT hit is computed across all hits. + + * An ALT hit is only reported if its score is strictly better than all + overlapping non-ALT hits. A reported ALT hit is flagged with 0x800 + (supplementary) unless there are no non-ALT hits. + +When option `-g FLOAT` is in use (which is the default), a third rule kicks in: + + * The mapQ of a non-ALT hit is reduced to zero if its score is less than FLOAT + times the score of an overlapping ALT hit. In this case, the original mapQ is + moved to the `om` tag. + +If we don't care about ALT hits, we may actually skip postprocessing (step 2). +Nonetheless, postprocessing is recommended as it improves mapQ and gives more +information about ALT hits. + +#### Step 2: Postprocessing + +Postprocessing is done with a separate script `bwa-postalt.js`. It reads all +potential hits reported in the XA tag, lifts ALT hits to the chromosomal +positions using the ALT-to-ref alignment, groups them after lifting and then +reassigns mapQ based on the best scoring hit in each group with all the hits in +a group get the same mapQ. Knowing the ALT-to-ref alignment, this script can +greatly improve mapQ of ALT hits and occasionally improve mapQ of non-ALT hits. + +The script also measures the presence of each ALT contig. For a group of +overlapping ALT contigs c_1, ..., c_m, the weight for c_k equals `\frac{\sum_j +P(c_k|r_j)}{\sum_j\max_i P(c_i|r_j)}`, where `P(c_k|r)=\frac{pow(4,s_k)}{\sum_i +pow(4,s_i)}` is the posterior of c_k given a read r mapped to it with a +Smith-Waterman score s_k. This weight is reported in `postinfo.ctw` in the +option 2 above. + +## Problems and Future Development + +There are some uncertainties about ALT mappings - we are not sure whether they +help biological discovery and don't know the best way to analyze them. Without +clear demand from downstream analyses, it is very difficult to design the +optimal mapping strategy. The current BWA-MEM method is just a start. If it +turns out to be useful in research, we will probably rewrite bwa-postalt.js in C +for performance; if not, we will try new designs. It is also possible that we +may make breakthrough on the representation of multiple genomes, in which case, +we can even get rid of ALT contigs once for all. + +[res]: https://sourceforge.net/projects/bio-bwa/files/ +[sb]: https://github.com/GregoryFaust/samblaster diff --git a/README.md b/README.md index 3472e91..cfc84c6 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,6 @@ do not have plan to submit it to a peer-reviewed journal in the near future. 3. [Does BWA work on reference sequences longer than 4GB in total?](#4gb) 4. [Why can one read in a pair has high mapping quality but the other has zero?](#pe0) 5. [How can a BWA-backtrack alignment stands out of the end of a chromosome?](#endref) -6. [How to map sequences to GRCh38 with ALT contigs?](#h38) ####1. What types of data does BWA work with? @@ -72,11 +71,11 @@ algorithm and setting may vary. The following list gives the recommended settings: * Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly - contigs up to a few megabases mapped to a close related reference genome: + contigs up to a few megabases mapped to a closely related reference genome: bwa mem ref.fa reads.fq > aln.sam -* Illumina single-end reads no longer than ~70bp: +* Illumina single-end reads shorter than ~70bp: bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam @@ -84,20 +83,21 @@ settings: bwa mem ref.fa read1.fq read2.fq > aln-pe.sam -* Illumina paired-end reads no longer than ~70bp: +* Illumina paired-end reads shorter than ~70bp: bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai bwa sampe ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam -* PacBio subreads to a reference genome: +* PacBio subreads or Oxford Nanopore reads to a reference genome: bwa mem -x pacbio ref.fa reads.fq > aln.sam + bwa mem -x ont2d ref.fa reads.fq > aln.sam BWA-MEM is recommended for query sequences longer than ~70bp for a variety of error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with errors given longer query sequences as the chance of missing all seeds is small. -As is shown above, with non-default settings, BWA-MEM works with PacBio subreads -with a sequencing error rate as high as ~15%. +As is shown above, with non-default settings, BWA-MEM works with Oxford Nanopore +reads with a sequencing error rate over 20%. ####2. Why does a read appear multiple times in the output SAM? diff --git a/bntseq.c b/bntseq.c index 93cdba1..465e383 100644 --- a/bntseq.c +++ b/bntseq.c @@ -179,17 +179,25 @@ bntseq_t *bns_restore(const char *prefix) if ((fp = fopen(strcat(strcpy(alt_filename, prefix), ".alt"), "r")) != 0) { // read .alt file if present char str[1024]; khash_t(str) *h; - int i, absent; + int c, i, absent; khint_t k; h = kh_init(str); for (i = 0; i < bns->n_seqs; ++i) { k = kh_put(str, h, bns->anns[i].name, &absent); kh_val(h, k) = i; } - while (fscanf(fp, "%s", str) == 1) { - k = kh_get(str, h, str); - if (k != kh_end(h)) - bns->anns[kh_val(h, k)].is_alt = 1; + i = 0; + while ((c = fgetc(fp)) != EOF) { + if (c == '\t' || c == '\n' || c == '\r') { + str[i] = 0; + if (str[0] != '@') { + k = kh_get(str, h, str); + if (k != kh_end(h)) + bns->anns[kh_val(h, k)].is_alt = 1; + } + while (c != '\n' && c != EOF) c = fgetc(fp); + i = 0; + } else str[i++] = c; // FIXME: potential segfault here } kh_destroy(str, h); fclose(fp); @@ -226,7 +234,7 @@ static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_ } p = bns->anns + bns->n_seqs; p->name = strdup((char*)seq->name.s); - p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)"); + p->anno = seq->comment.l > 0? strdup((char*)seq->comment.s) : strdup("(null)"); p->gi = 0; p->len = seq->seq.l; p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; p->n_ambs = 0; @@ -358,8 +366,9 @@ int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re) { int is_rev, rid_b, rid_e; if (rb < bns->l_pac && re > bns->l_pac) return -2; + assert(rb <= re); rid_b = bns_pos2rid(bns, bns_depos(bns, rb, &is_rev)); - rid_e = bns_pos2rid(bns, bns_depos(bns, re, &is_rev) - 1); + rid_e = rb < re? bns_pos2rid(bns, bns_depos(bns, re - 1, &is_rev)) : rid_b; return rid_b == rid_e? rid_b : -1; } diff --git a/bwa-postalt.js b/bwa-postalt.js index 5e6b7c7..411cfdd 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -52,7 +52,7 @@ var getopt = function(args, ostr) { return optopt; } -// print an object in a format similar to JSON. For debugging. +// print an object in a format similar to JSON. For debugging only. function obj2str(o) { if (typeof(o) != 'object') { @@ -203,54 +203,31 @@ function parse_hit(s, opt) return h; } -// read the ALT-to-REF alignment and generate the index -function read_ALT_sam(fn) -{ - var intv = {}; - var file = new File(fn); - var buf = new Bytes(); - while (file.readline(buf) >= 0) { - var line = buf.toString(); - var t = line.split("\t"); - if (line.charAt(0) == '@') continue; - var flag = parseInt(t[1]); - var m, cigar = [], l_qaln = 0, l_qclip = 0; - while ((m = re_cigar.exec(t[5])) != null) { - var l = parseInt(m[1]); - cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip - if (m[2] == 'M' || m[2] == 'I') l_qaln += l; - else if (m[2] == 'S' || m[2] == 'H') l_qclip += l; - } - var j = flag&16? cigar.length-1 : 0; - var start = cigar[j][0] == 'S'? cigar[j][1] : 0; - if (intv[t[0]] == null) intv[t[0]] = []; - intv[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, parseInt(t[3]) - 1, cigar]); - //print(start, start + l_qaln, t[2], flag&16? true : false, parseInt(t[3]), cigar); - } - buf.destroy(); - file.close(); - // create index for intervals on ALT contigs - var idx = {}; - for (var ctg in intv) - idx[ctg] = intv_ovlp(intv[ctg]); - return idx; -} - function bwa_postalt(args) { - var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true }; + var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:10, show_ev:false }; - while ((c = getopt(args, 'pqv:')) != null) { + while ((c = getopt(args, 'Pqev:p:')) != null) { if (c == 'v') opt.verbose = parseInt(getopt.arg); - else if (c == 'p') opt.show_pri = true; + else if (c == 'p') opt.pre = getopt.arg; + else if (c == 'P') opt.show_pri = true; else if (c == 'q') opt.recover_maq = false; + else if (c == 'e') opt.show_ev = true; + } + + if (opt.show_ev && opt.pre == null) { + warn("ERROR: option '-p' must be specified if '-e' is applied."); + exit(1); } if (args.length == getopt.ind) { print(""); - print("Usage: k8 bwa-postalt.js [-p] [aln.sam]\n"); - print("Options: -p output lifted non-ALT hit in a SAM line (for ALT-unware alignments)"); - print(" -q don't recover mapQ for non-ALTs hit overlapping lifted ALT"); + print("Usage: k8 bwa-postalt.js [options] [aln.sam]\n"); + print("Options: -p STR prefix of file(s) for additional information [null]"); + print(" PREFIX.ctw - weight of each ALT contig"); + print(" PREFIX.evi - reads supporting ALT contigs (effective with -e)"); + print(" -q don't modify mapQ for non-ALTs hit overlapping lifted ALT"); + print(" -e show reads supporting ALT contigs into file PREFIX.evi"); print(""); print("Note: This script inspects the XA tag, lifts the mapping positions of ALT hits to"); print(" the primary assembly, groups them and then estimates mapQ across groups. If"); @@ -262,12 +239,56 @@ function bwa_postalt(args) exit(1); } - var file, buf = new Bytes(); + var fp_evi = opt.show_ev && opt.pre? new File(opt.pre + '.evi', "w") : null; var aux = new Bytes(); // used for reverse and reverse complement - var idx = read_ALT_sam(args[getopt.ind]); - var buf2 = []; + var buf = new Bytes(); + + // read ALT-to-REF alignment + var intv_alt = {}, intv_pri = {}, idx_un = {}; + var file = new File(args[getopt.ind]); + while (file.readline(buf) >= 0) { + var line = buf.toString(); + if (line.charAt(0) == '@') continue; + var t = line.split("\t"); + if (t.length < 11) continue; // incomplete lines + var pos = parseInt(t[3]) - 1; + var flag = parseInt(t[1]); + if ((flag&4) || t[2] == '*') { + idx_un[t[0]] = true; + continue; + } + var m, cigar = [], l_qaln = 0, l_tlen = 0, l_qclip = 0; + while ((m = re_cigar.exec(t[5])) != null) { + var l = parseInt(m[1]); + cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip + if (m[2] == 'M') l_qaln += l, l_tlen += l; + else if (m[2] == 'I') l_qaln += l; + else if (m[2] == 'S' || m[2] == 'H') l_qclip += l; + else if (m[2] == 'D' || m[2] == 'N') l_tlen += l; + } + var j = flag&16? cigar.length-1 : 0; + var start = cigar[j][0] == 'S'? cigar[j][1] : 0; + if (intv_alt[t[0]] == null) intv_alt[t[0]] = []; + intv_alt[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, pos - 1, cigar, pos + l_tlen]); + if (intv_pri[t[2]] == null) intv_pri[t[2]] = []; + intv_pri[t[2]].push([pos, pos + l_tlen, t[0]]); + } + file.close(); + var idx_alt = {}, idx_pri = {}; + for (var ctg in intv_alt) + idx_alt[ctg] = intv_ovlp(intv_alt[ctg]); + for (var ctg in intv_pri) + idx_pri[ctg] = intv_ovlp(intv_pri[ctg]); + + // initialize the list of ALT contigs + var weight_alt = []; + for (var ctg in idx_alt) + weight_alt[ctg] = [0, 0, 0, 0, 0, 0, intv_alt[ctg][0][3], intv_alt[ctg][0][5], intv_alt[ctg][0][7]]; + for (var ctg in idx_un) + weight_alt[ctg] = [0, 0, 0, 0, 0, 0, '~', 0, 0]; // process SAM + var buf2 = []; file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File(); while (file.readline(buf) >= 0) { var m, line = buf.toString(); @@ -286,36 +307,49 @@ function bwa_postalt(args) buf2 = []; } - if ((m = /\tXA:Z:(\S+)/.exec(line)) == null) { + // skip unmapped lines + if (t[1]&4) { buf2.push(t); continue; } - var XA_strs = m[1].split(";"); // parse the reported hit - var hits = []; var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1]; var flag = t[1]; var h = parse_hit([t[2], ((flag&16)?'-':'+') + t[3], t[5], NM], opt); - if (h.hard) { // don't process lines with hard clips + if (h.hard) { // the following does not work with hard clipped alignments buf2.push(t); continue; } - hits.push(h); + var hits = [h]; // parse hits in the XA tag - for (var i = 0; i < XA_strs.length; ++i) - if (XA_strs[i] != '') // as the last symbol in an XA tag is ";", the last split is an empty string - hits.push(parse_hit(XA_strs[i].split(","), opt)); + if ((m = /\tXA:Z:(\S+)/.exec(line)) != null) { + var XA_strs = m[1].split(";"); + for (var i = 0; i < XA_strs.length; ++i) + if (XA_strs[i] != '') // as the last symbol in an XA tag is ";", the last split is an empty string + hits.push(parse_hit(XA_strs[i].split(","), opt)); + } + + // check if there are ALT hits + var has_alt = false; + for (var i = 0; i < hits.length; ++i) + if (weight_alt[hits[i].ctg] != null) { + has_alt = true; + break; + } + if (!has_alt) { + buf2.push(t); + continue; + } // lift mapping positions to the primary assembly - var n_lifted = 0, n_rpt_lifted = 0; + var n_rpt_lifted = 0, rpt_lifted = null; for (var i = 0; i < hits.length; ++i) { - var h = hits[i]; + var a, h = hits[i]; - if (idx[h.ctg] == null) continue; - var a = idx[h.ctg](h.start, h.end); - if (a == null || a.length == 0) continue; + if (idx_alt[h.ctg] == null || (a = idx_alt[h.ctg](h.start, h.end)) == null || a.length == 0) + continue; // find the approximate position on the primary assembly var lifted = []; @@ -333,36 +367,46 @@ function bwa_postalt(args) lifted.push([a[j][3], (h.rev!=a[j][4]), s, e]); if (i == 0) ++n_rpt_lifted; } - if (lifted.length) ++n_lifted, hits[i].lifted = lifted; - } - if (n_lifted == 0) { - buf2.push(t); - continue; + if (i == 0 && n_rpt_lifted == 1) rpt_lifted = lifted[0].slice(0); + if (lifted.length) hits[i].lifted = lifted; } - // group hits based on the lifted positions on the primary assembly + // prepare for hits grouping for (var i = 0; i < hits.length; ++i) { // set keys for sorting - if (hits[i].lifted && hits[i].lifted.length) // TODO: only the first element in lifted[] is used + if (hits[i].lifted != null) // TODO: only the first element in lifted[] is used hits[i].pctg = hits[i].lifted[0][0], hits[i].pstart = hits[i].lifted[0][2], hits[i].pend = hits[i].lifted[0][3]; else hits[i].pctg = hits[i].ctg, hits[i].pstart = hits[i].start, hits[i].pend = hits[i].end; hits[i].i = i; // keep the original index } - hits.sort(function(a,b) { return a.pctg != b.pctg? (a.pctg < b.pctg? -1 : 1) : a.pstart - b.pstart }); - var last_chr = null, end = 0, g = -1; - for (var i = 0; i < hits.length; ++i) { - if (last_chr != hits[i].pctg) ++g, last_chr = hits[i].pctg, end = 0; - else if (hits[i].pstart >= end) ++g; - hits[i].g = g; - end = end > hits[i].pend? end : hits[i].pend; + + // group hits based on the lifted positions on non-ALT sequences + if (hits.length > 1) { + hits.sort(function(a,b) { return a.pctg != b.pctg? (a.pctg < b.pctg? -1 : 1) : a.pstart - b.pstart }); + var last_chr = null, end = 0, g = -1; + for (var i = 0; i < hits.length; ++i) { + if (last_chr != hits[i].pctg) ++g, last_chr = hits[i].pctg, end = 0; + else if (hits[i].pstart >= end) ++g; + hits[i].g = g; + end = end > hits[i].pend? end : hits[i].pend; + } + } else hits[0].g = 0; + + // find the index and group id of the reported hit; find the size of the reported group + var reported_g = null, reported_i = null, n_group0 = 0; + if (hits.length > 1) { + for (var i = 0; i < hits.length; ++i) + if (hits[i].i == 0) + reported_g = hits[i].g, reported_i = i; + for (var i = 0; i < hits.length; ++i) + if (hits[i].g == reported_g) + ++n_group0; + } else { + if (weight_alt[hits[0].ctg] == null) { // no need to go through the following if the single hit is non-ALT + buf2.push(t); + continue; + } + reported_g = reported_i = 0, n_group0 = 1; } - var reported_g = null, reported_i = null; - for (var i = 0; i < hits.length; ++i) - if (hits[i].i == 0) - reported_g = hits[i].g, reported_i = i; - var n_group0 = 0; // #hits overlapping the reported hit - for (var i = 0; i < hits.length; ++i) - if (hits[i].g == reported_g) - ++n_group0; // re-estimate mapping quality if necessary var mapQ, ori_mapQ = t[4]; @@ -379,12 +423,65 @@ function bwa_postalt(args) mapQ = group_max.length == 1? 60 : 6 * (group_max[0][0] - group_max[1][0]); } else mapQ = 0; mapQ = mapQ < 60? mapQ : 60; - mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ; + if (idx_alt[t[2]] == null) mapQ = mapQ < ori_mapQ? mapQ : ori_mapQ; + else mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ; } else mapQ = t[4]; + // ALT genotyping + if (mapQ >= opt.min_mapq && hits[reported_i].score >= opt.min_sc) { + // collect all overlapping ALT contigs + var hits2 = []; + for (var i = 0; i < hits.length; ++i) { + var h = hits[i]; + if (h.g == reported_g) + hits2.push([h.pctg, h.pstart, h.pend, h.ctg, h.score, h.NM]); + } + var start = hits2[0][1], end = hits2[0][2]; + for (var i = 1; i < hits2.length; ++i) + end = end > hits2[i][2]? end : hits2[i][2]; + var alts = {}; + for (var i = 0; i < hits2.length; ++i) + if (weight_alt[hits2[i][3]] != null) + alts[hits2[i][3]] = [hits2[i][4], hits2[i][5]]; + if (idx_pri[hits2[0][0]] != null) { // add other unreported hits + var ovlp = idx_pri[hits2[0][0]](start, end); + for (var i = 0; i < ovlp.length; ++i) + if (ovlp[i][0] <= start && end <= ovlp[i][1] && alts[ovlp[i][2]] == null) + alts[ovlp[i][2]] = [0, 0]; + } + + // add weight to each ALT contig + var alt_arr = [], max_sc = -1, max_i = -1, sum = 0, min_sc = 1<<30, max_nm = -1; + for (var ctg in alts) + alt_arr.push([ctg, alts[ctg][0], 0, alts[ctg][1]]); + for (var i = 0; i < alt_arr.length; ++i) { + if (max_sc < alt_arr[i][1]) + max_sc = alt_arr[i][1], max_i = i; + min_sc = min_sc < alt_arr[i][1]? min_sc : alt_arr[i][1]; + var nm = alt_arr[i][1] > 0? alt_arr[i][3] : opt.max_nm_sc; + max_nm = max_nm > nm? max_nm : nm; + } + if (max_nm > opt.max_nm_sc) max_nm = opt.max_nm_sc; + if (max_sc > 0 && (alt_arr.length == 1 || min_sc < max_sc)) { + for (var i = 0; i < alt_arr.length; ++i) + sum += (alt_arr[i][2] = Math.pow(10, .6 * (alt_arr[i][1] - max_sc))); + for (var i = 0; i < alt_arr.length; ++i) alt_arr[i][2] /= sum; + for (var i = 0; i < alt_arr.length; ++i) { + var e = [alt_arr[i][0], 1, alt_arr[max_i][2], alt_arr[i][2], max_nm, max_nm * alt_arr[max_i][2], max_nm * alt_arr[i][2]]; + var w = weight_alt[e[0]]; + for (var j = 0; j < 6; ++j) w[j] += e[j+1]; + if (fp_evi) { + e[2] = e[2].toFixed(3); e[3] = e[3].toFixed(3); + e[5] = e[5].toFixed(3); e[6] = e[6].toFixed(3); + fp_evi.write(t[0] + '/' + (t[1]>>6&3) + '\t' + e.join("\t") + '\n'); + } + } + } + } + // check if the reported hit overlaps a hit to the primary assembly; if so, don't reduce mapping quality if (opt.recover_mapq && n_rpt_lifted == 1 && mapQ > 0) { - var l = lifted[0]; + var l = rpt_lifted; for (var i = 0; i < buf2.length; ++i) { var s = buf2[i]; if (l[0] != s[2]) continue; // different chr @@ -423,6 +520,7 @@ function bwa_postalt(args) need_rev = true; } if (need_rev) { // reverse and reverse complement + aux.length = 0; aux.set(t[9], 0); aux.revcomp(); rs = aux.toString(); aux.set(t[10],0); aux.reverse(); rq = aux.toString(); } @@ -438,14 +536,16 @@ function bwa_postalt(args) for (var i = 0; i < hits.length; ++i) { if (opt.verbose >= 5) print(obj2str(hits[i])); if (hits[i].g != reported_g || i == reported_i) continue; - if (!opt.show_pri && idx[hits[i].ctg] == null) continue; - var s = [t[0], flag&0xf10, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, '*', 0, 0]; - // update name - if (flag&0x40) s[0] += "/1"; - if (flag&0x80) s[0] += "/2"; - s[0] += "_" + (++cnt); - if (hits[i].rev == hits[reported_i].rev) s.push(t[9], t[10]); - else s.push(rs, rq); + if (!opt.show_pri && idx_alt[hits[i].ctg] == null) continue; + var s = [t[0], 0, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, '*', 0, 0]; + // print sequence/quality and set the rev flag + if (hits[i].rev == hits[reported_i].rev) { + s.push(t[9], t[10]); + s[1] = flag | 0x800; + } else { + s.push(rs, rq); + s[1] = (flag ^ 0x10) | 0x800; + } s.push("NM:i:" + hits[i].NM); if (hits[i].lifted_str) s.push("lt:Z:" + hits[i].lifted_str); buf2.push(s); @@ -454,9 +554,30 @@ function bwa_postalt(args) for (var i = 0; i < buf2.length; ++i) print(buf2[i].join("\t")); file.close(); + if (fp_evi != null) fp_evi.close(); - aux.destroy(); buf.destroy(); + aux.destroy(); + + // print weight of each contig + if (opt.pre != null) { + var fpout = new File(opt.pre + '.ctw', "w"); + var weight_arr = []; + for (var ctg in weight_alt) { + var w = weight_alt[ctg]; + weight_arr.push([ctg, w[6], w[7], w[8], + w[0], w[1].toFixed(3), w[2].toFixed(3), w[1] > 0? (w[2]/w[1]).toFixed(3) : '0.000', + w[3], w[4].toFixed(3), w[5].toFixed(3), w[4] > 0? (w[5]/w[4]).toFixed(3) : '0.000']); + } + weight_arr.sort(function(a,b) { + return a[1] < b[1]? -1 : a[1] > b[1]? 1 : a[2] != b[2]? a[2] - b[2] : a[0] < b[0]? -1 : a[0] > b[0]? 1 : 0; + }); + for (var i = 0; i < weight_arr.length; ++i) { + if (weight_arr[i][1] == '~') weight_arr[i][1] = '*'; + fpout.write(weight_arr[i].join("\t") + '\n'); + } + fpout.close(); + } } bwa_postalt(arguments); diff --git a/bwa.c b/bwa.c index 43d8a58..30e5284 100644 --- a/bwa.c +++ b/bwa.c @@ -227,7 +227,7 @@ bwt_t *bwa_idx_load_bwt(const char *hint) return bwt; } -bwaidx_t *bwa_idx_load(const char *hint, int which) +bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which) { bwaidx_t *idx; char *prefix; @@ -239,7 +239,12 @@ bwaidx_t *bwa_idx_load(const char *hint, int which) idx = calloc(1, sizeof(bwaidx_t)); if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint); if (which & BWA_IDX_BNS) { + int i, c; idx->bns = bns_restore(prefix); + for (i = c = 0; i < idx->bns->n_seqs; ++i) + if (idx->bns->anns[i].is_alt) ++c; + if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] read %d ALT contigs\n", __func__, c); if (which & BWA_IDX_PAC) { idx->pac = calloc(idx->bns->l_pac/4+1, 1); err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence @@ -251,15 +256,91 @@ bwaidx_t *bwa_idx_load(const char *hint, int which) return idx; } +bwaidx_t *bwa_idx_load(const char *hint, int which) +{ + return bwa_idx_load_from_disk(hint, which); +} + void bwa_idx_destroy(bwaidx_t *idx) { if (idx == 0) return; - if (idx->bwt) bwt_destroy(idx->bwt); - if (idx->bns) bns_destroy(idx->bns); - if (idx->pac) free(idx->pac); + if (idx->mem == 0) { + if (idx->bwt) bwt_destroy(idx->bwt); + if (idx->bns) bns_destroy(idx->bns); + if (idx->pac) free(idx->pac); + } else { + free(idx->bwt); free(idx->bns->anns); free(idx->bns); + if (!idx->is_shm) free(idx->mem); + } free(idx); } +int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx) +{ + int64_t k = 0, x; + int i; + + // generate idx->bwt + x = sizeof(bwt_t); idx->bwt = malloc(x); memcpy(idx->bwt, mem + k, x); k += x; + x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x; + x = idx->bwt->n_sa * sizeof(bwtint_t); idx->bwt->sa = (bwtint_t*)(mem + k); k += x; + + // generate idx->bns and idx->pac + x = sizeof(bntseq_t); idx->bns = malloc(x); memcpy(idx->bns, mem + k, x); k += x; + x = idx->bns->n_holes * sizeof(bntamb1_t); idx->bns->ambs = (bntamb1_t*)(mem + k); k += x; + x = idx->bns->n_seqs * sizeof(bntann1_t); idx->bns->anns = malloc(x); memcpy(idx->bns->anns, mem + k, x); k += x; + for (i = 0; i < idx->bns->n_seqs; ++i) { + idx->bns->anns[i].name = (char*)(mem + k); k += strlen(idx->bns->anns[i].name) + 1; + idx->bns->anns[i].anno = (char*)(mem + k); k += strlen(idx->bns->anns[i].anno) + 1; + } + idx->pac = (uint8_t*)(mem + k); k += idx->bns->l_pac/4+1; + assert(k == l_mem); + + idx->l_mem = k; idx->mem = mem; + return 0; +} + +int bwa_idx2mem(bwaidx_t *idx) +{ + int i; + int64_t k, x, tmp; + uint8_t *mem; + + // copy idx->bwt + x = idx->bwt->bwt_size * 4; + mem = realloc(idx->bwt->bwt, sizeof(bwt_t) + x); idx->bwt->bwt = 0; + memmove(mem + sizeof(bwt_t), mem, x); + memcpy(mem, idx->bwt, sizeof(bwt_t)); k = sizeof(bwt_t) + x; + x = idx->bwt->n_sa * sizeof(bwtint_t); mem = realloc(mem, k + x); memcpy(mem + k, idx->bwt->sa, x); k += x; + free(idx->bwt->sa); + free(idx->bwt); idx->bwt = 0; + + // copy idx->bns + tmp = idx->bns->n_seqs * sizeof(bntann1_t) + idx->bns->n_holes * sizeof(bntamb1_t); + for (i = 0; i < idx->bns->n_seqs; ++i) // compute the size of heap-allocated memory + tmp += strlen(idx->bns->anns[i].name) + strlen(idx->bns->anns[i].anno) + 2; + mem = realloc(mem, k + sizeof(bntseq_t) + tmp); + x = sizeof(bntseq_t); memcpy(mem + k, idx->bns, x); k += x; + x = idx->bns->n_holes * sizeof(bntamb1_t); memcpy(mem + k, idx->bns->ambs, x); k += x; + free(idx->bns->ambs); + x = idx->bns->n_seqs * sizeof(bntann1_t); memcpy(mem + k, idx->bns->anns, x); k += x; + for (i = 0; i < idx->bns->n_seqs; ++i) { + x = strlen(idx->bns->anns[i].name) + 1; memcpy(mem + k, idx->bns->anns[i].name, x); k += x; + x = strlen(idx->bns->anns[i].anno) + 1; memcpy(mem + k, idx->bns->anns[i].anno, x); k += x; + free(idx->bns->anns[i].name); free(idx->bns->anns[i].anno); + } + free(idx->bns->anns); + + // copy idx->pac + x = idx->bns->l_pac/4+1; + mem = realloc(mem, k + x); + memcpy(mem + k, idx->pac, x); k += x; + free(idx->bns); idx->bns = 0; + free(idx->pac); idx->pac = 0; + + return bwa_mem2idx(k, mem, idx); +} + /*********************** * SAM header routines * ***********************/ diff --git a/bwa.h b/bwa.h index bbc2525..6fcc82e 100644 --- a/bwa.h +++ b/bwa.h @@ -10,10 +10,16 @@ #define BWA_IDX_PAC 0x4 #define BWA_IDX_ALL 0x7 +#define BWA_CTL_SIZE 0x10000 + typedef struct { bwt_t *bwt; // FM-index bntseq_t *bns; // information on the reference sequences uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base + + int is_shm; + int64_t l_mem; + uint8_t *mem; } bwaidx_t; typedef struct { @@ -37,8 +43,12 @@ extern "C" { char *bwa_idx_infer_prefix(const char *hint); bwt_t *bwa_idx_load_bwt(const char *hint); + bwaidx_t *bwa_idx_load_from_shm(const char *hint); + bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which); bwaidx_t *bwa_idx_load(const char *hint, int which); void bwa_idx_destroy(bwaidx_t *idx); + int bwa_idx2mem(bwaidx_t *idx); + int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx); void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line); char *bwa_set_rg(const char *s); diff --git a/bwamem.c b/bwamem.c index 6bbbc6e..79785f4 100644 --- a/bwamem.c +++ b/bwamem.c @@ -73,7 +73,7 @@ mem_opt_t *mem_opt_init() o->chunk_size = 10000000; o->n_threads = 1; o->max_XA_hits = 5; - o->max_XA_hits_alt = 50; + o->max_XA_hits_alt = 200; o->max_matesw = 50; o->mask_level_redun = 0.95; o->min_chain_weight = 0; @@ -512,38 +512,38 @@ static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t * int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id) { - int i, j, n_pri; + int i, n_pri; int_v z = {0,0,0}; if (n == 0) return 0; for (i = n_pri = 0; i < n; ++i) { - a[i].sub = a[i].alt_sc = 0, a[i].secondary = a[i].secondary_alt = -1, a[i].hash = hash_64(id+i); + a[i].sub = a[i].alt_sc = 0, a[i].secondary = a[i].secondary_all = -1, a[i].hash = hash_64(id+i); if (!a[i].is_alt) ++n_pri; } ks_introsort(mem_ars_hash, n, a); mem_mark_primary_se_core(opt, n, a, &z); for (i = 0; i < n; ++i) { mem_alnreg_t *p = &a[i]; - p->secondary_alt = i; // keep the rank in the first round + p->secondary_all = i; // keep the rank in the first round if (!p->is_alt && p->secondary >= 0 && a[p->secondary].is_alt) p->alt_sc = a[p->secondary].score; } if (n_pri >= 0 && n_pri < n) { kv_resize(int, z, n); if (n_pri > 0) ks_introsort(mem_ars_hash2, n, a); - for (i = 0; i < n; ++i) z.a[a[i].secondary_alt] = i; + for (i = 0; i < n; ++i) z.a[a[i].secondary_all] = i; for (i = 0; i < n; ++i) { - if (a[i].secondary < 0) { - a[i].secondary_alt = -1; - continue; - } - j = z.a[a[i].secondary]; - a[i].secondary_alt = a[j].is_alt? j : -1; - if (a[i].is_alt) a[i].secondary = INT_MAX; + if (a[i].secondary >= 0) { + a[i].secondary_all = z.a[a[i].secondary]; + if (a[i].is_alt) a[i].secondary = INT_MAX; + } else a[i].secondary_all = -1; } if (n_pri > 0) { // mark primary for hits to the primary assembly only for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1; mem_mark_primary_se_core(opt, n_pri, a, &z); } + } else { + for (i = 0; i < n; ++i) + a[i].secondary_all = a[i].secondary; } free(z.a); return n_pri; diff --git a/bwamem.h b/bwamem.h index 6079e5a..ae53601 100644 --- a/bwamem.h +++ b/bwamem.h @@ -69,7 +69,7 @@ typedef struct { int w; // actual band width used in extension int seedcov; // length of regions coverged by seeds int secondary; // index of the parent hit shadowing the current hit; <0 if primary - int secondary_alt; + int secondary_all; int seedlen0; // length of the starting seed int n_comp:30, is_alt:2; // number of sub-alignments chained together float frac_rep; diff --git a/bwamem_extra.c b/bwamem_extra.c index eea5ac7..09faaa2 100644 --- a/bwamem_extra.c +++ b/bwamem_extra.c @@ -112,34 +112,35 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, s->sam = str.s; } -static inline void get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i, int r[2]) +static inline int get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i) { - int j = a[i].secondary, k = a[i].secondary_alt; - r[0] = r[1] = -1; - if (j >= 0 && j < INT_MAX && !a[j].is_alt && !a[i].is_alt && a[i].score >= a[j].score * XA_drop_ratio) r[0] = j; - if (k >= 0 && a[k].is_alt && a[i].score >= a[k].score * XA_drop_ratio) r[1] = k; + int k = a[i].secondary_all; + if (k >= 0 && a[i].score >= a[k].score * XA_drop_ratio) return k; + return -1; } // Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup. char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se() { - int i, k, *cnt, tot, r[2]; + int i, k, r, *cnt, tot; kstring_t *aln = 0, str = {0,0,0}; - char **XA = 0; + char **XA = 0, *has_alt; cnt = calloc(a->n, sizeof(int)); + has_alt = calloc(a->n, 1); for (i = 0, tot = 0; i < a->n; ++i) { - get_pri_idx(opt->XA_drop_ratio, a->a, i, r); - if (r[0] >= 0) ++cnt[r[0]], ++tot; - if (r[1] >= 0) ++cnt[r[1]], ++tot; + r = get_pri_idx(opt->XA_drop_ratio, a->a, i); + if (r >= 0) { + ++cnt[r], ++tot; + if (a->a[i].is_alt) has_alt[r] = 1; + } } if (tot == 0) goto end_gen_alt; aln = calloc(a->n, sizeof(kstring_t)); for (i = 0; i < a->n; ++i) { mem_aln_t t; - get_pri_idx(opt->XA_drop_ratio, a->a, i, r); - if (r[0] < 0 && r[1] < 0) continue; - if ((r[0] >= 0 && cnt[r[0]] > opt->max_XA_hits) || (r[1] >= 0 && cnt[r[1]] > opt->max_XA_hits_alt)) continue; + if ((r = get_pri_idx(opt->XA_drop_ratio, a->a, i)) < 0) continue; + if (cnt[r] > opt->max_XA_hits_alt || (!has_alt[r] && cnt[r] > opt->max_XA_hits)) continue; t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]); str.l = 0; kputs(bns->anns[t.rid].name, &str); @@ -152,14 +153,13 @@ char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac kputc(',', &str); kputw(t.NM, &str); kputc(';', &str); free(t.cigar); - if (r[0] >= 0 && cnt[r[0]] <= opt->max_XA_hits) kputsn(str.s, str.l, &aln[r[0]]); - if (r[1] >= 0 && cnt[r[1]] <= opt->max_XA_hits_alt) kputsn(str.s, str.l, &aln[r[1]]); + kputsn(str.s, str.l, &aln[r]); } XA = calloc(a->n, sizeof(char*)); for (k = 0; k < a->n; ++k) XA[k] = aln[k].s; end_gen_alt: - free(cnt); free(aln); free(str.s); + free(has_alt); free(cnt); free(aln); free(str.s); return XA; } diff --git a/bwashm.c b/bwashm.c new file mode 100644 index 0000000..4d60350 --- /dev/null +++ b/bwashm.c @@ -0,0 +1,200 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bwa.h" + +int bwa_shm_stage(bwaidx_t *idx, const char *hint, const char *_tmpfn) +{ + const char *name; + uint8_t *shm, *shm_idx; + uint16_t *cnt, i; + int shmid, to_init = 0, l; + char path[PATH_MAX + 1], *p, *tmpfn = (char*)_tmpfn; + + if (hint == 0 || hint[0] == 0) return -1; + for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name); + ++name; + + if ((shmid = shm_open("/bwactl", O_RDWR, 0444)) < 0) { + shmid = shm_open("/bwactl", O_CREAT|O_RDWR|O_EXCL, 0644); + to_init = 1; + } + if (shmid < 0) return -1; + ftruncate(shmid, BWA_CTL_SIZE); + shm = mmap(0, BWA_CTL_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0); + cnt = (uint16_t*)shm; + if (to_init) { + memset(shm, 0, BWA_CTL_SIZE); + cnt[1] = 4; + } else { + for (i = 0, p = (char*)shm + 4; i < cnt[0]; ++i) { + if (strcmp(p + 8, name) == 0) break; + p += 9 + strlen(p + 8); + } + if (i < cnt[0]) { + fprintf(stderr, "[W::%s] index '%s' is already in shared memory\n", __func__, name); + return -1; + } + } + + if (idx->mem == 0) bwa_idx2mem(idx); + + if (tmpfn) { + FILE *fp; + if ((fp = fopen(tmpfn, "wb")) != 0) { + int64_t rest = idx->l_mem; + while (rest > 0) { + int64_t l = rest < 0x1000000? rest : 0x1000000; + rest -= fwrite(&idx->mem[idx->l_mem - rest], 1, l, fp); + } + fclose(fp); + free(idx->mem); idx->mem = 0; + } else { + fprintf(stderr, "[W::%s] fail to create the temporary file. Option '-f' is ignored.\n", __func__); + tmpfn = 0; + } + } + + strcat(strcpy(path, "/bwaidx-"), name); + l = 8 + strlen(name) + 1; + if (cnt[1] + l > BWA_CTL_SIZE) return -1; + memcpy(shm + cnt[1], &idx->l_mem, 8); + memcpy(shm + cnt[1] + 8, name, l - 8); + if ((shmid = shm_open(path, O_CREAT|O_RDWR|O_EXCL, 0644)) < 0) { + shm_unlink(path); + perror("shm_open()"); + return -1; + } + cnt[1] += l; ++cnt[0]; + ftruncate(shmid, idx->l_mem); + shm_idx = mmap(0, idx->l_mem, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0); + if (tmpfn) { + FILE *fp; + fp = fopen(tmpfn, "rb"); + int64_t rest = idx->l_mem; + while (rest > 0) { + int64_t l = rest < 0x1000000? rest : 0x1000000; + rest -= fread(&shm_idx[idx->l_mem - rest], 1, l, fp); + } + fclose(fp); + unlink(tmpfn); + } else { + memcpy(shm_idx, idx->mem, idx->l_mem); + free(idx->mem); + } + bwa_mem2idx(idx->l_mem, shm_idx, idx); + idx->is_shm = 1; + return 0; +} + +bwaidx_t *bwa_idx_load_from_shm(const char *hint) +{ + const char *name; + uint8_t *shm, *shm_idx; + uint16_t *cnt, i; + char *p, path[PATH_MAX + 1]; + int shmid; + int64_t l_mem; + bwaidx_t *idx; + + if (hint == 0 || hint[0] == 0) return 0; + for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name); + ++name; + if ((shmid = shm_open("/bwactl", O_RDONLY, 0444)) < 0) return 0; + shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0); + cnt = (uint16_t*)shm; + if (cnt[0] == 0) return 0; + for (i = 0, p = (char*)(shm + 4); i < cnt[0]; ++i) { + memcpy(&l_mem, p, 8); p += 8; + if (strcmp(p, name) == 0) break; + p += strlen(p) + 1; + } + if (i == cnt[0]) return 0; + + strcat(strcpy(path, "/bwaidx-"), name); + if ((shmid = shm_open(path, O_RDONLY, 0444)) < 0) return 0; + shm_idx = mmap(0, l_mem, PROT_READ, MAP_SHARED, shmid, 0); + idx = calloc(1, sizeof(bwaidx_t)); + bwa_mem2idx(l_mem, shm_idx, idx); + idx->is_shm = 1; + return idx; +} + +int bwa_shm_list(void) +{ + int shmid; + uint16_t *cnt, i; + char *p, *shm; + if ((shmid = shm_open("/bwactl", O_RDONLY, 0444)) < 0) return -1; + shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0); + cnt = (uint16_t*)shm; + for (i = 0, p = shm + 4; i < cnt[0]; ++i) { + int64_t l_mem; + memcpy(&l_mem, p, 8); p += 8; + printf("%s\t%ld\n", p, (long)l_mem); + p += strlen(p) + 1; + } + return 0; +} + +int bwa_shm_destroy(void) +{ + int shmid; + uint16_t *cnt, i; + char *p, *shm; + char path[PATH_MAX + 1]; + + if ((shmid = shm_open("/bwactl", O_RDONLY, 0444)) < 0) return -1; + shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0); + cnt = (uint16_t*)shm; + for (i = 0, p = shm + 4; i < cnt[0]; ++i) { + int64_t l_mem; + memcpy(&l_mem, p, 8); p += 8; + strcat(strcpy(path, "/bwaidx-"), p); + shm_unlink(path); + p += strlen(p) + 1; + } + munmap(shm, BWA_CTL_SIZE); + shm_unlink("/bwactl"); + return 0; +} + +int main_shm(int argc, char *argv[]) +{ + int c, to_list = 0, to_drop = 0, ret = 0; + char *tmpfn = 0; + while ((c = getopt(argc, argv, "ldf:")) >= 0) { + if (c == 'l') to_list = 1; + else if (c == 'd') to_drop = 1; + else if (c == 'f') tmpfn = optarg; + } + if (optind == argc && !to_list && !to_drop) { + fprintf(stderr, "\nUsage: bwa shm [-d|-l] [-f tmpFile] [idxbase]\n\n"); + fprintf(stderr, "Options: -d destroy all indices in shared memory\n"); + fprintf(stderr, " -l list names of indices in shared memory\n"); + fprintf(stderr, " -f FILE temporary file to reduce peak memory\n\n"); + return 1; + } + if (optind < argc && (to_list || to_drop)) { + fprintf(stderr, "[E::%s] open -l or -d cannot be used when 'idxbase' is present\n", __func__); + return 1; + } + if (optind < argc) { + bwaidx_t *idx; + idx = bwa_idx_load_from_disk(argv[optind], BWA_IDX_ALL); + if (bwa_shm_stage(idx, argv[optind], tmpfn) < 0) { + fprintf(stderr, "[E::%s] failed to stage the index in shared memory\n", __func__); + ret = 1; + } + bwa_idx_destroy(idx); + } + if (to_list) bwa_shm_list(); + if (to_drop) bwa_shm_destroy(); + return ret; +} diff --git a/fastmap.c b/fastmap.c index 43ddfd3..e949da3 100644 --- a/fastmap.c +++ b/fastmap.c @@ -230,7 +230,11 @@ int main_mem(int argc, char *argv[]) } else update_a(opt, &opt0); bwa_fill_scmat(opt->a, opt->b, opt->mat); - if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak + idx = bwa_idx_load_from_shm(argv[optind]); + if (idx == 0) { + if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak + } else if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] load the bwa index from shared memory\n", __func__); if (ignore_alt) for (i = 0; i < idx->bns->n_seqs; ++i) idx->bns->anns[i].is_alt = 0; diff --git a/main.c b/main.c index 4b39d6e..313278c 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r876-dirty" +#define PACKAGE_VERSION "0.7.10-r907-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); @@ -22,6 +22,7 @@ int bwa_bwtsw2(int argc, char *argv[]); int main_fastmap(int argc, char *argv[]); int main_mem(int argc, char *argv[]); +int main_shm(int argc, char *argv[]); int main_pemerge(int argc, char *argv[]); @@ -43,6 +44,7 @@ static int usage() fprintf(stderr, " sampe generate alignment (paired ended)\n"); fprintf(stderr, " bwasw BWA-SW for long queries\n"); fprintf(stderr, "\n"); + fprintf(stderr, " shm manage indices in shared memory\n"); fprintf(stderr, " fa2pac convert FASTA to PAC format\n"); fprintf(stderr, " pac2bwt generate BWT from PAC\n"); fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n"); @@ -81,6 +83,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1); else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1); + else if (strcmp(argv[1], "shm") == 0) ret = main_shm(argc-1, argv+1); else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1); else { fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);