Merge branch 'dev'

This commit is contained in:
Heng Li 2014-10-17 09:43:52 -04:00
commit 44a185f2c1
13 changed files with 695 additions and 140 deletions

View File

@ -4,7 +4,7 @@ CFLAGS= -g -Wall -Wno-unused-function -O2
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
AR= ar AR= ar
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwashm.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o
AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
is.o bwtindex.o bwape.o kopen.o pemerge.o \ is.o bwtindex.o bwape.o kopen.o pemerge.o \
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
@ -14,6 +14,10 @@ INCLUDES=
LIBS= -lm -lz -lpthread LIBS= -lm -lz -lpthread
SUBDIRS= . SUBDIRS= .
ifeq ($(shell uname -s),Linux)
LIBS += -lrt
endif
.SUFFIXES:.c .o .cc .SUFFIXES:.c .o .cc
.c.o: .c.o:
@ -52,6 +56,7 @@ bwape.o: ksw.h khash.h
bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h
bwase.o: bwa.h ksw.h bwase.o: bwa.h ksw.h
bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h
bwashm.o: bwa.h bntseq.h bwt.h
bwt.o: utils.h bwt.h kvec.h malloc_wrap.h bwt.o: utils.h bwt.h kvec.h malloc_wrap.h
bwt_gen.o: QSufSort.h malloc_wrap.h bwt_gen.o: QSufSort.h malloc_wrap.h
bwt_lite.o: bwt_lite.h malloc_wrap.h bwt_lite.o: bwt_lite.h malloc_wrap.h
@ -66,7 +71,6 @@ bwtsw2_core.o: khash.h ksort.h
bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h
bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h
bwtsw2_pair.o: malloc_wrap.h ksw.h bwtsw2_pair.o: malloc_wrap.h ksw.h
cutvar.o: bwa.h bntseq.h bwt.h kvec.h malloc_wrap.h
example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h
fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h
is.o: malloc_wrap.h is.o: malloc_wrap.h

123
README-alt.md 100644
View File

@ -0,0 +1,123 @@
## Getting Started
Since version 0.7.11, BWA-MEM supports read mapping against a reference genome
with long alternative haplotypes present in separate ALT contigs. To use the
ALT-aware mode, users need to provide pairwise ALT-to-reference alignment in the
SAM format and rename the file to ""*idxbase*.alt". For GRCh38, this alignment is
available from the [BWA resource bundle for GRCh38][res].
#### Option 1: Mapping to the official GRCh38 with ALT contigs
Construct the index:
```sh
wget ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
gzip -d GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
mv GCA_000001405.15_GRCh38_full_analysis_set.fna hs38a.fa
bwa index hs38a.fa
cp bwa-hs38-res/hs38d4.fa.alt hs38a.fa.alt
```
Perform mapping:
```sh
bwa mem hs38a.fa read1.fq read2.fq \
| samblaster \
| bwa-hs38-res/k8-linux bwa-postalt.js hs38a.fa.alt \
| samtools view -bS - > aln.unsrt.bam
```
For short reads, the postprocessing script `bwa-postalt.js` runs at about the
same speed as BAM compression.
#### Option 2: Mapping to the collection of GRCh38, decoy and HLA genes
Construct the index:
```sh
cat hs38a.fa bwa-hs38-res/hs38d4-extra.fa > hs38d4.fa
bwa index hs38d4.fa
cp bwa-hs38-res/hs38d4.fa.alt .
```
Perform mapping:
```sh
bwa mem -g.8 hs38d4.fa read1.fq read2.fq \
| samblaster \
| bwa-hs38-res/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \
| samtools view -bS - > aln.unsrt.bam
```
This command line generates `postinfo.ctw` which loosely evaluates the presence
of an ALT contig with an empirical score at the last column.
## Background
GRCh38 ALT contigs are totaled 109Mb in length, spanning 60Mbp genomic regions.
However, sequences that are highly diverged from the primary assembly only
contribute a few million bp. Most subsequences of ALT contigs are highly similar
or identical to the primary assembly. If we align sequence reads to GRCh38+ALT
treating ALT equal to the primary assembly, we will get many reads with zero
mapping quality and lose variants on them. It is crucial to make the mapper
aware of ALTs.
BWA-MEM is designed to minimize the interference of ALT contigs such that on the
primary assembly, the ALT-aware alignment is highly similar to the alignment
without using ALT contigs in the index. This design choice makes it almost
always safe to map reads to GRCh38+ALT. Although we don't know yet how much
variations on ALT contigs contribute to phenotypes, we would not get the answer
without mapping large cohorts to these extra sequences. We hope our current
implementation encourages researchers to use ALT contigs soon and often.
## Methods
As of now, ALT mapping is done in two separate steps: BWA-MEM mapping and
postprocessing.
#### Step 1: BWA-MEM mapping
At this step, BWA-MEM reads the ALT contig names from "*idxbase*.alt", ignoring
the ALT-to-ref alignment, and labels a potential hit as *ALT* or *non-ALT*,
depending on whether the hit lands on an ALT contig or not. BWA-MEM then reports
alignments and assigns mapQ following these two rules:
* The original mapQ of a non-ALT hit is computed across non-ALT hits only.
The reported mapQ of an ALT hit is computed across all hits.
* An ALT hit is only reported if its score is strictly better than all
overlapping non-ALT hits. A reported ALT hit is flagged with 0x800
(supplementary) unless there are no non-ALT hits.
When option `-g FLOAT` is in use (which is the default), a third rule kicks in:
* The mapQ of a non-ALT hit is reduced to zero if its score is less than FLOAT
times the score of an overlapping ALT hit. In this case, the original mapQ is
moved to the `om` tag.
If we don't care about ALT hits, we may actually skip postprocessing (step 2).
Nonetheless, postprocessing is recommended as it improves mapQ and gives more
information about ALT hits.
#### Step 2: Postprocessing
Postprocessing is done with a separate script `bwa-postalt.js`. It reads all
potential hits reported in the XA tag, lifts ALT hits to the chromosomal
positions using the ALT-to-ref alignment, groups them after lifting and then
reassigns mapQ based on the best scoring hit in each group with all the hits in
a group get the same mapQ. Knowing the ALT-to-ref alignment, this script can
greatly improve mapQ of ALT hits and occasionally improve mapQ of non-ALT hits.
The script also measures the presence of each ALT contig. For a group of
overlapping ALT contigs c_1, ..., c_m, the weight for c_k equals `\frac{\sum_j
P(c_k|r_j)}{\sum_j\max_i P(c_i|r_j)}`, where `P(c_k|r)=\frac{pow(4,s_k)}{\sum_i
pow(4,s_i)}` is the posterior of c_k given a read r mapped to it with a
Smith-Waterman score s_k. This weight is reported in `postinfo.ctw` in the
option 2 above.
## Problems and Future Development
There are some uncertainties about ALT mappings - we are not sure whether they
help biological discovery and don't know the best way to analyze them. Without
clear demand from downstream analyses, it is very difficult to design the
optimal mapping strategy. The current BWA-MEM method is just a start. If it
turns out to be useful in research, we will probably rewrite bwa-postalt.js in C
for performance; if not, we will try new designs. It is also possible that we
may make breakthrough on the representation of multiple genomes, in which case,
we can even get rid of ALT contigs once for all.
[res]: https://sourceforge.net/projects/bio-bwa/files/
[sb]: https://github.com/GregoryFaust/samblaster

View File

@ -63,7 +63,6 @@ do not have plan to submit it to a peer-reviewed journal in the near future.
3. [Does BWA work on reference sequences longer than 4GB in total?](#4gb) 3. [Does BWA work on reference sequences longer than 4GB in total?](#4gb)
4. [Why can one read in a pair has high mapping quality but the other has zero?](#pe0) 4. [Why can one read in a pair has high mapping quality but the other has zero?](#pe0)
5. [How can a BWA-backtrack alignment stands out of the end of a chromosome?](#endref) 5. [How can a BWA-backtrack alignment stands out of the end of a chromosome?](#endref)
6. [How to map sequences to GRCh38 with ALT contigs?](#h38)
####<a name="type"></a>1. What types of data does BWA work with? ####<a name="type"></a>1. What types of data does BWA work with?
@ -72,11 +71,11 @@ algorithm and setting may vary. The following list gives the recommended
settings: settings:
* Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly * Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly
contigs up to a few megabases mapped to a close related reference genome: contigs up to a few megabases mapped to a closely related reference genome:
bwa mem ref.fa reads.fq > aln.sam bwa mem ref.fa reads.fq > aln.sam
* Illumina single-end reads no longer than ~70bp: * Illumina single-end reads shorter than ~70bp:
bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam
@ -84,20 +83,21 @@ settings:
bwa mem ref.fa read1.fq read2.fq > aln-pe.sam bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
* Illumina paired-end reads no longer than ~70bp: * Illumina paired-end reads shorter than ~70bp:
bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai
bwa sampe ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam bwa sampe ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam
* PacBio subreads to a reference genome: * PacBio subreads or Oxford Nanopore reads to a reference genome:
bwa mem -x pacbio ref.fa reads.fq > aln.sam bwa mem -x pacbio ref.fa reads.fq > aln.sam
bwa mem -x ont2d ref.fa reads.fq > aln.sam
BWA-MEM is recommended for query sequences longer than ~70bp for a variety of BWA-MEM is recommended for query sequences longer than ~70bp for a variety of
error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with
errors given longer query sequences as the chance of missing all seeds is small. errors given longer query sequences as the chance of missing all seeds is small.
As is shown above, with non-default settings, BWA-MEM works with PacBio subreads As is shown above, with non-default settings, BWA-MEM works with Oxford Nanopore
with a sequencing error rate as high as ~15%. reads with a sequencing error rate over 20%.
####<a name="multihit"></a>2. Why does a read appear multiple times in the output SAM? ####<a name="multihit"></a>2. Why does a read appear multiple times in the output SAM?

View File

@ -179,17 +179,25 @@ bntseq_t *bns_restore(const char *prefix)
if ((fp = fopen(strcat(strcpy(alt_filename, prefix), ".alt"), "r")) != 0) { // read .alt file if present if ((fp = fopen(strcat(strcpy(alt_filename, prefix), ".alt"), "r")) != 0) { // read .alt file if present
char str[1024]; char str[1024];
khash_t(str) *h; khash_t(str) *h;
int i, absent; int c, i, absent;
khint_t k; khint_t k;
h = kh_init(str); h = kh_init(str);
for (i = 0; i < bns->n_seqs; ++i) { for (i = 0; i < bns->n_seqs; ++i) {
k = kh_put(str, h, bns->anns[i].name, &absent); k = kh_put(str, h, bns->anns[i].name, &absent);
kh_val(h, k) = i; kh_val(h, k) = i;
} }
while (fscanf(fp, "%s", str) == 1) { i = 0;
k = kh_get(str, h, str); while ((c = fgetc(fp)) != EOF) {
if (k != kh_end(h)) if (c == '\t' || c == '\n' || c == '\r') {
bns->anns[kh_val(h, k)].is_alt = 1; str[i] = 0;
if (str[0] != '@') {
k = kh_get(str, h, str);
if (k != kh_end(h))
bns->anns[kh_val(h, k)].is_alt = 1;
}
while (c != '\n' && c != EOF) c = fgetc(fp);
i = 0;
} else str[i++] = c; // FIXME: potential segfault here
} }
kh_destroy(str, h); kh_destroy(str, h);
fclose(fp); fclose(fp);
@ -226,7 +234,7 @@ static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_
} }
p = bns->anns + bns->n_seqs; p = bns->anns + bns->n_seqs;
p->name = strdup((char*)seq->name.s); p->name = strdup((char*)seq->name.s);
p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)"); p->anno = seq->comment.l > 0? strdup((char*)seq->comment.s) : strdup("(null)");
p->gi = 0; p->len = seq->seq.l; p->gi = 0; p->len = seq->seq.l;
p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
p->n_ambs = 0; p->n_ambs = 0;
@ -358,8 +366,9 @@ int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re)
{ {
int is_rev, rid_b, rid_e; int is_rev, rid_b, rid_e;
if (rb < bns->l_pac && re > bns->l_pac) return -2; if (rb < bns->l_pac && re > bns->l_pac) return -2;
assert(rb <= re);
rid_b = bns_pos2rid(bns, bns_depos(bns, rb, &is_rev)); rid_b = bns_pos2rid(bns, bns_depos(bns, rb, &is_rev));
rid_e = bns_pos2rid(bns, bns_depos(bns, re, &is_rev) - 1); rid_e = rb < re? bns_pos2rid(bns, bns_depos(bns, re - 1, &is_rev)) : rid_b;
return rid_b == rid_e? rid_b : -1; return rid_b == rid_e? rid_b : -1;
} }

View File

@ -52,7 +52,7 @@ var getopt = function(args, ostr) {
return optopt; return optopt;
} }
// print an object in a format similar to JSON. For debugging. // print an object in a format similar to JSON. For debugging only.
function obj2str(o) function obj2str(o)
{ {
if (typeof(o) != 'object') { if (typeof(o) != 'object') {
@ -203,54 +203,31 @@ function parse_hit(s, opt)
return h; return h;
} }
// read the ALT-to-REF alignment and generate the index
function read_ALT_sam(fn)
{
var intv = {};
var file = new File(fn);
var buf = new Bytes();
while (file.readline(buf) >= 0) {
var line = buf.toString();
var t = line.split("\t");
if (line.charAt(0) == '@') continue;
var flag = parseInt(t[1]);
var m, cigar = [], l_qaln = 0, l_qclip = 0;
while ((m = re_cigar.exec(t[5])) != null) {
var l = parseInt(m[1]);
cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip
if (m[2] == 'M' || m[2] == 'I') l_qaln += l;
else if (m[2] == 'S' || m[2] == 'H') l_qclip += l;
}
var j = flag&16? cigar.length-1 : 0;
var start = cigar[j][0] == 'S'? cigar[j][1] : 0;
if (intv[t[0]] == null) intv[t[0]] = [];
intv[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, parseInt(t[3]) - 1, cigar]);
//print(start, start + l_qaln, t[2], flag&16? true : false, parseInt(t[3]), cigar);
}
buf.destroy();
file.close();
// create index for intervals on ALT contigs
var idx = {};
for (var ctg in intv)
idx[ctg] = intv_ovlp(intv[ctg]);
return idx;
}
function bwa_postalt(args) function bwa_postalt(args)
{ {
var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true }; var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:10, show_ev:false };
while ((c = getopt(args, 'pqv:')) != null) { while ((c = getopt(args, 'Pqev:p:')) != null) {
if (c == 'v') opt.verbose = parseInt(getopt.arg); if (c == 'v') opt.verbose = parseInt(getopt.arg);
else if (c == 'p') opt.show_pri = true; else if (c == 'p') opt.pre = getopt.arg;
else if (c == 'P') opt.show_pri = true;
else if (c == 'q') opt.recover_maq = false; else if (c == 'q') opt.recover_maq = false;
else if (c == 'e') opt.show_ev = true;
}
if (opt.show_ev && opt.pre == null) {
warn("ERROR: option '-p' must be specified if '-e' is applied.");
exit(1);
} }
if (args.length == getopt.ind) { if (args.length == getopt.ind) {
print(""); print("");
print("Usage: k8 bwa-postalt.js [-p] <alt.sam> [aln.sam]\n"); print("Usage: k8 bwa-postalt.js [options] <alt.sam> [aln.sam]\n");
print("Options: -p output lifted non-ALT hit in a SAM line (for ALT-unware alignments)"); print("Options: -p STR prefix of file(s) for additional information [null]");
print(" -q don't recover mapQ for non-ALTs hit overlapping lifted ALT"); print(" PREFIX.ctw - weight of each ALT contig");
print(" PREFIX.evi - reads supporting ALT contigs (effective with -e)");
print(" -q don't modify mapQ for non-ALTs hit overlapping lifted ALT");
print(" -e show reads supporting ALT contigs into file PREFIX.evi");
print(""); print("");
print("Note: This script inspects the XA tag, lifts the mapping positions of ALT hits to"); print("Note: This script inspects the XA tag, lifts the mapping positions of ALT hits to");
print(" the primary assembly, groups them and then estimates mapQ across groups. If"); print(" the primary assembly, groups them and then estimates mapQ across groups. If");
@ -262,12 +239,56 @@ function bwa_postalt(args)
exit(1); exit(1);
} }
var file, buf = new Bytes(); var fp_evi = opt.show_ev && opt.pre? new File(opt.pre + '.evi', "w") : null;
var aux = new Bytes(); // used for reverse and reverse complement var aux = new Bytes(); // used for reverse and reverse complement
var idx = read_ALT_sam(args[getopt.ind]); var buf = new Bytes();
var buf2 = [];
// read ALT-to-REF alignment
var intv_alt = {}, intv_pri = {}, idx_un = {};
var file = new File(args[getopt.ind]);
while (file.readline(buf) >= 0) {
var line = buf.toString();
if (line.charAt(0) == '@') continue;
var t = line.split("\t");
if (t.length < 11) continue; // incomplete lines
var pos = parseInt(t[3]) - 1;
var flag = parseInt(t[1]);
if ((flag&4) || t[2] == '*') {
idx_un[t[0]] = true;
continue;
}
var m, cigar = [], l_qaln = 0, l_tlen = 0, l_qclip = 0;
while ((m = re_cigar.exec(t[5])) != null) {
var l = parseInt(m[1]);
cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip
if (m[2] == 'M') l_qaln += l, l_tlen += l;
else if (m[2] == 'I') l_qaln += l;
else if (m[2] == 'S' || m[2] == 'H') l_qclip += l;
else if (m[2] == 'D' || m[2] == 'N') l_tlen += l;
}
var j = flag&16? cigar.length-1 : 0;
var start = cigar[j][0] == 'S'? cigar[j][1] : 0;
if (intv_alt[t[0]] == null) intv_alt[t[0]] = [];
intv_alt[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, pos - 1, cigar, pos + l_tlen]);
if (intv_pri[t[2]] == null) intv_pri[t[2]] = [];
intv_pri[t[2]].push([pos, pos + l_tlen, t[0]]);
}
file.close();
var idx_alt = {}, idx_pri = {};
for (var ctg in intv_alt)
idx_alt[ctg] = intv_ovlp(intv_alt[ctg]);
for (var ctg in intv_pri)
idx_pri[ctg] = intv_ovlp(intv_pri[ctg]);
// initialize the list of ALT contigs
var weight_alt = [];
for (var ctg in idx_alt)
weight_alt[ctg] = [0, 0, 0, 0, 0, 0, intv_alt[ctg][0][3], intv_alt[ctg][0][5], intv_alt[ctg][0][7]];
for (var ctg in idx_un)
weight_alt[ctg] = [0, 0, 0, 0, 0, 0, '~', 0, 0];
// process SAM // process SAM
var buf2 = [];
file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File(); file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File();
while (file.readline(buf) >= 0) { while (file.readline(buf) >= 0) {
var m, line = buf.toString(); var m, line = buf.toString();
@ -286,36 +307,49 @@ function bwa_postalt(args)
buf2 = []; buf2 = [];
} }
if ((m = /\tXA:Z:(\S+)/.exec(line)) == null) { // skip unmapped lines
if (t[1]&4) {
buf2.push(t); buf2.push(t);
continue; continue;
} }
var XA_strs = m[1].split(";");
// parse the reported hit // parse the reported hit
var hits = [];
var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1]; var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1];
var flag = t[1]; var flag = t[1];
var h = parse_hit([t[2], ((flag&16)?'-':'+') + t[3], t[5], NM], opt); var h = parse_hit([t[2], ((flag&16)?'-':'+') + t[3], t[5], NM], opt);
if (h.hard) { // don't process lines with hard clips if (h.hard) { // the following does not work with hard clipped alignments
buf2.push(t); buf2.push(t);
continue; continue;
} }
hits.push(h); var hits = [h];
// parse hits in the XA tag // parse hits in the XA tag
for (var i = 0; i < XA_strs.length; ++i) if ((m = /\tXA:Z:(\S+)/.exec(line)) != null) {
if (XA_strs[i] != '') // as the last symbol in an XA tag is ";", the last split is an empty string var XA_strs = m[1].split(";");
hits.push(parse_hit(XA_strs[i].split(","), opt)); for (var i = 0; i < XA_strs.length; ++i)
if (XA_strs[i] != '') // as the last symbol in an XA tag is ";", the last split is an empty string
hits.push(parse_hit(XA_strs[i].split(","), opt));
}
// check if there are ALT hits
var has_alt = false;
for (var i = 0; i < hits.length; ++i)
if (weight_alt[hits[i].ctg] != null) {
has_alt = true;
break;
}
if (!has_alt) {
buf2.push(t);
continue;
}
// lift mapping positions to the primary assembly // lift mapping positions to the primary assembly
var n_lifted = 0, n_rpt_lifted = 0; var n_rpt_lifted = 0, rpt_lifted = null;
for (var i = 0; i < hits.length; ++i) { for (var i = 0; i < hits.length; ++i) {
var h = hits[i]; var a, h = hits[i];
if (idx[h.ctg] == null) continue; if (idx_alt[h.ctg] == null || (a = idx_alt[h.ctg](h.start, h.end)) == null || a.length == 0)
var a = idx[h.ctg](h.start, h.end); continue;
if (a == null || a.length == 0) continue;
// find the approximate position on the primary assembly // find the approximate position on the primary assembly
var lifted = []; var lifted = [];
@ -333,36 +367,46 @@ function bwa_postalt(args)
lifted.push([a[j][3], (h.rev!=a[j][4]), s, e]); lifted.push([a[j][3], (h.rev!=a[j][4]), s, e]);
if (i == 0) ++n_rpt_lifted; if (i == 0) ++n_rpt_lifted;
} }
if (lifted.length) ++n_lifted, hits[i].lifted = lifted; if (i == 0 && n_rpt_lifted == 1) rpt_lifted = lifted[0].slice(0);
} if (lifted.length) hits[i].lifted = lifted;
if (n_lifted == 0) {
buf2.push(t);
continue;
} }
// group hits based on the lifted positions on the primary assembly // prepare for hits grouping
for (var i = 0; i < hits.length; ++i) { // set keys for sorting for (var i = 0; i < hits.length; ++i) { // set keys for sorting
if (hits[i].lifted && hits[i].lifted.length) // TODO: only the first element in lifted[] is used if (hits[i].lifted != null) // TODO: only the first element in lifted[] is used
hits[i].pctg = hits[i].lifted[0][0], hits[i].pstart = hits[i].lifted[0][2], hits[i].pend = hits[i].lifted[0][3]; hits[i].pctg = hits[i].lifted[0][0], hits[i].pstart = hits[i].lifted[0][2], hits[i].pend = hits[i].lifted[0][3];
else hits[i].pctg = hits[i].ctg, hits[i].pstart = hits[i].start, hits[i].pend = hits[i].end; else hits[i].pctg = hits[i].ctg, hits[i].pstart = hits[i].start, hits[i].pend = hits[i].end;
hits[i].i = i; // keep the original index hits[i].i = i; // keep the original index
} }
hits.sort(function(a,b) { return a.pctg != b.pctg? (a.pctg < b.pctg? -1 : 1) : a.pstart - b.pstart });
var last_chr = null, end = 0, g = -1; // group hits based on the lifted positions on non-ALT sequences
for (var i = 0; i < hits.length; ++i) { if (hits.length > 1) {
if (last_chr != hits[i].pctg) ++g, last_chr = hits[i].pctg, end = 0; hits.sort(function(a,b) { return a.pctg != b.pctg? (a.pctg < b.pctg? -1 : 1) : a.pstart - b.pstart });
else if (hits[i].pstart >= end) ++g; var last_chr = null, end = 0, g = -1;
hits[i].g = g; for (var i = 0; i < hits.length; ++i) {
end = end > hits[i].pend? end : hits[i].pend; if (last_chr != hits[i].pctg) ++g, last_chr = hits[i].pctg, end = 0;
else if (hits[i].pstart >= end) ++g;
hits[i].g = g;
end = end > hits[i].pend? end : hits[i].pend;
}
} else hits[0].g = 0;
// find the index and group id of the reported hit; find the size of the reported group
var reported_g = null, reported_i = null, n_group0 = 0;
if (hits.length > 1) {
for (var i = 0; i < hits.length; ++i)
if (hits[i].i == 0)
reported_g = hits[i].g, reported_i = i;
for (var i = 0; i < hits.length; ++i)
if (hits[i].g == reported_g)
++n_group0;
} else {
if (weight_alt[hits[0].ctg] == null) { // no need to go through the following if the single hit is non-ALT
buf2.push(t);
continue;
}
reported_g = reported_i = 0, n_group0 = 1;
} }
var reported_g = null, reported_i = null;
for (var i = 0; i < hits.length; ++i)
if (hits[i].i == 0)
reported_g = hits[i].g, reported_i = i;
var n_group0 = 0; // #hits overlapping the reported hit
for (var i = 0; i < hits.length; ++i)
if (hits[i].g == reported_g)
++n_group0;
// re-estimate mapping quality if necessary // re-estimate mapping quality if necessary
var mapQ, ori_mapQ = t[4]; var mapQ, ori_mapQ = t[4];
@ -379,12 +423,65 @@ function bwa_postalt(args)
mapQ = group_max.length == 1? 60 : 6 * (group_max[0][0] - group_max[1][0]); mapQ = group_max.length == 1? 60 : 6 * (group_max[0][0] - group_max[1][0]);
} else mapQ = 0; } else mapQ = 0;
mapQ = mapQ < 60? mapQ : 60; mapQ = mapQ < 60? mapQ : 60;
mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ; if (idx_alt[t[2]] == null) mapQ = mapQ < ori_mapQ? mapQ : ori_mapQ;
else mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ;
} else mapQ = t[4]; } else mapQ = t[4];
// ALT genotyping
if (mapQ >= opt.min_mapq && hits[reported_i].score >= opt.min_sc) {
// collect all overlapping ALT contigs
var hits2 = [];
for (var i = 0; i < hits.length; ++i) {
var h = hits[i];
if (h.g == reported_g)
hits2.push([h.pctg, h.pstart, h.pend, h.ctg, h.score, h.NM]);
}
var start = hits2[0][1], end = hits2[0][2];
for (var i = 1; i < hits2.length; ++i)
end = end > hits2[i][2]? end : hits2[i][2];
var alts = {};
for (var i = 0; i < hits2.length; ++i)
if (weight_alt[hits2[i][3]] != null)
alts[hits2[i][3]] = [hits2[i][4], hits2[i][5]];
if (idx_pri[hits2[0][0]] != null) { // add other unreported hits
var ovlp = idx_pri[hits2[0][0]](start, end);
for (var i = 0; i < ovlp.length; ++i)
if (ovlp[i][0] <= start && end <= ovlp[i][1] && alts[ovlp[i][2]] == null)
alts[ovlp[i][2]] = [0, 0];
}
// add weight to each ALT contig
var alt_arr = [], max_sc = -1, max_i = -1, sum = 0, min_sc = 1<<30, max_nm = -1;
for (var ctg in alts)
alt_arr.push([ctg, alts[ctg][0], 0, alts[ctg][1]]);
for (var i = 0; i < alt_arr.length; ++i) {
if (max_sc < alt_arr[i][1])
max_sc = alt_arr[i][1], max_i = i;
min_sc = min_sc < alt_arr[i][1]? min_sc : alt_arr[i][1];
var nm = alt_arr[i][1] > 0? alt_arr[i][3] : opt.max_nm_sc;
max_nm = max_nm > nm? max_nm : nm;
}
if (max_nm > opt.max_nm_sc) max_nm = opt.max_nm_sc;
if (max_sc > 0 && (alt_arr.length == 1 || min_sc < max_sc)) {
for (var i = 0; i < alt_arr.length; ++i)
sum += (alt_arr[i][2] = Math.pow(10, .6 * (alt_arr[i][1] - max_sc)));
for (var i = 0; i < alt_arr.length; ++i) alt_arr[i][2] /= sum;
for (var i = 0; i < alt_arr.length; ++i) {
var e = [alt_arr[i][0], 1, alt_arr[max_i][2], alt_arr[i][2], max_nm, max_nm * alt_arr[max_i][2], max_nm * alt_arr[i][2]];
var w = weight_alt[e[0]];
for (var j = 0; j < 6; ++j) w[j] += e[j+1];
if (fp_evi) {
e[2] = e[2].toFixed(3); e[3] = e[3].toFixed(3);
e[5] = e[5].toFixed(3); e[6] = e[6].toFixed(3);
fp_evi.write(t[0] + '/' + (t[1]>>6&3) + '\t' + e.join("\t") + '\n');
}
}
}
}
// check if the reported hit overlaps a hit to the primary assembly; if so, don't reduce mapping quality // check if the reported hit overlaps a hit to the primary assembly; if so, don't reduce mapping quality
if (opt.recover_mapq && n_rpt_lifted == 1 && mapQ > 0) { if (opt.recover_mapq && n_rpt_lifted == 1 && mapQ > 0) {
var l = lifted[0]; var l = rpt_lifted;
for (var i = 0; i < buf2.length; ++i) { for (var i = 0; i < buf2.length; ++i) {
var s = buf2[i]; var s = buf2[i];
if (l[0] != s[2]) continue; // different chr if (l[0] != s[2]) continue; // different chr
@ -423,6 +520,7 @@ function bwa_postalt(args)
need_rev = true; need_rev = true;
} }
if (need_rev) { // reverse and reverse complement if (need_rev) { // reverse and reverse complement
aux.length = 0;
aux.set(t[9], 0); aux.revcomp(); rs = aux.toString(); aux.set(t[9], 0); aux.revcomp(); rs = aux.toString();
aux.set(t[10],0); aux.reverse(); rq = aux.toString(); aux.set(t[10],0); aux.reverse(); rq = aux.toString();
} }
@ -438,14 +536,16 @@ function bwa_postalt(args)
for (var i = 0; i < hits.length; ++i) { for (var i = 0; i < hits.length; ++i) {
if (opt.verbose >= 5) print(obj2str(hits[i])); if (opt.verbose >= 5) print(obj2str(hits[i]));
if (hits[i].g != reported_g || i == reported_i) continue; if (hits[i].g != reported_g || i == reported_i) continue;
if (!opt.show_pri && idx[hits[i].ctg] == null) continue; if (!opt.show_pri && idx_alt[hits[i].ctg] == null) continue;
var s = [t[0], flag&0xf10, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, '*', 0, 0]; var s = [t[0], 0, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, '*', 0, 0];
// update name // print sequence/quality and set the rev flag
if (flag&0x40) s[0] += "/1"; if (hits[i].rev == hits[reported_i].rev) {
if (flag&0x80) s[0] += "/2"; s.push(t[9], t[10]);
s[0] += "_" + (++cnt); s[1] = flag | 0x800;
if (hits[i].rev == hits[reported_i].rev) s.push(t[9], t[10]); } else {
else s.push(rs, rq); s.push(rs, rq);
s[1] = (flag ^ 0x10) | 0x800;
}
s.push("NM:i:" + hits[i].NM); s.push("NM:i:" + hits[i].NM);
if (hits[i].lifted_str) s.push("lt:Z:" + hits[i].lifted_str); if (hits[i].lifted_str) s.push("lt:Z:" + hits[i].lifted_str);
buf2.push(s); buf2.push(s);
@ -454,9 +554,30 @@ function bwa_postalt(args)
for (var i = 0; i < buf2.length; ++i) for (var i = 0; i < buf2.length; ++i)
print(buf2[i].join("\t")); print(buf2[i].join("\t"));
file.close(); file.close();
if (fp_evi != null) fp_evi.close();
aux.destroy();
buf.destroy(); buf.destroy();
aux.destroy();
// print weight of each contig
if (opt.pre != null) {
var fpout = new File(opt.pre + '.ctw', "w");
var weight_arr = [];
for (var ctg in weight_alt) {
var w = weight_alt[ctg];
weight_arr.push([ctg, w[6], w[7], w[8],
w[0], w[1].toFixed(3), w[2].toFixed(3), w[1] > 0? (w[2]/w[1]).toFixed(3) : '0.000',
w[3], w[4].toFixed(3), w[5].toFixed(3), w[4] > 0? (w[5]/w[4]).toFixed(3) : '0.000']);
}
weight_arr.sort(function(a,b) {
return a[1] < b[1]? -1 : a[1] > b[1]? 1 : a[2] != b[2]? a[2] - b[2] : a[0] < b[0]? -1 : a[0] > b[0]? 1 : 0;
});
for (var i = 0; i < weight_arr.length; ++i) {
if (weight_arr[i][1] == '~') weight_arr[i][1] = '*';
fpout.write(weight_arr[i].join("\t") + '\n');
}
fpout.close();
}
} }
bwa_postalt(arguments); bwa_postalt(arguments);

89
bwa.c
View File

@ -227,7 +227,7 @@ bwt_t *bwa_idx_load_bwt(const char *hint)
return bwt; return bwt;
} }
bwaidx_t *bwa_idx_load(const char *hint, int which) bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
{ {
bwaidx_t *idx; bwaidx_t *idx;
char *prefix; char *prefix;
@ -239,7 +239,12 @@ bwaidx_t *bwa_idx_load(const char *hint, int which)
idx = calloc(1, sizeof(bwaidx_t)); idx = calloc(1, sizeof(bwaidx_t));
if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint); if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
if (which & BWA_IDX_BNS) { if (which & BWA_IDX_BNS) {
int i, c;
idx->bns = bns_restore(prefix); idx->bns = bns_restore(prefix);
for (i = c = 0; i < idx->bns->n_seqs; ++i)
if (idx->bns->anns[i].is_alt) ++c;
if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] read %d ALT contigs\n", __func__, c);
if (which & BWA_IDX_PAC) { if (which & BWA_IDX_PAC) {
idx->pac = calloc(idx->bns->l_pac/4+1, 1); idx->pac = calloc(idx->bns->l_pac/4+1, 1);
err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
@ -251,15 +256,91 @@ bwaidx_t *bwa_idx_load(const char *hint, int which)
return idx; return idx;
} }
bwaidx_t *bwa_idx_load(const char *hint, int which)
{
return bwa_idx_load_from_disk(hint, which);
}
void bwa_idx_destroy(bwaidx_t *idx) void bwa_idx_destroy(bwaidx_t *idx)
{ {
if (idx == 0) return; if (idx == 0) return;
if (idx->bwt) bwt_destroy(idx->bwt); if (idx->mem == 0) {
if (idx->bns) bns_destroy(idx->bns); if (idx->bwt) bwt_destroy(idx->bwt);
if (idx->pac) free(idx->pac); if (idx->bns) bns_destroy(idx->bns);
if (idx->pac) free(idx->pac);
} else {
free(idx->bwt); free(idx->bns->anns); free(idx->bns);
if (!idx->is_shm) free(idx->mem);
}
free(idx); free(idx);
} }
int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx)
{
int64_t k = 0, x;
int i;
// generate idx->bwt
x = sizeof(bwt_t); idx->bwt = malloc(x); memcpy(idx->bwt, mem + k, x); k += x;
x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x;
x = idx->bwt->n_sa * sizeof(bwtint_t); idx->bwt->sa = (bwtint_t*)(mem + k); k += x;
// generate idx->bns and idx->pac
x = sizeof(bntseq_t); idx->bns = malloc(x); memcpy(idx->bns, mem + k, x); k += x;
x = idx->bns->n_holes * sizeof(bntamb1_t); idx->bns->ambs = (bntamb1_t*)(mem + k); k += x;
x = idx->bns->n_seqs * sizeof(bntann1_t); idx->bns->anns = malloc(x); memcpy(idx->bns->anns, mem + k, x); k += x;
for (i = 0; i < idx->bns->n_seqs; ++i) {
idx->bns->anns[i].name = (char*)(mem + k); k += strlen(idx->bns->anns[i].name) + 1;
idx->bns->anns[i].anno = (char*)(mem + k); k += strlen(idx->bns->anns[i].anno) + 1;
}
idx->pac = (uint8_t*)(mem + k); k += idx->bns->l_pac/4+1;
assert(k == l_mem);
idx->l_mem = k; idx->mem = mem;
return 0;
}
int bwa_idx2mem(bwaidx_t *idx)
{
int i;
int64_t k, x, tmp;
uint8_t *mem;
// copy idx->bwt
x = idx->bwt->bwt_size * 4;
mem = realloc(idx->bwt->bwt, sizeof(bwt_t) + x); idx->bwt->bwt = 0;
memmove(mem + sizeof(bwt_t), mem, x);
memcpy(mem, idx->bwt, sizeof(bwt_t)); k = sizeof(bwt_t) + x;
x = idx->bwt->n_sa * sizeof(bwtint_t); mem = realloc(mem, k + x); memcpy(mem + k, idx->bwt->sa, x); k += x;
free(idx->bwt->sa);
free(idx->bwt); idx->bwt = 0;
// copy idx->bns
tmp = idx->bns->n_seqs * sizeof(bntann1_t) + idx->bns->n_holes * sizeof(bntamb1_t);
for (i = 0; i < idx->bns->n_seqs; ++i) // compute the size of heap-allocated memory
tmp += strlen(idx->bns->anns[i].name) + strlen(idx->bns->anns[i].anno) + 2;
mem = realloc(mem, k + sizeof(bntseq_t) + tmp);
x = sizeof(bntseq_t); memcpy(mem + k, idx->bns, x); k += x;
x = idx->bns->n_holes * sizeof(bntamb1_t); memcpy(mem + k, idx->bns->ambs, x); k += x;
free(idx->bns->ambs);
x = idx->bns->n_seqs * sizeof(bntann1_t); memcpy(mem + k, idx->bns->anns, x); k += x;
for (i = 0; i < idx->bns->n_seqs; ++i) {
x = strlen(idx->bns->anns[i].name) + 1; memcpy(mem + k, idx->bns->anns[i].name, x); k += x;
x = strlen(idx->bns->anns[i].anno) + 1; memcpy(mem + k, idx->bns->anns[i].anno, x); k += x;
free(idx->bns->anns[i].name); free(idx->bns->anns[i].anno);
}
free(idx->bns->anns);
// copy idx->pac
x = idx->bns->l_pac/4+1;
mem = realloc(mem, k + x);
memcpy(mem + k, idx->pac, x); k += x;
free(idx->bns); idx->bns = 0;
free(idx->pac); idx->pac = 0;
return bwa_mem2idx(k, mem, idx);
}
/*********************** /***********************
* SAM header routines * * SAM header routines *
***********************/ ***********************/

10
bwa.h
View File

@ -10,10 +10,16 @@
#define BWA_IDX_PAC 0x4 #define BWA_IDX_PAC 0x4
#define BWA_IDX_ALL 0x7 #define BWA_IDX_ALL 0x7
#define BWA_CTL_SIZE 0x10000
typedef struct { typedef struct {
bwt_t *bwt; // FM-index bwt_t *bwt; // FM-index
bntseq_t *bns; // information on the reference sequences bntseq_t *bns; // information on the reference sequences
uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
int is_shm;
int64_t l_mem;
uint8_t *mem;
} bwaidx_t; } bwaidx_t;
typedef struct { typedef struct {
@ -37,8 +43,12 @@ extern "C" {
char *bwa_idx_infer_prefix(const char *hint); char *bwa_idx_infer_prefix(const char *hint);
bwt_t *bwa_idx_load_bwt(const char *hint); bwt_t *bwa_idx_load_bwt(const char *hint);
bwaidx_t *bwa_idx_load_from_shm(const char *hint);
bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which);
bwaidx_t *bwa_idx_load(const char *hint, int which); bwaidx_t *bwa_idx_load(const char *hint, int which);
void bwa_idx_destroy(bwaidx_t *idx); void bwa_idx_destroy(bwaidx_t *idx);
int bwa_idx2mem(bwaidx_t *idx);
int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx);
void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line); void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line);
char *bwa_set_rg(const char *s); char *bwa_set_rg(const char *s);

View File

@ -73,7 +73,7 @@ mem_opt_t *mem_opt_init()
o->chunk_size = 10000000; o->chunk_size = 10000000;
o->n_threads = 1; o->n_threads = 1;
o->max_XA_hits = 5; o->max_XA_hits = 5;
o->max_XA_hits_alt = 50; o->max_XA_hits_alt = 200;
o->max_matesw = 50; o->max_matesw = 50;
o->mask_level_redun = 0.95; o->mask_level_redun = 0.95;
o->min_chain_weight = 0; o->min_chain_weight = 0;
@ -512,38 +512,38 @@ static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t *
int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id) int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id)
{ {
int i, j, n_pri; int i, n_pri;
int_v z = {0,0,0}; int_v z = {0,0,0};
if (n == 0) return 0; if (n == 0) return 0;
for (i = n_pri = 0; i < n; ++i) { for (i = n_pri = 0; i < n; ++i) {
a[i].sub = a[i].alt_sc = 0, a[i].secondary = a[i].secondary_alt = -1, a[i].hash = hash_64(id+i); a[i].sub = a[i].alt_sc = 0, a[i].secondary = a[i].secondary_all = -1, a[i].hash = hash_64(id+i);
if (!a[i].is_alt) ++n_pri; if (!a[i].is_alt) ++n_pri;
} }
ks_introsort(mem_ars_hash, n, a); ks_introsort(mem_ars_hash, n, a);
mem_mark_primary_se_core(opt, n, a, &z); mem_mark_primary_se_core(opt, n, a, &z);
for (i = 0; i < n; ++i) { for (i = 0; i < n; ++i) {
mem_alnreg_t *p = &a[i]; mem_alnreg_t *p = &a[i];
p->secondary_alt = i; // keep the rank in the first round p->secondary_all = i; // keep the rank in the first round
if (!p->is_alt && p->secondary >= 0 && a[p->secondary].is_alt) if (!p->is_alt && p->secondary >= 0 && a[p->secondary].is_alt)
p->alt_sc = a[p->secondary].score; p->alt_sc = a[p->secondary].score;
} }
if (n_pri >= 0 && n_pri < n) { if (n_pri >= 0 && n_pri < n) {
kv_resize(int, z, n); kv_resize(int, z, n);
if (n_pri > 0) ks_introsort(mem_ars_hash2, n, a); if (n_pri > 0) ks_introsort(mem_ars_hash2, n, a);
for (i = 0; i < n; ++i) z.a[a[i].secondary_alt] = i; for (i = 0; i < n; ++i) z.a[a[i].secondary_all] = i;
for (i = 0; i < n; ++i) { for (i = 0; i < n; ++i) {
if (a[i].secondary < 0) { if (a[i].secondary >= 0) {
a[i].secondary_alt = -1; a[i].secondary_all = z.a[a[i].secondary];
continue; if (a[i].is_alt) a[i].secondary = INT_MAX;
} } else a[i].secondary_all = -1;
j = z.a[a[i].secondary];
a[i].secondary_alt = a[j].is_alt? j : -1;
if (a[i].is_alt) a[i].secondary = INT_MAX;
} }
if (n_pri > 0) { // mark primary for hits to the primary assembly only if (n_pri > 0) { // mark primary for hits to the primary assembly only
for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1; for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1;
mem_mark_primary_se_core(opt, n_pri, a, &z); mem_mark_primary_se_core(opt, n_pri, a, &z);
} }
} else {
for (i = 0; i < n; ++i)
a[i].secondary_all = a[i].secondary;
} }
free(z.a); free(z.a);
return n_pri; return n_pri;

View File

@ -69,7 +69,7 @@ typedef struct {
int w; // actual band width used in extension int w; // actual band width used in extension
int seedcov; // length of regions coverged by seeds int seedcov; // length of regions coverged by seeds
int secondary; // index of the parent hit shadowing the current hit; <0 if primary int secondary; // index of the parent hit shadowing the current hit; <0 if primary
int secondary_alt; int secondary_all;
int seedlen0; // length of the starting seed int seedlen0; // length of the starting seed
int n_comp:30, is_alt:2; // number of sub-alignments chained together int n_comp:30, is_alt:2; // number of sub-alignments chained together
float frac_rep; float frac_rep;

View File

@ -112,34 +112,35 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
s->sam = str.s; s->sam = str.s;
} }
static inline void get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i, int r[2]) static inline int get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i)
{ {
int j = a[i].secondary, k = a[i].secondary_alt; int k = a[i].secondary_all;
r[0] = r[1] = -1; if (k >= 0 && a[i].score >= a[k].score * XA_drop_ratio) return k;
if (j >= 0 && j < INT_MAX && !a[j].is_alt && !a[i].is_alt && a[i].score >= a[j].score * XA_drop_ratio) r[0] = j; return -1;
if (k >= 0 && a[k].is_alt && a[i].score >= a[k].score * XA_drop_ratio) r[1] = k;
} }
// Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup. // Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup.
char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se() char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se()
{ {
int i, k, *cnt, tot, r[2]; int i, k, r, *cnt, tot;
kstring_t *aln = 0, str = {0,0,0}; kstring_t *aln = 0, str = {0,0,0};
char **XA = 0; char **XA = 0, *has_alt;
cnt = calloc(a->n, sizeof(int)); cnt = calloc(a->n, sizeof(int));
has_alt = calloc(a->n, 1);
for (i = 0, tot = 0; i < a->n; ++i) { for (i = 0, tot = 0; i < a->n; ++i) {
get_pri_idx(opt->XA_drop_ratio, a->a, i, r); r = get_pri_idx(opt->XA_drop_ratio, a->a, i);
if (r[0] >= 0) ++cnt[r[0]], ++tot; if (r >= 0) {
if (r[1] >= 0) ++cnt[r[1]], ++tot; ++cnt[r], ++tot;
if (a->a[i].is_alt) has_alt[r] = 1;
}
} }
if (tot == 0) goto end_gen_alt; if (tot == 0) goto end_gen_alt;
aln = calloc(a->n, sizeof(kstring_t)); aln = calloc(a->n, sizeof(kstring_t));
for (i = 0; i < a->n; ++i) { for (i = 0; i < a->n; ++i) {
mem_aln_t t; mem_aln_t t;
get_pri_idx(opt->XA_drop_ratio, a->a, i, r); if ((r = get_pri_idx(opt->XA_drop_ratio, a->a, i)) < 0) continue;
if (r[0] < 0 && r[1] < 0) continue; if (cnt[r] > opt->max_XA_hits_alt || (!has_alt[r] && cnt[r] > opt->max_XA_hits)) continue;
if ((r[0] >= 0 && cnt[r[0]] > opt->max_XA_hits) || (r[1] >= 0 && cnt[r[1]] > opt->max_XA_hits_alt)) continue;
t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]); t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]);
str.l = 0; str.l = 0;
kputs(bns->anns[t.rid].name, &str); kputs(bns->anns[t.rid].name, &str);
@ -152,14 +153,13 @@ char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
kputc(',', &str); kputw(t.NM, &str); kputc(',', &str); kputw(t.NM, &str);
kputc(';', &str); kputc(';', &str);
free(t.cigar); free(t.cigar);
if (r[0] >= 0 && cnt[r[0]] <= opt->max_XA_hits) kputsn(str.s, str.l, &aln[r[0]]); kputsn(str.s, str.l, &aln[r]);
if (r[1] >= 0 && cnt[r[1]] <= opt->max_XA_hits_alt) kputsn(str.s, str.l, &aln[r[1]]);
} }
XA = calloc(a->n, sizeof(char*)); XA = calloc(a->n, sizeof(char*));
for (k = 0; k < a->n; ++k) for (k = 0; k < a->n; ++k)
XA[k] = aln[k].s; XA[k] = aln[k].s;
end_gen_alt: end_gen_alt:
free(cnt); free(aln); free(str.s); free(has_alt); free(cnt); free(aln); free(str.s);
return XA; return XA;
} }

200
bwashm.c 100644
View File

@ -0,0 +1,200 @@
#include <sys/types.h>
#include <sys/mman.h>
#include <string.h>
#include <stdlib.h>
#include <limits.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <stdio.h>
#include "bwa.h"
int bwa_shm_stage(bwaidx_t *idx, const char *hint, const char *_tmpfn)
{
const char *name;
uint8_t *shm, *shm_idx;
uint16_t *cnt, i;
int shmid, to_init = 0, l;
char path[PATH_MAX + 1], *p, *tmpfn = (char*)_tmpfn;
if (hint == 0 || hint[0] == 0) return -1;
for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
++name;
if ((shmid = shm_open("/bwactl", O_RDWR, 0444)) < 0) {
shmid = shm_open("/bwactl", O_CREAT|O_RDWR|O_EXCL, 0644);
to_init = 1;
}
if (shmid < 0) return -1;
ftruncate(shmid, BWA_CTL_SIZE);
shm = mmap(0, BWA_CTL_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
cnt = (uint16_t*)shm;
if (to_init) {
memset(shm, 0, BWA_CTL_SIZE);
cnt[1] = 4;
} else {
for (i = 0, p = (char*)shm + 4; i < cnt[0]; ++i) {
if (strcmp(p + 8, name) == 0) break;
p += 9 + strlen(p + 8);
}
if (i < cnt[0]) {
fprintf(stderr, "[W::%s] index '%s' is already in shared memory\n", __func__, name);
return -1;
}
}
if (idx->mem == 0) bwa_idx2mem(idx);
if (tmpfn) {
FILE *fp;
if ((fp = fopen(tmpfn, "wb")) != 0) {
int64_t rest = idx->l_mem;
while (rest > 0) {
int64_t l = rest < 0x1000000? rest : 0x1000000;
rest -= fwrite(&idx->mem[idx->l_mem - rest], 1, l, fp);
}
fclose(fp);
free(idx->mem); idx->mem = 0;
} else {
fprintf(stderr, "[W::%s] fail to create the temporary file. Option '-f' is ignored.\n", __func__);
tmpfn = 0;
}
}
strcat(strcpy(path, "/bwaidx-"), name);
l = 8 + strlen(name) + 1;
if (cnt[1] + l > BWA_CTL_SIZE) return -1;
memcpy(shm + cnt[1], &idx->l_mem, 8);
memcpy(shm + cnt[1] + 8, name, l - 8);
if ((shmid = shm_open(path, O_CREAT|O_RDWR|O_EXCL, 0644)) < 0) {
shm_unlink(path);
perror("shm_open()");
return -1;
}
cnt[1] += l; ++cnt[0];
ftruncate(shmid, idx->l_mem);
shm_idx = mmap(0, idx->l_mem, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
if (tmpfn) {
FILE *fp;
fp = fopen(tmpfn, "rb");
int64_t rest = idx->l_mem;
while (rest > 0) {
int64_t l = rest < 0x1000000? rest : 0x1000000;
rest -= fread(&shm_idx[idx->l_mem - rest], 1, l, fp);
}
fclose(fp);
unlink(tmpfn);
} else {
memcpy(shm_idx, idx->mem, idx->l_mem);
free(idx->mem);
}
bwa_mem2idx(idx->l_mem, shm_idx, idx);
idx->is_shm = 1;
return 0;
}
bwaidx_t *bwa_idx_load_from_shm(const char *hint)
{
const char *name;
uint8_t *shm, *shm_idx;
uint16_t *cnt, i;
char *p, path[PATH_MAX + 1];
int shmid;
int64_t l_mem;
bwaidx_t *idx;
if (hint == 0 || hint[0] == 0) return 0;
for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
++name;
if ((shmid = shm_open("/bwactl", O_RDONLY, 0444)) < 0) return 0;
shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
cnt = (uint16_t*)shm;
if (cnt[0] == 0) return 0;
for (i = 0, p = (char*)(shm + 4); i < cnt[0]; ++i) {
memcpy(&l_mem, p, 8); p += 8;
if (strcmp(p, name) == 0) break;
p += strlen(p) + 1;
}
if (i == cnt[0]) return 0;
strcat(strcpy(path, "/bwaidx-"), name);
if ((shmid = shm_open(path, O_RDONLY, 0444)) < 0) return 0;
shm_idx = mmap(0, l_mem, PROT_READ, MAP_SHARED, shmid, 0);
idx = calloc(1, sizeof(bwaidx_t));
bwa_mem2idx(l_mem, shm_idx, idx);
idx->is_shm = 1;
return idx;
}
int bwa_shm_list(void)
{
int shmid;
uint16_t *cnt, i;
char *p, *shm;
if ((shmid = shm_open("/bwactl", O_RDONLY, 0444)) < 0) return -1;
shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
cnt = (uint16_t*)shm;
for (i = 0, p = shm + 4; i < cnt[0]; ++i) {
int64_t l_mem;
memcpy(&l_mem, p, 8); p += 8;
printf("%s\t%ld\n", p, (long)l_mem);
p += strlen(p) + 1;
}
return 0;
}
int bwa_shm_destroy(void)
{
int shmid;
uint16_t *cnt, i;
char *p, *shm;
char path[PATH_MAX + 1];
if ((shmid = shm_open("/bwactl", O_RDONLY, 0444)) < 0) return -1;
shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
cnt = (uint16_t*)shm;
for (i = 0, p = shm + 4; i < cnt[0]; ++i) {
int64_t l_mem;
memcpy(&l_mem, p, 8); p += 8;
strcat(strcpy(path, "/bwaidx-"), p);
shm_unlink(path);
p += strlen(p) + 1;
}
munmap(shm, BWA_CTL_SIZE);
shm_unlink("/bwactl");
return 0;
}
int main_shm(int argc, char *argv[])
{
int c, to_list = 0, to_drop = 0, ret = 0;
char *tmpfn = 0;
while ((c = getopt(argc, argv, "ldf:")) >= 0) {
if (c == 'l') to_list = 1;
else if (c == 'd') to_drop = 1;
else if (c == 'f') tmpfn = optarg;
}
if (optind == argc && !to_list && !to_drop) {
fprintf(stderr, "\nUsage: bwa shm [-d|-l] [-f tmpFile] [idxbase]\n\n");
fprintf(stderr, "Options: -d destroy all indices in shared memory\n");
fprintf(stderr, " -l list names of indices in shared memory\n");
fprintf(stderr, " -f FILE temporary file to reduce peak memory\n\n");
return 1;
}
if (optind < argc && (to_list || to_drop)) {
fprintf(stderr, "[E::%s] open -l or -d cannot be used when 'idxbase' is present\n", __func__);
return 1;
}
if (optind < argc) {
bwaidx_t *idx;
idx = bwa_idx_load_from_disk(argv[optind], BWA_IDX_ALL);
if (bwa_shm_stage(idx, argv[optind], tmpfn) < 0) {
fprintf(stderr, "[E::%s] failed to stage the index in shared memory\n", __func__);
ret = 1;
}
bwa_idx_destroy(idx);
}
if (to_list) bwa_shm_list();
if (to_drop) bwa_shm_destroy();
return ret;
}

View File

@ -230,7 +230,11 @@ int main_mem(int argc, char *argv[])
} else update_a(opt, &opt0); } else update_a(opt, &opt0);
bwa_fill_scmat(opt->a, opt->b, opt->mat); bwa_fill_scmat(opt->a, opt->b, opt->mat);
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak idx = bwa_idx_load_from_shm(argv[optind]);
if (idx == 0) {
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
} else if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] load the bwa index from shared memory\n", __func__);
if (ignore_alt) if (ignore_alt)
for (i = 0; i < idx->bns->n_seqs; ++i) for (i = 0; i < idx->bns->n_seqs; ++i)
idx->bns->anns[i].is_alt = 0; idx->bns->anns[i].is_alt = 0;

5
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h" #include "utils.h"
#ifndef PACKAGE_VERSION #ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.10-r876-dirty" #define PACKAGE_VERSION "0.7.10-r907-dirty"
#endif #endif
int bwa_fa2pac(int argc, char *argv[]); int bwa_fa2pac(int argc, char *argv[]);
@ -22,6 +22,7 @@ int bwa_bwtsw2(int argc, char *argv[]);
int main_fastmap(int argc, char *argv[]); int main_fastmap(int argc, char *argv[]);
int main_mem(int argc, char *argv[]); int main_mem(int argc, char *argv[]);
int main_shm(int argc, char *argv[]);
int main_pemerge(int argc, char *argv[]); int main_pemerge(int argc, char *argv[]);
@ -43,6 +44,7 @@ static int usage()
fprintf(stderr, " sampe generate alignment (paired ended)\n"); fprintf(stderr, " sampe generate alignment (paired ended)\n");
fprintf(stderr, " bwasw BWA-SW for long queries\n"); fprintf(stderr, " bwasw BWA-SW for long queries\n");
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, " shm manage indices in shared memory\n");
fprintf(stderr, " fa2pac convert FASTA to PAC format\n"); fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
fprintf(stderr, " pac2bwt generate BWT from PAC\n"); fprintf(stderr, " pac2bwt generate BWT from PAC\n");
fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n"); fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n");
@ -81,6 +83,7 @@ int main(int argc, char *argv[])
else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1); else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1);
else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1); else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1);
else if (strcmp(argv[1], "shm") == 0) ret = main_shm(argc-1, argv+1);
else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1); else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1);
else { else {
fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);