Merge branch 'dev'
This commit is contained in:
commit
44a185f2c1
8
Makefile
8
Makefile
|
|
@ -4,7 +4,7 @@ CFLAGS= -g -Wall -Wno-unused-function -O2
|
||||||
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
||||||
AR= ar
|
AR= ar
|
||||||
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
|
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
|
||||||
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o
|
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwashm.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o
|
||||||
AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
||||||
is.o bwtindex.o bwape.o kopen.o pemerge.o \
|
is.o bwtindex.o bwape.o kopen.o pemerge.o \
|
||||||
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
||||||
|
|
@ -14,6 +14,10 @@ INCLUDES=
|
||||||
LIBS= -lm -lz -lpthread
|
LIBS= -lm -lz -lpthread
|
||||||
SUBDIRS= .
|
SUBDIRS= .
|
||||||
|
|
||||||
|
ifeq ($(shell uname -s),Linux)
|
||||||
|
LIBS += -lrt
|
||||||
|
endif
|
||||||
|
|
||||||
.SUFFIXES:.c .o .cc
|
.SUFFIXES:.c .o .cc
|
||||||
|
|
||||||
.c.o:
|
.c.o:
|
||||||
|
|
@ -52,6 +56,7 @@ bwape.o: ksw.h khash.h
|
||||||
bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h
|
bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h
|
||||||
bwase.o: bwa.h ksw.h
|
bwase.o: bwa.h ksw.h
|
||||||
bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h
|
bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h
|
||||||
|
bwashm.o: bwa.h bntseq.h bwt.h
|
||||||
bwt.o: utils.h bwt.h kvec.h malloc_wrap.h
|
bwt.o: utils.h bwt.h kvec.h malloc_wrap.h
|
||||||
bwt_gen.o: QSufSort.h malloc_wrap.h
|
bwt_gen.o: QSufSort.h malloc_wrap.h
|
||||||
bwt_lite.o: bwt_lite.h malloc_wrap.h
|
bwt_lite.o: bwt_lite.h malloc_wrap.h
|
||||||
|
|
@ -66,7 +71,6 @@ bwtsw2_core.o: khash.h ksort.h
|
||||||
bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h
|
bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h
|
||||||
bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h
|
bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h
|
||||||
bwtsw2_pair.o: malloc_wrap.h ksw.h
|
bwtsw2_pair.o: malloc_wrap.h ksw.h
|
||||||
cutvar.o: bwa.h bntseq.h bwt.h kvec.h malloc_wrap.h
|
|
||||||
example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h
|
example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h
|
||||||
fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h
|
fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h
|
||||||
is.o: malloc_wrap.h
|
is.o: malloc_wrap.h
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,123 @@
|
||||||
|
## Getting Started
|
||||||
|
|
||||||
|
Since version 0.7.11, BWA-MEM supports read mapping against a reference genome
|
||||||
|
with long alternative haplotypes present in separate ALT contigs. To use the
|
||||||
|
ALT-aware mode, users need to provide pairwise ALT-to-reference alignment in the
|
||||||
|
SAM format and rename the file to ""*idxbase*.alt". For GRCh38, this alignment is
|
||||||
|
available from the [BWA resource bundle for GRCh38][res].
|
||||||
|
|
||||||
|
#### Option 1: Mapping to the official GRCh38 with ALT contigs
|
||||||
|
|
||||||
|
Construct the index:
|
||||||
|
```sh
|
||||||
|
wget ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
|
||||||
|
gzip -d GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
|
||||||
|
mv GCA_000001405.15_GRCh38_full_analysis_set.fna hs38a.fa
|
||||||
|
bwa index hs38a.fa
|
||||||
|
cp bwa-hs38-res/hs38d4.fa.alt hs38a.fa.alt
|
||||||
|
```
|
||||||
|
|
||||||
|
Perform mapping:
|
||||||
|
```sh
|
||||||
|
bwa mem hs38a.fa read1.fq read2.fq \
|
||||||
|
| samblaster \
|
||||||
|
| bwa-hs38-res/k8-linux bwa-postalt.js hs38a.fa.alt \
|
||||||
|
| samtools view -bS - > aln.unsrt.bam
|
||||||
|
```
|
||||||
|
For short reads, the postprocessing script `bwa-postalt.js` runs at about the
|
||||||
|
same speed as BAM compression.
|
||||||
|
|
||||||
|
#### Option 2: Mapping to the collection of GRCh38, decoy and HLA genes
|
||||||
|
|
||||||
|
Construct the index:
|
||||||
|
```sh
|
||||||
|
cat hs38a.fa bwa-hs38-res/hs38d4-extra.fa > hs38d4.fa
|
||||||
|
bwa index hs38d4.fa
|
||||||
|
cp bwa-hs38-res/hs38d4.fa.alt .
|
||||||
|
```
|
||||||
|
Perform mapping:
|
||||||
|
```sh
|
||||||
|
bwa mem -g.8 hs38d4.fa read1.fq read2.fq \
|
||||||
|
| samblaster \
|
||||||
|
| bwa-hs38-res/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \
|
||||||
|
| samtools view -bS - > aln.unsrt.bam
|
||||||
|
```
|
||||||
|
This command line generates `postinfo.ctw` which loosely evaluates the presence
|
||||||
|
of an ALT contig with an empirical score at the last column.
|
||||||
|
|
||||||
|
## Background
|
||||||
|
|
||||||
|
GRCh38 ALT contigs are totaled 109Mb in length, spanning 60Mbp genomic regions.
|
||||||
|
However, sequences that are highly diverged from the primary assembly only
|
||||||
|
contribute a few million bp. Most subsequences of ALT contigs are highly similar
|
||||||
|
or identical to the primary assembly. If we align sequence reads to GRCh38+ALT
|
||||||
|
treating ALT equal to the primary assembly, we will get many reads with zero
|
||||||
|
mapping quality and lose variants on them. It is crucial to make the mapper
|
||||||
|
aware of ALTs.
|
||||||
|
|
||||||
|
BWA-MEM is designed to minimize the interference of ALT contigs such that on the
|
||||||
|
primary assembly, the ALT-aware alignment is highly similar to the alignment
|
||||||
|
without using ALT contigs in the index. This design choice makes it almost
|
||||||
|
always safe to map reads to GRCh38+ALT. Although we don't know yet how much
|
||||||
|
variations on ALT contigs contribute to phenotypes, we would not get the answer
|
||||||
|
without mapping large cohorts to these extra sequences. We hope our current
|
||||||
|
implementation encourages researchers to use ALT contigs soon and often.
|
||||||
|
|
||||||
|
## Methods
|
||||||
|
|
||||||
|
As of now, ALT mapping is done in two separate steps: BWA-MEM mapping and
|
||||||
|
postprocessing.
|
||||||
|
|
||||||
|
#### Step 1: BWA-MEM mapping
|
||||||
|
|
||||||
|
At this step, BWA-MEM reads the ALT contig names from "*idxbase*.alt", ignoring
|
||||||
|
the ALT-to-ref alignment, and labels a potential hit as *ALT* or *non-ALT*,
|
||||||
|
depending on whether the hit lands on an ALT contig or not. BWA-MEM then reports
|
||||||
|
alignments and assigns mapQ following these two rules:
|
||||||
|
|
||||||
|
* The original mapQ of a non-ALT hit is computed across non-ALT hits only.
|
||||||
|
The reported mapQ of an ALT hit is computed across all hits.
|
||||||
|
|
||||||
|
* An ALT hit is only reported if its score is strictly better than all
|
||||||
|
overlapping non-ALT hits. A reported ALT hit is flagged with 0x800
|
||||||
|
(supplementary) unless there are no non-ALT hits.
|
||||||
|
|
||||||
|
When option `-g FLOAT` is in use (which is the default), a third rule kicks in:
|
||||||
|
|
||||||
|
* The mapQ of a non-ALT hit is reduced to zero if its score is less than FLOAT
|
||||||
|
times the score of an overlapping ALT hit. In this case, the original mapQ is
|
||||||
|
moved to the `om` tag.
|
||||||
|
|
||||||
|
If we don't care about ALT hits, we may actually skip postprocessing (step 2).
|
||||||
|
Nonetheless, postprocessing is recommended as it improves mapQ and gives more
|
||||||
|
information about ALT hits.
|
||||||
|
|
||||||
|
#### Step 2: Postprocessing
|
||||||
|
|
||||||
|
Postprocessing is done with a separate script `bwa-postalt.js`. It reads all
|
||||||
|
potential hits reported in the XA tag, lifts ALT hits to the chromosomal
|
||||||
|
positions using the ALT-to-ref alignment, groups them after lifting and then
|
||||||
|
reassigns mapQ based on the best scoring hit in each group with all the hits in
|
||||||
|
a group get the same mapQ. Knowing the ALT-to-ref alignment, this script can
|
||||||
|
greatly improve mapQ of ALT hits and occasionally improve mapQ of non-ALT hits.
|
||||||
|
|
||||||
|
The script also measures the presence of each ALT contig. For a group of
|
||||||
|
overlapping ALT contigs c_1, ..., c_m, the weight for c_k equals `\frac{\sum_j
|
||||||
|
P(c_k|r_j)}{\sum_j\max_i P(c_i|r_j)}`, where `P(c_k|r)=\frac{pow(4,s_k)}{\sum_i
|
||||||
|
pow(4,s_i)}` is the posterior of c_k given a read r mapped to it with a
|
||||||
|
Smith-Waterman score s_k. This weight is reported in `postinfo.ctw` in the
|
||||||
|
option 2 above.
|
||||||
|
|
||||||
|
## Problems and Future Development
|
||||||
|
|
||||||
|
There are some uncertainties about ALT mappings - we are not sure whether they
|
||||||
|
help biological discovery and don't know the best way to analyze them. Without
|
||||||
|
clear demand from downstream analyses, it is very difficult to design the
|
||||||
|
optimal mapping strategy. The current BWA-MEM method is just a start. If it
|
||||||
|
turns out to be useful in research, we will probably rewrite bwa-postalt.js in C
|
||||||
|
for performance; if not, we will try new designs. It is also possible that we
|
||||||
|
may make breakthrough on the representation of multiple genomes, in which case,
|
||||||
|
we can even get rid of ALT contigs once for all.
|
||||||
|
|
||||||
|
[res]: https://sourceforge.net/projects/bio-bwa/files/
|
||||||
|
[sb]: https://github.com/GregoryFaust/samblaster
|
||||||
14
README.md
14
README.md
|
|
@ -63,7 +63,6 @@ do not have plan to submit it to a peer-reviewed journal in the near future.
|
||||||
3. [Does BWA work on reference sequences longer than 4GB in total?](#4gb)
|
3. [Does BWA work on reference sequences longer than 4GB in total?](#4gb)
|
||||||
4. [Why can one read in a pair has high mapping quality but the other has zero?](#pe0)
|
4. [Why can one read in a pair has high mapping quality but the other has zero?](#pe0)
|
||||||
5. [How can a BWA-backtrack alignment stands out of the end of a chromosome?](#endref)
|
5. [How can a BWA-backtrack alignment stands out of the end of a chromosome?](#endref)
|
||||||
6. [How to map sequences to GRCh38 with ALT contigs?](#h38)
|
|
||||||
|
|
||||||
####<a name="type"></a>1. What types of data does BWA work with?
|
####<a name="type"></a>1. What types of data does BWA work with?
|
||||||
|
|
||||||
|
|
@ -72,11 +71,11 @@ algorithm and setting may vary. The following list gives the recommended
|
||||||
settings:
|
settings:
|
||||||
|
|
||||||
* Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly
|
* Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly
|
||||||
contigs up to a few megabases mapped to a close related reference genome:
|
contigs up to a few megabases mapped to a closely related reference genome:
|
||||||
|
|
||||||
bwa mem ref.fa reads.fq > aln.sam
|
bwa mem ref.fa reads.fq > aln.sam
|
||||||
|
|
||||||
* Illumina single-end reads no longer than ~70bp:
|
* Illumina single-end reads shorter than ~70bp:
|
||||||
|
|
||||||
bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam
|
bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam
|
||||||
|
|
||||||
|
|
@ -84,20 +83,21 @@ settings:
|
||||||
|
|
||||||
bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
|
bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
|
||||||
|
|
||||||
* Illumina paired-end reads no longer than ~70bp:
|
* Illumina paired-end reads shorter than ~70bp:
|
||||||
|
|
||||||
bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai
|
bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai
|
||||||
bwa sampe ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam
|
bwa sampe ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam
|
||||||
|
|
||||||
* PacBio subreads to a reference genome:
|
* PacBio subreads or Oxford Nanopore reads to a reference genome:
|
||||||
|
|
||||||
bwa mem -x pacbio ref.fa reads.fq > aln.sam
|
bwa mem -x pacbio ref.fa reads.fq > aln.sam
|
||||||
|
bwa mem -x ont2d ref.fa reads.fq > aln.sam
|
||||||
|
|
||||||
BWA-MEM is recommended for query sequences longer than ~70bp for a variety of
|
BWA-MEM is recommended for query sequences longer than ~70bp for a variety of
|
||||||
error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with
|
error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with
|
||||||
errors given longer query sequences as the chance of missing all seeds is small.
|
errors given longer query sequences as the chance of missing all seeds is small.
|
||||||
As is shown above, with non-default settings, BWA-MEM works with PacBio subreads
|
As is shown above, with non-default settings, BWA-MEM works with Oxford Nanopore
|
||||||
with a sequencing error rate as high as ~15%.
|
reads with a sequencing error rate over 20%.
|
||||||
|
|
||||||
####<a name="multihit"></a>2. Why does a read appear multiple times in the output SAM?
|
####<a name="multihit"></a>2. Why does a read appear multiple times in the output SAM?
|
||||||
|
|
||||||
|
|
|
||||||
23
bntseq.c
23
bntseq.c
|
|
@ -179,17 +179,25 @@ bntseq_t *bns_restore(const char *prefix)
|
||||||
if ((fp = fopen(strcat(strcpy(alt_filename, prefix), ".alt"), "r")) != 0) { // read .alt file if present
|
if ((fp = fopen(strcat(strcpy(alt_filename, prefix), ".alt"), "r")) != 0) { // read .alt file if present
|
||||||
char str[1024];
|
char str[1024];
|
||||||
khash_t(str) *h;
|
khash_t(str) *h;
|
||||||
int i, absent;
|
int c, i, absent;
|
||||||
khint_t k;
|
khint_t k;
|
||||||
h = kh_init(str);
|
h = kh_init(str);
|
||||||
for (i = 0; i < bns->n_seqs; ++i) {
|
for (i = 0; i < bns->n_seqs; ++i) {
|
||||||
k = kh_put(str, h, bns->anns[i].name, &absent);
|
k = kh_put(str, h, bns->anns[i].name, &absent);
|
||||||
kh_val(h, k) = i;
|
kh_val(h, k) = i;
|
||||||
}
|
}
|
||||||
while (fscanf(fp, "%s", str) == 1) {
|
i = 0;
|
||||||
k = kh_get(str, h, str);
|
while ((c = fgetc(fp)) != EOF) {
|
||||||
if (k != kh_end(h))
|
if (c == '\t' || c == '\n' || c == '\r') {
|
||||||
bns->anns[kh_val(h, k)].is_alt = 1;
|
str[i] = 0;
|
||||||
|
if (str[0] != '@') {
|
||||||
|
k = kh_get(str, h, str);
|
||||||
|
if (k != kh_end(h))
|
||||||
|
bns->anns[kh_val(h, k)].is_alt = 1;
|
||||||
|
}
|
||||||
|
while (c != '\n' && c != EOF) c = fgetc(fp);
|
||||||
|
i = 0;
|
||||||
|
} else str[i++] = c; // FIXME: potential segfault here
|
||||||
}
|
}
|
||||||
kh_destroy(str, h);
|
kh_destroy(str, h);
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
|
|
@ -226,7 +234,7 @@ static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_
|
||||||
}
|
}
|
||||||
p = bns->anns + bns->n_seqs;
|
p = bns->anns + bns->n_seqs;
|
||||||
p->name = strdup((char*)seq->name.s);
|
p->name = strdup((char*)seq->name.s);
|
||||||
p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)");
|
p->anno = seq->comment.l > 0? strdup((char*)seq->comment.s) : strdup("(null)");
|
||||||
p->gi = 0; p->len = seq->seq.l;
|
p->gi = 0; p->len = seq->seq.l;
|
||||||
p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
|
p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
|
||||||
p->n_ambs = 0;
|
p->n_ambs = 0;
|
||||||
|
|
@ -358,8 +366,9 @@ int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re)
|
||||||
{
|
{
|
||||||
int is_rev, rid_b, rid_e;
|
int is_rev, rid_b, rid_e;
|
||||||
if (rb < bns->l_pac && re > bns->l_pac) return -2;
|
if (rb < bns->l_pac && re > bns->l_pac) return -2;
|
||||||
|
assert(rb <= re);
|
||||||
rid_b = bns_pos2rid(bns, bns_depos(bns, rb, &is_rev));
|
rid_b = bns_pos2rid(bns, bns_depos(bns, rb, &is_rev));
|
||||||
rid_e = bns_pos2rid(bns, bns_depos(bns, re, &is_rev) - 1);
|
rid_e = rb < re? bns_pos2rid(bns, bns_depos(bns, re - 1, &is_rev)) : rid_b;
|
||||||
return rid_b == rid_e? rid_b : -1;
|
return rid_b == rid_e? rid_b : -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
299
bwa-postalt.js
299
bwa-postalt.js
|
|
@ -52,7 +52,7 @@ var getopt = function(args, ostr) {
|
||||||
return optopt;
|
return optopt;
|
||||||
}
|
}
|
||||||
|
|
||||||
// print an object in a format similar to JSON. For debugging.
|
// print an object in a format similar to JSON. For debugging only.
|
||||||
function obj2str(o)
|
function obj2str(o)
|
||||||
{
|
{
|
||||||
if (typeof(o) != 'object') {
|
if (typeof(o) != 'object') {
|
||||||
|
|
@ -203,54 +203,31 @@ function parse_hit(s, opt)
|
||||||
return h;
|
return h;
|
||||||
}
|
}
|
||||||
|
|
||||||
// read the ALT-to-REF alignment and generate the index
|
|
||||||
function read_ALT_sam(fn)
|
|
||||||
{
|
|
||||||
var intv = {};
|
|
||||||
var file = new File(fn);
|
|
||||||
var buf = new Bytes();
|
|
||||||
while (file.readline(buf) >= 0) {
|
|
||||||
var line = buf.toString();
|
|
||||||
var t = line.split("\t");
|
|
||||||
if (line.charAt(0) == '@') continue;
|
|
||||||
var flag = parseInt(t[1]);
|
|
||||||
var m, cigar = [], l_qaln = 0, l_qclip = 0;
|
|
||||||
while ((m = re_cigar.exec(t[5])) != null) {
|
|
||||||
var l = parseInt(m[1]);
|
|
||||||
cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip
|
|
||||||
if (m[2] == 'M' || m[2] == 'I') l_qaln += l;
|
|
||||||
else if (m[2] == 'S' || m[2] == 'H') l_qclip += l;
|
|
||||||
}
|
|
||||||
var j = flag&16? cigar.length-1 : 0;
|
|
||||||
var start = cigar[j][0] == 'S'? cigar[j][1] : 0;
|
|
||||||
if (intv[t[0]] == null) intv[t[0]] = [];
|
|
||||||
intv[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, parseInt(t[3]) - 1, cigar]);
|
|
||||||
//print(start, start + l_qaln, t[2], flag&16? true : false, parseInt(t[3]), cigar);
|
|
||||||
}
|
|
||||||
buf.destroy();
|
|
||||||
file.close();
|
|
||||||
// create index for intervals on ALT contigs
|
|
||||||
var idx = {};
|
|
||||||
for (var ctg in intv)
|
|
||||||
idx[ctg] = intv_ovlp(intv[ctg]);
|
|
||||||
return idx;
|
|
||||||
}
|
|
||||||
|
|
||||||
function bwa_postalt(args)
|
function bwa_postalt(args)
|
||||||
{
|
{
|
||||||
var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true };
|
var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:10, show_ev:false };
|
||||||
|
|
||||||
while ((c = getopt(args, 'pqv:')) != null) {
|
while ((c = getopt(args, 'Pqev:p:')) != null) {
|
||||||
if (c == 'v') opt.verbose = parseInt(getopt.arg);
|
if (c == 'v') opt.verbose = parseInt(getopt.arg);
|
||||||
else if (c == 'p') opt.show_pri = true;
|
else if (c == 'p') opt.pre = getopt.arg;
|
||||||
|
else if (c == 'P') opt.show_pri = true;
|
||||||
else if (c == 'q') opt.recover_maq = false;
|
else if (c == 'q') opt.recover_maq = false;
|
||||||
|
else if (c == 'e') opt.show_ev = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (opt.show_ev && opt.pre == null) {
|
||||||
|
warn("ERROR: option '-p' must be specified if '-e' is applied.");
|
||||||
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (args.length == getopt.ind) {
|
if (args.length == getopt.ind) {
|
||||||
print("");
|
print("");
|
||||||
print("Usage: k8 bwa-postalt.js [-p] <alt.sam> [aln.sam]\n");
|
print("Usage: k8 bwa-postalt.js [options] <alt.sam> [aln.sam]\n");
|
||||||
print("Options: -p output lifted non-ALT hit in a SAM line (for ALT-unware alignments)");
|
print("Options: -p STR prefix of file(s) for additional information [null]");
|
||||||
print(" -q don't recover mapQ for non-ALTs hit overlapping lifted ALT");
|
print(" PREFIX.ctw - weight of each ALT contig");
|
||||||
|
print(" PREFIX.evi - reads supporting ALT contigs (effective with -e)");
|
||||||
|
print(" -q don't modify mapQ for non-ALTs hit overlapping lifted ALT");
|
||||||
|
print(" -e show reads supporting ALT contigs into file PREFIX.evi");
|
||||||
print("");
|
print("");
|
||||||
print("Note: This script inspects the XA tag, lifts the mapping positions of ALT hits to");
|
print("Note: This script inspects the XA tag, lifts the mapping positions of ALT hits to");
|
||||||
print(" the primary assembly, groups them and then estimates mapQ across groups. If");
|
print(" the primary assembly, groups them and then estimates mapQ across groups. If");
|
||||||
|
|
@ -262,12 +239,56 @@ function bwa_postalt(args)
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
var file, buf = new Bytes();
|
var fp_evi = opt.show_ev && opt.pre? new File(opt.pre + '.evi', "w") : null;
|
||||||
var aux = new Bytes(); // used for reverse and reverse complement
|
var aux = new Bytes(); // used for reverse and reverse complement
|
||||||
var idx = read_ALT_sam(args[getopt.ind]);
|
var buf = new Bytes();
|
||||||
var buf2 = [];
|
|
||||||
|
// read ALT-to-REF alignment
|
||||||
|
var intv_alt = {}, intv_pri = {}, idx_un = {};
|
||||||
|
var file = new File(args[getopt.ind]);
|
||||||
|
while (file.readline(buf) >= 0) {
|
||||||
|
var line = buf.toString();
|
||||||
|
if (line.charAt(0) == '@') continue;
|
||||||
|
var t = line.split("\t");
|
||||||
|
if (t.length < 11) continue; // incomplete lines
|
||||||
|
var pos = parseInt(t[3]) - 1;
|
||||||
|
var flag = parseInt(t[1]);
|
||||||
|
if ((flag&4) || t[2] == '*') {
|
||||||
|
idx_un[t[0]] = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
var m, cigar = [], l_qaln = 0, l_tlen = 0, l_qclip = 0;
|
||||||
|
while ((m = re_cigar.exec(t[5])) != null) {
|
||||||
|
var l = parseInt(m[1]);
|
||||||
|
cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip
|
||||||
|
if (m[2] == 'M') l_qaln += l, l_tlen += l;
|
||||||
|
else if (m[2] == 'I') l_qaln += l;
|
||||||
|
else if (m[2] == 'S' || m[2] == 'H') l_qclip += l;
|
||||||
|
else if (m[2] == 'D' || m[2] == 'N') l_tlen += l;
|
||||||
|
}
|
||||||
|
var j = flag&16? cigar.length-1 : 0;
|
||||||
|
var start = cigar[j][0] == 'S'? cigar[j][1] : 0;
|
||||||
|
if (intv_alt[t[0]] == null) intv_alt[t[0]] = [];
|
||||||
|
intv_alt[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, pos - 1, cigar, pos + l_tlen]);
|
||||||
|
if (intv_pri[t[2]] == null) intv_pri[t[2]] = [];
|
||||||
|
intv_pri[t[2]].push([pos, pos + l_tlen, t[0]]);
|
||||||
|
}
|
||||||
|
file.close();
|
||||||
|
var idx_alt = {}, idx_pri = {};
|
||||||
|
for (var ctg in intv_alt)
|
||||||
|
idx_alt[ctg] = intv_ovlp(intv_alt[ctg]);
|
||||||
|
for (var ctg in intv_pri)
|
||||||
|
idx_pri[ctg] = intv_ovlp(intv_pri[ctg]);
|
||||||
|
|
||||||
|
// initialize the list of ALT contigs
|
||||||
|
var weight_alt = [];
|
||||||
|
for (var ctg in idx_alt)
|
||||||
|
weight_alt[ctg] = [0, 0, 0, 0, 0, 0, intv_alt[ctg][0][3], intv_alt[ctg][0][5], intv_alt[ctg][0][7]];
|
||||||
|
for (var ctg in idx_un)
|
||||||
|
weight_alt[ctg] = [0, 0, 0, 0, 0, 0, '~', 0, 0];
|
||||||
|
|
||||||
// process SAM
|
// process SAM
|
||||||
|
var buf2 = [];
|
||||||
file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File();
|
file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File();
|
||||||
while (file.readline(buf) >= 0) {
|
while (file.readline(buf) >= 0) {
|
||||||
var m, line = buf.toString();
|
var m, line = buf.toString();
|
||||||
|
|
@ -286,36 +307,49 @@ function bwa_postalt(args)
|
||||||
buf2 = [];
|
buf2 = [];
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((m = /\tXA:Z:(\S+)/.exec(line)) == null) {
|
// skip unmapped lines
|
||||||
|
if (t[1]&4) {
|
||||||
buf2.push(t);
|
buf2.push(t);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
var XA_strs = m[1].split(";");
|
|
||||||
|
|
||||||
// parse the reported hit
|
// parse the reported hit
|
||||||
var hits = [];
|
|
||||||
var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1];
|
var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1];
|
||||||
var flag = t[1];
|
var flag = t[1];
|
||||||
var h = parse_hit([t[2], ((flag&16)?'-':'+') + t[3], t[5], NM], opt);
|
var h = parse_hit([t[2], ((flag&16)?'-':'+') + t[3], t[5], NM], opt);
|
||||||
if (h.hard) { // don't process lines with hard clips
|
if (h.hard) { // the following does not work with hard clipped alignments
|
||||||
buf2.push(t);
|
buf2.push(t);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
hits.push(h);
|
var hits = [h];
|
||||||
|
|
||||||
// parse hits in the XA tag
|
// parse hits in the XA tag
|
||||||
for (var i = 0; i < XA_strs.length; ++i)
|
if ((m = /\tXA:Z:(\S+)/.exec(line)) != null) {
|
||||||
if (XA_strs[i] != '') // as the last symbol in an XA tag is ";", the last split is an empty string
|
var XA_strs = m[1].split(";");
|
||||||
hits.push(parse_hit(XA_strs[i].split(","), opt));
|
for (var i = 0; i < XA_strs.length; ++i)
|
||||||
|
if (XA_strs[i] != '') // as the last symbol in an XA tag is ";", the last split is an empty string
|
||||||
|
hits.push(parse_hit(XA_strs[i].split(","), opt));
|
||||||
|
}
|
||||||
|
|
||||||
|
// check if there are ALT hits
|
||||||
|
var has_alt = false;
|
||||||
|
for (var i = 0; i < hits.length; ++i)
|
||||||
|
if (weight_alt[hits[i].ctg] != null) {
|
||||||
|
has_alt = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (!has_alt) {
|
||||||
|
buf2.push(t);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// lift mapping positions to the primary assembly
|
// lift mapping positions to the primary assembly
|
||||||
var n_lifted = 0, n_rpt_lifted = 0;
|
var n_rpt_lifted = 0, rpt_lifted = null;
|
||||||
for (var i = 0; i < hits.length; ++i) {
|
for (var i = 0; i < hits.length; ++i) {
|
||||||
var h = hits[i];
|
var a, h = hits[i];
|
||||||
|
|
||||||
if (idx[h.ctg] == null) continue;
|
if (idx_alt[h.ctg] == null || (a = idx_alt[h.ctg](h.start, h.end)) == null || a.length == 0)
|
||||||
var a = idx[h.ctg](h.start, h.end);
|
continue;
|
||||||
if (a == null || a.length == 0) continue;
|
|
||||||
|
|
||||||
// find the approximate position on the primary assembly
|
// find the approximate position on the primary assembly
|
||||||
var lifted = [];
|
var lifted = [];
|
||||||
|
|
@ -333,36 +367,46 @@ function bwa_postalt(args)
|
||||||
lifted.push([a[j][3], (h.rev!=a[j][4]), s, e]);
|
lifted.push([a[j][3], (h.rev!=a[j][4]), s, e]);
|
||||||
if (i == 0) ++n_rpt_lifted;
|
if (i == 0) ++n_rpt_lifted;
|
||||||
}
|
}
|
||||||
if (lifted.length) ++n_lifted, hits[i].lifted = lifted;
|
if (i == 0 && n_rpt_lifted == 1) rpt_lifted = lifted[0].slice(0);
|
||||||
}
|
if (lifted.length) hits[i].lifted = lifted;
|
||||||
if (n_lifted == 0) {
|
|
||||||
buf2.push(t);
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// group hits based on the lifted positions on the primary assembly
|
// prepare for hits grouping
|
||||||
for (var i = 0; i < hits.length; ++i) { // set keys for sorting
|
for (var i = 0; i < hits.length; ++i) { // set keys for sorting
|
||||||
if (hits[i].lifted && hits[i].lifted.length) // TODO: only the first element in lifted[] is used
|
if (hits[i].lifted != null) // TODO: only the first element in lifted[] is used
|
||||||
hits[i].pctg = hits[i].lifted[0][0], hits[i].pstart = hits[i].lifted[0][2], hits[i].pend = hits[i].lifted[0][3];
|
hits[i].pctg = hits[i].lifted[0][0], hits[i].pstart = hits[i].lifted[0][2], hits[i].pend = hits[i].lifted[0][3];
|
||||||
else hits[i].pctg = hits[i].ctg, hits[i].pstart = hits[i].start, hits[i].pend = hits[i].end;
|
else hits[i].pctg = hits[i].ctg, hits[i].pstart = hits[i].start, hits[i].pend = hits[i].end;
|
||||||
hits[i].i = i; // keep the original index
|
hits[i].i = i; // keep the original index
|
||||||
}
|
}
|
||||||
hits.sort(function(a,b) { return a.pctg != b.pctg? (a.pctg < b.pctg? -1 : 1) : a.pstart - b.pstart });
|
|
||||||
var last_chr = null, end = 0, g = -1;
|
// group hits based on the lifted positions on non-ALT sequences
|
||||||
for (var i = 0; i < hits.length; ++i) {
|
if (hits.length > 1) {
|
||||||
if (last_chr != hits[i].pctg) ++g, last_chr = hits[i].pctg, end = 0;
|
hits.sort(function(a,b) { return a.pctg != b.pctg? (a.pctg < b.pctg? -1 : 1) : a.pstart - b.pstart });
|
||||||
else if (hits[i].pstart >= end) ++g;
|
var last_chr = null, end = 0, g = -1;
|
||||||
hits[i].g = g;
|
for (var i = 0; i < hits.length; ++i) {
|
||||||
end = end > hits[i].pend? end : hits[i].pend;
|
if (last_chr != hits[i].pctg) ++g, last_chr = hits[i].pctg, end = 0;
|
||||||
|
else if (hits[i].pstart >= end) ++g;
|
||||||
|
hits[i].g = g;
|
||||||
|
end = end > hits[i].pend? end : hits[i].pend;
|
||||||
|
}
|
||||||
|
} else hits[0].g = 0;
|
||||||
|
|
||||||
|
// find the index and group id of the reported hit; find the size of the reported group
|
||||||
|
var reported_g = null, reported_i = null, n_group0 = 0;
|
||||||
|
if (hits.length > 1) {
|
||||||
|
for (var i = 0; i < hits.length; ++i)
|
||||||
|
if (hits[i].i == 0)
|
||||||
|
reported_g = hits[i].g, reported_i = i;
|
||||||
|
for (var i = 0; i < hits.length; ++i)
|
||||||
|
if (hits[i].g == reported_g)
|
||||||
|
++n_group0;
|
||||||
|
} else {
|
||||||
|
if (weight_alt[hits[0].ctg] == null) { // no need to go through the following if the single hit is non-ALT
|
||||||
|
buf2.push(t);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
reported_g = reported_i = 0, n_group0 = 1;
|
||||||
}
|
}
|
||||||
var reported_g = null, reported_i = null;
|
|
||||||
for (var i = 0; i < hits.length; ++i)
|
|
||||||
if (hits[i].i == 0)
|
|
||||||
reported_g = hits[i].g, reported_i = i;
|
|
||||||
var n_group0 = 0; // #hits overlapping the reported hit
|
|
||||||
for (var i = 0; i < hits.length; ++i)
|
|
||||||
if (hits[i].g == reported_g)
|
|
||||||
++n_group0;
|
|
||||||
|
|
||||||
// re-estimate mapping quality if necessary
|
// re-estimate mapping quality if necessary
|
||||||
var mapQ, ori_mapQ = t[4];
|
var mapQ, ori_mapQ = t[4];
|
||||||
|
|
@ -379,12 +423,65 @@ function bwa_postalt(args)
|
||||||
mapQ = group_max.length == 1? 60 : 6 * (group_max[0][0] - group_max[1][0]);
|
mapQ = group_max.length == 1? 60 : 6 * (group_max[0][0] - group_max[1][0]);
|
||||||
} else mapQ = 0;
|
} else mapQ = 0;
|
||||||
mapQ = mapQ < 60? mapQ : 60;
|
mapQ = mapQ < 60? mapQ : 60;
|
||||||
mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ;
|
if (idx_alt[t[2]] == null) mapQ = mapQ < ori_mapQ? mapQ : ori_mapQ;
|
||||||
|
else mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ;
|
||||||
} else mapQ = t[4];
|
} else mapQ = t[4];
|
||||||
|
|
||||||
|
// ALT genotyping
|
||||||
|
if (mapQ >= opt.min_mapq && hits[reported_i].score >= opt.min_sc) {
|
||||||
|
// collect all overlapping ALT contigs
|
||||||
|
var hits2 = [];
|
||||||
|
for (var i = 0; i < hits.length; ++i) {
|
||||||
|
var h = hits[i];
|
||||||
|
if (h.g == reported_g)
|
||||||
|
hits2.push([h.pctg, h.pstart, h.pend, h.ctg, h.score, h.NM]);
|
||||||
|
}
|
||||||
|
var start = hits2[0][1], end = hits2[0][2];
|
||||||
|
for (var i = 1; i < hits2.length; ++i)
|
||||||
|
end = end > hits2[i][2]? end : hits2[i][2];
|
||||||
|
var alts = {};
|
||||||
|
for (var i = 0; i < hits2.length; ++i)
|
||||||
|
if (weight_alt[hits2[i][3]] != null)
|
||||||
|
alts[hits2[i][3]] = [hits2[i][4], hits2[i][5]];
|
||||||
|
if (idx_pri[hits2[0][0]] != null) { // add other unreported hits
|
||||||
|
var ovlp = idx_pri[hits2[0][0]](start, end);
|
||||||
|
for (var i = 0; i < ovlp.length; ++i)
|
||||||
|
if (ovlp[i][0] <= start && end <= ovlp[i][1] && alts[ovlp[i][2]] == null)
|
||||||
|
alts[ovlp[i][2]] = [0, 0];
|
||||||
|
}
|
||||||
|
|
||||||
|
// add weight to each ALT contig
|
||||||
|
var alt_arr = [], max_sc = -1, max_i = -1, sum = 0, min_sc = 1<<30, max_nm = -1;
|
||||||
|
for (var ctg in alts)
|
||||||
|
alt_arr.push([ctg, alts[ctg][0], 0, alts[ctg][1]]);
|
||||||
|
for (var i = 0; i < alt_arr.length; ++i) {
|
||||||
|
if (max_sc < alt_arr[i][1])
|
||||||
|
max_sc = alt_arr[i][1], max_i = i;
|
||||||
|
min_sc = min_sc < alt_arr[i][1]? min_sc : alt_arr[i][1];
|
||||||
|
var nm = alt_arr[i][1] > 0? alt_arr[i][3] : opt.max_nm_sc;
|
||||||
|
max_nm = max_nm > nm? max_nm : nm;
|
||||||
|
}
|
||||||
|
if (max_nm > opt.max_nm_sc) max_nm = opt.max_nm_sc;
|
||||||
|
if (max_sc > 0 && (alt_arr.length == 1 || min_sc < max_sc)) {
|
||||||
|
for (var i = 0; i < alt_arr.length; ++i)
|
||||||
|
sum += (alt_arr[i][2] = Math.pow(10, .6 * (alt_arr[i][1] - max_sc)));
|
||||||
|
for (var i = 0; i < alt_arr.length; ++i) alt_arr[i][2] /= sum;
|
||||||
|
for (var i = 0; i < alt_arr.length; ++i) {
|
||||||
|
var e = [alt_arr[i][0], 1, alt_arr[max_i][2], alt_arr[i][2], max_nm, max_nm * alt_arr[max_i][2], max_nm * alt_arr[i][2]];
|
||||||
|
var w = weight_alt[e[0]];
|
||||||
|
for (var j = 0; j < 6; ++j) w[j] += e[j+1];
|
||||||
|
if (fp_evi) {
|
||||||
|
e[2] = e[2].toFixed(3); e[3] = e[3].toFixed(3);
|
||||||
|
e[5] = e[5].toFixed(3); e[6] = e[6].toFixed(3);
|
||||||
|
fp_evi.write(t[0] + '/' + (t[1]>>6&3) + '\t' + e.join("\t") + '\n');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// check if the reported hit overlaps a hit to the primary assembly; if so, don't reduce mapping quality
|
// check if the reported hit overlaps a hit to the primary assembly; if so, don't reduce mapping quality
|
||||||
if (opt.recover_mapq && n_rpt_lifted == 1 && mapQ > 0) {
|
if (opt.recover_mapq && n_rpt_lifted == 1 && mapQ > 0) {
|
||||||
var l = lifted[0];
|
var l = rpt_lifted;
|
||||||
for (var i = 0; i < buf2.length; ++i) {
|
for (var i = 0; i < buf2.length; ++i) {
|
||||||
var s = buf2[i];
|
var s = buf2[i];
|
||||||
if (l[0] != s[2]) continue; // different chr
|
if (l[0] != s[2]) continue; // different chr
|
||||||
|
|
@ -423,6 +520,7 @@ function bwa_postalt(args)
|
||||||
need_rev = true;
|
need_rev = true;
|
||||||
}
|
}
|
||||||
if (need_rev) { // reverse and reverse complement
|
if (need_rev) { // reverse and reverse complement
|
||||||
|
aux.length = 0;
|
||||||
aux.set(t[9], 0); aux.revcomp(); rs = aux.toString();
|
aux.set(t[9], 0); aux.revcomp(); rs = aux.toString();
|
||||||
aux.set(t[10],0); aux.reverse(); rq = aux.toString();
|
aux.set(t[10],0); aux.reverse(); rq = aux.toString();
|
||||||
}
|
}
|
||||||
|
|
@ -438,14 +536,16 @@ function bwa_postalt(args)
|
||||||
for (var i = 0; i < hits.length; ++i) {
|
for (var i = 0; i < hits.length; ++i) {
|
||||||
if (opt.verbose >= 5) print(obj2str(hits[i]));
|
if (opt.verbose >= 5) print(obj2str(hits[i]));
|
||||||
if (hits[i].g != reported_g || i == reported_i) continue;
|
if (hits[i].g != reported_g || i == reported_i) continue;
|
||||||
if (!opt.show_pri && idx[hits[i].ctg] == null) continue;
|
if (!opt.show_pri && idx_alt[hits[i].ctg] == null) continue;
|
||||||
var s = [t[0], flag&0xf10, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, '*', 0, 0];
|
var s = [t[0], 0, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, '*', 0, 0];
|
||||||
// update name
|
// print sequence/quality and set the rev flag
|
||||||
if (flag&0x40) s[0] += "/1";
|
if (hits[i].rev == hits[reported_i].rev) {
|
||||||
if (flag&0x80) s[0] += "/2";
|
s.push(t[9], t[10]);
|
||||||
s[0] += "_" + (++cnt);
|
s[1] = flag | 0x800;
|
||||||
if (hits[i].rev == hits[reported_i].rev) s.push(t[9], t[10]);
|
} else {
|
||||||
else s.push(rs, rq);
|
s.push(rs, rq);
|
||||||
|
s[1] = (flag ^ 0x10) | 0x800;
|
||||||
|
}
|
||||||
s.push("NM:i:" + hits[i].NM);
|
s.push("NM:i:" + hits[i].NM);
|
||||||
if (hits[i].lifted_str) s.push("lt:Z:" + hits[i].lifted_str);
|
if (hits[i].lifted_str) s.push("lt:Z:" + hits[i].lifted_str);
|
||||||
buf2.push(s);
|
buf2.push(s);
|
||||||
|
|
@ -454,9 +554,30 @@ function bwa_postalt(args)
|
||||||
for (var i = 0; i < buf2.length; ++i)
|
for (var i = 0; i < buf2.length; ++i)
|
||||||
print(buf2[i].join("\t"));
|
print(buf2[i].join("\t"));
|
||||||
file.close();
|
file.close();
|
||||||
|
if (fp_evi != null) fp_evi.close();
|
||||||
|
|
||||||
aux.destroy();
|
|
||||||
buf.destroy();
|
buf.destroy();
|
||||||
|
aux.destroy();
|
||||||
|
|
||||||
|
// print weight of each contig
|
||||||
|
if (opt.pre != null) {
|
||||||
|
var fpout = new File(opt.pre + '.ctw', "w");
|
||||||
|
var weight_arr = [];
|
||||||
|
for (var ctg in weight_alt) {
|
||||||
|
var w = weight_alt[ctg];
|
||||||
|
weight_arr.push([ctg, w[6], w[7], w[8],
|
||||||
|
w[0], w[1].toFixed(3), w[2].toFixed(3), w[1] > 0? (w[2]/w[1]).toFixed(3) : '0.000',
|
||||||
|
w[3], w[4].toFixed(3), w[5].toFixed(3), w[4] > 0? (w[5]/w[4]).toFixed(3) : '0.000']);
|
||||||
|
}
|
||||||
|
weight_arr.sort(function(a,b) {
|
||||||
|
return a[1] < b[1]? -1 : a[1] > b[1]? 1 : a[2] != b[2]? a[2] - b[2] : a[0] < b[0]? -1 : a[0] > b[0]? 1 : 0;
|
||||||
|
});
|
||||||
|
for (var i = 0; i < weight_arr.length; ++i) {
|
||||||
|
if (weight_arr[i][1] == '~') weight_arr[i][1] = '*';
|
||||||
|
fpout.write(weight_arr[i].join("\t") + '\n');
|
||||||
|
}
|
||||||
|
fpout.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bwa_postalt(arguments);
|
bwa_postalt(arguments);
|
||||||
|
|
|
||||||
89
bwa.c
89
bwa.c
|
|
@ -227,7 +227,7 @@ bwt_t *bwa_idx_load_bwt(const char *hint)
|
||||||
return bwt;
|
return bwt;
|
||||||
}
|
}
|
||||||
|
|
||||||
bwaidx_t *bwa_idx_load(const char *hint, int which)
|
bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
|
||||||
{
|
{
|
||||||
bwaidx_t *idx;
|
bwaidx_t *idx;
|
||||||
char *prefix;
|
char *prefix;
|
||||||
|
|
@ -239,7 +239,12 @@ bwaidx_t *bwa_idx_load(const char *hint, int which)
|
||||||
idx = calloc(1, sizeof(bwaidx_t));
|
idx = calloc(1, sizeof(bwaidx_t));
|
||||||
if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
|
if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
|
||||||
if (which & BWA_IDX_BNS) {
|
if (which & BWA_IDX_BNS) {
|
||||||
|
int i, c;
|
||||||
idx->bns = bns_restore(prefix);
|
idx->bns = bns_restore(prefix);
|
||||||
|
for (i = c = 0; i < idx->bns->n_seqs; ++i)
|
||||||
|
if (idx->bns->anns[i].is_alt) ++c;
|
||||||
|
if (bwa_verbose >= 3)
|
||||||
|
fprintf(stderr, "[M::%s] read %d ALT contigs\n", __func__, c);
|
||||||
if (which & BWA_IDX_PAC) {
|
if (which & BWA_IDX_PAC) {
|
||||||
idx->pac = calloc(idx->bns->l_pac/4+1, 1);
|
idx->pac = calloc(idx->bns->l_pac/4+1, 1);
|
||||||
err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
|
err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
|
||||||
|
|
@ -251,15 +256,91 @@ bwaidx_t *bwa_idx_load(const char *hint, int which)
|
||||||
return idx;
|
return idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bwaidx_t *bwa_idx_load(const char *hint, int which)
|
||||||
|
{
|
||||||
|
return bwa_idx_load_from_disk(hint, which);
|
||||||
|
}
|
||||||
|
|
||||||
void bwa_idx_destroy(bwaidx_t *idx)
|
void bwa_idx_destroy(bwaidx_t *idx)
|
||||||
{
|
{
|
||||||
if (idx == 0) return;
|
if (idx == 0) return;
|
||||||
if (idx->bwt) bwt_destroy(idx->bwt);
|
if (idx->mem == 0) {
|
||||||
if (idx->bns) bns_destroy(idx->bns);
|
if (idx->bwt) bwt_destroy(idx->bwt);
|
||||||
if (idx->pac) free(idx->pac);
|
if (idx->bns) bns_destroy(idx->bns);
|
||||||
|
if (idx->pac) free(idx->pac);
|
||||||
|
} else {
|
||||||
|
free(idx->bwt); free(idx->bns->anns); free(idx->bns);
|
||||||
|
if (!idx->is_shm) free(idx->mem);
|
||||||
|
}
|
||||||
free(idx);
|
free(idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx)
|
||||||
|
{
|
||||||
|
int64_t k = 0, x;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
// generate idx->bwt
|
||||||
|
x = sizeof(bwt_t); idx->bwt = malloc(x); memcpy(idx->bwt, mem + k, x); k += x;
|
||||||
|
x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x;
|
||||||
|
x = idx->bwt->n_sa * sizeof(bwtint_t); idx->bwt->sa = (bwtint_t*)(mem + k); k += x;
|
||||||
|
|
||||||
|
// generate idx->bns and idx->pac
|
||||||
|
x = sizeof(bntseq_t); idx->bns = malloc(x); memcpy(idx->bns, mem + k, x); k += x;
|
||||||
|
x = idx->bns->n_holes * sizeof(bntamb1_t); idx->bns->ambs = (bntamb1_t*)(mem + k); k += x;
|
||||||
|
x = idx->bns->n_seqs * sizeof(bntann1_t); idx->bns->anns = malloc(x); memcpy(idx->bns->anns, mem + k, x); k += x;
|
||||||
|
for (i = 0; i < idx->bns->n_seqs; ++i) {
|
||||||
|
idx->bns->anns[i].name = (char*)(mem + k); k += strlen(idx->bns->anns[i].name) + 1;
|
||||||
|
idx->bns->anns[i].anno = (char*)(mem + k); k += strlen(idx->bns->anns[i].anno) + 1;
|
||||||
|
}
|
||||||
|
idx->pac = (uint8_t*)(mem + k); k += idx->bns->l_pac/4+1;
|
||||||
|
assert(k == l_mem);
|
||||||
|
|
||||||
|
idx->l_mem = k; idx->mem = mem;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int bwa_idx2mem(bwaidx_t *idx)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
int64_t k, x, tmp;
|
||||||
|
uint8_t *mem;
|
||||||
|
|
||||||
|
// copy idx->bwt
|
||||||
|
x = idx->bwt->bwt_size * 4;
|
||||||
|
mem = realloc(idx->bwt->bwt, sizeof(bwt_t) + x); idx->bwt->bwt = 0;
|
||||||
|
memmove(mem + sizeof(bwt_t), mem, x);
|
||||||
|
memcpy(mem, idx->bwt, sizeof(bwt_t)); k = sizeof(bwt_t) + x;
|
||||||
|
x = idx->bwt->n_sa * sizeof(bwtint_t); mem = realloc(mem, k + x); memcpy(mem + k, idx->bwt->sa, x); k += x;
|
||||||
|
free(idx->bwt->sa);
|
||||||
|
free(idx->bwt); idx->bwt = 0;
|
||||||
|
|
||||||
|
// copy idx->bns
|
||||||
|
tmp = idx->bns->n_seqs * sizeof(bntann1_t) + idx->bns->n_holes * sizeof(bntamb1_t);
|
||||||
|
for (i = 0; i < idx->bns->n_seqs; ++i) // compute the size of heap-allocated memory
|
||||||
|
tmp += strlen(idx->bns->anns[i].name) + strlen(idx->bns->anns[i].anno) + 2;
|
||||||
|
mem = realloc(mem, k + sizeof(bntseq_t) + tmp);
|
||||||
|
x = sizeof(bntseq_t); memcpy(mem + k, idx->bns, x); k += x;
|
||||||
|
x = idx->bns->n_holes * sizeof(bntamb1_t); memcpy(mem + k, idx->bns->ambs, x); k += x;
|
||||||
|
free(idx->bns->ambs);
|
||||||
|
x = idx->bns->n_seqs * sizeof(bntann1_t); memcpy(mem + k, idx->bns->anns, x); k += x;
|
||||||
|
for (i = 0; i < idx->bns->n_seqs; ++i) {
|
||||||
|
x = strlen(idx->bns->anns[i].name) + 1; memcpy(mem + k, idx->bns->anns[i].name, x); k += x;
|
||||||
|
x = strlen(idx->bns->anns[i].anno) + 1; memcpy(mem + k, idx->bns->anns[i].anno, x); k += x;
|
||||||
|
free(idx->bns->anns[i].name); free(idx->bns->anns[i].anno);
|
||||||
|
}
|
||||||
|
free(idx->bns->anns);
|
||||||
|
|
||||||
|
// copy idx->pac
|
||||||
|
x = idx->bns->l_pac/4+1;
|
||||||
|
mem = realloc(mem, k + x);
|
||||||
|
memcpy(mem + k, idx->pac, x); k += x;
|
||||||
|
free(idx->bns); idx->bns = 0;
|
||||||
|
free(idx->pac); idx->pac = 0;
|
||||||
|
|
||||||
|
return bwa_mem2idx(k, mem, idx);
|
||||||
|
}
|
||||||
|
|
||||||
/***********************
|
/***********************
|
||||||
* SAM header routines *
|
* SAM header routines *
|
||||||
***********************/
|
***********************/
|
||||||
|
|
|
||||||
10
bwa.h
10
bwa.h
|
|
@ -10,10 +10,16 @@
|
||||||
#define BWA_IDX_PAC 0x4
|
#define BWA_IDX_PAC 0x4
|
||||||
#define BWA_IDX_ALL 0x7
|
#define BWA_IDX_ALL 0x7
|
||||||
|
|
||||||
|
#define BWA_CTL_SIZE 0x10000
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
bwt_t *bwt; // FM-index
|
bwt_t *bwt; // FM-index
|
||||||
bntseq_t *bns; // information on the reference sequences
|
bntseq_t *bns; // information on the reference sequences
|
||||||
uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
|
uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
|
||||||
|
|
||||||
|
int is_shm;
|
||||||
|
int64_t l_mem;
|
||||||
|
uint8_t *mem;
|
||||||
} bwaidx_t;
|
} bwaidx_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
|
@ -37,8 +43,12 @@ extern "C" {
|
||||||
char *bwa_idx_infer_prefix(const char *hint);
|
char *bwa_idx_infer_prefix(const char *hint);
|
||||||
bwt_t *bwa_idx_load_bwt(const char *hint);
|
bwt_t *bwa_idx_load_bwt(const char *hint);
|
||||||
|
|
||||||
|
bwaidx_t *bwa_idx_load_from_shm(const char *hint);
|
||||||
|
bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which);
|
||||||
bwaidx_t *bwa_idx_load(const char *hint, int which);
|
bwaidx_t *bwa_idx_load(const char *hint, int which);
|
||||||
void bwa_idx_destroy(bwaidx_t *idx);
|
void bwa_idx_destroy(bwaidx_t *idx);
|
||||||
|
int bwa_idx2mem(bwaidx_t *idx);
|
||||||
|
int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx);
|
||||||
|
|
||||||
void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line);
|
void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line);
|
||||||
char *bwa_set_rg(const char *s);
|
char *bwa_set_rg(const char *s);
|
||||||
|
|
|
||||||
24
bwamem.c
24
bwamem.c
|
|
@ -73,7 +73,7 @@ mem_opt_t *mem_opt_init()
|
||||||
o->chunk_size = 10000000;
|
o->chunk_size = 10000000;
|
||||||
o->n_threads = 1;
|
o->n_threads = 1;
|
||||||
o->max_XA_hits = 5;
|
o->max_XA_hits = 5;
|
||||||
o->max_XA_hits_alt = 50;
|
o->max_XA_hits_alt = 200;
|
||||||
o->max_matesw = 50;
|
o->max_matesw = 50;
|
||||||
o->mask_level_redun = 0.95;
|
o->mask_level_redun = 0.95;
|
||||||
o->min_chain_weight = 0;
|
o->min_chain_weight = 0;
|
||||||
|
|
@ -512,38 +512,38 @@ static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t *
|
||||||
|
|
||||||
int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id)
|
int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id)
|
||||||
{
|
{
|
||||||
int i, j, n_pri;
|
int i, n_pri;
|
||||||
int_v z = {0,0,0};
|
int_v z = {0,0,0};
|
||||||
if (n == 0) return 0;
|
if (n == 0) return 0;
|
||||||
for (i = n_pri = 0; i < n; ++i) {
|
for (i = n_pri = 0; i < n; ++i) {
|
||||||
a[i].sub = a[i].alt_sc = 0, a[i].secondary = a[i].secondary_alt = -1, a[i].hash = hash_64(id+i);
|
a[i].sub = a[i].alt_sc = 0, a[i].secondary = a[i].secondary_all = -1, a[i].hash = hash_64(id+i);
|
||||||
if (!a[i].is_alt) ++n_pri;
|
if (!a[i].is_alt) ++n_pri;
|
||||||
}
|
}
|
||||||
ks_introsort(mem_ars_hash, n, a);
|
ks_introsort(mem_ars_hash, n, a);
|
||||||
mem_mark_primary_se_core(opt, n, a, &z);
|
mem_mark_primary_se_core(opt, n, a, &z);
|
||||||
for (i = 0; i < n; ++i) {
|
for (i = 0; i < n; ++i) {
|
||||||
mem_alnreg_t *p = &a[i];
|
mem_alnreg_t *p = &a[i];
|
||||||
p->secondary_alt = i; // keep the rank in the first round
|
p->secondary_all = i; // keep the rank in the first round
|
||||||
if (!p->is_alt && p->secondary >= 0 && a[p->secondary].is_alt)
|
if (!p->is_alt && p->secondary >= 0 && a[p->secondary].is_alt)
|
||||||
p->alt_sc = a[p->secondary].score;
|
p->alt_sc = a[p->secondary].score;
|
||||||
}
|
}
|
||||||
if (n_pri >= 0 && n_pri < n) {
|
if (n_pri >= 0 && n_pri < n) {
|
||||||
kv_resize(int, z, n);
|
kv_resize(int, z, n);
|
||||||
if (n_pri > 0) ks_introsort(mem_ars_hash2, n, a);
|
if (n_pri > 0) ks_introsort(mem_ars_hash2, n, a);
|
||||||
for (i = 0; i < n; ++i) z.a[a[i].secondary_alt] = i;
|
for (i = 0; i < n; ++i) z.a[a[i].secondary_all] = i;
|
||||||
for (i = 0; i < n; ++i) {
|
for (i = 0; i < n; ++i) {
|
||||||
if (a[i].secondary < 0) {
|
if (a[i].secondary >= 0) {
|
||||||
a[i].secondary_alt = -1;
|
a[i].secondary_all = z.a[a[i].secondary];
|
||||||
continue;
|
if (a[i].is_alt) a[i].secondary = INT_MAX;
|
||||||
}
|
} else a[i].secondary_all = -1;
|
||||||
j = z.a[a[i].secondary];
|
|
||||||
a[i].secondary_alt = a[j].is_alt? j : -1;
|
|
||||||
if (a[i].is_alt) a[i].secondary = INT_MAX;
|
|
||||||
}
|
}
|
||||||
if (n_pri > 0) { // mark primary for hits to the primary assembly only
|
if (n_pri > 0) { // mark primary for hits to the primary assembly only
|
||||||
for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1;
|
for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1;
|
||||||
mem_mark_primary_se_core(opt, n_pri, a, &z);
|
mem_mark_primary_se_core(opt, n_pri, a, &z);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
for (i = 0; i < n; ++i)
|
||||||
|
a[i].secondary_all = a[i].secondary;
|
||||||
}
|
}
|
||||||
free(z.a);
|
free(z.a);
|
||||||
return n_pri;
|
return n_pri;
|
||||||
|
|
|
||||||
2
bwamem.h
2
bwamem.h
|
|
@ -69,7 +69,7 @@ typedef struct {
|
||||||
int w; // actual band width used in extension
|
int w; // actual band width used in extension
|
||||||
int seedcov; // length of regions coverged by seeds
|
int seedcov; // length of regions coverged by seeds
|
||||||
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
|
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
|
||||||
int secondary_alt;
|
int secondary_all;
|
||||||
int seedlen0; // length of the starting seed
|
int seedlen0; // length of the starting seed
|
||||||
int n_comp:30, is_alt:2; // number of sub-alignments chained together
|
int n_comp:30, is_alt:2; // number of sub-alignments chained together
|
||||||
float frac_rep;
|
float frac_rep;
|
||||||
|
|
|
||||||
|
|
@ -112,34 +112,35 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
|
||||||
s->sam = str.s;
|
s->sam = str.s;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i, int r[2])
|
static inline int get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i)
|
||||||
{
|
{
|
||||||
int j = a[i].secondary, k = a[i].secondary_alt;
|
int k = a[i].secondary_all;
|
||||||
r[0] = r[1] = -1;
|
if (k >= 0 && a[i].score >= a[k].score * XA_drop_ratio) return k;
|
||||||
if (j >= 0 && j < INT_MAX && !a[j].is_alt && !a[i].is_alt && a[i].score >= a[j].score * XA_drop_ratio) r[0] = j;
|
return -1;
|
||||||
if (k >= 0 && a[k].is_alt && a[i].score >= a[k].score * XA_drop_ratio) r[1] = k;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup.
|
// Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup.
|
||||||
char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se()
|
char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se()
|
||||||
{
|
{
|
||||||
int i, k, *cnt, tot, r[2];
|
int i, k, r, *cnt, tot;
|
||||||
kstring_t *aln = 0, str = {0,0,0};
|
kstring_t *aln = 0, str = {0,0,0};
|
||||||
char **XA = 0;
|
char **XA = 0, *has_alt;
|
||||||
|
|
||||||
cnt = calloc(a->n, sizeof(int));
|
cnt = calloc(a->n, sizeof(int));
|
||||||
|
has_alt = calloc(a->n, 1);
|
||||||
for (i = 0, tot = 0; i < a->n; ++i) {
|
for (i = 0, tot = 0; i < a->n; ++i) {
|
||||||
get_pri_idx(opt->XA_drop_ratio, a->a, i, r);
|
r = get_pri_idx(opt->XA_drop_ratio, a->a, i);
|
||||||
if (r[0] >= 0) ++cnt[r[0]], ++tot;
|
if (r >= 0) {
|
||||||
if (r[1] >= 0) ++cnt[r[1]], ++tot;
|
++cnt[r], ++tot;
|
||||||
|
if (a->a[i].is_alt) has_alt[r] = 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (tot == 0) goto end_gen_alt;
|
if (tot == 0) goto end_gen_alt;
|
||||||
aln = calloc(a->n, sizeof(kstring_t));
|
aln = calloc(a->n, sizeof(kstring_t));
|
||||||
for (i = 0; i < a->n; ++i) {
|
for (i = 0; i < a->n; ++i) {
|
||||||
mem_aln_t t;
|
mem_aln_t t;
|
||||||
get_pri_idx(opt->XA_drop_ratio, a->a, i, r);
|
if ((r = get_pri_idx(opt->XA_drop_ratio, a->a, i)) < 0) continue;
|
||||||
if (r[0] < 0 && r[1] < 0) continue;
|
if (cnt[r] > opt->max_XA_hits_alt || (!has_alt[r] && cnt[r] > opt->max_XA_hits)) continue;
|
||||||
if ((r[0] >= 0 && cnt[r[0]] > opt->max_XA_hits) || (r[1] >= 0 && cnt[r[1]] > opt->max_XA_hits_alt)) continue;
|
|
||||||
t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]);
|
t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]);
|
||||||
str.l = 0;
|
str.l = 0;
|
||||||
kputs(bns->anns[t.rid].name, &str);
|
kputs(bns->anns[t.rid].name, &str);
|
||||||
|
|
@ -152,14 +153,13 @@ char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
|
||||||
kputc(',', &str); kputw(t.NM, &str);
|
kputc(',', &str); kputw(t.NM, &str);
|
||||||
kputc(';', &str);
|
kputc(';', &str);
|
||||||
free(t.cigar);
|
free(t.cigar);
|
||||||
if (r[0] >= 0 && cnt[r[0]] <= opt->max_XA_hits) kputsn(str.s, str.l, &aln[r[0]]);
|
kputsn(str.s, str.l, &aln[r]);
|
||||||
if (r[1] >= 0 && cnt[r[1]] <= opt->max_XA_hits_alt) kputsn(str.s, str.l, &aln[r[1]]);
|
|
||||||
}
|
}
|
||||||
XA = calloc(a->n, sizeof(char*));
|
XA = calloc(a->n, sizeof(char*));
|
||||||
for (k = 0; k < a->n; ++k)
|
for (k = 0; k < a->n; ++k)
|
||||||
XA[k] = aln[k].s;
|
XA[k] = aln[k].s;
|
||||||
|
|
||||||
end_gen_alt:
|
end_gen_alt:
|
||||||
free(cnt); free(aln); free(str.s);
|
free(has_alt); free(cnt); free(aln); free(str.s);
|
||||||
return XA;
|
return XA;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,200 @@
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/mman.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <limits.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include "bwa.h"
|
||||||
|
|
||||||
|
int bwa_shm_stage(bwaidx_t *idx, const char *hint, const char *_tmpfn)
|
||||||
|
{
|
||||||
|
const char *name;
|
||||||
|
uint8_t *shm, *shm_idx;
|
||||||
|
uint16_t *cnt, i;
|
||||||
|
int shmid, to_init = 0, l;
|
||||||
|
char path[PATH_MAX + 1], *p, *tmpfn = (char*)_tmpfn;
|
||||||
|
|
||||||
|
if (hint == 0 || hint[0] == 0) return -1;
|
||||||
|
for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
|
||||||
|
++name;
|
||||||
|
|
||||||
|
if ((shmid = shm_open("/bwactl", O_RDWR, 0444)) < 0) {
|
||||||
|
shmid = shm_open("/bwactl", O_CREAT|O_RDWR|O_EXCL, 0644);
|
||||||
|
to_init = 1;
|
||||||
|
}
|
||||||
|
if (shmid < 0) return -1;
|
||||||
|
ftruncate(shmid, BWA_CTL_SIZE);
|
||||||
|
shm = mmap(0, BWA_CTL_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
|
||||||
|
cnt = (uint16_t*)shm;
|
||||||
|
if (to_init) {
|
||||||
|
memset(shm, 0, BWA_CTL_SIZE);
|
||||||
|
cnt[1] = 4;
|
||||||
|
} else {
|
||||||
|
for (i = 0, p = (char*)shm + 4; i < cnt[0]; ++i) {
|
||||||
|
if (strcmp(p + 8, name) == 0) break;
|
||||||
|
p += 9 + strlen(p + 8);
|
||||||
|
}
|
||||||
|
if (i < cnt[0]) {
|
||||||
|
fprintf(stderr, "[W::%s] index '%s' is already in shared memory\n", __func__, name);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (idx->mem == 0) bwa_idx2mem(idx);
|
||||||
|
|
||||||
|
if (tmpfn) {
|
||||||
|
FILE *fp;
|
||||||
|
if ((fp = fopen(tmpfn, "wb")) != 0) {
|
||||||
|
int64_t rest = idx->l_mem;
|
||||||
|
while (rest > 0) {
|
||||||
|
int64_t l = rest < 0x1000000? rest : 0x1000000;
|
||||||
|
rest -= fwrite(&idx->mem[idx->l_mem - rest], 1, l, fp);
|
||||||
|
}
|
||||||
|
fclose(fp);
|
||||||
|
free(idx->mem); idx->mem = 0;
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "[W::%s] fail to create the temporary file. Option '-f' is ignored.\n", __func__);
|
||||||
|
tmpfn = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
strcat(strcpy(path, "/bwaidx-"), name);
|
||||||
|
l = 8 + strlen(name) + 1;
|
||||||
|
if (cnt[1] + l > BWA_CTL_SIZE) return -1;
|
||||||
|
memcpy(shm + cnt[1], &idx->l_mem, 8);
|
||||||
|
memcpy(shm + cnt[1] + 8, name, l - 8);
|
||||||
|
if ((shmid = shm_open(path, O_CREAT|O_RDWR|O_EXCL, 0644)) < 0) {
|
||||||
|
shm_unlink(path);
|
||||||
|
perror("shm_open()");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
cnt[1] += l; ++cnt[0];
|
||||||
|
ftruncate(shmid, idx->l_mem);
|
||||||
|
shm_idx = mmap(0, idx->l_mem, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
|
||||||
|
if (tmpfn) {
|
||||||
|
FILE *fp;
|
||||||
|
fp = fopen(tmpfn, "rb");
|
||||||
|
int64_t rest = idx->l_mem;
|
||||||
|
while (rest > 0) {
|
||||||
|
int64_t l = rest < 0x1000000? rest : 0x1000000;
|
||||||
|
rest -= fread(&shm_idx[idx->l_mem - rest], 1, l, fp);
|
||||||
|
}
|
||||||
|
fclose(fp);
|
||||||
|
unlink(tmpfn);
|
||||||
|
} else {
|
||||||
|
memcpy(shm_idx, idx->mem, idx->l_mem);
|
||||||
|
free(idx->mem);
|
||||||
|
}
|
||||||
|
bwa_mem2idx(idx->l_mem, shm_idx, idx);
|
||||||
|
idx->is_shm = 1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bwaidx_t *bwa_idx_load_from_shm(const char *hint)
|
||||||
|
{
|
||||||
|
const char *name;
|
||||||
|
uint8_t *shm, *shm_idx;
|
||||||
|
uint16_t *cnt, i;
|
||||||
|
char *p, path[PATH_MAX + 1];
|
||||||
|
int shmid;
|
||||||
|
int64_t l_mem;
|
||||||
|
bwaidx_t *idx;
|
||||||
|
|
||||||
|
if (hint == 0 || hint[0] == 0) return 0;
|
||||||
|
for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
|
||||||
|
++name;
|
||||||
|
if ((shmid = shm_open("/bwactl", O_RDONLY, 0444)) < 0) return 0;
|
||||||
|
shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
|
||||||
|
cnt = (uint16_t*)shm;
|
||||||
|
if (cnt[0] == 0) return 0;
|
||||||
|
for (i = 0, p = (char*)(shm + 4); i < cnt[0]; ++i) {
|
||||||
|
memcpy(&l_mem, p, 8); p += 8;
|
||||||
|
if (strcmp(p, name) == 0) break;
|
||||||
|
p += strlen(p) + 1;
|
||||||
|
}
|
||||||
|
if (i == cnt[0]) return 0;
|
||||||
|
|
||||||
|
strcat(strcpy(path, "/bwaidx-"), name);
|
||||||
|
if ((shmid = shm_open(path, O_RDONLY, 0444)) < 0) return 0;
|
||||||
|
shm_idx = mmap(0, l_mem, PROT_READ, MAP_SHARED, shmid, 0);
|
||||||
|
idx = calloc(1, sizeof(bwaidx_t));
|
||||||
|
bwa_mem2idx(l_mem, shm_idx, idx);
|
||||||
|
idx->is_shm = 1;
|
||||||
|
return idx;
|
||||||
|
}
|
||||||
|
|
||||||
|
int bwa_shm_list(void)
|
||||||
|
{
|
||||||
|
int shmid;
|
||||||
|
uint16_t *cnt, i;
|
||||||
|
char *p, *shm;
|
||||||
|
if ((shmid = shm_open("/bwactl", O_RDONLY, 0444)) < 0) return -1;
|
||||||
|
shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
|
||||||
|
cnt = (uint16_t*)shm;
|
||||||
|
for (i = 0, p = shm + 4; i < cnt[0]; ++i) {
|
||||||
|
int64_t l_mem;
|
||||||
|
memcpy(&l_mem, p, 8); p += 8;
|
||||||
|
printf("%s\t%ld\n", p, (long)l_mem);
|
||||||
|
p += strlen(p) + 1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int bwa_shm_destroy(void)
|
||||||
|
{
|
||||||
|
int shmid;
|
||||||
|
uint16_t *cnt, i;
|
||||||
|
char *p, *shm;
|
||||||
|
char path[PATH_MAX + 1];
|
||||||
|
|
||||||
|
if ((shmid = shm_open("/bwactl", O_RDONLY, 0444)) < 0) return -1;
|
||||||
|
shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
|
||||||
|
cnt = (uint16_t*)shm;
|
||||||
|
for (i = 0, p = shm + 4; i < cnt[0]; ++i) {
|
||||||
|
int64_t l_mem;
|
||||||
|
memcpy(&l_mem, p, 8); p += 8;
|
||||||
|
strcat(strcpy(path, "/bwaidx-"), p);
|
||||||
|
shm_unlink(path);
|
||||||
|
p += strlen(p) + 1;
|
||||||
|
}
|
||||||
|
munmap(shm, BWA_CTL_SIZE);
|
||||||
|
shm_unlink("/bwactl");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main_shm(int argc, char *argv[])
|
||||||
|
{
|
||||||
|
int c, to_list = 0, to_drop = 0, ret = 0;
|
||||||
|
char *tmpfn = 0;
|
||||||
|
while ((c = getopt(argc, argv, "ldf:")) >= 0) {
|
||||||
|
if (c == 'l') to_list = 1;
|
||||||
|
else if (c == 'd') to_drop = 1;
|
||||||
|
else if (c == 'f') tmpfn = optarg;
|
||||||
|
}
|
||||||
|
if (optind == argc && !to_list && !to_drop) {
|
||||||
|
fprintf(stderr, "\nUsage: bwa shm [-d|-l] [-f tmpFile] [idxbase]\n\n");
|
||||||
|
fprintf(stderr, "Options: -d destroy all indices in shared memory\n");
|
||||||
|
fprintf(stderr, " -l list names of indices in shared memory\n");
|
||||||
|
fprintf(stderr, " -f FILE temporary file to reduce peak memory\n\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (optind < argc && (to_list || to_drop)) {
|
||||||
|
fprintf(stderr, "[E::%s] open -l or -d cannot be used when 'idxbase' is present\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (optind < argc) {
|
||||||
|
bwaidx_t *idx;
|
||||||
|
idx = bwa_idx_load_from_disk(argv[optind], BWA_IDX_ALL);
|
||||||
|
if (bwa_shm_stage(idx, argv[optind], tmpfn) < 0) {
|
||||||
|
fprintf(stderr, "[E::%s] failed to stage the index in shared memory\n", __func__);
|
||||||
|
ret = 1;
|
||||||
|
}
|
||||||
|
bwa_idx_destroy(idx);
|
||||||
|
}
|
||||||
|
if (to_list) bwa_shm_list();
|
||||||
|
if (to_drop) bwa_shm_destroy();
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
@ -230,7 +230,11 @@ int main_mem(int argc, char *argv[])
|
||||||
} else update_a(opt, &opt0);
|
} else update_a(opt, &opt0);
|
||||||
bwa_fill_scmat(opt->a, opt->b, opt->mat);
|
bwa_fill_scmat(opt->a, opt->b, opt->mat);
|
||||||
|
|
||||||
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
|
idx = bwa_idx_load_from_shm(argv[optind]);
|
||||||
|
if (idx == 0) {
|
||||||
|
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
|
||||||
|
} else if (bwa_verbose >= 3)
|
||||||
|
fprintf(stderr, "[M::%s] load the bwa index from shared memory\n", __func__);
|
||||||
if (ignore_alt)
|
if (ignore_alt)
|
||||||
for (i = 0; i < idx->bns->n_seqs; ++i)
|
for (i = 0; i < idx->bns->n_seqs; ++i)
|
||||||
idx->bns->anns[i].is_alt = 0;
|
idx->bns->anns[i].is_alt = 0;
|
||||||
|
|
|
||||||
5
main.c
5
main.c
|
|
@ -4,7 +4,7 @@
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
|
|
||||||
#ifndef PACKAGE_VERSION
|
#ifndef PACKAGE_VERSION
|
||||||
#define PACKAGE_VERSION "0.7.10-r876-dirty"
|
#define PACKAGE_VERSION "0.7.10-r907-dirty"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int bwa_fa2pac(int argc, char *argv[]);
|
int bwa_fa2pac(int argc, char *argv[]);
|
||||||
|
|
@ -22,6 +22,7 @@ int bwa_bwtsw2(int argc, char *argv[]);
|
||||||
|
|
||||||
int main_fastmap(int argc, char *argv[]);
|
int main_fastmap(int argc, char *argv[]);
|
||||||
int main_mem(int argc, char *argv[]);
|
int main_mem(int argc, char *argv[]);
|
||||||
|
int main_shm(int argc, char *argv[]);
|
||||||
|
|
||||||
int main_pemerge(int argc, char *argv[]);
|
int main_pemerge(int argc, char *argv[]);
|
||||||
|
|
||||||
|
|
@ -43,6 +44,7 @@ static int usage()
|
||||||
fprintf(stderr, " sampe generate alignment (paired ended)\n");
|
fprintf(stderr, " sampe generate alignment (paired ended)\n");
|
||||||
fprintf(stderr, " bwasw BWA-SW for long queries\n");
|
fprintf(stderr, " bwasw BWA-SW for long queries\n");
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
fprintf(stderr, " shm manage indices in shared memory\n");
|
||||||
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
|
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
|
||||||
fprintf(stderr, " pac2bwt generate BWT from PAC\n");
|
fprintf(stderr, " pac2bwt generate BWT from PAC\n");
|
||||||
fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n");
|
fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n");
|
||||||
|
|
@ -81,6 +83,7 @@ int main(int argc, char *argv[])
|
||||||
else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
|
else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
|
||||||
else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1);
|
else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1);
|
||||||
else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1);
|
else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1);
|
||||||
|
else if (strcmp(argv[1], "shm") == 0) ret = main_shm(argc-1, argv+1);
|
||||||
else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1);
|
else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1);
|
||||||
else {
|
else {
|
||||||
fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
|
fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue