Merge branch 'dev'
This commit is contained in:
commit
44a185f2c1
8
Makefile
8
Makefile
|
|
@ -4,7 +4,7 @@ CFLAGS= -g -Wall -Wno-unused-function -O2
|
|||
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
|
||||
AR= ar
|
||||
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
|
||||
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o
|
||||
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwashm.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o
|
||||
AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
||||
is.o bwtindex.o bwape.o kopen.o pemerge.o \
|
||||
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
||||
|
|
@ -14,6 +14,10 @@ INCLUDES=
|
|||
LIBS= -lm -lz -lpthread
|
||||
SUBDIRS= .
|
||||
|
||||
ifeq ($(shell uname -s),Linux)
|
||||
LIBS += -lrt
|
||||
endif
|
||||
|
||||
.SUFFIXES:.c .o .cc
|
||||
|
||||
.c.o:
|
||||
|
|
@ -52,6 +56,7 @@ bwape.o: ksw.h khash.h
|
|||
bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h
|
||||
bwase.o: bwa.h ksw.h
|
||||
bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h
|
||||
bwashm.o: bwa.h bntseq.h bwt.h
|
||||
bwt.o: utils.h bwt.h kvec.h malloc_wrap.h
|
||||
bwt_gen.o: QSufSort.h malloc_wrap.h
|
||||
bwt_lite.o: bwt_lite.h malloc_wrap.h
|
||||
|
|
@ -66,7 +71,6 @@ bwtsw2_core.o: khash.h ksort.h
|
|||
bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h
|
||||
bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h
|
||||
bwtsw2_pair.o: malloc_wrap.h ksw.h
|
||||
cutvar.o: bwa.h bntseq.h bwt.h kvec.h malloc_wrap.h
|
||||
example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h
|
||||
fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h
|
||||
is.o: malloc_wrap.h
|
||||
|
|
|
|||
|
|
@ -0,0 +1,123 @@
|
|||
## Getting Started
|
||||
|
||||
Since version 0.7.11, BWA-MEM supports read mapping against a reference genome
|
||||
with long alternative haplotypes present in separate ALT contigs. To use the
|
||||
ALT-aware mode, users need to provide pairwise ALT-to-reference alignment in the
|
||||
SAM format and rename the file to ""*idxbase*.alt". For GRCh38, this alignment is
|
||||
available from the [BWA resource bundle for GRCh38][res].
|
||||
|
||||
#### Option 1: Mapping to the official GRCh38 with ALT contigs
|
||||
|
||||
Construct the index:
|
||||
```sh
|
||||
wget ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
|
||||
gzip -d GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
|
||||
mv GCA_000001405.15_GRCh38_full_analysis_set.fna hs38a.fa
|
||||
bwa index hs38a.fa
|
||||
cp bwa-hs38-res/hs38d4.fa.alt hs38a.fa.alt
|
||||
```
|
||||
|
||||
Perform mapping:
|
||||
```sh
|
||||
bwa mem hs38a.fa read1.fq read2.fq \
|
||||
| samblaster \
|
||||
| bwa-hs38-res/k8-linux bwa-postalt.js hs38a.fa.alt \
|
||||
| samtools view -bS - > aln.unsrt.bam
|
||||
```
|
||||
For short reads, the postprocessing script `bwa-postalt.js` runs at about the
|
||||
same speed as BAM compression.
|
||||
|
||||
#### Option 2: Mapping to the collection of GRCh38, decoy and HLA genes
|
||||
|
||||
Construct the index:
|
||||
```sh
|
||||
cat hs38a.fa bwa-hs38-res/hs38d4-extra.fa > hs38d4.fa
|
||||
bwa index hs38d4.fa
|
||||
cp bwa-hs38-res/hs38d4.fa.alt .
|
||||
```
|
||||
Perform mapping:
|
||||
```sh
|
||||
bwa mem -g.8 hs38d4.fa read1.fq read2.fq \
|
||||
| samblaster \
|
||||
| bwa-hs38-res/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \
|
||||
| samtools view -bS - > aln.unsrt.bam
|
||||
```
|
||||
This command line generates `postinfo.ctw` which loosely evaluates the presence
|
||||
of an ALT contig with an empirical score at the last column.
|
||||
|
||||
## Background
|
||||
|
||||
GRCh38 ALT contigs are totaled 109Mb in length, spanning 60Mbp genomic regions.
|
||||
However, sequences that are highly diverged from the primary assembly only
|
||||
contribute a few million bp. Most subsequences of ALT contigs are highly similar
|
||||
or identical to the primary assembly. If we align sequence reads to GRCh38+ALT
|
||||
treating ALT equal to the primary assembly, we will get many reads with zero
|
||||
mapping quality and lose variants on them. It is crucial to make the mapper
|
||||
aware of ALTs.
|
||||
|
||||
BWA-MEM is designed to minimize the interference of ALT contigs such that on the
|
||||
primary assembly, the ALT-aware alignment is highly similar to the alignment
|
||||
without using ALT contigs in the index. This design choice makes it almost
|
||||
always safe to map reads to GRCh38+ALT. Although we don't know yet how much
|
||||
variations on ALT contigs contribute to phenotypes, we would not get the answer
|
||||
without mapping large cohorts to these extra sequences. We hope our current
|
||||
implementation encourages researchers to use ALT contigs soon and often.
|
||||
|
||||
## Methods
|
||||
|
||||
As of now, ALT mapping is done in two separate steps: BWA-MEM mapping and
|
||||
postprocessing.
|
||||
|
||||
#### Step 1: BWA-MEM mapping
|
||||
|
||||
At this step, BWA-MEM reads the ALT contig names from "*idxbase*.alt", ignoring
|
||||
the ALT-to-ref alignment, and labels a potential hit as *ALT* or *non-ALT*,
|
||||
depending on whether the hit lands on an ALT contig or not. BWA-MEM then reports
|
||||
alignments and assigns mapQ following these two rules:
|
||||
|
||||
* The original mapQ of a non-ALT hit is computed across non-ALT hits only.
|
||||
The reported mapQ of an ALT hit is computed across all hits.
|
||||
|
||||
* An ALT hit is only reported if its score is strictly better than all
|
||||
overlapping non-ALT hits. A reported ALT hit is flagged with 0x800
|
||||
(supplementary) unless there are no non-ALT hits.
|
||||
|
||||
When option `-g FLOAT` is in use (which is the default), a third rule kicks in:
|
||||
|
||||
* The mapQ of a non-ALT hit is reduced to zero if its score is less than FLOAT
|
||||
times the score of an overlapping ALT hit. In this case, the original mapQ is
|
||||
moved to the `om` tag.
|
||||
|
||||
If we don't care about ALT hits, we may actually skip postprocessing (step 2).
|
||||
Nonetheless, postprocessing is recommended as it improves mapQ and gives more
|
||||
information about ALT hits.
|
||||
|
||||
#### Step 2: Postprocessing
|
||||
|
||||
Postprocessing is done with a separate script `bwa-postalt.js`. It reads all
|
||||
potential hits reported in the XA tag, lifts ALT hits to the chromosomal
|
||||
positions using the ALT-to-ref alignment, groups them after lifting and then
|
||||
reassigns mapQ based on the best scoring hit in each group with all the hits in
|
||||
a group get the same mapQ. Knowing the ALT-to-ref alignment, this script can
|
||||
greatly improve mapQ of ALT hits and occasionally improve mapQ of non-ALT hits.
|
||||
|
||||
The script also measures the presence of each ALT contig. For a group of
|
||||
overlapping ALT contigs c_1, ..., c_m, the weight for c_k equals `\frac{\sum_j
|
||||
P(c_k|r_j)}{\sum_j\max_i P(c_i|r_j)}`, where `P(c_k|r)=\frac{pow(4,s_k)}{\sum_i
|
||||
pow(4,s_i)}` is the posterior of c_k given a read r mapped to it with a
|
||||
Smith-Waterman score s_k. This weight is reported in `postinfo.ctw` in the
|
||||
option 2 above.
|
||||
|
||||
## Problems and Future Development
|
||||
|
||||
There are some uncertainties about ALT mappings - we are not sure whether they
|
||||
help biological discovery and don't know the best way to analyze them. Without
|
||||
clear demand from downstream analyses, it is very difficult to design the
|
||||
optimal mapping strategy. The current BWA-MEM method is just a start. If it
|
||||
turns out to be useful in research, we will probably rewrite bwa-postalt.js in C
|
||||
for performance; if not, we will try new designs. It is also possible that we
|
||||
may make breakthrough on the representation of multiple genomes, in which case,
|
||||
we can even get rid of ALT contigs once for all.
|
||||
|
||||
[res]: https://sourceforge.net/projects/bio-bwa/files/
|
||||
[sb]: https://github.com/GregoryFaust/samblaster
|
||||
14
README.md
14
README.md
|
|
@ -63,7 +63,6 @@ do not have plan to submit it to a peer-reviewed journal in the near future.
|
|||
3. [Does BWA work on reference sequences longer than 4GB in total?](#4gb)
|
||||
4. [Why can one read in a pair has high mapping quality but the other has zero?](#pe0)
|
||||
5. [How can a BWA-backtrack alignment stands out of the end of a chromosome?](#endref)
|
||||
6. [How to map sequences to GRCh38 with ALT contigs?](#h38)
|
||||
|
||||
####<a name="type"></a>1. What types of data does BWA work with?
|
||||
|
||||
|
|
@ -72,11 +71,11 @@ algorithm and setting may vary. The following list gives the recommended
|
|||
settings:
|
||||
|
||||
* Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly
|
||||
contigs up to a few megabases mapped to a close related reference genome:
|
||||
contigs up to a few megabases mapped to a closely related reference genome:
|
||||
|
||||
bwa mem ref.fa reads.fq > aln.sam
|
||||
|
||||
* Illumina single-end reads no longer than ~70bp:
|
||||
* Illumina single-end reads shorter than ~70bp:
|
||||
|
||||
bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam
|
||||
|
||||
|
|
@ -84,20 +83,21 @@ settings:
|
|||
|
||||
bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
|
||||
|
||||
* Illumina paired-end reads no longer than ~70bp:
|
||||
* Illumina paired-end reads shorter than ~70bp:
|
||||
|
||||
bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai
|
||||
bwa sampe ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam
|
||||
|
||||
* PacBio subreads to a reference genome:
|
||||
* PacBio subreads or Oxford Nanopore reads to a reference genome:
|
||||
|
||||
bwa mem -x pacbio ref.fa reads.fq > aln.sam
|
||||
bwa mem -x ont2d ref.fa reads.fq > aln.sam
|
||||
|
||||
BWA-MEM is recommended for query sequences longer than ~70bp for a variety of
|
||||
error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with
|
||||
errors given longer query sequences as the chance of missing all seeds is small.
|
||||
As is shown above, with non-default settings, BWA-MEM works with PacBio subreads
|
||||
with a sequencing error rate as high as ~15%.
|
||||
As is shown above, with non-default settings, BWA-MEM works with Oxford Nanopore
|
||||
reads with a sequencing error rate over 20%.
|
||||
|
||||
####<a name="multihit"></a>2. Why does a read appear multiple times in the output SAM?
|
||||
|
||||
|
|
|
|||
23
bntseq.c
23
bntseq.c
|
|
@ -179,17 +179,25 @@ bntseq_t *bns_restore(const char *prefix)
|
|||
if ((fp = fopen(strcat(strcpy(alt_filename, prefix), ".alt"), "r")) != 0) { // read .alt file if present
|
||||
char str[1024];
|
||||
khash_t(str) *h;
|
||||
int i, absent;
|
||||
int c, i, absent;
|
||||
khint_t k;
|
||||
h = kh_init(str);
|
||||
for (i = 0; i < bns->n_seqs; ++i) {
|
||||
k = kh_put(str, h, bns->anns[i].name, &absent);
|
||||
kh_val(h, k) = i;
|
||||
}
|
||||
while (fscanf(fp, "%s", str) == 1) {
|
||||
k = kh_get(str, h, str);
|
||||
if (k != kh_end(h))
|
||||
bns->anns[kh_val(h, k)].is_alt = 1;
|
||||
i = 0;
|
||||
while ((c = fgetc(fp)) != EOF) {
|
||||
if (c == '\t' || c == '\n' || c == '\r') {
|
||||
str[i] = 0;
|
||||
if (str[0] != '@') {
|
||||
k = kh_get(str, h, str);
|
||||
if (k != kh_end(h))
|
||||
bns->anns[kh_val(h, k)].is_alt = 1;
|
||||
}
|
||||
while (c != '\n' && c != EOF) c = fgetc(fp);
|
||||
i = 0;
|
||||
} else str[i++] = c; // FIXME: potential segfault here
|
||||
}
|
||||
kh_destroy(str, h);
|
||||
fclose(fp);
|
||||
|
|
@ -226,7 +234,7 @@ static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_
|
|||
}
|
||||
p = bns->anns + bns->n_seqs;
|
||||
p->name = strdup((char*)seq->name.s);
|
||||
p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)");
|
||||
p->anno = seq->comment.l > 0? strdup((char*)seq->comment.s) : strdup("(null)");
|
||||
p->gi = 0; p->len = seq->seq.l;
|
||||
p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
|
||||
p->n_ambs = 0;
|
||||
|
|
@ -358,8 +366,9 @@ int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re)
|
|||
{
|
||||
int is_rev, rid_b, rid_e;
|
||||
if (rb < bns->l_pac && re > bns->l_pac) return -2;
|
||||
assert(rb <= re);
|
||||
rid_b = bns_pos2rid(bns, bns_depos(bns, rb, &is_rev));
|
||||
rid_e = bns_pos2rid(bns, bns_depos(bns, re, &is_rev) - 1);
|
||||
rid_e = rb < re? bns_pos2rid(bns, bns_depos(bns, re - 1, &is_rev)) : rid_b;
|
||||
return rid_b == rid_e? rid_b : -1;
|
||||
}
|
||||
|
||||
|
|
|
|||
299
bwa-postalt.js
299
bwa-postalt.js
|
|
@ -52,7 +52,7 @@ var getopt = function(args, ostr) {
|
|||
return optopt;
|
||||
}
|
||||
|
||||
// print an object in a format similar to JSON. For debugging.
|
||||
// print an object in a format similar to JSON. For debugging only.
|
||||
function obj2str(o)
|
||||
{
|
||||
if (typeof(o) != 'object') {
|
||||
|
|
@ -203,54 +203,31 @@ function parse_hit(s, opt)
|
|||
return h;
|
||||
}
|
||||
|
||||
// read the ALT-to-REF alignment and generate the index
|
||||
function read_ALT_sam(fn)
|
||||
{
|
||||
var intv = {};
|
||||
var file = new File(fn);
|
||||
var buf = new Bytes();
|
||||
while (file.readline(buf) >= 0) {
|
||||
var line = buf.toString();
|
||||
var t = line.split("\t");
|
||||
if (line.charAt(0) == '@') continue;
|
||||
var flag = parseInt(t[1]);
|
||||
var m, cigar = [], l_qaln = 0, l_qclip = 0;
|
||||
while ((m = re_cigar.exec(t[5])) != null) {
|
||||
var l = parseInt(m[1]);
|
||||
cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip
|
||||
if (m[2] == 'M' || m[2] == 'I') l_qaln += l;
|
||||
else if (m[2] == 'S' || m[2] == 'H') l_qclip += l;
|
||||
}
|
||||
var j = flag&16? cigar.length-1 : 0;
|
||||
var start = cigar[j][0] == 'S'? cigar[j][1] : 0;
|
||||
if (intv[t[0]] == null) intv[t[0]] = [];
|
||||
intv[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, parseInt(t[3]) - 1, cigar]);
|
||||
//print(start, start + l_qaln, t[2], flag&16? true : false, parseInt(t[3]), cigar);
|
||||
}
|
||||
buf.destroy();
|
||||
file.close();
|
||||
// create index for intervals on ALT contigs
|
||||
var idx = {};
|
||||
for (var ctg in intv)
|
||||
idx[ctg] = intv_ovlp(intv[ctg]);
|
||||
return idx;
|
||||
}
|
||||
|
||||
function bwa_postalt(args)
|
||||
{
|
||||
var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true };
|
||||
var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, recover_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:10, show_ev:false };
|
||||
|
||||
while ((c = getopt(args, 'pqv:')) != null) {
|
||||
while ((c = getopt(args, 'Pqev:p:')) != null) {
|
||||
if (c == 'v') opt.verbose = parseInt(getopt.arg);
|
||||
else if (c == 'p') opt.show_pri = true;
|
||||
else if (c == 'p') opt.pre = getopt.arg;
|
||||
else if (c == 'P') opt.show_pri = true;
|
||||
else if (c == 'q') opt.recover_maq = false;
|
||||
else if (c == 'e') opt.show_ev = true;
|
||||
}
|
||||
|
||||
if (opt.show_ev && opt.pre == null) {
|
||||
warn("ERROR: option '-p' must be specified if '-e' is applied.");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (args.length == getopt.ind) {
|
||||
print("");
|
||||
print("Usage: k8 bwa-postalt.js [-p] <alt.sam> [aln.sam]\n");
|
||||
print("Options: -p output lifted non-ALT hit in a SAM line (for ALT-unware alignments)");
|
||||
print(" -q don't recover mapQ for non-ALTs hit overlapping lifted ALT");
|
||||
print("Usage: k8 bwa-postalt.js [options] <alt.sam> [aln.sam]\n");
|
||||
print("Options: -p STR prefix of file(s) for additional information [null]");
|
||||
print(" PREFIX.ctw - weight of each ALT contig");
|
||||
print(" PREFIX.evi - reads supporting ALT contigs (effective with -e)");
|
||||
print(" -q don't modify mapQ for non-ALTs hit overlapping lifted ALT");
|
||||
print(" -e show reads supporting ALT contigs into file PREFIX.evi");
|
||||
print("");
|
||||
print("Note: This script inspects the XA tag, lifts the mapping positions of ALT hits to");
|
||||
print(" the primary assembly, groups them and then estimates mapQ across groups. If");
|
||||
|
|
@ -262,12 +239,56 @@ function bwa_postalt(args)
|
|||
exit(1);
|
||||
}
|
||||
|
||||
var file, buf = new Bytes();
|
||||
var fp_evi = opt.show_ev && opt.pre? new File(opt.pre + '.evi', "w") : null;
|
||||
var aux = new Bytes(); // used for reverse and reverse complement
|
||||
var idx = read_ALT_sam(args[getopt.ind]);
|
||||
var buf2 = [];
|
||||
var buf = new Bytes();
|
||||
|
||||
// read ALT-to-REF alignment
|
||||
var intv_alt = {}, intv_pri = {}, idx_un = {};
|
||||
var file = new File(args[getopt.ind]);
|
||||
while (file.readline(buf) >= 0) {
|
||||
var line = buf.toString();
|
||||
if (line.charAt(0) == '@') continue;
|
||||
var t = line.split("\t");
|
||||
if (t.length < 11) continue; // incomplete lines
|
||||
var pos = parseInt(t[3]) - 1;
|
||||
var flag = parseInt(t[1]);
|
||||
if ((flag&4) || t[2] == '*') {
|
||||
idx_un[t[0]] = true;
|
||||
continue;
|
||||
}
|
||||
var m, cigar = [], l_qaln = 0, l_tlen = 0, l_qclip = 0;
|
||||
while ((m = re_cigar.exec(t[5])) != null) {
|
||||
var l = parseInt(m[1]);
|
||||
cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip
|
||||
if (m[2] == 'M') l_qaln += l, l_tlen += l;
|
||||
else if (m[2] == 'I') l_qaln += l;
|
||||
else if (m[2] == 'S' || m[2] == 'H') l_qclip += l;
|
||||
else if (m[2] == 'D' || m[2] == 'N') l_tlen += l;
|
||||
}
|
||||
var j = flag&16? cigar.length-1 : 0;
|
||||
var start = cigar[j][0] == 'S'? cigar[j][1] : 0;
|
||||
if (intv_alt[t[0]] == null) intv_alt[t[0]] = [];
|
||||
intv_alt[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, pos - 1, cigar, pos + l_tlen]);
|
||||
if (intv_pri[t[2]] == null) intv_pri[t[2]] = [];
|
||||
intv_pri[t[2]].push([pos, pos + l_tlen, t[0]]);
|
||||
}
|
||||
file.close();
|
||||
var idx_alt = {}, idx_pri = {};
|
||||
for (var ctg in intv_alt)
|
||||
idx_alt[ctg] = intv_ovlp(intv_alt[ctg]);
|
||||
for (var ctg in intv_pri)
|
||||
idx_pri[ctg] = intv_ovlp(intv_pri[ctg]);
|
||||
|
||||
// initialize the list of ALT contigs
|
||||
var weight_alt = [];
|
||||
for (var ctg in idx_alt)
|
||||
weight_alt[ctg] = [0, 0, 0, 0, 0, 0, intv_alt[ctg][0][3], intv_alt[ctg][0][5], intv_alt[ctg][0][7]];
|
||||
for (var ctg in idx_un)
|
||||
weight_alt[ctg] = [0, 0, 0, 0, 0, 0, '~', 0, 0];
|
||||
|
||||
// process SAM
|
||||
var buf2 = [];
|
||||
file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File();
|
||||
while (file.readline(buf) >= 0) {
|
||||
var m, line = buf.toString();
|
||||
|
|
@ -286,36 +307,49 @@ function bwa_postalt(args)
|
|||
buf2 = [];
|
||||
}
|
||||
|
||||
if ((m = /\tXA:Z:(\S+)/.exec(line)) == null) {
|
||||
// skip unmapped lines
|
||||
if (t[1]&4) {
|
||||
buf2.push(t);
|
||||
continue;
|
||||
}
|
||||
var XA_strs = m[1].split(";");
|
||||
|
||||
// parse the reported hit
|
||||
var hits = [];
|
||||
var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1];
|
||||
var flag = t[1];
|
||||
var h = parse_hit([t[2], ((flag&16)?'-':'+') + t[3], t[5], NM], opt);
|
||||
if (h.hard) { // don't process lines with hard clips
|
||||
if (h.hard) { // the following does not work with hard clipped alignments
|
||||
buf2.push(t);
|
||||
continue;
|
||||
}
|
||||
hits.push(h);
|
||||
var hits = [h];
|
||||
|
||||
// parse hits in the XA tag
|
||||
for (var i = 0; i < XA_strs.length; ++i)
|
||||
if (XA_strs[i] != '') // as the last symbol in an XA tag is ";", the last split is an empty string
|
||||
hits.push(parse_hit(XA_strs[i].split(","), opt));
|
||||
if ((m = /\tXA:Z:(\S+)/.exec(line)) != null) {
|
||||
var XA_strs = m[1].split(";");
|
||||
for (var i = 0; i < XA_strs.length; ++i)
|
||||
if (XA_strs[i] != '') // as the last symbol in an XA tag is ";", the last split is an empty string
|
||||
hits.push(parse_hit(XA_strs[i].split(","), opt));
|
||||
}
|
||||
|
||||
// check if there are ALT hits
|
||||
var has_alt = false;
|
||||
for (var i = 0; i < hits.length; ++i)
|
||||
if (weight_alt[hits[i].ctg] != null) {
|
||||
has_alt = true;
|
||||
break;
|
||||
}
|
||||
if (!has_alt) {
|
||||
buf2.push(t);
|
||||
continue;
|
||||
}
|
||||
|
||||
// lift mapping positions to the primary assembly
|
||||
var n_lifted = 0, n_rpt_lifted = 0;
|
||||
var n_rpt_lifted = 0, rpt_lifted = null;
|
||||
for (var i = 0; i < hits.length; ++i) {
|
||||
var h = hits[i];
|
||||
var a, h = hits[i];
|
||||
|
||||
if (idx[h.ctg] == null) continue;
|
||||
var a = idx[h.ctg](h.start, h.end);
|
||||
if (a == null || a.length == 0) continue;
|
||||
if (idx_alt[h.ctg] == null || (a = idx_alt[h.ctg](h.start, h.end)) == null || a.length == 0)
|
||||
continue;
|
||||
|
||||
// find the approximate position on the primary assembly
|
||||
var lifted = [];
|
||||
|
|
@ -333,36 +367,46 @@ function bwa_postalt(args)
|
|||
lifted.push([a[j][3], (h.rev!=a[j][4]), s, e]);
|
||||
if (i == 0) ++n_rpt_lifted;
|
||||
}
|
||||
if (lifted.length) ++n_lifted, hits[i].lifted = lifted;
|
||||
}
|
||||
if (n_lifted == 0) {
|
||||
buf2.push(t);
|
||||
continue;
|
||||
if (i == 0 && n_rpt_lifted == 1) rpt_lifted = lifted[0].slice(0);
|
||||
if (lifted.length) hits[i].lifted = lifted;
|
||||
}
|
||||
|
||||
// group hits based on the lifted positions on the primary assembly
|
||||
// prepare for hits grouping
|
||||
for (var i = 0; i < hits.length; ++i) { // set keys for sorting
|
||||
if (hits[i].lifted && hits[i].lifted.length) // TODO: only the first element in lifted[] is used
|
||||
if (hits[i].lifted != null) // TODO: only the first element in lifted[] is used
|
||||
hits[i].pctg = hits[i].lifted[0][0], hits[i].pstart = hits[i].lifted[0][2], hits[i].pend = hits[i].lifted[0][3];
|
||||
else hits[i].pctg = hits[i].ctg, hits[i].pstart = hits[i].start, hits[i].pend = hits[i].end;
|
||||
hits[i].i = i; // keep the original index
|
||||
}
|
||||
hits.sort(function(a,b) { return a.pctg != b.pctg? (a.pctg < b.pctg? -1 : 1) : a.pstart - b.pstart });
|
||||
var last_chr = null, end = 0, g = -1;
|
||||
for (var i = 0; i < hits.length; ++i) {
|
||||
if (last_chr != hits[i].pctg) ++g, last_chr = hits[i].pctg, end = 0;
|
||||
else if (hits[i].pstart >= end) ++g;
|
||||
hits[i].g = g;
|
||||
end = end > hits[i].pend? end : hits[i].pend;
|
||||
|
||||
// group hits based on the lifted positions on non-ALT sequences
|
||||
if (hits.length > 1) {
|
||||
hits.sort(function(a,b) { return a.pctg != b.pctg? (a.pctg < b.pctg? -1 : 1) : a.pstart - b.pstart });
|
||||
var last_chr = null, end = 0, g = -1;
|
||||
for (var i = 0; i < hits.length; ++i) {
|
||||
if (last_chr != hits[i].pctg) ++g, last_chr = hits[i].pctg, end = 0;
|
||||
else if (hits[i].pstart >= end) ++g;
|
||||
hits[i].g = g;
|
||||
end = end > hits[i].pend? end : hits[i].pend;
|
||||
}
|
||||
} else hits[0].g = 0;
|
||||
|
||||
// find the index and group id of the reported hit; find the size of the reported group
|
||||
var reported_g = null, reported_i = null, n_group0 = 0;
|
||||
if (hits.length > 1) {
|
||||
for (var i = 0; i < hits.length; ++i)
|
||||
if (hits[i].i == 0)
|
||||
reported_g = hits[i].g, reported_i = i;
|
||||
for (var i = 0; i < hits.length; ++i)
|
||||
if (hits[i].g == reported_g)
|
||||
++n_group0;
|
||||
} else {
|
||||
if (weight_alt[hits[0].ctg] == null) { // no need to go through the following if the single hit is non-ALT
|
||||
buf2.push(t);
|
||||
continue;
|
||||
}
|
||||
reported_g = reported_i = 0, n_group0 = 1;
|
||||
}
|
||||
var reported_g = null, reported_i = null;
|
||||
for (var i = 0; i < hits.length; ++i)
|
||||
if (hits[i].i == 0)
|
||||
reported_g = hits[i].g, reported_i = i;
|
||||
var n_group0 = 0; // #hits overlapping the reported hit
|
||||
for (var i = 0; i < hits.length; ++i)
|
||||
if (hits[i].g == reported_g)
|
||||
++n_group0;
|
||||
|
||||
// re-estimate mapping quality if necessary
|
||||
var mapQ, ori_mapQ = t[4];
|
||||
|
|
@ -379,12 +423,65 @@ function bwa_postalt(args)
|
|||
mapQ = group_max.length == 1? 60 : 6 * (group_max[0][0] - group_max[1][0]);
|
||||
} else mapQ = 0;
|
||||
mapQ = mapQ < 60? mapQ : 60;
|
||||
mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ;
|
||||
if (idx_alt[t[2]] == null) mapQ = mapQ < ori_mapQ? mapQ : ori_mapQ;
|
||||
else mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ;
|
||||
} else mapQ = t[4];
|
||||
|
||||
// ALT genotyping
|
||||
if (mapQ >= opt.min_mapq && hits[reported_i].score >= opt.min_sc) {
|
||||
// collect all overlapping ALT contigs
|
||||
var hits2 = [];
|
||||
for (var i = 0; i < hits.length; ++i) {
|
||||
var h = hits[i];
|
||||
if (h.g == reported_g)
|
||||
hits2.push([h.pctg, h.pstart, h.pend, h.ctg, h.score, h.NM]);
|
||||
}
|
||||
var start = hits2[0][1], end = hits2[0][2];
|
||||
for (var i = 1; i < hits2.length; ++i)
|
||||
end = end > hits2[i][2]? end : hits2[i][2];
|
||||
var alts = {};
|
||||
for (var i = 0; i < hits2.length; ++i)
|
||||
if (weight_alt[hits2[i][3]] != null)
|
||||
alts[hits2[i][3]] = [hits2[i][4], hits2[i][5]];
|
||||
if (idx_pri[hits2[0][0]] != null) { // add other unreported hits
|
||||
var ovlp = idx_pri[hits2[0][0]](start, end);
|
||||
for (var i = 0; i < ovlp.length; ++i)
|
||||
if (ovlp[i][0] <= start && end <= ovlp[i][1] && alts[ovlp[i][2]] == null)
|
||||
alts[ovlp[i][2]] = [0, 0];
|
||||
}
|
||||
|
||||
// add weight to each ALT contig
|
||||
var alt_arr = [], max_sc = -1, max_i = -1, sum = 0, min_sc = 1<<30, max_nm = -1;
|
||||
for (var ctg in alts)
|
||||
alt_arr.push([ctg, alts[ctg][0], 0, alts[ctg][1]]);
|
||||
for (var i = 0; i < alt_arr.length; ++i) {
|
||||
if (max_sc < alt_arr[i][1])
|
||||
max_sc = alt_arr[i][1], max_i = i;
|
||||
min_sc = min_sc < alt_arr[i][1]? min_sc : alt_arr[i][1];
|
||||
var nm = alt_arr[i][1] > 0? alt_arr[i][3] : opt.max_nm_sc;
|
||||
max_nm = max_nm > nm? max_nm : nm;
|
||||
}
|
||||
if (max_nm > opt.max_nm_sc) max_nm = opt.max_nm_sc;
|
||||
if (max_sc > 0 && (alt_arr.length == 1 || min_sc < max_sc)) {
|
||||
for (var i = 0; i < alt_arr.length; ++i)
|
||||
sum += (alt_arr[i][2] = Math.pow(10, .6 * (alt_arr[i][1] - max_sc)));
|
||||
for (var i = 0; i < alt_arr.length; ++i) alt_arr[i][2] /= sum;
|
||||
for (var i = 0; i < alt_arr.length; ++i) {
|
||||
var e = [alt_arr[i][0], 1, alt_arr[max_i][2], alt_arr[i][2], max_nm, max_nm * alt_arr[max_i][2], max_nm * alt_arr[i][2]];
|
||||
var w = weight_alt[e[0]];
|
||||
for (var j = 0; j < 6; ++j) w[j] += e[j+1];
|
||||
if (fp_evi) {
|
||||
e[2] = e[2].toFixed(3); e[3] = e[3].toFixed(3);
|
||||
e[5] = e[5].toFixed(3); e[6] = e[6].toFixed(3);
|
||||
fp_evi.write(t[0] + '/' + (t[1]>>6&3) + '\t' + e.join("\t") + '\n');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// check if the reported hit overlaps a hit to the primary assembly; if so, don't reduce mapping quality
|
||||
if (opt.recover_mapq && n_rpt_lifted == 1 && mapQ > 0) {
|
||||
var l = lifted[0];
|
||||
var l = rpt_lifted;
|
||||
for (var i = 0; i < buf2.length; ++i) {
|
||||
var s = buf2[i];
|
||||
if (l[0] != s[2]) continue; // different chr
|
||||
|
|
@ -423,6 +520,7 @@ function bwa_postalt(args)
|
|||
need_rev = true;
|
||||
}
|
||||
if (need_rev) { // reverse and reverse complement
|
||||
aux.length = 0;
|
||||
aux.set(t[9], 0); aux.revcomp(); rs = aux.toString();
|
||||
aux.set(t[10],0); aux.reverse(); rq = aux.toString();
|
||||
}
|
||||
|
|
@ -438,14 +536,16 @@ function bwa_postalt(args)
|
|||
for (var i = 0; i < hits.length; ++i) {
|
||||
if (opt.verbose >= 5) print(obj2str(hits[i]));
|
||||
if (hits[i].g != reported_g || i == reported_i) continue;
|
||||
if (!opt.show_pri && idx[hits[i].ctg] == null) continue;
|
||||
var s = [t[0], flag&0xf10, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, '*', 0, 0];
|
||||
// update name
|
||||
if (flag&0x40) s[0] += "/1";
|
||||
if (flag&0x80) s[0] += "/2";
|
||||
s[0] += "_" + (++cnt);
|
||||
if (hits[i].rev == hits[reported_i].rev) s.push(t[9], t[10]);
|
||||
else s.push(rs, rq);
|
||||
if (!opt.show_pri && idx_alt[hits[i].ctg] == null) continue;
|
||||
var s = [t[0], 0, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, '*', 0, 0];
|
||||
// print sequence/quality and set the rev flag
|
||||
if (hits[i].rev == hits[reported_i].rev) {
|
||||
s.push(t[9], t[10]);
|
||||
s[1] = flag | 0x800;
|
||||
} else {
|
||||
s.push(rs, rq);
|
||||
s[1] = (flag ^ 0x10) | 0x800;
|
||||
}
|
||||
s.push("NM:i:" + hits[i].NM);
|
||||
if (hits[i].lifted_str) s.push("lt:Z:" + hits[i].lifted_str);
|
||||
buf2.push(s);
|
||||
|
|
@ -454,9 +554,30 @@ function bwa_postalt(args)
|
|||
for (var i = 0; i < buf2.length; ++i)
|
||||
print(buf2[i].join("\t"));
|
||||
file.close();
|
||||
if (fp_evi != null) fp_evi.close();
|
||||
|
||||
aux.destroy();
|
||||
buf.destroy();
|
||||
aux.destroy();
|
||||
|
||||
// print weight of each contig
|
||||
if (opt.pre != null) {
|
||||
var fpout = new File(opt.pre + '.ctw', "w");
|
||||
var weight_arr = [];
|
||||
for (var ctg in weight_alt) {
|
||||
var w = weight_alt[ctg];
|
||||
weight_arr.push([ctg, w[6], w[7], w[8],
|
||||
w[0], w[1].toFixed(3), w[2].toFixed(3), w[1] > 0? (w[2]/w[1]).toFixed(3) : '0.000',
|
||||
w[3], w[4].toFixed(3), w[5].toFixed(3), w[4] > 0? (w[5]/w[4]).toFixed(3) : '0.000']);
|
||||
}
|
||||
weight_arr.sort(function(a,b) {
|
||||
return a[1] < b[1]? -1 : a[1] > b[1]? 1 : a[2] != b[2]? a[2] - b[2] : a[0] < b[0]? -1 : a[0] > b[0]? 1 : 0;
|
||||
});
|
||||
for (var i = 0; i < weight_arr.length; ++i) {
|
||||
if (weight_arr[i][1] == '~') weight_arr[i][1] = '*';
|
||||
fpout.write(weight_arr[i].join("\t") + '\n');
|
||||
}
|
||||
fpout.close();
|
||||
}
|
||||
}
|
||||
|
||||
bwa_postalt(arguments);
|
||||
|
|
|
|||
89
bwa.c
89
bwa.c
|
|
@ -227,7 +227,7 @@ bwt_t *bwa_idx_load_bwt(const char *hint)
|
|||
return bwt;
|
||||
}
|
||||
|
||||
bwaidx_t *bwa_idx_load(const char *hint, int which)
|
||||
bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
|
||||
{
|
||||
bwaidx_t *idx;
|
||||
char *prefix;
|
||||
|
|
@ -239,7 +239,12 @@ bwaidx_t *bwa_idx_load(const char *hint, int which)
|
|||
idx = calloc(1, sizeof(bwaidx_t));
|
||||
if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
|
||||
if (which & BWA_IDX_BNS) {
|
||||
int i, c;
|
||||
idx->bns = bns_restore(prefix);
|
||||
for (i = c = 0; i < idx->bns->n_seqs; ++i)
|
||||
if (idx->bns->anns[i].is_alt) ++c;
|
||||
if (bwa_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s] read %d ALT contigs\n", __func__, c);
|
||||
if (which & BWA_IDX_PAC) {
|
||||
idx->pac = calloc(idx->bns->l_pac/4+1, 1);
|
||||
err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
|
||||
|
|
@ -251,15 +256,91 @@ bwaidx_t *bwa_idx_load(const char *hint, int which)
|
|||
return idx;
|
||||
}
|
||||
|
||||
bwaidx_t *bwa_idx_load(const char *hint, int which)
|
||||
{
|
||||
return bwa_idx_load_from_disk(hint, which);
|
||||
}
|
||||
|
||||
void bwa_idx_destroy(bwaidx_t *idx)
|
||||
{
|
||||
if (idx == 0) return;
|
||||
if (idx->bwt) bwt_destroy(idx->bwt);
|
||||
if (idx->bns) bns_destroy(idx->bns);
|
||||
if (idx->pac) free(idx->pac);
|
||||
if (idx->mem == 0) {
|
||||
if (idx->bwt) bwt_destroy(idx->bwt);
|
||||
if (idx->bns) bns_destroy(idx->bns);
|
||||
if (idx->pac) free(idx->pac);
|
||||
} else {
|
||||
free(idx->bwt); free(idx->bns->anns); free(idx->bns);
|
||||
if (!idx->is_shm) free(idx->mem);
|
||||
}
|
||||
free(idx);
|
||||
}
|
||||
|
||||
int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx)
|
||||
{
|
||||
int64_t k = 0, x;
|
||||
int i;
|
||||
|
||||
// generate idx->bwt
|
||||
x = sizeof(bwt_t); idx->bwt = malloc(x); memcpy(idx->bwt, mem + k, x); k += x;
|
||||
x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x;
|
||||
x = idx->bwt->n_sa * sizeof(bwtint_t); idx->bwt->sa = (bwtint_t*)(mem + k); k += x;
|
||||
|
||||
// generate idx->bns and idx->pac
|
||||
x = sizeof(bntseq_t); idx->bns = malloc(x); memcpy(idx->bns, mem + k, x); k += x;
|
||||
x = idx->bns->n_holes * sizeof(bntamb1_t); idx->bns->ambs = (bntamb1_t*)(mem + k); k += x;
|
||||
x = idx->bns->n_seqs * sizeof(bntann1_t); idx->bns->anns = malloc(x); memcpy(idx->bns->anns, mem + k, x); k += x;
|
||||
for (i = 0; i < idx->bns->n_seqs; ++i) {
|
||||
idx->bns->anns[i].name = (char*)(mem + k); k += strlen(idx->bns->anns[i].name) + 1;
|
||||
idx->bns->anns[i].anno = (char*)(mem + k); k += strlen(idx->bns->anns[i].anno) + 1;
|
||||
}
|
||||
idx->pac = (uint8_t*)(mem + k); k += idx->bns->l_pac/4+1;
|
||||
assert(k == l_mem);
|
||||
|
||||
idx->l_mem = k; idx->mem = mem;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bwa_idx2mem(bwaidx_t *idx)
|
||||
{
|
||||
int i;
|
||||
int64_t k, x, tmp;
|
||||
uint8_t *mem;
|
||||
|
||||
// copy idx->bwt
|
||||
x = idx->bwt->bwt_size * 4;
|
||||
mem = realloc(idx->bwt->bwt, sizeof(bwt_t) + x); idx->bwt->bwt = 0;
|
||||
memmove(mem + sizeof(bwt_t), mem, x);
|
||||
memcpy(mem, idx->bwt, sizeof(bwt_t)); k = sizeof(bwt_t) + x;
|
||||
x = idx->bwt->n_sa * sizeof(bwtint_t); mem = realloc(mem, k + x); memcpy(mem + k, idx->bwt->sa, x); k += x;
|
||||
free(idx->bwt->sa);
|
||||
free(idx->bwt); idx->bwt = 0;
|
||||
|
||||
// copy idx->bns
|
||||
tmp = idx->bns->n_seqs * sizeof(bntann1_t) + idx->bns->n_holes * sizeof(bntamb1_t);
|
||||
for (i = 0; i < idx->bns->n_seqs; ++i) // compute the size of heap-allocated memory
|
||||
tmp += strlen(idx->bns->anns[i].name) + strlen(idx->bns->anns[i].anno) + 2;
|
||||
mem = realloc(mem, k + sizeof(bntseq_t) + tmp);
|
||||
x = sizeof(bntseq_t); memcpy(mem + k, idx->bns, x); k += x;
|
||||
x = idx->bns->n_holes * sizeof(bntamb1_t); memcpy(mem + k, idx->bns->ambs, x); k += x;
|
||||
free(idx->bns->ambs);
|
||||
x = idx->bns->n_seqs * sizeof(bntann1_t); memcpy(mem + k, idx->bns->anns, x); k += x;
|
||||
for (i = 0; i < idx->bns->n_seqs; ++i) {
|
||||
x = strlen(idx->bns->anns[i].name) + 1; memcpy(mem + k, idx->bns->anns[i].name, x); k += x;
|
||||
x = strlen(idx->bns->anns[i].anno) + 1; memcpy(mem + k, idx->bns->anns[i].anno, x); k += x;
|
||||
free(idx->bns->anns[i].name); free(idx->bns->anns[i].anno);
|
||||
}
|
||||
free(idx->bns->anns);
|
||||
|
||||
// copy idx->pac
|
||||
x = idx->bns->l_pac/4+1;
|
||||
mem = realloc(mem, k + x);
|
||||
memcpy(mem + k, idx->pac, x); k += x;
|
||||
free(idx->bns); idx->bns = 0;
|
||||
free(idx->pac); idx->pac = 0;
|
||||
|
||||
return bwa_mem2idx(k, mem, idx);
|
||||
}
|
||||
|
||||
/***********************
|
||||
* SAM header routines *
|
||||
***********************/
|
||||
|
|
|
|||
10
bwa.h
10
bwa.h
|
|
@ -10,10 +10,16 @@
|
|||
#define BWA_IDX_PAC 0x4
|
||||
#define BWA_IDX_ALL 0x7
|
||||
|
||||
#define BWA_CTL_SIZE 0x10000
|
||||
|
||||
typedef struct {
|
||||
bwt_t *bwt; // FM-index
|
||||
bntseq_t *bns; // information on the reference sequences
|
||||
uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
|
||||
|
||||
int is_shm;
|
||||
int64_t l_mem;
|
||||
uint8_t *mem;
|
||||
} bwaidx_t;
|
||||
|
||||
typedef struct {
|
||||
|
|
@ -37,8 +43,12 @@ extern "C" {
|
|||
char *bwa_idx_infer_prefix(const char *hint);
|
||||
bwt_t *bwa_idx_load_bwt(const char *hint);
|
||||
|
||||
bwaidx_t *bwa_idx_load_from_shm(const char *hint);
|
||||
bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which);
|
||||
bwaidx_t *bwa_idx_load(const char *hint, int which);
|
||||
void bwa_idx_destroy(bwaidx_t *idx);
|
||||
int bwa_idx2mem(bwaidx_t *idx);
|
||||
int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx);
|
||||
|
||||
void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line);
|
||||
char *bwa_set_rg(const char *s);
|
||||
|
|
|
|||
24
bwamem.c
24
bwamem.c
|
|
@ -73,7 +73,7 @@ mem_opt_t *mem_opt_init()
|
|||
o->chunk_size = 10000000;
|
||||
o->n_threads = 1;
|
||||
o->max_XA_hits = 5;
|
||||
o->max_XA_hits_alt = 50;
|
||||
o->max_XA_hits_alt = 200;
|
||||
o->max_matesw = 50;
|
||||
o->mask_level_redun = 0.95;
|
||||
o->min_chain_weight = 0;
|
||||
|
|
@ -512,38 +512,38 @@ static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t *
|
|||
|
||||
int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id)
|
||||
{
|
||||
int i, j, n_pri;
|
||||
int i, n_pri;
|
||||
int_v z = {0,0,0};
|
||||
if (n == 0) return 0;
|
||||
for (i = n_pri = 0; i < n; ++i) {
|
||||
a[i].sub = a[i].alt_sc = 0, a[i].secondary = a[i].secondary_alt = -1, a[i].hash = hash_64(id+i);
|
||||
a[i].sub = a[i].alt_sc = 0, a[i].secondary = a[i].secondary_all = -1, a[i].hash = hash_64(id+i);
|
||||
if (!a[i].is_alt) ++n_pri;
|
||||
}
|
||||
ks_introsort(mem_ars_hash, n, a);
|
||||
mem_mark_primary_se_core(opt, n, a, &z);
|
||||
for (i = 0; i < n; ++i) {
|
||||
mem_alnreg_t *p = &a[i];
|
||||
p->secondary_alt = i; // keep the rank in the first round
|
||||
p->secondary_all = i; // keep the rank in the first round
|
||||
if (!p->is_alt && p->secondary >= 0 && a[p->secondary].is_alt)
|
||||
p->alt_sc = a[p->secondary].score;
|
||||
}
|
||||
if (n_pri >= 0 && n_pri < n) {
|
||||
kv_resize(int, z, n);
|
||||
if (n_pri > 0) ks_introsort(mem_ars_hash2, n, a);
|
||||
for (i = 0; i < n; ++i) z.a[a[i].secondary_alt] = i;
|
||||
for (i = 0; i < n; ++i) z.a[a[i].secondary_all] = i;
|
||||
for (i = 0; i < n; ++i) {
|
||||
if (a[i].secondary < 0) {
|
||||
a[i].secondary_alt = -1;
|
||||
continue;
|
||||
}
|
||||
j = z.a[a[i].secondary];
|
||||
a[i].secondary_alt = a[j].is_alt? j : -1;
|
||||
if (a[i].is_alt) a[i].secondary = INT_MAX;
|
||||
if (a[i].secondary >= 0) {
|
||||
a[i].secondary_all = z.a[a[i].secondary];
|
||||
if (a[i].is_alt) a[i].secondary = INT_MAX;
|
||||
} else a[i].secondary_all = -1;
|
||||
}
|
||||
if (n_pri > 0) { // mark primary for hits to the primary assembly only
|
||||
for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1;
|
||||
mem_mark_primary_se_core(opt, n_pri, a, &z);
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < n; ++i)
|
||||
a[i].secondary_all = a[i].secondary;
|
||||
}
|
||||
free(z.a);
|
||||
return n_pri;
|
||||
|
|
|
|||
2
bwamem.h
2
bwamem.h
|
|
@ -69,7 +69,7 @@ typedef struct {
|
|||
int w; // actual band width used in extension
|
||||
int seedcov; // length of regions coverged by seeds
|
||||
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
|
||||
int secondary_alt;
|
||||
int secondary_all;
|
||||
int seedlen0; // length of the starting seed
|
||||
int n_comp:30, is_alt:2; // number of sub-alignments chained together
|
||||
float frac_rep;
|
||||
|
|
|
|||
|
|
@ -112,34 +112,35 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
|
|||
s->sam = str.s;
|
||||
}
|
||||
|
||||
static inline void get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i, int r[2])
|
||||
static inline int get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i)
|
||||
{
|
||||
int j = a[i].secondary, k = a[i].secondary_alt;
|
||||
r[0] = r[1] = -1;
|
||||
if (j >= 0 && j < INT_MAX && !a[j].is_alt && !a[i].is_alt && a[i].score >= a[j].score * XA_drop_ratio) r[0] = j;
|
||||
if (k >= 0 && a[k].is_alt && a[i].score >= a[k].score * XA_drop_ratio) r[1] = k;
|
||||
int k = a[i].secondary_all;
|
||||
if (k >= 0 && a[i].score >= a[k].score * XA_drop_ratio) return k;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup.
|
||||
char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se()
|
||||
{
|
||||
int i, k, *cnt, tot, r[2];
|
||||
int i, k, r, *cnt, tot;
|
||||
kstring_t *aln = 0, str = {0,0,0};
|
||||
char **XA = 0;
|
||||
char **XA = 0, *has_alt;
|
||||
|
||||
cnt = calloc(a->n, sizeof(int));
|
||||
has_alt = calloc(a->n, 1);
|
||||
for (i = 0, tot = 0; i < a->n; ++i) {
|
||||
get_pri_idx(opt->XA_drop_ratio, a->a, i, r);
|
||||
if (r[0] >= 0) ++cnt[r[0]], ++tot;
|
||||
if (r[1] >= 0) ++cnt[r[1]], ++tot;
|
||||
r = get_pri_idx(opt->XA_drop_ratio, a->a, i);
|
||||
if (r >= 0) {
|
||||
++cnt[r], ++tot;
|
||||
if (a->a[i].is_alt) has_alt[r] = 1;
|
||||
}
|
||||
}
|
||||
if (tot == 0) goto end_gen_alt;
|
||||
aln = calloc(a->n, sizeof(kstring_t));
|
||||
for (i = 0; i < a->n; ++i) {
|
||||
mem_aln_t t;
|
||||
get_pri_idx(opt->XA_drop_ratio, a->a, i, r);
|
||||
if (r[0] < 0 && r[1] < 0) continue;
|
||||
if ((r[0] >= 0 && cnt[r[0]] > opt->max_XA_hits) || (r[1] >= 0 && cnt[r[1]] > opt->max_XA_hits_alt)) continue;
|
||||
if ((r = get_pri_idx(opt->XA_drop_ratio, a->a, i)) < 0) continue;
|
||||
if (cnt[r] > opt->max_XA_hits_alt || (!has_alt[r] && cnt[r] > opt->max_XA_hits)) continue;
|
||||
t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]);
|
||||
str.l = 0;
|
||||
kputs(bns->anns[t.rid].name, &str);
|
||||
|
|
@ -152,14 +153,13 @@ char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
|
|||
kputc(',', &str); kputw(t.NM, &str);
|
||||
kputc(';', &str);
|
||||
free(t.cigar);
|
||||
if (r[0] >= 0 && cnt[r[0]] <= opt->max_XA_hits) kputsn(str.s, str.l, &aln[r[0]]);
|
||||
if (r[1] >= 0 && cnt[r[1]] <= opt->max_XA_hits_alt) kputsn(str.s, str.l, &aln[r[1]]);
|
||||
kputsn(str.s, str.l, &aln[r]);
|
||||
}
|
||||
XA = calloc(a->n, sizeof(char*));
|
||||
for (k = 0; k < a->n; ++k)
|
||||
XA[k] = aln[k].s;
|
||||
|
||||
end_gen_alt:
|
||||
free(cnt); free(aln); free(str.s);
|
||||
free(has_alt); free(cnt); free(aln); free(str.s);
|
||||
return XA;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,200 @@
|
|||
#include <sys/types.h>
|
||||
#include <sys/mman.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <stdio.h>
|
||||
#include "bwa.h"
|
||||
|
||||
int bwa_shm_stage(bwaidx_t *idx, const char *hint, const char *_tmpfn)
|
||||
{
|
||||
const char *name;
|
||||
uint8_t *shm, *shm_idx;
|
||||
uint16_t *cnt, i;
|
||||
int shmid, to_init = 0, l;
|
||||
char path[PATH_MAX + 1], *p, *tmpfn = (char*)_tmpfn;
|
||||
|
||||
if (hint == 0 || hint[0] == 0) return -1;
|
||||
for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
|
||||
++name;
|
||||
|
||||
if ((shmid = shm_open("/bwactl", O_RDWR, 0444)) < 0) {
|
||||
shmid = shm_open("/bwactl", O_CREAT|O_RDWR|O_EXCL, 0644);
|
||||
to_init = 1;
|
||||
}
|
||||
if (shmid < 0) return -1;
|
||||
ftruncate(shmid, BWA_CTL_SIZE);
|
||||
shm = mmap(0, BWA_CTL_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
|
||||
cnt = (uint16_t*)shm;
|
||||
if (to_init) {
|
||||
memset(shm, 0, BWA_CTL_SIZE);
|
||||
cnt[1] = 4;
|
||||
} else {
|
||||
for (i = 0, p = (char*)shm + 4; i < cnt[0]; ++i) {
|
||||
if (strcmp(p + 8, name) == 0) break;
|
||||
p += 9 + strlen(p + 8);
|
||||
}
|
||||
if (i < cnt[0]) {
|
||||
fprintf(stderr, "[W::%s] index '%s' is already in shared memory\n", __func__, name);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (idx->mem == 0) bwa_idx2mem(idx);
|
||||
|
||||
if (tmpfn) {
|
||||
FILE *fp;
|
||||
if ((fp = fopen(tmpfn, "wb")) != 0) {
|
||||
int64_t rest = idx->l_mem;
|
||||
while (rest > 0) {
|
||||
int64_t l = rest < 0x1000000? rest : 0x1000000;
|
||||
rest -= fwrite(&idx->mem[idx->l_mem - rest], 1, l, fp);
|
||||
}
|
||||
fclose(fp);
|
||||
free(idx->mem); idx->mem = 0;
|
||||
} else {
|
||||
fprintf(stderr, "[W::%s] fail to create the temporary file. Option '-f' is ignored.\n", __func__);
|
||||
tmpfn = 0;
|
||||
}
|
||||
}
|
||||
|
||||
strcat(strcpy(path, "/bwaidx-"), name);
|
||||
l = 8 + strlen(name) + 1;
|
||||
if (cnt[1] + l > BWA_CTL_SIZE) return -1;
|
||||
memcpy(shm + cnt[1], &idx->l_mem, 8);
|
||||
memcpy(shm + cnt[1] + 8, name, l - 8);
|
||||
if ((shmid = shm_open(path, O_CREAT|O_RDWR|O_EXCL, 0644)) < 0) {
|
||||
shm_unlink(path);
|
||||
perror("shm_open()");
|
||||
return -1;
|
||||
}
|
||||
cnt[1] += l; ++cnt[0];
|
||||
ftruncate(shmid, idx->l_mem);
|
||||
shm_idx = mmap(0, idx->l_mem, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
|
||||
if (tmpfn) {
|
||||
FILE *fp;
|
||||
fp = fopen(tmpfn, "rb");
|
||||
int64_t rest = idx->l_mem;
|
||||
while (rest > 0) {
|
||||
int64_t l = rest < 0x1000000? rest : 0x1000000;
|
||||
rest -= fread(&shm_idx[idx->l_mem - rest], 1, l, fp);
|
||||
}
|
||||
fclose(fp);
|
||||
unlink(tmpfn);
|
||||
} else {
|
||||
memcpy(shm_idx, idx->mem, idx->l_mem);
|
||||
free(idx->mem);
|
||||
}
|
||||
bwa_mem2idx(idx->l_mem, shm_idx, idx);
|
||||
idx->is_shm = 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bwaidx_t *bwa_idx_load_from_shm(const char *hint)
|
||||
{
|
||||
const char *name;
|
||||
uint8_t *shm, *shm_idx;
|
||||
uint16_t *cnt, i;
|
||||
char *p, path[PATH_MAX + 1];
|
||||
int shmid;
|
||||
int64_t l_mem;
|
||||
bwaidx_t *idx;
|
||||
|
||||
if (hint == 0 || hint[0] == 0) return 0;
|
||||
for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
|
||||
++name;
|
||||
if ((shmid = shm_open("/bwactl", O_RDONLY, 0444)) < 0) return 0;
|
||||
shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
|
||||
cnt = (uint16_t*)shm;
|
||||
if (cnt[0] == 0) return 0;
|
||||
for (i = 0, p = (char*)(shm + 4); i < cnt[0]; ++i) {
|
||||
memcpy(&l_mem, p, 8); p += 8;
|
||||
if (strcmp(p, name) == 0) break;
|
||||
p += strlen(p) + 1;
|
||||
}
|
||||
if (i == cnt[0]) return 0;
|
||||
|
||||
strcat(strcpy(path, "/bwaidx-"), name);
|
||||
if ((shmid = shm_open(path, O_RDONLY, 0444)) < 0) return 0;
|
||||
shm_idx = mmap(0, l_mem, PROT_READ, MAP_SHARED, shmid, 0);
|
||||
idx = calloc(1, sizeof(bwaidx_t));
|
||||
bwa_mem2idx(l_mem, shm_idx, idx);
|
||||
idx->is_shm = 1;
|
||||
return idx;
|
||||
}
|
||||
|
||||
int bwa_shm_list(void)
|
||||
{
|
||||
int shmid;
|
||||
uint16_t *cnt, i;
|
||||
char *p, *shm;
|
||||
if ((shmid = shm_open("/bwactl", O_RDONLY, 0444)) < 0) return -1;
|
||||
shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
|
||||
cnt = (uint16_t*)shm;
|
||||
for (i = 0, p = shm + 4; i < cnt[0]; ++i) {
|
||||
int64_t l_mem;
|
||||
memcpy(&l_mem, p, 8); p += 8;
|
||||
printf("%s\t%ld\n", p, (long)l_mem);
|
||||
p += strlen(p) + 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bwa_shm_destroy(void)
|
||||
{
|
||||
int shmid;
|
||||
uint16_t *cnt, i;
|
||||
char *p, *shm;
|
||||
char path[PATH_MAX + 1];
|
||||
|
||||
if ((shmid = shm_open("/bwactl", O_RDONLY, 0444)) < 0) return -1;
|
||||
shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
|
||||
cnt = (uint16_t*)shm;
|
||||
for (i = 0, p = shm + 4; i < cnt[0]; ++i) {
|
||||
int64_t l_mem;
|
||||
memcpy(&l_mem, p, 8); p += 8;
|
||||
strcat(strcpy(path, "/bwaidx-"), p);
|
||||
shm_unlink(path);
|
||||
p += strlen(p) + 1;
|
||||
}
|
||||
munmap(shm, BWA_CTL_SIZE);
|
||||
shm_unlink("/bwactl");
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main_shm(int argc, char *argv[])
|
||||
{
|
||||
int c, to_list = 0, to_drop = 0, ret = 0;
|
||||
char *tmpfn = 0;
|
||||
while ((c = getopt(argc, argv, "ldf:")) >= 0) {
|
||||
if (c == 'l') to_list = 1;
|
||||
else if (c == 'd') to_drop = 1;
|
||||
else if (c == 'f') tmpfn = optarg;
|
||||
}
|
||||
if (optind == argc && !to_list && !to_drop) {
|
||||
fprintf(stderr, "\nUsage: bwa shm [-d|-l] [-f tmpFile] [idxbase]\n\n");
|
||||
fprintf(stderr, "Options: -d destroy all indices in shared memory\n");
|
||||
fprintf(stderr, " -l list names of indices in shared memory\n");
|
||||
fprintf(stderr, " -f FILE temporary file to reduce peak memory\n\n");
|
||||
return 1;
|
||||
}
|
||||
if (optind < argc && (to_list || to_drop)) {
|
||||
fprintf(stderr, "[E::%s] open -l or -d cannot be used when 'idxbase' is present\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
if (optind < argc) {
|
||||
bwaidx_t *idx;
|
||||
idx = bwa_idx_load_from_disk(argv[optind], BWA_IDX_ALL);
|
||||
if (bwa_shm_stage(idx, argv[optind], tmpfn) < 0) {
|
||||
fprintf(stderr, "[E::%s] failed to stage the index in shared memory\n", __func__);
|
||||
ret = 1;
|
||||
}
|
||||
bwa_idx_destroy(idx);
|
||||
}
|
||||
if (to_list) bwa_shm_list();
|
||||
if (to_drop) bwa_shm_destroy();
|
||||
return ret;
|
||||
}
|
||||
|
|
@ -230,7 +230,11 @@ int main_mem(int argc, char *argv[])
|
|||
} else update_a(opt, &opt0);
|
||||
bwa_fill_scmat(opt->a, opt->b, opt->mat);
|
||||
|
||||
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
|
||||
idx = bwa_idx_load_from_shm(argv[optind]);
|
||||
if (idx == 0) {
|
||||
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
|
||||
} else if (bwa_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s] load the bwa index from shared memory\n", __func__);
|
||||
if (ignore_alt)
|
||||
for (i = 0; i < idx->bns->n_seqs; ++i)
|
||||
idx->bns->anns[i].is_alt = 0;
|
||||
|
|
|
|||
5
main.c
5
main.c
|
|
@ -4,7 +4,7 @@
|
|||
#include "utils.h"
|
||||
|
||||
#ifndef PACKAGE_VERSION
|
||||
#define PACKAGE_VERSION "0.7.10-r876-dirty"
|
||||
#define PACKAGE_VERSION "0.7.10-r907-dirty"
|
||||
#endif
|
||||
|
||||
int bwa_fa2pac(int argc, char *argv[]);
|
||||
|
|
@ -22,6 +22,7 @@ int bwa_bwtsw2(int argc, char *argv[]);
|
|||
|
||||
int main_fastmap(int argc, char *argv[]);
|
||||
int main_mem(int argc, char *argv[]);
|
||||
int main_shm(int argc, char *argv[]);
|
||||
|
||||
int main_pemerge(int argc, char *argv[]);
|
||||
|
||||
|
|
@ -43,6 +44,7 @@ static int usage()
|
|||
fprintf(stderr, " sampe generate alignment (paired ended)\n");
|
||||
fprintf(stderr, " bwasw BWA-SW for long queries\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, " shm manage indices in shared memory\n");
|
||||
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
|
||||
fprintf(stderr, " pac2bwt generate BWT from PAC\n");
|
||||
fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n");
|
||||
|
|
@ -81,6 +83,7 @@ int main(int argc, char *argv[])
|
|||
else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "shm") == 0) ret = main_shm(argc-1, argv+1);
|
||||
else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1);
|
||||
else {
|
||||
fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
|
||||
|
|
|
|||
Loading…
Reference in New Issue