Merge branch 'dev'
This commit is contained in:
commit
82ef0500e6
38
NEWS.md
38
NEWS.md
|
|
@ -1,29 +1,17 @@
|
|||
Release 0.7.11 (XX September, 2014)
|
||||
Release 0.7.11 (XX November, 2014)
|
||||
-----------------------------------
|
||||
|
||||
A major change to BWA-MEM is the support of mapping to ALT contigs. To use this
|
||||
feature, users need to manually create a file `indexbase.alt` with each line
|
||||
giving the name of an ALT contig. During alignment, BWA-MEM will be able to
|
||||
classify potential hits to ALT and non-ALT hits. It reports alignments and
|
||||
assigns mapping quality (mapQ) loosely following these rules:
|
||||
A major change to BWA-MEM is the support of mapping to ALT contigs in addition
|
||||
to the primary assembly. Part of the ALT mapping strategy is implemented in
|
||||
BWA-MEM and the rest in a postprocessing script for now. Due to the extra
|
||||
layer of complexity on generating the reference genome and on the two-step
|
||||
mapping, we start to provide a wrapper script and precompiled binaries since
|
||||
this release. Please check README-alt.md for details.
|
||||
|
||||
1. The original mapQ of a non-ALT hit is computed across non-ALT hits only.
|
||||
The reported mapQ of an ALT hit is always computed across all hits.
|
||||
|
||||
2. An ALT hit is only reported if its score is strictly better than all
|
||||
overlapping non-ALT hits. A reported ALT hit is flagged with 0x800
|
||||
(supplementary) unless there are no non-ALT hits.
|
||||
|
||||
3. The mapQ of a non-ALT hit is reduced to zero if its score is less than 80%
|
||||
(controlled by option `-g`) of the score of an overlapping ALT hit. In this
|
||||
case, the original mapQ is moved to the `om` tag.
|
||||
|
||||
This way, non-ALT alignments are only affected by ALT contigs if there are
|
||||
significantly better ALT alignments. BWA-MEM is carefully engineered such that
|
||||
ALT contigs do not interfere with the alignments to the primary assembly.
|
||||
|
||||
Users may consider to use ALT contigs from GRCh38. I am also constructing a
|
||||
non-redundant and more complete set of sequences missing from GRCh38.
|
||||
Another major addition to BWA-MEM is HLA typing, which made possible with the
|
||||
new ALT mapping strategy. Necessary data and programs are included in the
|
||||
binary release. The wrapper script also performs HLA typing when HLA genes are
|
||||
also included in the reference genome as additional ALT contigs.
|
||||
|
||||
Other notable changes to BWA-MEM:
|
||||
|
||||
|
|
@ -42,7 +30,9 @@ Other notable changes to BWA-MEM:
|
|||
* Added new pre-setting for Oxford Nanopore 2D reads. For small genomes,
|
||||
though, LAST is still more sensitive.
|
||||
|
||||
(0.7.11: XX September 2014, rXXX)
|
||||
* Added LAST-like seeding. This improves the accuracy for longer reads.
|
||||
|
||||
(0.7.11: XX November 2014, rXXX)
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
204
README-alt.md
204
README-alt.md
|
|
@ -1,85 +1,63 @@
|
|||
## Getting Started
|
||||
|
||||
Since version 0.7.11, BWA-MEM supports read mapping against a reference genome
|
||||
with long alternative haplotypes present in separate ALT contigs. To use the
|
||||
ALT-aware mode, users need to provide pairwise ALT-to-reference alignment in the
|
||||
SAM format and rename the file to "*idxbase*.alt". For GRCh38, this alignment
|
||||
is available from the [BWA resource bundle for GRCh38][res].
|
||||
|
||||
#### Option 1: Mapping to the official GRCh38 with ALT contigs
|
||||
|
||||
Construct the index:
|
||||
```sh
|
||||
wget ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
|
||||
gzip -d GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
|
||||
mv GCA_000001405.15_GRCh38_full_analysis_set.fna hs38a.fa
|
||||
bwa index hs38a.fa
|
||||
cp bwa-hs38-bundle/hs38d4.fa.alt hs38a.fa.alt
|
||||
# Download the bwa-0.7.11 binary package
|
||||
wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit-0.7.11_x64-linux.tar.bz2/download \
|
||||
| gzip -dc | tar xf -
|
||||
# Generate the GRCh38+ALT+decoy+HLA and create the BWA index
|
||||
bwa.kit/run-gen-ref hs38d6 # download GRCh38 and write hs38d6.fa
|
||||
bwa.kit/bwa index hs38d6.fa # create BWA index
|
||||
# mapping
|
||||
bwa.kit/run-bwamem -o out hs38d6.fa read1.fq read2.fq | sh # skip "|sh" to show command lines
|
||||
```
|
||||
|
||||
Perform mapping:
|
||||
```sh
|
||||
bwa mem hs38a.fa read1.fq read2.fq \
|
||||
| bwa-hs38-bundle/k8-linux bwa-postalt.js hs38a.fa.alt \
|
||||
| samtools view -bS - > aln.unsrt.bam
|
||||
```
|
||||
This will generate the following files:
|
||||
|
||||
In the final alignment, a read may be placed on the [primary assembly][grcdef]
|
||||
and multiple overlapping ALT contigs at the same time (on multiple SAM lines).
|
||||
Mapping quality (mapQ) is properly adjusted by the postprocessing script
|
||||
`bwa-postalt.js` using the ALT-to-reference alignment `hs38a.fa.alt`. For
|
||||
details, see the [Methods section](#methods).
|
||||
* `out.aln.bam`: unsorted alignments with ALT-aware mapping quality. In this
|
||||
file, one read may be placed on multiple overlapping ALT contigs at the same
|
||||
time even if the read is mapped better to some contigs than others. This makes
|
||||
it possible to analyze each contig independent of others.
|
||||
|
||||
#### Option 2: Mapping to the collection of GRCh38, decoy and HLA genes
|
||||
* `out.hla.top`: best genotypes for HLA-A, -B, -C, -DQA1, -DQB1 and -DRB1 genes.
|
||||
|
||||
Construct the index:
|
||||
```sh
|
||||
cat hs38a.fa bwa-hs38-bundle/hs38d4-extra.fa > hs38d4.fa
|
||||
bwa index hs38d4.fa
|
||||
cp bwa-hs38-bundle/hs38d4.fa.alt .
|
||||
```
|
||||
Perform mapping:
|
||||
```sh
|
||||
bwa mem hs38d4.fa read1.fq read2.fq \
|
||||
| bwa-hs38-bundle/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \
|
||||
| samtools view -bS - > aln.unsrt.bam
|
||||
```
|
||||
The benefit of this option is to have a more complete reference sequence and
|
||||
to facilitate HLA typing with a 3rd-party tool (see below).
|
||||
* `out.hla.all`: other possible genotypes on the six HLA genes.
|
||||
|
||||
***If you are not interested in the way BWA-MEM performs ALT mapping, you can
|
||||
skip the rest of this documentation.***
|
||||
* `out.log.*`: bwa-mem, samblaster and HLA typing log files.
|
||||
|
||||
Note that `run-bwamem` only prints command lines but doesn't execute them. It
|
||||
is advised to have a look at the command lines before passing them to `sh` for
|
||||
actual execution.
|
||||
|
||||
## Background
|
||||
|
||||
GRCh38 consists of several components: chromosomal assembly, unlocalized contigs
|
||||
(chromosome known but location unknown), unplaced contigs (chromosome unknown)
|
||||
and ALT contigs (long clustered variations). The combination of the first three
|
||||
components is called the *primary assembly*. You can find the more exact
|
||||
definitions from the [GRC website][grcdef].
|
||||
components is called the *primary assembly*. It is recommended to use the
|
||||
complete primary assembly for all analyses. Using ALT contigs in read mapping is
|
||||
tricky.
|
||||
|
||||
GRCh38 ALT contigs are totaled 109Mb in length, spanning 60Mbp genomic regions.
|
||||
However, sequences that are highly diverged from the primary assembly only
|
||||
contribute a few million bp. Most subsequences of ALT contigs are highly similar
|
||||
or identical to the primary assembly. If we align sequence reads to GRCh38+ALT
|
||||
treating ALT equal to the primary assembly, we will get many reads with zero
|
||||
mapping quality and lose variants on them. It is crucial to make the mapper
|
||||
aware of ALTs.
|
||||
GRCh38 ALT contigs are totaled 109Mb in length, spanning 60Mbp of the primary
|
||||
assembly. However, sequences that are highly diverged from the primary assembly
|
||||
only contribute a few million bp. Most subsequences of ALT contigs are nearly
|
||||
identical to the primary assembly. If we align sequence reads to GRCh38+ALT
|
||||
blindly, we will get many additional reads with zero mapping quality and miss
|
||||
variants on them. It is crucial to make mappers aware of ALTs.
|
||||
|
||||
BWA-MEM is designed to minimize the interference of ALT contigs such that on the
|
||||
primary assembly, the ALT-aware alignment is highly similar to the alignment
|
||||
without using ALT contigs in the index. This design choice makes it almost
|
||||
always safe to map reads to GRCh38+ALT. Although we don't know yet how much
|
||||
variations on ALT contigs contribute to phenotypes, we would not get the answer
|
||||
without mapping large cohorts to these extra sequences. We hope our current
|
||||
implementation encourages researchers to use ALT contigs soon and often.
|
||||
BWA-MEM is ALT-aware. It essentially computes mapping quality across the
|
||||
non-redundant content of the primary assembly plus the ALT contigs and is free
|
||||
of the problem above.
|
||||
|
||||
## Methods
|
||||
|
||||
### Sequence alignment
|
||||
|
||||
As of now, ALT mapping is done in two separate steps: BWA-MEM mapping and
|
||||
postprocessing.
|
||||
postprocessing. The `bwa.kit/run-bwamem` script performs the two steps when ALT
|
||||
contigs are present. The following picture shows an example about how BWA-MEM
|
||||
infers mapping quality and reports alignment after step 2:
|
||||
|
||||

|
||||
|
||||
#### Step 1: BWA-MEM mapping
|
||||
|
||||
|
|
@ -88,19 +66,19 @@ the ALT-to-ref alignment, and labels a potential hit as *ALT* or *non-ALT*,
|
|||
depending on whether the hit lands on an ALT contig or not. BWA-MEM then reports
|
||||
alignments and assigns mapQ following these two rules:
|
||||
|
||||
1. The original mapQ of a non-ALT hit is computed across non-ALT hits only.
|
||||
The reported mapQ of an ALT hit is computed across all hits.
|
||||
1. The mapQ of a non-ALT hit is computed across non-ALT hits only. The mapQ of
|
||||
an ALT hit is computed across all hits.
|
||||
|
||||
2. If there are no non-ALT hits, the best ALT hit is outputted as the primary
|
||||
alignment. If there are both ALT and non-ALT hits, non-ALT hits will be
|
||||
primary. ALT hits are reported as supplementary alignments (flag 0x800) only
|
||||
if they are better than all overlapping non-ALT hits.
|
||||
primary and ALT hits be supplementary (SAM flag 0x800) if ALT hits are better
|
||||
than the best overlapping non-ALT hits.
|
||||
|
||||
In theory, non-ALT alignments from step 1 should be identical to alignments
|
||||
against a reference genome with ALT contigs. In practice, the two types of
|
||||
against the reference genome with ALT contigs. In practice, the two types of
|
||||
alignments may differ in rare cases due to seeding heuristics. When an ALT hit
|
||||
is significantly better than non-ALT hits, BWA-MEM may miss seeds on the
|
||||
non-ALT hits. This happens more often for contig mapping.
|
||||
non-ALT hits.
|
||||
|
||||
If we don't care about ALT hits, we may skip postprocessing (step 2).
|
||||
Nonetheless, postprocessing is recommended as it improves mapQ and gives more
|
||||
|
|
@ -110,18 +88,12 @@ information about ALT hits.
|
|||
|
||||
Postprocessing is done with a separate script `bwa-postalt.js`. It reads all
|
||||
potential hits reported in the XA tag, lifts ALT hits to the chromosomal
|
||||
positions using the ALT-to-ref alignment, groups them after lifting and then
|
||||
reassigns mapQ based on the best scoring hit in each group with all the hits in
|
||||
a group get the same mapQ. Being aware of the ALT-to-ref alignment, this script
|
||||
can greatly improve mapQ of ALT hits and occasionally improve mapQ of non-ALT
|
||||
hits.
|
||||
|
||||
The script also measures the presence of each ALT contig. For a group of
|
||||
overlapping ALT contigs c_1, ..., c_m, the weight for c_k equals `\frac{\sum_j
|
||||
P(c_k|r_j)}{\sum_j\max_i P(c_i|r_j)}`, where `P(c_k|r)=\frac{pow(4,s_k)}{\sum_i
|
||||
pow(4,s_i)}` is the posterior of c_k given a read r mapped to it with a
|
||||
Smith-Waterman score s_k. This weight is reported in `postinfo.ctw` in the
|
||||
option 2 above.
|
||||
positions using the ALT-to-ref alignment, groups them based on overlaps between
|
||||
their lifted positions, and then re-estimates mapQ across the best scoring hit
|
||||
in each group. Being aware of the ALT-to-ref alignment, this script can greatly
|
||||
improve mapQ of ALT hits and occasionally improve mapQ of non-ALT hits. It also
|
||||
writes each hit overlapping the reported hit into a separate SAM line. This
|
||||
enables variant calling on each ALT contig independent of others.
|
||||
|
||||
### On the completeness of GRCh38+ALT
|
||||
|
||||
|
|
@ -129,47 +101,65 @@ While GRCh38 is much more complete than GRCh37, it is still missing some true
|
|||
human sequences. To make sure every piece of sequence in the reference assembly
|
||||
is correct, the [Genome Reference Consortium][grc] (GRC) require each ALT contig
|
||||
to have enough support from multiple sources before considering to add it to the
|
||||
reference assembly. This careful procedure has left out some sequences, one of
|
||||
which is [this example][novel], a 10kb contig assembled from CHM1 short
|
||||
reads and present also in NA12878. You can try [BLAT][blat] or [BLAST][blast] to
|
||||
see where it maps.
|
||||
reference assembly. This careful and sophisticated procedure has left out some
|
||||
sequences, one of which is [this example][novel], a 10kb contig assembled from
|
||||
CHM1 short reads and present also in NA12878. You can try [BLAT][blat] or
|
||||
[BLAST][blast] to see where it maps.
|
||||
|
||||
For a more complete reference genome, we compiled a new set of decoy sequences
|
||||
from GenBank clones and the de novo assembly of 254 public [SGDP][sgdp] samples.
|
||||
The sequences are included in `hs38d4-extra.fa` from the [BWA resource bundle
|
||||
for GRCh38][res].
|
||||
The sequences are included in `hs38d6-extra.fa` from the [BWA binary
|
||||
package][res].
|
||||
|
||||
In addition to decoy, we also put multiple alleles of HLA genes in
|
||||
`hs38d4-extra.fa`. These genomic sequences were acquired from [IMGT/HLA][hladb],
|
||||
version 3.18.0. Script `bwa-postalt.js` also helps to genotype HLA genes, though
|
||||
not to high resolution for now.
|
||||
`hs38d6-extra.fa`. These genomic sequences were acquired from [IMGT/HLA][hladb],
|
||||
version 3.18.0 and are used to collect reads sequenced from these genes.
|
||||
|
||||
### More on HLA typing
|
||||
### HLA typing
|
||||
|
||||
It is [well known][hlalink] that HLA genes are associated with many autoimmune
|
||||
diseases as well as some others not directly related to the immune system.
|
||||
However, many HLA alleles are highly diverged from the reference genome. If we
|
||||
map whole-genome shotgun (WGS) reads to the reference only, many
|
||||
allele-informative will get lost. As a result, the vast majority of WGS projects
|
||||
have ignored these important genes.
|
||||
HLA genes are known to be associated with many autoimmune diseases, infectious
|
||||
diseases and drug responses. They are among the most important genes but are
|
||||
rarely studied by WGS projects due to the high sequence divergence between
|
||||
HLA genes and the reference genome in these regions.
|
||||
|
||||
We recommend to include the genomic regions of classical HLA genes in the BWA
|
||||
index. This way we will be able to get a more complete collection of reads
|
||||
mapped to HLA. We can then isolate these reads with little computational cost
|
||||
and type HLA genes with another program, such as [Warren et al (2012)][hla4],
|
||||
[Liu et al (2013)][hla2], [Bai et al (2014)][hla3], [Dilthey et al (2014)][hla1]
|
||||
or others from [this list][hlatools].
|
||||
By including the HLA gene regions in the reference assembly as ALT contigs, we
|
||||
are able to effectively identify reads coming from these genes. We also provide
|
||||
a pipeline, which is included in the [BWA binary package][res], to type the
|
||||
several classic HLA genes. The pipeline is conceptually simple. It de novo
|
||||
assembles sequence reads mapped to each gene, aligns exon sequences of each
|
||||
allele to the assembled contigs and then finds the pairs of alleles that best
|
||||
explain the contigs. In practice, however, the completeness of IMGT/HLA and
|
||||
copy-number changes related to these genes are not so straightforward to
|
||||
resolve. HLA typing may not always be successful. Users may also consider to use
|
||||
other programs for typing such as [Warren et al (2012)][hla4], [Liu et al
|
||||
(2013)][hla2], [Bai et al (2014)][hla3] and [Dilthey et al (2014)][hla1], though
|
||||
most of them are distributed under restrictive licenses.
|
||||
|
||||
If the postprocessing script `bwa-postalt.js` is invoked with `-p prefix`, it
|
||||
will also write the top three alleles to file `prefix.hla`. However, as most HLA
|
||||
alleles from IMGT/HLA don't have intronic sequences and thus are not included in
|
||||
the BWA index from option 2, we are unable to type HLA genes to high resolution
|
||||
with the BWA-MEM mapping alone. A dedicated tool is recommended for accurate
|
||||
typing.
|
||||
## Preliminary Evaluation
|
||||
|
||||
### Evaluating ALT Mapping
|
||||
To check whether GRCh38 is better than GRCh37, we mapped the CHM1 and NA12878
|
||||
unitigs to GRCh37 primary (hs37), GRCh38 primary (hs38) and GRCh38+ALT+decoy
|
||||
(hs38d6), and called small variants from the alignment. CHM1 is haploid.
|
||||
Ideally, heterozygous calls are false positives (FP). NA12878 is diploid. The
|
||||
true positive (TP) heterozygous calls from NA12878 are approximately equal
|
||||
to the difference between NA12878 and CHM1 heterozygous calls. A better assembly
|
||||
should yield higher TP and lower FP. The following table shows the numbers for
|
||||
these assemblies:
|
||||
|
||||
(Coming soon...)
|
||||
|Assembly|hs37 |hs38 |hs38d6|CHM1_1.1| huref|
|
||||
|:------:|------:|------:|------:|------:|------:|
|
||||
|FP | 255706| 168068| 142516|307172 | 575634|
|
||||
|TP |2142260|2163113|2150844|2167235|2137053|
|
||||
|
||||
With this measurement, hs38 is clearly better than hs37. Genome hs38d6 reduces
|
||||
FP by ~25k but also reduces TP by ~12k. We manually inspected variants called
|
||||
from hs38 only and found the majority of them are associated with excessive read
|
||||
depth, clustered variants or weak alignment. We believe most hs38-only calls are
|
||||
problematic. In addition, if we compare two NA12878 replicates from HiSeq X10
|
||||
with nearly identical library construction, the difference is ~140k, an order
|
||||
of magnitude higher than the difference between hs38 and hs38d6. ALT contigs,
|
||||
decoy and HLA genes in hs38d6 improve variant calling and enable the analyses of
|
||||
ALT contigs and HLA typing at little cost.
|
||||
|
||||
## Problems and Future Development
|
||||
|
||||
|
|
|
|||
373
bwa-helper.js
373
bwa-helper.js
|
|
@ -1,373 +0,0 @@
|
|||
/*****************************************************************
|
||||
* The K8 Javascript interpreter is required to run this script. *
|
||||
* *
|
||||
* Source code: https://github.com/attractivechaos/k8 *
|
||||
* Binary: http://sourceforge.net/projects/lh3/files/k8/ *
|
||||
* *
|
||||
* Data file used for generating GRCh38 ALT alignments: *
|
||||
* *
|
||||
* http://sourceforge.net/projects/bio-bwa/files/ *
|
||||
*****************************************************************/
|
||||
|
||||
/******************
|
||||
*** From k8.js ***
|
||||
******************/
|
||||
|
||||
var getopt = function(args, ostr) {
|
||||
var oli; // option letter list index
|
||||
if (typeof(getopt.place) == 'undefined')
|
||||
getopt.ind = 0, getopt.arg = null, getopt.place = -1;
|
||||
if (getopt.place == -1) { // update scanning pointer
|
||||
if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
|
||||
getopt.place = -1;
|
||||
return null;
|
||||
}
|
||||
if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
|
||||
++getopt.ind;
|
||||
getopt.place = -1;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
|
||||
if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
|
||||
if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null.
|
||||
if (getopt.place < 0) ++getopt.ind;
|
||||
return '?';
|
||||
}
|
||||
if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
|
||||
getopt.arg = null;
|
||||
if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
|
||||
} else { // need an argument
|
||||
if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
|
||||
getopt.arg = args[getopt.ind].substr(getopt.place);
|
||||
else if (args.length <= ++getopt.ind) { // no arg
|
||||
getopt.place = -1;
|
||||
if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
|
||||
return '?';
|
||||
} else getopt.arg = args[getopt.ind]; // white space
|
||||
getopt.place = -1;
|
||||
++getopt.ind;
|
||||
}
|
||||
return optopt;
|
||||
}
|
||||
|
||||
/************************
|
||||
*** command markovlp ***
|
||||
************************/
|
||||
|
||||
function bwa_markOvlp(args)
|
||||
{
|
||||
var c, min_aln_ratio = .9, min_ext = 50;
|
||||
while ((c = getopt(args, "r:e:")) != null) {
|
||||
if (c == 'r') min_aln_ratio = parseFloat(getopt.arg);
|
||||
else if (c == 'e') min_ext = parseInt(getopt.arg);
|
||||
}
|
||||
|
||||
var file = args.length == getopt.ind? new File() : new File(args[getopt.ind]);
|
||||
var buf = new Bytes();
|
||||
var dir4 = ['>>', '><', '<>', '<<'];
|
||||
|
||||
while (file.readline(buf) >= 0) {
|
||||
var t = buf.toString().split("\t")
|
||||
for (var i = 0; i < t.length; ++i)
|
||||
if (i != 0 && i != 4)
|
||||
t[i] = parseInt(t[i]);
|
||||
var el, a1, a2, e1, e2, r1, r2; // a: aligned length; e: extended length; r: remaining length
|
||||
e2 = a2 = t[7] - t[6];
|
||||
if (t[2] < t[3]) { // forward-forward match
|
||||
e1 = a1 = t[3] - t[2];
|
||||
r1 = t[2] - t[6];
|
||||
r2 = (t[5] - t[7]) - (t[1] - t[3]);
|
||||
el = r1 > 0? t[6] : t[2];
|
||||
el += r2 > 0? t[1] - t[3] : t[5] - t[7];
|
||||
} else { // reverse-forward match
|
||||
e1 = a1 = t[2] - t[3];
|
||||
r1 = (t[1] - t[2]) - t[6];
|
||||
r2 = (t[5] - t[7]) - t[3];
|
||||
el = r1 > 0? t[6] : t[1] - t[2];
|
||||
el += r2 > 0? t[3] : t[5] - t[7];
|
||||
}
|
||||
e1 += el; e2 += el;
|
||||
var type;
|
||||
if (a1 / e1 >= min_aln_ratio && a2 / e2 >= min_aln_ratio) {
|
||||
if ((r1 >= min_ext && r2 >= min_ext) || (r1 <= -min_ext && r2 <= -min_ext)) { // suffix-prefix match
|
||||
var d = t[2] < t[3]? 0 : 2;
|
||||
if (r1 < 0) d ^= 3; // reverse the direction
|
||||
type = 'O' + dir4[d];
|
||||
} else type = 'C' + (e1 / t[1] > e2 / t[5]? 1 : 2);
|
||||
} else type = 'I'; // internal local match; not a suffix-prefix match
|
||||
//print(t[1], e1, a1, t[5], e2, a2);
|
||||
print(type, buf);
|
||||
}
|
||||
|
||||
buf.destroy();
|
||||
file.close();
|
||||
}
|
||||
|
||||
/***********************
|
||||
*** command pas2bed ***
|
||||
***********************/
|
||||
|
||||
function bwa_pas2reg(args)
|
||||
{
|
||||
var file = args.length? new File(args[0]) : new File();
|
||||
var buf = new Bytes();
|
||||
|
||||
while (file.readline(buf) >= 0) {
|
||||
var t = buf.toString().split("\t");
|
||||
if (t[0] == t[4]) continue;
|
||||
if (parseInt(t[2]) < parseInt(t[3])) print(t[0], t[1], t[2], t[3], t[8]);
|
||||
else print(t[0], t[1], t[3], t[2], t[8]);
|
||||
print(t[4], t[5], t[6], t[7], t[8]);
|
||||
}
|
||||
|
||||
buf.destroy();
|
||||
file.close();
|
||||
}
|
||||
|
||||
/*******************
|
||||
* command sam2pas *
|
||||
*******************/
|
||||
|
||||
function bwa_sam2pas(args)
|
||||
{
|
||||
var file = args.length == 0? new File() : new File(args[0]);
|
||||
var buf = new Bytes();
|
||||
var seq_dict = {};
|
||||
|
||||
while (file.readline(buf) >= 0) {
|
||||
var line = buf.toString();
|
||||
var m;
|
||||
if (/^@SQ/.test(line)) {
|
||||
var name = null, len = null;
|
||||
if ((m = /\tSN:(\S+)/.exec(line)) != null) name = m[1];
|
||||
if ((m = /\tLN:(\S+)/.exec(line)) != null) len = parseInt(m[1]);
|
||||
if (name != null && len != null) seq_dict[name] = len;
|
||||
}
|
||||
if (/^@/.test(line)) continue;
|
||||
var t = line.split("\t");
|
||||
var pos = parseInt(t[3]) - 1;
|
||||
var x = 0, y = 0, i = 0, clip = [0, 0], n_ins = 0, n_del = 0, o_ins = 0, o_del = 0, n_M = 0;
|
||||
var re = /(\d+)([MIDSH])/g;
|
||||
while ((m = re.exec(t[5])) != null) {
|
||||
var l = parseInt(m[1]);
|
||||
if (m[2] == 'M') x += l, y += l, n_M += l;
|
||||
else if (m[2] == 'I') y += l, n_ins += l, ++o_ins;
|
||||
else if (m[2] == 'D') x += l, n_del += l, ++o_del;
|
||||
else if (m[2] == 'S' || m[2] == 'H')
|
||||
clip[i == 0? 0 : 1] = l;
|
||||
++i;
|
||||
}
|
||||
var is_rev = (parseInt(t[1]) & 16)? true : false;
|
||||
var misc = 'mapQ=' + t[4] + ';';
|
||||
var usc = 1;
|
||||
if ((m = /\tNM:i:(\d+)/.exec(line)) != null) {
|
||||
var NM = parseInt(m[1]);
|
||||
var diff = (NM / (n_M + n_ins + n_del)).toFixed(3);
|
||||
misc += 'diff=' + diff + ';n_mis=' + (NM - n_del - n_ins) + ';';
|
||||
}
|
||||
misc += 'n_del='+n_del+';n_ins='+n_ins+';o_del='+o_del+';o_ins='+o_ins + ';';
|
||||
if ((m = /\tAS:i:(\d+)/.exec(line)) != null) {
|
||||
misc += 'AS='+m[1] + ';';
|
||||
usc = (parseInt(m[1]) / (x > y? x : y)).toFixed(3);
|
||||
}
|
||||
if ((m = /\tXS:i:(\d+)/.exec(line)) != null) misc += 'XS='+m[1] + ';';
|
||||
var len = y + clip[0] + clip[1];
|
||||
var z = [t[0], len, clip[0], clip[0] + y, t[2], seq_dict[t[2]], pos, pos + x, usc, misc];
|
||||
if (parseInt(t[1]) & 16) z[2] = clip[1] + y, z[3] = clip[1];
|
||||
print(z.join("\t"));
|
||||
}
|
||||
|
||||
buf.destroy();
|
||||
file.close();
|
||||
}
|
||||
|
||||
/***********************
|
||||
*** command reg2cut ***
|
||||
***********************/
|
||||
|
||||
function bwa_reg2cut(args)
|
||||
{
|
||||
var c, min_usc = 0.5, min_ext = 100, min_len = 5000, cut = 250;
|
||||
while ((c = getopt(args, "s:e:l:c:")) != null) {
|
||||
if (c == 's') min_usc = parseFloat(getopt.arg);
|
||||
else if (c == 'e') min_ext = parseInt(getopt.arg);
|
||||
else if (c == 'l') min_len = parseInt(getopt.arg);
|
||||
else if (c == 'c') cut = parseInt(getopt.arg);
|
||||
}
|
||||
|
||||
var file = args.length == getopt.ind? new File () : new File(args[getopt.ind]);
|
||||
var buf = new Bytes();
|
||||
|
||||
function print_bed() {
|
||||
for (var i = 0; i < a.length; ++i) {
|
||||
var start = a[i][0] - cut > 0? a[i][0] : 0;
|
||||
var end = a[i][1] + cut < last_len? a[i][1] : last_len;
|
||||
if (end - start >= min_len) print(last_chr, start, end);
|
||||
}
|
||||
}
|
||||
|
||||
var last_chr = null, last_len = null, max_c_usc = 0, start = 0, end = 0;
|
||||
var a = [];
|
||||
while (file.readline(buf) >= 0) {
|
||||
var t = buf.toString().split("\t");
|
||||
t[1] = parseInt(t[1]);
|
||||
t[2] = parseInt(t[2]);
|
||||
t[3] = parseInt(t[3]);
|
||||
t[4] = parseFloat(t[4]);
|
||||
var is_contained = t[2] < min_ext && t[1] - t[3] < min_ext? true : false;
|
||||
if (t[3] - t[2] < cut<<1) continue;
|
||||
t[2] += cut; t[3] -= cut;
|
||||
if (t[0] != last_chr) {
|
||||
a.push([start, end]);
|
||||
if (last_chr != null && max_c_usc < min_usc) print_bed();
|
||||
last_chr = t[0]; last_len = t[1]; start = t[2]; end = t[3];
|
||||
max_c_usc = is_contained? t[4] : 0;
|
||||
a.length = 0;
|
||||
} else {
|
||||
if (is_contained)
|
||||
max_c_usc = max_c_usc > t[4]? max_c_usc : t[4];
|
||||
if (t[4] < min_usc) continue;
|
||||
if (t[2] > end) {
|
||||
a.push([start, end]);
|
||||
start = t[2];
|
||||
end = end > t[3]? end : t[3];
|
||||
} else end = end > t[3]? end : t[3];
|
||||
}
|
||||
}
|
||||
a.push([start, end]);
|
||||
if (max_c_usc < min_usc) print_bed(); // the last sequence
|
||||
|
||||
buf.destroy();
|
||||
file.close();
|
||||
}
|
||||
|
||||
function bwa_shortname(args)
|
||||
{
|
||||
var file = args.length? new File(args[0]) : new File();
|
||||
var buf = new Bytes();
|
||||
|
||||
var re = /(\S+)\/(\d+)_(\d+)((:\d+-\d+)+)/g;
|
||||
var re2 = /:(\d+)-(\d+)/g;
|
||||
while (file.readline(buf) >= 0) {
|
||||
var match, match2;
|
||||
var line = buf.toString();
|
||||
var x = [];
|
||||
while ((match = re.exec(line)) != null) {
|
||||
var start = parseInt(match[2]), len = parseInt(match[3]) - start;
|
||||
while ((match2 = re2.exec(match[4])) != null) {
|
||||
var a = parseInt(match2[1]) - 1;
|
||||
var b = parseInt(match2[2]);
|
||||
start += a; len = b - a;
|
||||
}
|
||||
x.push([match[0], match[1] + '/' + start.toString() + '_' + (start+len).toString()]);
|
||||
}
|
||||
for (var i = 0; i < x.length; ++i)
|
||||
line = line.replace(x[i][0], x[i][1]);
|
||||
print(line);
|
||||
}
|
||||
|
||||
buf.destroy();
|
||||
file.close();
|
||||
}
|
||||
|
||||
/*******************
|
||||
* Command gff2sam *
|
||||
*******************/
|
||||
|
||||
function bwa_gff2sam(args)
|
||||
{
|
||||
if (args.length < 2) {
|
||||
print("Usage: k8 bwa-helper.js <aln.gff> <query-length.txt>");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
var file = new File(args[1]);
|
||||
var buf = new Bytes();
|
||||
var len = {};
|
||||
|
||||
while (file.readline(buf) >= 0) {
|
||||
var t = buf.toString().split(/\s+/);
|
||||
len[t[0]] = parseInt(t[1]);
|
||||
}
|
||||
file.close();
|
||||
|
||||
file = new File(args[0]);
|
||||
var re_cigar = /([MID])(\d+)/g;
|
||||
var lineno = 0;
|
||||
while (file.readline(buf) >= 0) {
|
||||
++lineno;
|
||||
var t = buf.toString().split("\t");
|
||||
var m = /Target=(\S+)\s+(\d+)\s+(\d+)\s+([+-])/.exec(t[8]);
|
||||
if (m == null) {
|
||||
warn("WARNING: skipped line "+lineno+" due to the lack of Target.");
|
||||
continue;
|
||||
}
|
||||
var qname = m[1];
|
||||
var flag = t[6] == m[4]? 0 : 16;
|
||||
var qb = parseInt(m[2]) - 1, qe = parseInt(m[3]), qlen = len[qname];
|
||||
if (qlen == null)
|
||||
throw Error("Sequence "+qname+" is not present in the query-length.txt");
|
||||
var clip5 = qb, clip3 = qlen - qe;
|
||||
if (flag&16) clip5 ^= clip3, clip3 ^= clip5, clip5 ^= clip3; // swap
|
||||
|
||||
m = /Gap\s*=\s*(([MID]\d+\s*)+)/.exec(t[8]);
|
||||
var cigar = clip5? clip5 + 'S' : '';
|
||||
var n_ins = 0, n_del = 0, n_match = 0, NM = null;
|
||||
if (m) {
|
||||
var mc;
|
||||
while ((mc = re_cigar.exec(m[1])) != null) {
|
||||
var l = parseInt(mc[2]);
|
||||
cigar += mc[2] + mc[1];
|
||||
if (mc[1] == 'I') n_ins += l;
|
||||
else if (mc[1] == 'D') n_del += l;
|
||||
else if (mc[1] == 'M') n_match += l;
|
||||
}
|
||||
if (n_ins + n_match != qe - qb || n_del + n_match != parseInt(t[4]) - parseInt(t[3]) + 1)
|
||||
throw Error("Inconsistent CIGAR at line "+lineno);
|
||||
} else { // ungapped alignment
|
||||
var tb = parseInt(t[3]) - 1, te = parseInt(t[4]);
|
||||
if (te - tb != qe - qb) {
|
||||
warn("WARNING: line "+lineno+" should contain gaps, but lacks Gap. Skipped.\n");
|
||||
} else cigar = (qe - qb) + 'M';
|
||||
}
|
||||
if (clip3) cigar += clip3 + 'S';
|
||||
if ((m = /num_mismatch=(\d+)/.exec(t[8])) != null)
|
||||
NM = parseInt(m[1]) + n_ins + n_del;
|
||||
var out = [qname, flag, t[0], t[3], 255, cigar, '*', 0, 0, '*', '*'];
|
||||
if (NM != null) out.push('NM:i:' + NM);
|
||||
print(out.join("\t"));
|
||||
}
|
||||
buf.destroy();
|
||||
file.close();
|
||||
}
|
||||
|
||||
/*********************
|
||||
*** Main function ***
|
||||
*********************/
|
||||
|
||||
function main(args)
|
||||
{
|
||||
if (args.length == 0) {
|
||||
print("\nUsage: k8 bwa-helper.js <command> [arguments]\n");
|
||||
print("Commands: sam2pas convert SAM to pairwise alignment summary format (PAS)");
|
||||
print(" pas2reg extract covered regions");
|
||||
print(" reg2cut regions to extract for the 2nd round bwa-mem");
|
||||
print(" markovlp identify bi-directional overlaps");
|
||||
print(" gff2sam convert GFF3 alignment to SAM");
|
||||
print(" shortname shorten sequence name after subseq (PacBio read names only)");
|
||||
print("");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
var cmd = args.shift();
|
||||
if (cmd == 'sam2pas') bwa_sam2pas(args);
|
||||
else if (cmd == 'gff2sam') bwa_gff2sam(args);
|
||||
else if (cmd == 'markovlp') bwa_markOvlp(args);
|
||||
else if (cmd == 'pas2reg') bwa_pas2reg(args);
|
||||
else if (cmd == 'reg2cut') bwa_reg2cut(args);
|
||||
else if (cmd == 'shortname') bwa_shortname(args);
|
||||
else warn("Unrecognized command");
|
||||
}
|
||||
|
||||
main(arguments);
|
||||
215
bwa-postalt.js
215
bwa-postalt.js
|
|
@ -52,31 +52,6 @@ var getopt = function(args, ostr) {
|
|||
return optopt;
|
||||
}
|
||||
|
||||
// print an object in a format similar to JSON. For debugging only.
|
||||
function obj2str(o)
|
||||
{
|
||||
if (typeof(o) != 'object') {
|
||||
return o.toString();
|
||||
} else if (o == null) {
|
||||
return "null";
|
||||
} else if (Array.isArray(o)) {
|
||||
var s = "[";
|
||||
for (var i = 0; i < o.length; ++i) {
|
||||
if (i) s += ',';
|
||||
s += obj2str(o[i]);
|
||||
}
|
||||
return s + "]";
|
||||
} else {
|
||||
var i = 0, s = "{";
|
||||
for (var key in o) {
|
||||
if (i++) s += ',';
|
||||
s += key + ":";
|
||||
s += obj2str(o[key]);
|
||||
}
|
||||
return s + "}";
|
||||
}
|
||||
}
|
||||
|
||||
// reverse a string
|
||||
Bytes.prototype.reverse = function()
|
||||
{
|
||||
|
|
@ -203,7 +178,7 @@ function parse_hit(s, opt)
|
|||
return h;
|
||||
}
|
||||
|
||||
function print_buffer(buf2, fp_hla, hla)
|
||||
function print_buffer(buf2, fp_hla, hla) // output alignments
|
||||
{
|
||||
if (buf2.length == 0) return;
|
||||
for (var i = 0; i < buf2.length; ++i)
|
||||
|
|
@ -217,34 +192,34 @@ function print_buffer(buf2, fp_hla, hla)
|
|||
}
|
||||
}
|
||||
|
||||
function collect_hla_hits(idx, ctg, start, end, hla) // collect reads hit to HLA genes
|
||||
{
|
||||
var m, ofunc = idx[ctg];
|
||||
if (ofunc == null) return;
|
||||
var ovlp_alt = ofunc(start, end);
|
||||
for (var i = 0; i < ovlp_alt.length; ++i)
|
||||
if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(ovlp_alt[i][2])) != null)
|
||||
hla[m[1]] = true;
|
||||
}
|
||||
|
||||
function bwa_postalt(args)
|
||||
{
|
||||
var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, update_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:10, show_ev:false, min_pa_ratio:1 };
|
||||
var version = "r985";
|
||||
var c, opt = { a:1, b:4, o:6, e:1, min_mapq:10, min_sc:90, max_nm_sc:10, min_pa_ratio:1 };
|
||||
|
||||
while ((c = getopt(args, 'Pqev:p:r:')) != null) {
|
||||
if (c == 'v') opt.verbose = parseInt(getopt.arg);
|
||||
else if (c == 'p') opt.pre = getopt.arg;
|
||||
else if (c == 'P') opt.show_pri = true;
|
||||
else if (c == 'q') opt.recover_maq = false;
|
||||
else if (c == 'e') opt.show_ev = true;
|
||||
while ((c = getopt(args, 'vp:r:')) != null) {
|
||||
if (c == 'p') opt.pre = getopt.arg;
|
||||
else if (c == 'r') opt.min_pa_ratio = parseFloat(getopt.arg);
|
||||
else if (c == 'v') { print(version); exit(0); }
|
||||
}
|
||||
if (opt.min_pa_ratio > 1.) opt.min_pa_ratio = 1.;
|
||||
|
||||
if (opt.show_ev && opt.pre == null) {
|
||||
warn("ERROR: option '-p' must be specified if '-e' is applied.");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (args.length == getopt.ind) {
|
||||
print("");
|
||||
print("Usage: k8 bwa-postalt.js [options] <alt.sam> [aln.sam]\n");
|
||||
print("Options: -p STR prefix of file(s) for additional information [null]");
|
||||
print(" PREFIX.ctw - weight of each ALT contig");
|
||||
print(" PREFIX.evi - reads supporting ALT contigs (effective with -e)");
|
||||
print(" -e show reads supporting ALT contigs into file PREFIX.evi");
|
||||
print("Options: -p STR prefix of output files containting sequences matching HLA genes [null]");
|
||||
print(" -r FLOAT reduce mapQ to 0 if not overlapping lifted best and pa<FLOAT ["+opt.min_pa_ratio+"]");
|
||||
print(" -q don't modify mapQ for non-ALTs hit overlapping lifted ALT");
|
||||
print(" -v show version number");
|
||||
print("");
|
||||
print("Note: This script extracts the XA tag, lifts the mapping positions of ALT hits to");
|
||||
print(" the primary assembly, groups them and then estimates mapQ across groups. If");
|
||||
|
|
@ -256,28 +231,26 @@ function bwa_postalt(args)
|
|||
exit(1);
|
||||
}
|
||||
|
||||
var fp_evi = opt.show_ev && opt.pre? new File(opt.pre + '.evi', "w") : null;
|
||||
var aux = new Bytes(); // used for reverse and reverse complement
|
||||
var buf = new Bytes();
|
||||
var buf = new Bytes(); // line reading buffer
|
||||
|
||||
// read ALT-to-REF alignment
|
||||
var intv_alt = {}, intv_pri = {}, idx_un = {}, hla_ctg = {};
|
||||
var intv_alt = {}, intv_pri = {}, hla_ctg = {}, is_alt = {}, hla_chr = null;
|
||||
var file = new File(args[getopt.ind]);
|
||||
while (file.readline(buf) >= 0) {
|
||||
var line = buf.toString();
|
||||
if (line.charAt(0) == '@') continue;
|
||||
var t = line.split("\t");
|
||||
if (t.length < 11) continue; // incomplete lines
|
||||
is_alt[t[0]] = true;
|
||||
var pos = parseInt(t[3]) - 1;
|
||||
var flag = parseInt(t[1]);
|
||||
if ((flag&4) || t[2] == '*') {
|
||||
idx_un[t[0]] = true;
|
||||
continue;
|
||||
}
|
||||
if ((flag&4) || t[2] == '*') continue;
|
||||
var m, cigar = [], l_qaln = 0, l_tlen = 0, l_qclip = 0;
|
||||
if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(t[0])) != null) { // read HLA contigs
|
||||
if (hla_ctg[m[1]] == null) hla_ctg[m[1]] = 0;
|
||||
++hla_ctg[m[1]];
|
||||
hla_chr = t[2];
|
||||
}
|
||||
while ((m = re_cigar.exec(t[5])) != null) {
|
||||
var l = parseInt(m[1]);
|
||||
|
|
@ -296,10 +269,8 @@ function bwa_postalt(args)
|
|||
}
|
||||
file.close();
|
||||
var idx_alt = {}, idx_pri = {};
|
||||
for (var ctg in intv_alt)
|
||||
idx_alt[ctg] = intv_ovlp(intv_alt[ctg]);
|
||||
for (var ctg in intv_pri)
|
||||
idx_pri[ctg] = intv_ovlp(intv_pri[ctg]);
|
||||
for (var ctg in intv_alt) idx_alt[ctg] = intv_ovlp(intv_alt[ctg]);
|
||||
for (var ctg in intv_pri) idx_pri[ctg] = intv_ovlp(intv_pri[ctg]);
|
||||
|
||||
// initialize the list of HLA contigs
|
||||
var fp_hla = null;
|
||||
|
|
@ -309,18 +280,12 @@ function bwa_postalt(args)
|
|||
fp_hla[h] = new File(opt.pre + '.' + h + '.fq', "w");
|
||||
}
|
||||
|
||||
// initialize the list of ALT contigs
|
||||
var weight_alt = [];
|
||||
for (var ctg in idx_alt)
|
||||
weight_alt[ctg] = [0, 0, 0, 0, 0, 0, intv_alt[ctg][0][3], intv_alt[ctg][0][5], intv_alt[ctg][0][7]];
|
||||
for (var ctg in idx_un)
|
||||
weight_alt[ctg] = [0, 0, 0, 0, 0, 0, '~', 0, 0];
|
||||
|
||||
// process SAM
|
||||
var buf2 = [], hla = {};
|
||||
file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File();
|
||||
while (file.readline(buf) >= 0) {
|
||||
var m, line = buf.toString();
|
||||
|
||||
if (line.charAt(0) == '@') { // print and then skip the header line
|
||||
print(line);
|
||||
continue;
|
||||
|
|
@ -345,6 +310,8 @@ function bwa_postalt(args)
|
|||
var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1];
|
||||
var flag = t[1];
|
||||
var h = parse_hit([t[2], ((flag&16)?'-':'+') + t[3], t[5], NM], opt);
|
||||
if (t[2] == hla_chr) collect_hla_hits(idx_pri, h.ctg, h.start, h.end, hla);
|
||||
|
||||
if (h.hard) { // the following does not work with hard clipped alignments
|
||||
buf2.push(t);
|
||||
continue;
|
||||
|
|
@ -362,7 +329,7 @@ function bwa_postalt(args)
|
|||
// check if there are ALT hits
|
||||
var has_alt = false;
|
||||
for (var i = 0; i < hits.length; ++i)
|
||||
if (weight_alt[hits[i].ctg] != null) {
|
||||
if (is_alt[hits[i].ctg] != null) {
|
||||
has_alt = true;
|
||||
break;
|
||||
}
|
||||
|
|
@ -429,7 +396,7 @@ function bwa_postalt(args)
|
|||
if (hits[i].g == reported_g)
|
||||
++n_group0;
|
||||
} else {
|
||||
if (weight_alt[hits[0].ctg] == null) { // no need to go through the following if the single hit is non-ALT
|
||||
if (is_alt[hits[0].ctg] == null) { // no need to go through the following if the single hit is non-ALT
|
||||
buf2.push(t);
|
||||
continue;
|
||||
}
|
||||
|
|
@ -455,8 +422,8 @@ function bwa_postalt(args)
|
|||
else mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ;
|
||||
} else mapQ = t[4];
|
||||
|
||||
var pri_ofunc = idx_pri[hits[reported_i].pctg], ovlp_alt = [];
|
||||
if (pri_ofunc != null) {
|
||||
// find out whether the read is overlapping HLA genes
|
||||
if (hits[reported_i].pctg == hla_chr) {
|
||||
var rpt_start = 1<<30, rpt_end = 0;
|
||||
for (var i = 0; i < hits.length; ++i) {
|
||||
var h = hits[i];
|
||||
|
|
@ -465,70 +432,26 @@ function bwa_postalt(args)
|
|||
rpt_end = rpt_end > h.pend ? rpt_end : h.pend;
|
||||
}
|
||||
}
|
||||
ovlp_alt = pri_ofunc(rpt_start, rpt_end);
|
||||
for (var i = 0; i < ovlp_alt.length; ++i)
|
||||
if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(ovlp_alt[i][2])) != null)
|
||||
hla[m[1]] = true;
|
||||
collect_hla_hits(idx_pri, hla_chr, rpt_start, rpt_end, hla);
|
||||
}
|
||||
|
||||
// ALT genotyping
|
||||
if (mapQ >= opt.min_mapq && hits[reported_i].score >= opt.min_sc) {
|
||||
// collect all overlapping ALT contigs
|
||||
var alts = {};
|
||||
for (var i = 0; i < hits.length; ++i) {
|
||||
var h = hits[i];
|
||||
if (h.g == reported_g && weight_alt[h.ctg] != null)
|
||||
alts[h.ctg] = [h.score, h.NM];
|
||||
}
|
||||
if (ovlp_alt.length > 0) { // add other unreported hits
|
||||
for (var i = 0; i < ovlp_alt.length; ++i)
|
||||
if (ovlp_alt[i][0] <= rpt_start && rpt_end <= ovlp_alt[i][1] && alts[ovlp_alt[i][2]] == null)
|
||||
alts[ovlp_alt[i][2]] = [0, 0];
|
||||
}
|
||||
|
||||
// add weight to each ALT contig
|
||||
var alt_arr = [], max_sc = -1, max_i = -1, sum = 0, min_sc = 1<<30, max_nm = -1;
|
||||
for (var ctg in alts)
|
||||
alt_arr.push([ctg, alts[ctg][0], 0, alts[ctg][1]]);
|
||||
for (var i = 0; i < alt_arr.length; ++i) {
|
||||
if (max_sc < alt_arr[i][1])
|
||||
max_sc = alt_arr[i][1], max_i = i;
|
||||
min_sc = min_sc < alt_arr[i][1]? min_sc : alt_arr[i][1];
|
||||
var nm = alt_arr[i][1] > 0? alt_arr[i][3] : opt.max_nm_sc;
|
||||
max_nm = max_nm > nm? max_nm : nm;
|
||||
}
|
||||
if (max_nm > opt.max_nm_sc) max_nm = opt.max_nm_sc;
|
||||
if (max_sc > 0 && (alt_arr.length == 1 || min_sc < max_sc)) {
|
||||
for (var i = 0; i < alt_arr.length; ++i)
|
||||
sum += (alt_arr[i][2] = Math.pow(10, .6 * (alt_arr[i][1] - max_sc)));
|
||||
for (var i = 0; i < alt_arr.length; ++i) alt_arr[i][2] /= sum;
|
||||
for (var i = 0; i < alt_arr.length; ++i) {
|
||||
var e = [alt_arr[i][0], 1, alt_arr[max_i][2], alt_arr[i][2], max_nm, max_nm * alt_arr[max_i][2], max_nm * alt_arr[i][2]];
|
||||
var w = weight_alt[e[0]];
|
||||
for (var j = 0; j < 6; ++j) w[j] += e[j+1];
|
||||
if (fp_evi) {
|
||||
e[2] = e[2].toFixed(3); e[3] = e[3].toFixed(3);
|
||||
e[5] = e[5].toFixed(3); e[6] = e[6].toFixed(3);
|
||||
fp_evi.write(t[0] + '/' + (t[1]>>6&3) + '\t' + e.join("\t") + '\n');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// check if the reported hit overlaps a hit to the primary assembly; if so, don't reduce mapping quality
|
||||
if (opt.update_mapq && mapQ > 0 && n_rpt_lifted <= 1) {
|
||||
// adjust the mapQ of the primary hits
|
||||
if (n_rpt_lifted <= 1) {
|
||||
var l = n_rpt_lifted == 1? rpt_lifted : null;
|
||||
for (var i = 0; i < buf2.length; ++i) {
|
||||
var s = buf2[i], is_ovlp = true;
|
||||
if (l != null) {
|
||||
if (l[0] != s[2]) is_ovlp = false; // different chr
|
||||
if (((s[1]&16) != 0) != l[1]) is_ovlp = false; // different strand
|
||||
var start = s[3] - 1, end = start;
|
||||
while ((m = re_cigar.exec(t[5])) != null)
|
||||
if (m[2] == 'M' || m[2] == 'D' || m[2] == 'N')
|
||||
end += parseInt(m[1]);
|
||||
if (!(start < l[3] && l[2] < end)) is_ovlp = false; // no overlap
|
||||
else if (((s[1]&16) != 0) != l[1]) is_ovlp = false; // different strand
|
||||
else {
|
||||
var start = s[3] - 1, end = start;
|
||||
while ((m = re_cigar.exec(t[5])) != null)
|
||||
if (m[2] == 'M' || m[2] == 'D' || m[2] == 'N')
|
||||
end += parseInt(m[1]);
|
||||
if (!(start < l[3] && l[2] < end)) is_ovlp = false; // no overlap
|
||||
}
|
||||
} else is_ovlp = false;
|
||||
// get the "pa" tag if present
|
||||
var om = -1, pa = 10.;
|
||||
for (var j = 11; j < s.length; ++j)
|
||||
if ((m = /^om:i:(\d+)/.exec(s[j])) != null)
|
||||
|
|
@ -555,20 +478,6 @@ function bwa_postalt(args)
|
|||
}
|
||||
}
|
||||
|
||||
// generate reversed quality and reverse-complemented sequence if necessary
|
||||
var rs = null, rq = null; // reversed quality and reverse complement sequence
|
||||
var need_rev = false;
|
||||
for (var i = 0; i < hits.length; ++i) {
|
||||
if (hits[i].g != reported_g || i == reported_i) continue;
|
||||
if (hits[i].rev != hits[reported_i].rev)
|
||||
need_rev = true;
|
||||
}
|
||||
if (need_rev) { // reverse and reverse complement
|
||||
aux.length = 0;
|
||||
aux.set(t[9], 0); aux.revcomp(); rs = aux.toString();
|
||||
aux.set(t[10],0); aux.reverse(); rq = aux.toString();
|
||||
}
|
||||
|
||||
// stage the reported hit
|
||||
t[4] = mapQ;
|
||||
if (n_group0 > 1) t.push("om:i:"+ori_mapQ);
|
||||
|
|
@ -576,18 +485,23 @@ function bwa_postalt(args)
|
|||
buf2.push(t);
|
||||
|
||||
// stage the hits generated from the XA tag
|
||||
var cnt = 0;
|
||||
var cnt = 0, rs = null, rq = null; // rq: reverse quality; rs: reverse complement sequence
|
||||
var rg = (m = /\t(RG:Z:\S+)/.exec(line)) != null? m[1] : null;
|
||||
for (var i = 0; i < hits.length; ++i) {
|
||||
if (opt.verbose >= 5) print(obj2str(hits[i]));
|
||||
if (hits[i].g != reported_g || i == reported_i) continue;
|
||||
if (!opt.show_pri && idx_alt[hits[i].ctg] == null) continue;
|
||||
if (idx_alt[hits[i].ctg] == null) continue;
|
||||
var s = [t[0], 0, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, t[6], t[7], t[8]];
|
||||
if (t[6] == '=' && s[2] != t[2]) s[6] = t[2];
|
||||
// print sequence/quality and set the rev flag
|
||||
if (hits[i].rev == hits[reported_i].rev) {
|
||||
s.push(t[9], t[10]);
|
||||
s[1] = flag | 0x800;
|
||||
} else {
|
||||
} else { // we need to write the reverse sequence
|
||||
if (rs == null || rq == null) {
|
||||
aux.length = 0;
|
||||
aux.set(t[9], 0); aux.revcomp(); rs = aux.toString();
|
||||
aux.set(t[10],0); aux.reverse(); rq = aux.toString();
|
||||
}
|
||||
s.push(rs, rq);
|
||||
s[1] = (flag ^ 0x10) | 0x800;
|
||||
}
|
||||
|
|
@ -599,35 +513,12 @@ function bwa_postalt(args)
|
|||
}
|
||||
print_buffer(buf2, fp_hla, hla);
|
||||
file.close();
|
||||
if (fp_evi != null) fp_evi.close();
|
||||
if (fp_hla != null)
|
||||
for (var h in fp_hla)
|
||||
fp_hla[h].close();
|
||||
|
||||
buf.destroy();
|
||||
aux.destroy();
|
||||
|
||||
// print weight of each contig
|
||||
if (opt.pre != null) {
|
||||
var fpout = new File(opt.pre + '.ctw', "w");
|
||||
var weight_arr = [];
|
||||
var weight_hla = [];
|
||||
for (var ctg in weight_alt) {
|
||||
var w = weight_alt[ctg];
|
||||
weight_arr.push([ctg, w[6], w[7], w[8],
|
||||
w[0], w[1].toFixed(3), w[2].toFixed(3), w[1] > 0? (w[2]/w[1]).toFixed(3) : '0.000',
|
||||
w[3], w[4].toFixed(3), w[5].toFixed(3), w[4] > 0? (w[5]/w[4]).toFixed(3) : '0.000']);
|
||||
weight_hla.push([ctg, w[4] > 0? w[5]/w[4] : 0]);
|
||||
}
|
||||
weight_arr.sort(function(a,b) {
|
||||
return a[1] < b[1]? -1 : a[1] > b[1]? 1 : a[2] != b[2]? a[2] - b[2] : a[0] < b[0]? -1 : a[0] > b[0]? 1 : 0;
|
||||
});
|
||||
for (var i = 0; i < weight_arr.length; ++i) {
|
||||
if (weight_arr[i][1] == '~') weight_arr[i][1] = '*';
|
||||
fpout.write(weight_arr[i].join("\t") + '\n');
|
||||
}
|
||||
fpout.close();
|
||||
}
|
||||
}
|
||||
|
||||
bwa_postalt(arguments);
|
||||
|
|
|
|||
7
bwa.1
7
bwa.1
|
|
@ -1,4 +1,4 @@
|
|||
.TH bwa 1 "19 May 2014" "bwa-0.7.9-r783" "Bioinformatics tools"
|
||||
.TH bwa 1 "18 November 2014" "bwa-0.7.11-r999" "Bioinformatics tools"
|
||||
.SH NAME
|
||||
.PP
|
||||
bwa - Burrows-Wheeler Alignment Tool
|
||||
|
|
@ -233,8 +233,9 @@ more aggressive read pair. [17]
|
|||
.B INPUT/OUTPUT OPTIONS:
|
||||
.TP
|
||||
.B -p
|
||||
Assume the first input query file is interleaved paired-end FASTA/Q. See the
|
||||
command description for details.
|
||||
Smart pairing. If two adjacent reads have the same name, they are considered
|
||||
to form a read pair. This way, paired-end and single-end reads can be mixed
|
||||
in a single FASTA/Q stream.
|
||||
.TP
|
||||
.BI -R \ STR
|
||||
Complete read group header line. '\\t' can be used in
|
||||
|
|
|
|||
21
bwa.c
21
bwa.c
|
|
@ -7,6 +7,7 @@
|
|||
#include "ksw.h"
|
||||
#include "utils.h"
|
||||
#include "kstring.h"
|
||||
#include "kvec.h"
|
||||
|
||||
#ifdef USE_MALLOC_WRAPPERS
|
||||
# include "malloc_wrap.h"
|
||||
|
|
@ -55,10 +56,12 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
|
|||
}
|
||||
trim_readno(&ks->name);
|
||||
kseq2bseq1(ks, &seqs[n]);
|
||||
seqs[n].id = n;
|
||||
size += seqs[n++].l_seq;
|
||||
if (ks2) {
|
||||
trim_readno(&ks2->name);
|
||||
kseq2bseq1(ks2, &seqs[n]);
|
||||
seqs[n].id = n;
|
||||
size += seqs[n++].l_seq;
|
||||
}
|
||||
if (size >= chunk_size && (n&1) == 0) break;
|
||||
|
|
@ -71,6 +74,24 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
|
|||
return seqs;
|
||||
}
|
||||
|
||||
void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2])
|
||||
{
|
||||
int i, has_last;
|
||||
kvec_t(bseq1_t) a[2] = {{0,0,0}, {0,0,0}};
|
||||
for (i = 1, has_last = 1; i < n; ++i) {
|
||||
if (has_last) {
|
||||
if (strcmp(seqs[i].name, seqs[i-1].name) == 0) {
|
||||
kv_push(bseq1_t, a[1], seqs[i-1]);
|
||||
kv_push(bseq1_t, a[1], seqs[i]);
|
||||
has_last = 0;
|
||||
} else kv_push(bseq1_t, a[0], seqs[i-1]);
|
||||
} else has_last = 1;
|
||||
}
|
||||
if (has_last) kv_push(bseq1_t, a[0], seqs[i-1]);
|
||||
sep[0] = a[0].a, m[0] = a[0].n;
|
||||
sep[1] = a[1].a, m[1] = a[1].n;
|
||||
}
|
||||
|
||||
/*****************
|
||||
* CIGAR related *
|
||||
*****************/
|
||||
|
|
|
|||
3
bwa.h
3
bwa.h
|
|
@ -23,7 +23,7 @@ typedef struct {
|
|||
} bwaidx_t;
|
||||
|
||||
typedef struct {
|
||||
int l_seq;
|
||||
int l_seq, id;
|
||||
char *name, *comment, *seq, *qual, *sam;
|
||||
} bseq1_t;
|
||||
|
||||
|
|
@ -35,6 +35,7 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
|
||||
void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]);
|
||||
|
||||
void bwa_fill_scmat(int a, int b, int8_t mat[25]);
|
||||
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
|
||||
|
|
|
|||
1
bwamem.h
1
bwamem.h
|
|
@ -20,6 +20,7 @@ typedef struct __smem_i smem_i;
|
|||
#define MEM_F_ALN_REG 0x80
|
||||
#define MEM_F_REF_HDR 0x100
|
||||
#define MEM_F_SOFTCLIP 0x200
|
||||
#define MEM_F_SMARTPE 0x400
|
||||
|
||||
typedef struct {
|
||||
int a, b; // match score and mismatch penalty
|
||||
|
|
|
|||
|
|
@ -362,8 +362,13 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
|
|||
|
||||
no_pairing:
|
||||
for (i = 0; i < 2; ++i) {
|
||||
if (a[i].n && a[i].a[0].score >= opt->T)
|
||||
h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[0]);
|
||||
int which = -1;
|
||||
if (a[i].n) {
|
||||
if (a[i].a[0].score >= opt->T) which = 0;
|
||||
else if (n_pri[i] < a[i].n && a[i].a[n_pri[i]].score >= opt->T)
|
||||
which = n_pri[i];
|
||||
}
|
||||
if (which >= 0) h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[which]);
|
||||
else h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0);
|
||||
}
|
||||
if (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it.
|
||||
|
|
|
|||
|
|
@ -0,0 +1,330 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>ActiveLayerIndex</key>
|
||||
<integer>0</integer>
|
||||
<key>ApplicationVersion</key>
|
||||
<array>
|
||||
<string>com.omnigroup.OmniGraffle</string>
|
||||
<string>139.18.0.187838</string>
|
||||
</array>
|
||||
<key>AutoAdjust</key>
|
||||
<true/>
|
||||
<key>BackgroundGraphic</key>
|
||||
<dict>
|
||||
<key>Bounds</key>
|
||||
<string>{{0, 0}, {576, 733}}</string>
|
||||
<key>Class</key>
|
||||
<string>SolidGraphic</string>
|
||||
<key>ID</key>
|
||||
<integer>2</integer>
|
||||
<key>Style</key>
|
||||
<dict>
|
||||
<key>shadow</key>
|
||||
<dict>
|
||||
<key>Draws</key>
|
||||
<string>NO</string>
|
||||
</dict>
|
||||
<key>stroke</key>
|
||||
<dict>
|
||||
<key>Draws</key>
|
||||
<string>NO</string>
|
||||
</dict>
|
||||
</dict>
|
||||
</dict>
|
||||
<key>BaseZoom</key>
|
||||
<integer>0</integer>
|
||||
<key>CanvasOrigin</key>
|
||||
<string>{0, 0}</string>
|
||||
<key>ColumnAlign</key>
|
||||
<integer>1</integer>
|
||||
<key>ColumnSpacing</key>
|
||||
<real>36</real>
|
||||
<key>CreationDate</key>
|
||||
<string>2014-11-17 16:51:42 +0000</string>
|
||||
<key>Creator</key>
|
||||
<string>Heng Li</string>
|
||||
<key>DisplayScale</key>
|
||||
<string>1 0/72 in = 1 0/72 in</string>
|
||||
<key>GraphDocumentVersion</key>
|
||||
<integer>8</integer>
|
||||
<key>GraphicsList</key>
|
||||
<array>
|
||||
<dict>
|
||||
<key>Bounds</key>
|
||||
<string>{{35.699992179870605, 151.89999580383301}, {476, 224}}</string>
|
||||
<key>Class</key>
|
||||
<string>ShapedGraphic</string>
|
||||
<key>FitText</key>
|
||||
<string>YES</string>
|
||||
<key>Flow</key>
|
||||
<string>Resize</string>
|
||||
<key>FontInfo</key>
|
||||
<dict>
|
||||
<key>Font</key>
|
||||
<string>AndaleMono</string>
|
||||
<key>Size</key>
|
||||
<real>12</real>
|
||||
</dict>
|
||||
<key>ID</key>
|
||||
<integer>28</integer>
|
||||
<key>Shape</key>
|
||||
<string>Rectangle</string>
|
||||
<key>Style</key>
|
||||
<dict>
|
||||
<key>fill</key>
|
||||
<dict>
|
||||
<key>Draws</key>
|
||||
<string>NO</string>
|
||||
</dict>
|
||||
<key>shadow</key>
|
||||
<dict>
|
||||
<key>Draws</key>
|
||||
<string>NO</string>
|
||||
</dict>
|
||||
<key>stroke</key>
|
||||
<dict>
|
||||
<key>Draws</key>
|
||||
<string>NO</string>
|
||||
</dict>
|
||||
</dict>
|
||||
<key>Text</key>
|
||||
<dict>
|
||||
<key>Align</key>
|
||||
<integer>0</integer>
|
||||
<key>Pad</key>
|
||||
<integer>0</integer>
|
||||
<key>Text</key>
|
||||
<string>{\rtf1\ansi\ansicpg1252\cocoartf1265\cocoasubrtf210
|
||||
\cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 Consolas;\f1\fnil\fcharset0 Consolas-Bold;}
|
||||
{\colortbl;\red255\green255\blue255;\red0\green0\blue0;\red127\green127\blue127;\red255\green0\blue0;
|
||||
\red204\green204\blue204;\red0\green0\blue255;\red0\green128\blue0;\red255\green128\blue0;}
|
||||
\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural
|
||||
|
||||
\f0\fs24 \cf2 Read: A\cf0 TCAGCATC\
|
||||
\cf2 \
|
||||
ALT ctg 1: \cf3 TGA\cf3 AA---CGAATGCAAATGGTCA
|
||||
\f1\b \cf4 ATCAGCATC
|
||||
\f0\b0 \cf3 GAACTAGTCACAT\cf2 \
|
||||
\cf3 |||||\cf5 (high div) \cf3 ||||||\cf5 (novel ins)\cf3 ||||||||||\cf2 \
|
||||
Chromosome:\cf3 GCGTACATGATACGA
|
||||
\f1\b \cf6 ATCgGCATC
|
||||
\f0\b0 \cf3 ATGGTC-------------CTAGTCACATCGTAATC\
|
||||
\cf2 \cf3 |||||||||||| ||||||||||\cf5 (novel ins) \cf3 ||||||||||\
|
||||
\cf2 ALT ctg 2:\cf3 TGATACGA
|
||||
\f1\b \cf7 ATCgcCATC
|
||||
\f0\b0 \cf3 ATGGTCA
|
||||
\f1\b \cf8 ATCgcCAgC
|
||||
\f0\b0 \cf3 GAACTAGTCACAT\
|
||||
\
|
||||
\cf2 4 potential hits:
|
||||
\f1\b \cf4 ATCAGCATC
|
||||
\f0\b0 \cf0 >
|
||||
\f1\b \cf6 ATCgGCATC
|
||||
\f0\b0 \cf0 >
|
||||
\f1\b \cf7 ATCgcCATC
|
||||
\f0\b0 \cf2 >
|
||||
\f1\b \cf8 ATCgcCAgC\
|
||||
\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural
|
||||
|
||||
\f0\b0 \cf0 2 hit groups: \{
|
||||
\f1\b \cf4 ATCAGCATC
|
||||
\f0\b0 \cf0 ,
|
||||
\f1\b \cf8 ATCgcCAgC
|
||||
\f0\b0 \cf2 \} and\cf0 \{
|
||||
\f1\b \cf6 ATCgGCATC
|
||||
\f0\b0 \cf2 ,
|
||||
\f1\b \cf7 ATCgcCATC
|
||||
\f0\b0 \cf2 \}\
|
||||
\cf0 Hits considered in mapQ:
|
||||
\f1\b \cf4 ATCAGCATC
|
||||
\f0\b0 \cf0 and
|
||||
\f1\b \cf6 ATCgGCATC
|
||||
\f0\b0 \cf2 (best from each group)
|
||||
\f1\b \cf6 \
|
||||
\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural
|
||||
|
||||
\f0\b0 \cf3 \
|
||||
\cf2 In the output SAM:
|
||||
\f1\b \cf6 ATCgGCATC
|
||||
\f0\b0 \cf2 as the primary SAM line with mapQ=0\
|
||||
|
||||
\f1\b \cf4 ATCAGCATC
|
||||
\f0\b0 \cf2 as a supplementary line with mapQ>0\
|
||||
|
||||
\f1\b \cf8 ATCgcCAgC
|
||||
\f0\b0 \cf2 as a supplementary line with mapQ>0\
|
||||
|
||||
\f1\b \cf7 ATCgcCATC
|
||||
\f0\b0 \cf2 in an XA tag, not as a separate line}</string>
|
||||
<key>VerticalPad</key>
|
||||
<integer>0</integer>
|
||||
</dict>
|
||||
<key>Wrap</key>
|
||||
<string>NO</string>
|
||||
</dict>
|
||||
</array>
|
||||
<key>GridInfo</key>
|
||||
<dict>
|
||||
<key>GridSpacing</key>
|
||||
<real>7.1999998092651367</real>
|
||||
<key>MajorGridSpacing</key>
|
||||
<integer>10</integer>
|
||||
<key>SnapsToGrid</key>
|
||||
<string>YES</string>
|
||||
</dict>
|
||||
<key>GuidesLocked</key>
|
||||
<string>NO</string>
|
||||
<key>GuidesVisible</key>
|
||||
<string>YES</string>
|
||||
<key>HPages</key>
|
||||
<integer>1</integer>
|
||||
<key>ImageCounter</key>
|
||||
<integer>1</integer>
|
||||
<key>KeepToScale</key>
|
||||
<false/>
|
||||
<key>Layers</key>
|
||||
<array>
|
||||
<dict>
|
||||
<key>Lock</key>
|
||||
<string>NO</string>
|
||||
<key>Name</key>
|
||||
<string>Layer 1</string>
|
||||
<key>Print</key>
|
||||
<string>YES</string>
|
||||
<key>View</key>
|
||||
<string>YES</string>
|
||||
</dict>
|
||||
</array>
|
||||
<key>LayoutInfo</key>
|
||||
<dict>
|
||||
<key>Animate</key>
|
||||
<string>NO</string>
|
||||
<key>circoMinDist</key>
|
||||
<real>18</real>
|
||||
<key>circoSeparation</key>
|
||||
<real>0.0</real>
|
||||
<key>layoutEngine</key>
|
||||
<string>dot</string>
|
||||
<key>neatoSeparation</key>
|
||||
<real>0.0</real>
|
||||
<key>twopiSeparation</key>
|
||||
<real>0.0</real>
|
||||
</dict>
|
||||
<key>LinksVisible</key>
|
||||
<string>NO</string>
|
||||
<key>MagnetsVisible</key>
|
||||
<string>NO</string>
|
||||
<key>MasterSheets</key>
|
||||
<array/>
|
||||
<key>ModificationDate</key>
|
||||
<string>2014-11-17 18:28:10 +0000</string>
|
||||
<key>Modifier</key>
|
||||
<string>Heng Li</string>
|
||||
<key>NotesVisible</key>
|
||||
<string>NO</string>
|
||||
<key>Orientation</key>
|
||||
<integer>2</integer>
|
||||
<key>OriginVisible</key>
|
||||
<string>NO</string>
|
||||
<key>PageBreaks</key>
|
||||
<string>YES</string>
|
||||
<key>PrintInfo</key>
|
||||
<dict>
|
||||
<key>NSBottomMargin</key>
|
||||
<array>
|
||||
<string>float</string>
|
||||
<string>41</string>
|
||||
</array>
|
||||
<key>NSHorizonalPagination</key>
|
||||
<array>
|
||||
<string>coded</string>
|
||||
<string>BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG</string>
|
||||
</array>
|
||||
<key>NSLeftMargin</key>
|
||||
<array>
|
||||
<string>float</string>
|
||||
<string>18</string>
|
||||
</array>
|
||||
<key>NSPaperSize</key>
|
||||
<array>
|
||||
<string>size</string>
|
||||
<string>{612, 792}</string>
|
||||
</array>
|
||||
<key>NSPrintReverseOrientation</key>
|
||||
<array>
|
||||
<string>int</string>
|
||||
<string>0</string>
|
||||
</array>
|
||||
<key>NSRightMargin</key>
|
||||
<array>
|
||||
<string>float</string>
|
||||
<string>18</string>
|
||||
</array>
|
||||
<key>NSTopMargin</key>
|
||||
<array>
|
||||
<string>float</string>
|
||||
<string>18</string>
|
||||
</array>
|
||||
</dict>
|
||||
<key>PrintOnePage</key>
|
||||
<false/>
|
||||
<key>ReadOnly</key>
|
||||
<string>NO</string>
|
||||
<key>RowAlign</key>
|
||||
<integer>1</integer>
|
||||
<key>RowSpacing</key>
|
||||
<real>36</real>
|
||||
<key>SheetTitle</key>
|
||||
<string>Canvas 1</string>
|
||||
<key>SmartAlignmentGuidesActive</key>
|
||||
<string>YES</string>
|
||||
<key>SmartDistanceGuidesActive</key>
|
||||
<string>YES</string>
|
||||
<key>UniqueID</key>
|
||||
<integer>1</integer>
|
||||
<key>UseEntirePage</key>
|
||||
<false/>
|
||||
<key>VPages</key>
|
||||
<integer>1</integer>
|
||||
<key>WindowInfo</key>
|
||||
<dict>
|
||||
<key>CurrentSheet</key>
|
||||
<integer>0</integer>
|
||||
<key>ExpandedCanvases</key>
|
||||
<array>
|
||||
<dict>
|
||||
<key>name</key>
|
||||
<string>Canvas 1</string>
|
||||
</dict>
|
||||
</array>
|
||||
<key>Frame</key>
|
||||
<string>{{367, 6}, {710, 872}}</string>
|
||||
<key>ListView</key>
|
||||
<true/>
|
||||
<key>OutlineWidth</key>
|
||||
<integer>142</integer>
|
||||
<key>RightSidebar</key>
|
||||
<false/>
|
||||
<key>ShowRuler</key>
|
||||
<true/>
|
||||
<key>Sidebar</key>
|
||||
<true/>
|
||||
<key>SidebarWidth</key>
|
||||
<integer>120</integer>
|
||||
<key>VisibleRegion</key>
|
||||
<string>{{0, 0}, {575, 733}}</string>
|
||||
<key>Zoom</key>
|
||||
<real>1</real>
|
||||
<key>ZoomValues</key>
|
||||
<array>
|
||||
<array>
|
||||
<string>Canvas 1</string>
|
||||
<real>1</real>
|
||||
<real>1</real>
|
||||
</array>
|
||||
</array>
|
||||
</dict>
|
||||
</dict>
|
||||
</plist>
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 47 KiB |
|
|
@ -0,0 +1,20 @@
|
|||
#!/bin/bash
|
||||
|
||||
ctg_opt=""
|
||||
if [ $# -gt 1 ] && [ $1 == '-A' ]; then
|
||||
ctg_opt="-A"
|
||||
shift
|
||||
fi
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "Usage: $0 <prefix>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for f in $1.HLA-*.fq; do
|
||||
gene=`echo $f | perl -pe 's/^.*(HLA-[A-Z]+[0-9]*).*fq$/$1/'`
|
||||
echo -e "\n*** Processing gene $gene...\n" >&2
|
||||
`dirname $0`/typeHLA.sh $ctg_opt $1 $gene
|
||||
done
|
||||
|
||||
ls $1.HLA-*.gt | xargs -i echo grep ^GT {} \| head -1 | sh | sed "s,^GT,$1,"
|
||||
|
|
@ -0,0 +1,160 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use Getopt::Std;
|
||||
|
||||
my %opts = (t=>1);
|
||||
getopts("SadskHo:R:x:t:", \%opts);
|
||||
|
||||
die('
|
||||
Usage: run-bwamem [options] <idxbase> <file1> [file2]
|
||||
|
||||
Options: -o STR prefix for output files [inferred from input]
|
||||
-R STR read group header line such as \'@RG\tID:foo\tSM:bar\' [null]
|
||||
-x STR read type: pacbio, ont2d or intractg [default]
|
||||
intractg: intra-species contig (kb query, highly similar)
|
||||
pacbio: pacbio subreads (~10kb query, high error rate)
|
||||
ont2d: Oxford Nanopore reads (~10kb query, higher error rate)
|
||||
-t INT number of threads [1]
|
||||
|
||||
-a trim HiSeq2000/2500 PE resequencing adapters (via trimadap)
|
||||
-d mark duplicate (via samblaster)
|
||||
-S for SAM/BAM input, don\'t shuffle
|
||||
-s sort the output alignment (requring more RAM)
|
||||
-H skip HLA typing
|
||||
-k keep temporary files generated by typeHLA
|
||||
|
||||
Examples:
|
||||
|
||||
* Map paired-end reads to GRCh38+ALT+decoy+HLA and perform HLA typing:
|
||||
|
||||
run-bwamem -o prefix -t8 -R"@RG\tID:foo\tSM:bar" hs38d6.fa read1.fq.gz read2.fq.gz
|
||||
|
||||
* Remap coordinate-sorted BAM, trim Illumina PE adapters and sort the output. The BAM
|
||||
may contain single-end or paired-end reads, or a mixture of the two types. Read groups
|
||||
are not transferred to the output BAM, though.
|
||||
|
||||
run-bwamem -sao prefix hs38d6.fa old-srt.bam
|
||||
|
||||
* Remap name-grouped BAM and mark duplicates. Note that in this case, all reads from
|
||||
a single library should be aligned at the same time. Paired-end only.
|
||||
|
||||
run-bwamem -Sdo prefix hs38d6.fa old-unsrt.bam
|
||||
|
||||
Output files:
|
||||
|
||||
{-o}.aln.bam - final alignment
|
||||
{-o}.hla.top - best genotypes for the 6 classical HLA genes (if there are HLA-* contigs)
|
||||
{-o}.hla.all - additional HLA genotypes consistent with data
|
||||
{-o}.log.* - log files
|
||||
|
||||
') if @ARGV < 2;
|
||||
|
||||
warn("WARNING: many programs require read groups. Please specify with -R if you can.\n") unless defined($opts{R});
|
||||
|
||||
my $idx = $ARGV[0];
|
||||
|
||||
my $exepath = $0 =~/^\S+\/[^\/\s]+/? $0 : &which($0);
|
||||
my $root = $0 =~/^(\S+)\/[^\/\s]+/? $1 : undef;
|
||||
die "ERROR: failed to locate the 'bwa.kit' directory\n" if !defined($root);
|
||||
|
||||
die("ERROR: failed to locate the BWA index. Please run '$root/bwa index -p $idx ref.fa'.\n")
|
||||
unless (-f "$idx.bwt" && -f "$idx.pac" && -f "$idx.sa" && -f "$idx.ann" && -f "$idx.amb");
|
||||
|
||||
if (@ARGV >= 3 && $ARGV[1] =~ /\.(bam|sam|sam\.gz)$/) {
|
||||
warn("WARNING: for SAM/BAM input, only the first sequence file is used.\n");
|
||||
@ARGV = 2;
|
||||
}
|
||||
|
||||
if (defined($opts{p}) && @ARGV >= 3) {
|
||||
warn("WARNING: option -P is ignored as there are two input sequence files.\n");
|
||||
delete $opts{p};
|
||||
}
|
||||
|
||||
my $prefix;
|
||||
if (defined $opts{o}) {
|
||||
$prefix = $opts{o};
|
||||
} elsif (@ARGV >= 3) {
|
||||
my $len = length($ARGV[1]) < length($ARGV[2])? length($ARGV[1]) : length($ARGV[2]);
|
||||
my $i;
|
||||
for ($i = 0; $i < $len; ++$i) {
|
||||
last if substr($ARGV[1], $i, 1) ne substr($ARGV[2], $i, 1)
|
||||
}
|
||||
$prefix = substr($ARGV[1], 0, $i) if $i > 0;
|
||||
} elsif ($ARGV[1] =~ /^(\S+)\.(fastq|fq|fasta|fa|mag|sam|sam\.gz|mag\.gz|fasta\.gz|fa\.gz|fastq\.gz|fq\.gz|bam)$/) {
|
||||
$prefix = $1;
|
||||
}
|
||||
die("ERROR: failed to identify the prefix for output. Please specify -p.\n") unless defined($prefix);
|
||||
|
||||
my $size = 0;
|
||||
my $comp_ratio = 3.;
|
||||
for my $f (@ARGV[1..$#ARGV]) {
|
||||
my @a = stat($f);
|
||||
my $s = $a[7];
|
||||
die("ERROR: failed to read file $f\n") if !defined($s);
|
||||
$s *= $comp_ratio if $f =~ /\.(gz|bam)$/;
|
||||
$size += int($s) + 1;
|
||||
}
|
||||
|
||||
my $is_pe = (defined($opts{p}) || @ARGV >= 3)? 1 : 0;
|
||||
my $is_sam = $ARGV[1] =~ /\.(sam|sam\.gz)$/? 1 : 0;
|
||||
my $is_bam = $ARGV[1] =~ /\.bam$/? 1 : 0;
|
||||
|
||||
if (defined($opts{x})) {
|
||||
delete($opts{d}); delete($opts{a}); delete $opts{p};
|
||||
}
|
||||
|
||||
my $cmd = '';
|
||||
if ($is_sam || $is_bam) {
|
||||
my $cmd_sam2bam = $is_sam? "$root/htsbox samview -uS $ARGV[1] \\\n" : "cat $ARGV[1] \\\n";
|
||||
my $ntmps = int($size / 4e9) + 1;
|
||||
my $cmd_shuf = ($is_bam || $is_sam) && !defined($opts{S})? " | $root/htsbox bamshuf -uOn$ntmps - $prefix.shuf \\\n" : "";
|
||||
my $cmd_bam2fq = " | $root/htsbox bam2fq -O - \\\n";
|
||||
$cmd = $cmd_sam2bam . $cmd_shuf . $cmd_bam2fq;
|
||||
} elsif (@ARGV >= 3) {
|
||||
$cmd = "$root/seqtk mergepe $ARGV[1] $ARGV[2] \\\n";
|
||||
} else {
|
||||
$cmd = "cat $ARGV[1] \\\n";
|
||||
}
|
||||
|
||||
my $bwa_opts = "-p " . ($opts{t} > 1? "-t$opts{t} " : "") . (defined($opts{x})? "-x $opts{x} " : "") . (defined($opts{R})? "-R'$opts{R}' " : "");
|
||||
|
||||
$cmd .= " | $root/trimadap 2> $prefix.log.trim \\\n" if defined($opts{a});
|
||||
$cmd .= " | $root/bwa mem $bwa_opts$ARGV[0] - 2> $prefix.log.bwamem \\\n";
|
||||
$cmd .= " | $root/samblaster 2> $prefix.log.dedup \\\n" if defined($opts{d});
|
||||
|
||||
my $has_hla = 0;
|
||||
if (-f "$ARGV[0].alt") {
|
||||
my $fh;
|
||||
open($fh, "$ARGV[0].alt") || die;
|
||||
while (<$fh>) {
|
||||
$has_hla = 1 if /^HLA-[^\s\*]+\*\d+/;
|
||||
}
|
||||
close($fh);
|
||||
my $hla_pre = $has_hla? "-p $prefix.hla " : "";
|
||||
$cmd .= " | $root/k8 $root/bwa-postalt.js $hla_pre$ARGV[0].alt \\\n";
|
||||
}
|
||||
|
||||
my $t_sort = $opts{t} < 4? $opts{t} : 4;
|
||||
$cmd .= defined($opts{s})? " | $root/samtools sort -@ $t_sort -m1G - $prefix.aln;\n" : " | $root/samtools view -1 - > $prefix.aln.bam;\n";
|
||||
|
||||
if ($has_hla && !defined($opts{H}) && (!defined($opts{x}) || $opts{x} eq 'intractg')) {
|
||||
$cmd .= "$root/run-HLA ". (defined($opts{x}) && $opts{x} eq 'intractg'? "-A " : "") . "$prefix.hla > $prefix.hla.top 2> $prefix.log.hla;\n";
|
||||
$cmd .= "cat $prefix.hla.HLA*.gt | grep ^GT | cut -f2- > $prefix.hla.all;\n";
|
||||
$cmd .= "rm -f $prefix.hla.HLA*;\n" unless defined($opts{k});
|
||||
}
|
||||
|
||||
print $cmd;
|
||||
|
||||
sub which
|
||||
{
|
||||
my $file = shift;
|
||||
my $path = (@_)? shift : $ENV{PATH};
|
||||
return if (!defined($path));
|
||||
foreach my $x (split(":", $path)) {
|
||||
$x =~ s/\/$//;
|
||||
return "$x/$file" if (-x "$x/$file");
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
#!/bin/bash
|
||||
|
||||
root=`dirname $0`
|
||||
|
||||
url38="ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz"
|
||||
url37d5="ftp://ftp.ncbi.nlm.nih.gov/1000genomes/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "Usage: $0 <hs38|hs38a|hs38d6|hs37|hs37d5>"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
if [ $1 == "hs38d6" ]; then
|
||||
(wget -O- $url38 | gzip -dc; cat $root/resource-GRCh38/hs38d6-extra.fa) > $1.fa
|
||||
[ ! -f $1.fa.alt ] && cp $root/resource-GRCh38/hs38d6.fa.alt $1.fa.alt
|
||||
elif [ $1 == "hs38a" ]; then
|
||||
wget -O- $url38 | gzip -dc > $1.fa
|
||||
[ ! -f $1.fa.alt ] && grep _alt $root/resource-GRCh38/hs38d6.fa.alt > $1.fa.alt
|
||||
elif [ $1 == "hs38" ]; then
|
||||
wget -O- $url38 | gzip -dc | awk '/^>/{f=/_alt/?0:1}f' > $1.fa
|
||||
elif [ $1 == "hs37d5" ]; then
|
||||
wget -O- $url37d5 | gzip -dc > $1.fa 2>/dev/null
|
||||
elif [ $1 == "hs37" ]; then
|
||||
wget -O- $url37d5 | gzip -dc 2>/dev/null | awk '/^>/{f=/>hs37d5/?0:1}f' > $1.fa
|
||||
else
|
||||
echo "ERROR: unknown genome build"
|
||||
fi
|
||||
|
||||
[ ! -f $1.fa.bwt ] && echo -e "\nPlease run 'bwa index $1.fa'...\n"
|
||||
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
var min_ovlp = 30;
|
||||
|
||||
if (arguments.length < 3) {
|
||||
print("Usage: k8 selctg.js <HLA-gene> <HLA-ALT-exons.bed> <ctg-to-ALT.sam> [min_ovlp="+min_ovlp+"]");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (arguments.length >= 4) min_ovlp = parseInt(arguments[3]);
|
||||
var gene = arguments[0];
|
||||
|
||||
var buf = new Bytes();
|
||||
|
||||
var h = {};
|
||||
var file = new File(arguments[1]);
|
||||
while (file.readline(buf) >= 0) {
|
||||
var t = buf.toString().split("\t");
|
||||
if (t[3] != gene) continue;
|
||||
if (h[t[0]] == null) h[t[0]] = [];
|
||||
h[t[0]].push([parseInt(t[1]), parseInt(t[2])]);
|
||||
}
|
||||
file.close();
|
||||
|
||||
var s = {}, re = /(\d+)([MIDSHN])/g;
|
||||
file = new File(arguments[2]);
|
||||
while (file.readline(buf) >= 0) {
|
||||
var line = buf.toString();
|
||||
var m, t = line.split("\t");
|
||||
var x = h[t[2]];
|
||||
if (x == null) continue;
|
||||
|
||||
var start = parseInt(t[3]) - 1, end = start;
|
||||
while ((m = re.exec(t[5])) != null) // parse CIGAR to get the end position
|
||||
if (m[2] == 'M' || m[2] == 'D')
|
||||
end += parseInt(m[1]);
|
||||
|
||||
var max_ovlp = 0;
|
||||
for (var i = 0; i < x.length; ++i) {
|
||||
var max_left = x[i][0] > start? x[i][0] : start;
|
||||
var min_rght = x[i][1] < end ? x[i][1] : end;
|
||||
max_ovlp = max_ovlp > min_rght - max_left? max_ovlp : min_rght - max_left;
|
||||
}
|
||||
|
||||
var AS = null, XS = null;
|
||||
if ((m = /AS:i:(\d+)/.exec(line)) != null) AS = parseInt(m[1]);
|
||||
if ((m = /XS:i:(\d+)/.exec(line)) != null) XS = parseInt(m[1]);
|
||||
|
||||
if (s[t[0]] == null) s[t[0]] = [];
|
||||
s[t[0]].push([AS, XS, max_ovlp]);
|
||||
}
|
||||
file.close();
|
||||
|
||||
buf.destroy();
|
||||
|
||||
for (var x in s) {
|
||||
var is_rejected = false, y = s[x];
|
||||
y.sort(function(a,b) {return b[0]-a[0]});
|
||||
for (var i = 0; i < y.length && y[i][0] == y[0][0]; ++i)
|
||||
if (y[0][2] < min_ovlp || y[i][0] == y[i][1])
|
||||
is_rejected = true;
|
||||
if (is_rejected) continue;
|
||||
print(x);
|
||||
}
|
||||
|
|
@ -0,0 +1,496 @@
|
|||
/*****************************************************************
|
||||
* The K8 Javascript interpreter is required to run this script. *
|
||||
* *
|
||||
* Source code: https://github.com/attractivechaos/k8 *
|
||||
* Binary: http://sourceforge.net/projects/lh3/files/k8/ *
|
||||
*****************************************************************/
|
||||
|
||||
var getopt = function(args, ostr) {
|
||||
var oli; // option letter list index
|
||||
if (typeof(getopt.place) == 'undefined')
|
||||
getopt.ind = 0, getopt.arg = null, getopt.place = -1;
|
||||
if (getopt.place == -1) { // update scanning pointer
|
||||
if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
|
||||
getopt.place = -1;
|
||||
return null;
|
||||
}
|
||||
if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
|
||||
++getopt.ind;
|
||||
getopt.place = -1;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
|
||||
if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
|
||||
if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null.
|
||||
if (getopt.place < 0) ++getopt.ind;
|
||||
return '?';
|
||||
}
|
||||
if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
|
||||
getopt.arg = null;
|
||||
if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
|
||||
} else { // need an argument
|
||||
if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
|
||||
getopt.arg = args[getopt.ind].substr(getopt.place);
|
||||
else if (args.length <= ++getopt.ind) { // no arg
|
||||
getopt.place = -1;
|
||||
if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
|
||||
return '?';
|
||||
} else getopt.arg = args[getopt.ind]; // white space
|
||||
getopt.place = -1;
|
||||
++getopt.ind;
|
||||
}
|
||||
return optopt;
|
||||
}
|
||||
|
||||
/************************
|
||||
* Command line parsing *
|
||||
************************/
|
||||
|
||||
var ver = "r19";
|
||||
var c, thres_len = 50, thres_ratio = .8, thres_nm = 5, thres_frac = .33, dbg = false;
|
||||
|
||||
// parse command line options
|
||||
while ((c = getopt(arguments, "vdl:n:f:")) != null) {
|
||||
if (c == 'l') thres_len = parseInt(getopt.arg);
|
||||
else if (c == 'n') thres_nm = parseInt(getopt.arg);
|
||||
else if (c == 'd') dbg = true;
|
||||
else if (c == 'f') thres_frac = parseFloat(getopt.arg);
|
||||
else if (c == 'v') { print(ver); exit(0); }
|
||||
}
|
||||
if (arguments.length == getopt.ind) {
|
||||
print("");
|
||||
print("Usage: k8 typeHLA.js [options] <exon-to-contig.sam>\n");
|
||||
print("Options: -n INT drop a contig if the edit distance to the closest gene is >INT ["+thres_nm+"]");
|
||||
print(" -l INT drop a contig if its match too short ["+thres_len+"]");
|
||||
print(" -f FLOAT drop inconsistent contigs if their length <FLOAT fraction of total length ["+thres_ratio.toFixed(2)+"]");
|
||||
print(" -d output extra info for debugging");
|
||||
print(" -v show version number");
|
||||
print("");
|
||||
print("Note: The output is TAB delimited with each GT line consisting of allele1, allele2,");
|
||||
print(" #mismatches/gaps on primary exons, #mismatches/gaps on other exons and #exons");
|
||||
print(" used in typing. If unusure, use the first GT line as the final genotype.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/*********************************
|
||||
* Read gene-to-contig alignment *
|
||||
*********************************/
|
||||
|
||||
var file = new File(arguments[getopt.ind]);
|
||||
var buf = new Bytes();
|
||||
var re_cigar = /(\d+)([MIDSH])/g;
|
||||
|
||||
var len = {}, list = [], gcnt = [];
|
||||
while (file.readline(buf) >= 0) {
|
||||
var m, mm, line = buf.toString();
|
||||
var t = line.split("\t");
|
||||
var flag = parseInt(t[1]);
|
||||
// SAM header
|
||||
if (t[0].charAt(0) == '@') {
|
||||
if (t[0] == '@SQ' && (m = /LN:(\d+)/.exec(line)) != null && (mm = /SN:(\S+)/.exec(line)) != null)
|
||||
len[mm[1]] = parseInt(m[1]);
|
||||
continue;
|
||||
}
|
||||
// parse gene name and exon number
|
||||
var gene = null, exon = null;
|
||||
if ((m = /^(HLA-[^\s_]+)_(\d+)/.exec(t[0])) != null) {
|
||||
gene = m[1], exon = parseInt(m[2]) - 1;
|
||||
if (gcnt[exon] == null) gcnt[exon] = {};
|
||||
gcnt[exon][gene] = true;
|
||||
}
|
||||
if (gene == null || exon == null || t[2] == '*') continue;
|
||||
// parse clipping and aligned length
|
||||
var x = 0, ts = parseInt(t[3]) - 1, te = ts, clip = [0, 0];
|
||||
while ((m = re_cigar.exec(t[5])) != null) {
|
||||
var l = parseInt(m[1]);
|
||||
if (m[2] == 'M') x += l, te += l;
|
||||
else if (m[2] == 'I') x += l;
|
||||
else if (m[2] == 'D') te += l;
|
||||
else if (m[2] == 'S' || m[2] == 'H') clip[x==0?0:1] = l;
|
||||
}
|
||||
var tl = len[t[2]];
|
||||
var left = ts < clip[0]? ts : clip[0];
|
||||
var right = tl - te < clip[1]? tl - te : clip[1];
|
||||
var qs, qe, ql = clip[0] + x + clip[1];
|
||||
if (flag & 16) qs = clip[1], qe = ql - clip[0];
|
||||
else qs = clip[0], qe = ql - clip[1];
|
||||
var nm = (m = /\tNM:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : 0;
|
||||
list.push([t[2], gene, exon, ts, te, nm, left + right, qs, qe, ql]); // left+right should be 0 given a prefix-suffix alignment
|
||||
}
|
||||
|
||||
buf.destroy();
|
||||
file.close();
|
||||
|
||||
/**************************************
|
||||
* Prepare data structures for typing *
|
||||
**************************************/
|
||||
|
||||
// identify the primary exons, the exons associated with most genes
|
||||
var pri_exon = [], n_pri_exons;
|
||||
{
|
||||
var cnt = [], max = 0;
|
||||
// count the number of genes per exon and track the max
|
||||
for (var e = 0; e < gcnt.length; ++e) {
|
||||
if (gcnt[e] != null) {
|
||||
var c = 0, h = gcnt[e];
|
||||
for (var x in h) ++c;
|
||||
cnt[e] = c;
|
||||
max = max > c? max : c;
|
||||
} else cnt[e] = 0;
|
||||
}
|
||||
warn("- Number of genes for each exon: [" +cnt.join(",") + "]");
|
||||
// find primary exons
|
||||
var pri_list = [];
|
||||
for (var e = 0; e < cnt.length; ++e) {
|
||||
if (cnt[e] == max) pri_list.push(e + 1);
|
||||
pri_exon[e] = cnt[e] == max? 1 : 0;
|
||||
}
|
||||
warn("- List of primary exon(s): ["+pri_list.join(",")+"]");
|
||||
n_pri_exons = pri_list.length;
|
||||
}
|
||||
|
||||
// convert strings to integers (for performance)
|
||||
var ghash = {}, glist = [], chash = {}, clist = [], elist = [];
|
||||
for (var i = 0; i < list.length; ++i) {
|
||||
if (ghash[list[i][1]] == null) {
|
||||
ghash[list[i][1]] = glist.length;
|
||||
glist.push(list[i][1]);
|
||||
}
|
||||
if (chash[list[i][0]] == null) {
|
||||
chash[list[i][0]] = clist.length;
|
||||
clist.push(list[i][0]);
|
||||
}
|
||||
var g = ghash[list[i][1]];
|
||||
if (elist[g] == null) elist[g] = {};
|
||||
elist[g][list[i][2]] = true;
|
||||
}
|
||||
|
||||
// extract the 3rd and 4th digits
|
||||
var gsub = [], gsuf = [];
|
||||
for (var i = 0; i < glist.length; ++i) {
|
||||
var m = /^HLA-[^*\s]+\*\d+:(\d+).*([A-Z]?)$/.exec(glist[i]);
|
||||
gsub[i] = parseInt(m[1]);
|
||||
gsuf[i] = /[A-Z]$/.test(glist[i])? 1 : 0;
|
||||
}
|
||||
|
||||
/*************************************************
|
||||
* Collect genes with perfect matches on primary *
|
||||
*************************************************/
|
||||
|
||||
// collect exons with fully covered by perfect match(es)
|
||||
var perf_exons = [];
|
||||
|
||||
function push_perf_exons(matches, last)
|
||||
{
|
||||
matches.sort(function(a, b) { return a[0]-b[0]; });
|
||||
var cov = 0, start = 0, end = 0;
|
||||
for (var i = 0; i < matches.length; ++i) {
|
||||
if (matches[i][3] > 0) continue;
|
||||
if (matches[i][0] <= end)
|
||||
end = end > matches[i][1]? end : matches[i][1];
|
||||
else cov += end - start, start = matches[i][0], end = matches[i][1];
|
||||
}
|
||||
cov += end - start;
|
||||
if (matches[0][2] == cov) {
|
||||
if (perf_exons[last[1]] == null) perf_exons[last[1]] = [];
|
||||
//print(last[0], last[1], ghash[last[0]]);
|
||||
perf_exons[last[1]].push(ghash[last[0]]);
|
||||
}
|
||||
}
|
||||
|
||||
var last = [null, -1], matches = [];
|
||||
for (var i = 0; i < list.length; ++i) {
|
||||
var li = list[i];
|
||||
if (last[0] != li[1] || last[1] != li[2]) {
|
||||
if (matches.length) push_perf_exons(matches, last);
|
||||
matches = [];
|
||||
last = [li[1], li[2]];
|
||||
}
|
||||
matches.push([li[7], li[8], li[9], li[5]+li[6]]);
|
||||
}
|
||||
if (matches.length) push_perf_exons(matches, last);
|
||||
|
||||
// for each gene, count how many primary exons are perfect
|
||||
var pg_aux_cnt = {};
|
||||
for (var e = 0; e < perf_exons.length; ++e) {
|
||||
if (!pri_exon[e]) continue;
|
||||
var pe = perf_exons[e];
|
||||
var n = pe? pe.length : 0;
|
||||
for (var i = 0; i < n; ++i) {
|
||||
var g = pe[i];
|
||||
if (pg_aux_cnt[g] == null) pg_aux_cnt[g] = 1;
|
||||
else ++pg_aux_cnt[g];
|
||||
}
|
||||
}
|
||||
|
||||
// find genes with perfect matches on the primary exons
|
||||
var perf_genes = [];
|
||||
for (var g in pg_aux_cnt)
|
||||
if (pg_aux_cnt[g] == n_pri_exons)
|
||||
perf_genes.push(parseInt(g));
|
||||
warn("- Found " +perf_genes.length+ " genes fully covered by perfect matches on the primary exon(s)");
|
||||
|
||||
var h_perf_genes = {};
|
||||
for (var i = 0; i < perf_genes.length; ++i) {
|
||||
if (dbg) print("PG", glist[perf_genes[i]]);
|
||||
h_perf_genes[perf_genes[i]] = true;
|
||||
}
|
||||
|
||||
/*******************
|
||||
* Filter hit list *
|
||||
*******************/
|
||||
|
||||
// reorganize hits to exons
|
||||
function list2exons(list, flt_flag, perf_hash)
|
||||
{
|
||||
var exons = [];
|
||||
for (var i = 0; i < list.length; ++i) {
|
||||
var li = list[i], c = chash[li[0]], g = ghash[li[1]];
|
||||
if (flt_flag != null && flt_flag[c] == 1) continue;
|
||||
if (perf_hash != null && !perf_hash[g]) continue;
|
||||
if (exons[li[2]] == null) exons[li[2]] = [];
|
||||
exons[li[2]].push([c, g, li[5] + li[6], li[4] - li[3]]);
|
||||
}
|
||||
return exons;
|
||||
}
|
||||
|
||||
var exons = list2exons(list), flt_flag = [], ovlp_len = [];
|
||||
for (var c = 0; c < clist.length; ++c) flt_flag[c] = ovlp_len[c] = 0;
|
||||
for (var e = 0; e < exons.length; ++e) {
|
||||
if (!pri_exon[e]) continue;
|
||||
var ee = exons[e];
|
||||
var max_len = [];
|
||||
for (var c = 0; c < clist.length; ++c) max_len[c] = 0;
|
||||
for (var i = 0; i < ee.length; ++i) {
|
||||
var l = ee[i][3] - ee[i][2];
|
||||
if (l < 1) l = 1;
|
||||
if (max_len[ee[i][0]] < l) max_len[ee[i][0]] = l;
|
||||
}
|
||||
for (var c = 0; c < clist.length; ++c) ovlp_len[c] += max_len[c];
|
||||
for (var i = 0; i < ee.length; ++i)
|
||||
flt_flag[ee[i][0]] |= (!h_perf_genes[ee[i][1]] || ee[i][2])? 1 : 1<<1;
|
||||
}
|
||||
|
||||
var l_cons = 0, l_incons = 0;
|
||||
for (var c = 0; c < clist.length; ++c)
|
||||
if (flt_flag[c]&2) l_cons += ovlp_len[c];
|
||||
else if (flt_flag[c] == 1) l_incons += ovlp_len[c];
|
||||
|
||||
warn("- Total length of contigs consistent/inconsistent with perfect genes: " +l_cons+ "/" +l_incons);
|
||||
var attempt_perf = (l_incons/(l_cons+l_incons) < thres_frac);
|
||||
|
||||
/********************************
|
||||
* Core function for genotyping *
|
||||
********************************/
|
||||
|
||||
function type_gene(perf_mode)
|
||||
{
|
||||
if (perf_mode) {
|
||||
var flt_list = [];
|
||||
for (var c = 0; c < clist.length; ++c)
|
||||
if (flt_flag[c] == 1) flt_list.push(clist[c]);
|
||||
warn(" - Filtered " +flt_list.length+ " inconsistent contig(s): [" +flt_list.join(",")+ "]");
|
||||
exons = list2exons(list, flt_flag, h_perf_genes);
|
||||
} else exons = list2exons(list);
|
||||
|
||||
/***********************
|
||||
* Score each genotype *
|
||||
***********************/
|
||||
|
||||
// initialize genotype scores
|
||||
var pair = [];
|
||||
for (var i = 0; i < glist.length; ++i) {
|
||||
pair[i] = [];
|
||||
for (var j = 0; j <= i; ++j)
|
||||
pair[i][j] = 0;
|
||||
}
|
||||
|
||||
// these two arrays are used to output debugging information
|
||||
var score = [], ctg = [];
|
||||
|
||||
function type_exon(e, gt_list)
|
||||
{
|
||||
function update_pair(x, m, is_pri)
|
||||
{
|
||||
var y, z;
|
||||
y = (x>>14&0xff) + m < 0xff? (x>>14&0xff) + m : 0xff;
|
||||
if (is_pri) z = (x>>22) + m < 0xff? (x>>22) + m : 0xff;
|
||||
else z = x>>22;
|
||||
return z<<22 | y<<14 | ((x&0x3fff) + (1<<6|is_pri));
|
||||
}
|
||||
|
||||
score[e] = []; ctg[e] = [];
|
||||
if (exons[e] == null) return;
|
||||
var ee = exons[e], is_pri = pri_exon[e]? 1 : 0;
|
||||
// find contigs and genes associated with the current exon
|
||||
var ch = {}, gh = {};
|
||||
for (var i = 0; i < ee.length; ++i)
|
||||
if (elist[ee[i][1]][e] != null)
|
||||
ch[ee[i][0]] = true, gh[ee[i][1]] = true;
|
||||
var ga = [], ca = ctg[e];
|
||||
for (var c in ch) ca.push(parseInt(c));
|
||||
for (var g in gh) ga.push(parseInt(g));
|
||||
var named_ca = [];
|
||||
for (var i = 0; i < ca.length; ++i) named_ca.push(clist[ca[i]]);
|
||||
warn(" - Processing exon "+(e+1)+" (" +ga.length+ " genes; " +ca.length+ " contigs: [" +named_ca.join(", ")+ "])...");
|
||||
// set unmapped entries to high mismatch
|
||||
var sc = score[e];
|
||||
for (var k = 0; k < ga.length; ++k) {
|
||||
var g = ga[k];
|
||||
if (sc[g] == null) sc[g] = [];
|
||||
for (var i = 0; i < ca.length; ++i)
|
||||
sc[g][ca[i]] = 0xff;
|
||||
}
|
||||
// convert representation again and compute max_len[]
|
||||
var max_len = [];
|
||||
for (var i = 0; i < ee.length; ++i) {
|
||||
var c = ee[i][0], g = ee[i][1];
|
||||
if (gh[g] == null || ch[c] == null) continue;
|
||||
sc[g][c] = sc[g][c] < ee[i][2]? sc[g][c] : ee[i][2];
|
||||
if (max_len[c] == null) max_len[c] = 0;
|
||||
max_len[c] = max_len[c] > ee[i][3]? max_len[c] : ee[i][3];
|
||||
}
|
||||
// drop mismapped contigs
|
||||
var max_max_len = 0;
|
||||
for (var k = 0; k < ca.length; ++k)
|
||||
max_max_len = max_max_len > max_len[ca[k]]? max_max_len : max_len[ca[k]];
|
||||
var dropped = [];
|
||||
for (var k = 0; k < ca.length; ++k) {
|
||||
var min = 0x7fffffff, c = ca[k];
|
||||
for (var i = 0; i < ga.length; ++i) {
|
||||
var g = ga[i];
|
||||
min = min < sc[g][c]? min : sc[g][c];
|
||||
}
|
||||
dropped[c] = min > thres_nm? true : false;
|
||||
if (max_len[c] < thres_len && max_len[c] < thres_ratio * max_max_len) dropped[c] = true;
|
||||
if (dropped[c]) warn(" . Dropped low-quality contig " +clist[c]+ " (minNM=" +min+ "; maxLen=" +max_len[c]+ ")");
|
||||
}
|
||||
// fill the pair array
|
||||
if (gt_list == null) {
|
||||
for (var i = 0; i < ga.length; ++i) {
|
||||
var m = 0, gi = ga[i], g1 = sc[gi];
|
||||
// homozygous
|
||||
for (var k = 0; k < ca.length; ++k) {
|
||||
var c = ca[k];
|
||||
if (!dropped[c]) m += g1[c];
|
||||
}
|
||||
pair[gi][gi] = update_pair(pair[gi][gi], m, is_pri);
|
||||
// heterozygous
|
||||
for (var j = i + 1; j < ga.length; ++j) {
|
||||
var gj = ga[j], g2 = sc[gj], m = 0, a = [0, 0];
|
||||
for (var k = 0; k < ca.length; ++k) {
|
||||
var c = ca[k];
|
||||
if (!dropped[c]) {
|
||||
m += g1[c] < g2[c]? g1[c] : g2[c];
|
||||
++a[g1[c]<g2[c]? 0:1];
|
||||
}
|
||||
}
|
||||
if (a[0] == 0 || a[1] == 0) m = 0xff; // if all contigs are assigned to one gene, it is not good
|
||||
if (gi < gj) pair[gj][gi] = update_pair(pair[gj][gi], m, is_pri);
|
||||
else pair[gi][gj] = update_pair(pair[gi][gj], m, is_pri);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
var tmp_pairs = [], min = 0xff;
|
||||
for (var i = 0; i < gt_list.length; ++i) {
|
||||
var gt = gt_list[i], m = 0;
|
||||
var g1 = sc[gt[0]], g2 = sc[gt[1]], a = [0, 0];
|
||||
if (g1 == null || g2 == null) continue;
|
||||
if (gt[0] == gt[1]) {
|
||||
for (var k = 0; k < ca.length; ++k) {
|
||||
var c = ca[k];
|
||||
if (!dropped[c]) m += g1[c];
|
||||
}
|
||||
} else {
|
||||
var a = [0, 0];
|
||||
for (k = 0; k < ca.length; ++k) {
|
||||
var c = ca[k];
|
||||
if (!dropped[c]) {
|
||||
m += g1[c] < g2[c]? g1[c] : g2[c];
|
||||
++a[g1[c]<g2[c]? 0:1];
|
||||
}
|
||||
}
|
||||
if (a[0] == 0 || a[1] == 0) m = 0xff;
|
||||
}
|
||||
tmp_pairs.push([gt[0], gt[1], m]);
|
||||
min = min < m? min : m;
|
||||
}
|
||||
if (min < 0xff) {
|
||||
for (var i = 0; i < tmp_pairs.length; ++i) {
|
||||
var t = tmp_pairs[i];
|
||||
pair[t[0]][t[1]] = update_pair(pair[t[0]][t[1]], t[2], is_pri);
|
||||
}
|
||||
} else warn(" . Skipped exon " +(e+1)+ " as the assembly may be incomplete");
|
||||
}
|
||||
}
|
||||
|
||||
// type primary exons
|
||||
warn(" - Processing primary exon(s)...");
|
||||
for (var e = 0; e < exons.length; ++e)
|
||||
if (pri_exon[e]) type_exon(e);
|
||||
|
||||
// generate the list of best genotypes on primary exons
|
||||
var min_nm_pri = 0x7fffffff;
|
||||
for (var i = 0; i < glist.length; ++i)
|
||||
for (var j = 0; j <= i; ++j)
|
||||
if ((pair[i][j]&63) == n_pri_exons)
|
||||
min_nm_pri = min_nm_pri < pair[i][j]>>22? min_nm_pri : pair[i][j]>>22;
|
||||
|
||||
var gt_list = [];
|
||||
for (var i = 0; i < glist.length; ++i)
|
||||
for (var j = 0; j <= i; ++j)
|
||||
if ((pair[i][j]&63) == n_pri_exons && pair[i][j]>>22 == min_nm_pri)
|
||||
gt_list.push([i, j]);
|
||||
|
||||
warn(" - Collected " +gt_list.length+ " top genotypes on the primary exon(s); minimal edit distance: " +min_nm_pri);
|
||||
|
||||
// type other exons
|
||||
warn(" - Processing other exon(s)...");
|
||||
for (var e = 0; e < exons.length; ++e)
|
||||
if (!pri_exon[e]) type_exon(e, gt_list);
|
||||
|
||||
/*****************************
|
||||
* Choose the best genotypes *
|
||||
*****************************/
|
||||
|
||||
// genotyping
|
||||
var min_nm = 0x7fffffff;
|
||||
for (var i = 0; i < glist.length; ++i)
|
||||
for (var j = 0; j <= i; ++j)
|
||||
if ((pair[i][j]&63) == n_pri_exons)
|
||||
min_nm = min_nm < pair[i][j]>>14? min_nm : pair[i][j]>>14;
|
||||
|
||||
var out = [];
|
||||
for (var i = 0; i < glist.length; ++i)
|
||||
for (var j = 0; j <= i; ++j)
|
||||
if ((pair[i][j]&63) == n_pri_exons && pair[i][j]>>14 <= min_nm + 1)
|
||||
out.push([pair[i][j]>>14, pair[i][j]>>6&0xff, i, j, (gsuf[i] + gsuf[j])<<16|(gsub[i] + gsub[j])]);
|
||||
|
||||
out.sort(function(a, b) { return a[0]!=b[0]? a[0]-b[0] : a[1]!=b[1]? b[1]-a[1] : a[4]!=b[4]? a[4]-b[4] : a[2]!=b[2]? a[2]-b[2] : a[3]-b[3]});
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
/**********************
|
||||
* Perform genotyping *
|
||||
**********************/
|
||||
|
||||
warn("- Typing in the imperfect mode...");
|
||||
var rst = type_gene(false);
|
||||
if (attempt_perf) {
|
||||
warn("- Typing in the perfect mode...");
|
||||
var rst_perf = type_gene(true);
|
||||
warn("- Imperfect vs perfect mode: [" +(rst[0][0]>>8&0xff)+ "," +(rst[0][0]&0xff)+ "] vs [" +(rst_perf[0][0]>>8&0xff)+ "," +(rst_perf[0][0]&0xff)+ "]");
|
||||
if (rst_perf[0][0] < rst[0][0]) {
|
||||
warn("- Chose the result from the perfect mode");
|
||||
rst = rst_perf;
|
||||
} else warn("- Chose the result from the imperfect mode");
|
||||
} else warn("- Perfect mode is not attempted");
|
||||
|
||||
/**********
|
||||
* Output *
|
||||
**********/
|
||||
|
||||
for (var i = 0; i < rst.length; ++i)
|
||||
print("GT", glist[rst[i][3]], glist[rst[i][2]], rst[i][0]>>8&0xff, rst[i][0]&0xff, rst[i][1]);
|
||||
|
|
@ -0,0 +1,48 @@
|
|||
#!/bin/bash
|
||||
|
||||
is_ctg=0
|
||||
|
||||
if [ $# -gt 1 ] && [ $1 == '-A' ]; then
|
||||
is_ctg=1
|
||||
shift
|
||||
fi
|
||||
|
||||
if [ $# -lt 2 ]; then
|
||||
echo "Usage: $0 [-A] <prefix> <gene>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
preres="resource-human-HLA"
|
||||
root=`dirname $0`
|
||||
pre=$1.$2
|
||||
|
||||
if [ ! -s $pre.fq ]; then
|
||||
echo '** Empty input file. Abort!' >&2
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ $is_ctg -eq 0 ]; then
|
||||
echo "** De novo assembling..." >&2
|
||||
len=`$root/seqtk comp $pre.fq | awk '{++x;y+=$2}END{printf("%.0f\n", y/x)}'`
|
||||
$root/fermi2.pl unitig -f $root/fermi2 -r $root/ropebwt2 -t2 -l$len -p $pre.tmp $pre.fq > $pre.tmp.mak
|
||||
make -f $pre.tmp.mak >&2
|
||||
cp $pre.tmp.mag.gz $pre.mag.gz
|
||||
else
|
||||
rm -f $pre.tmp.mag.gz
|
||||
ln -s $pre.fq $pre.tmp.mag.gz
|
||||
fi
|
||||
|
||||
echo "** Selecting contigs overlapping target exons..." >&2
|
||||
(ls $root/$preres/HLA-ALT-idx/*.fa.bwt | sed s,.bwt,, | xargs -i $root/bwa mem -t2 -B1 -O1 -E1 {} $pre.tmp.mag.gz 2>/dev/null) | grep -v ^@ | sort -k3,3 -k4,4n | gzip > $pre.tmp.ALT.sam.gz
|
||||
$root/k8 $root/typeHLA-selctg.js $2 $root/$preres/HLA-ALT-exons.bed $pre.tmp.ALT.sam.gz | $root/seqtk subseq $pre.tmp.mag.gz - | gzip -1 > $pre.tmp.fq.gz
|
||||
|
||||
echo "** Mapping exons to de novo contigs..." >&2
|
||||
$root/bwa index -p $pre.tmp $pre.tmp.fq.gz 2>/dev/null
|
||||
$root/seqtk comp $root/$preres/HLA-CDS.fa | cut -f1 | grep ^$2 | $root/seqtk subseq $root/$preres/HLA-CDS.fa - | $root/bwa mem -aD.1 -t2 $pre.tmp - 2>/dev/null | gzip -1 > $pre.sam.gz
|
||||
|
||||
echo "** Typing..." >&2
|
||||
$root/k8 $root/typeHLA.js $pre.sam.gz > $pre.gt
|
||||
|
||||
# delete temporary files
|
||||
rm -f $pre.tmp.*
|
||||
[ $is_ctg -eq 1 ] && rm -f $pre.mag.gz
|
||||
36
fastmap.c
36
fastmap.c
|
|
@ -55,7 +55,7 @@ int main_mem(int argc, char *argv[])
|
|||
|
||||
opt = mem_opt_init();
|
||||
memset(&opt0, 0, sizeof(mem_opt_t));
|
||||
while ((c = getopt(argc, argv, "epaFMCSPHVYjk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:")) >= 0) {
|
||||
while ((c = getopt(argc, argv, "epaFMCSPHVYjk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:X:")) >= 0) {
|
||||
if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
|
||||
else if (c == 'x') mode = optarg;
|
||||
else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1;
|
||||
|
|
@ -66,7 +66,7 @@ int main_mem(int argc, char *argv[])
|
|||
else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
|
||||
else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
|
||||
else if (c == 'a') opt->flag |= MEM_F_ALL;
|
||||
else if (c == 'p') opt->flag |= MEM_F_PE;
|
||||
else if (c == 'p') opt->flag |= MEM_F_PE | MEM_F_SMARTPE;
|
||||
else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
|
||||
else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE;
|
||||
else if (c == 'e') opt->flag |= MEM_F_SELF_OVLP;
|
||||
|
|
@ -87,6 +87,7 @@ int main_mem(int argc, char *argv[])
|
|||
else if (c == 'y') opt->max_mem_intv = atol(optarg), opt0.max_mem_intv = 1;
|
||||
else if (c == 'C') copy_comment = 1;
|
||||
else if (c == 'K') fixed_chunk_size = atoi(optarg);
|
||||
else if (c == 'X') opt->mask_level = atof(optarg);
|
||||
else if (c == 'h') {
|
||||
opt0.max_XA_hits = opt0.max_XA_hits_alt = 1;
|
||||
opt->max_XA_hits = opt->max_XA_hits_alt = strtol(optarg, &p, 10);
|
||||
|
|
@ -166,7 +167,7 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, " intractg: -B9 -O16 -L5 (intra-species contigs to ref)\n");
|
||||
// fprintf(stderr, " pbread: -k13 -W40 -c1000 -r10 -A1 -B1 -O1 -E1 -N25 -FeaD.001\n");
|
||||
fprintf(stderr, "\nInput/output options:\n\n");
|
||||
fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n");
|
||||
fprintf(stderr, " -p smart pairing (ignoring in2.fq)\n");
|
||||
fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
|
||||
fprintf(stderr, " -j ignore ALT contigs\n");
|
||||
fprintf(stderr, "\n");
|
||||
|
|
@ -247,7 +248,7 @@ int main_mem(int argc, char *argv[])
|
|||
if (optind + 2 < argc) {
|
||||
if (opt->flag&MEM_F_PE) {
|
||||
if (bwa_verbose >= 2)
|
||||
fprintf(stderr, "[W::%s] when '-p' is in use, the second query file will be ignored.\n", __func__);
|
||||
fprintf(stderr, "[W::%s] when '-p' is in use, the second query file is ignored.\n", __func__);
|
||||
} else {
|
||||
ko2 = kopen(argv[optind + 2], &fd2);
|
||||
if (ko2 == 0) {
|
||||
|
|
@ -264,11 +265,6 @@ int main_mem(int argc, char *argv[])
|
|||
actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads;
|
||||
while ((seqs = bseq_read(actual_chunk_size, &n, ks, ks2)) != 0) {
|
||||
int64_t size = 0;
|
||||
if ((opt->flag & MEM_F_PE) && (n&1) == 1) {
|
||||
if (bwa_verbose >= 2)
|
||||
fprintf(stderr, "[W::%s] odd number of reads in the PE mode; last read dropped\n", __func__);
|
||||
n = n>>1<<1;
|
||||
}
|
||||
if (!copy_comment)
|
||||
for (i = 0; i < n; ++i) {
|
||||
free(seqs[i].comment); seqs[i].comment = 0;
|
||||
|
|
@ -276,7 +272,27 @@ int main_mem(int argc, char *argv[])
|
|||
for (i = 0; i < n; ++i) size += seqs[i].l_seq;
|
||||
if (bwa_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size);
|
||||
mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, pes0);
|
||||
if (opt->flag & MEM_F_SMARTPE) {
|
||||
bseq1_t *sep[2];
|
||||
int i, n_sep[2];
|
||||
mem_opt_t tmp_opt = *opt;
|
||||
bseq_classify(n, seqs, n_sep, sep);
|
||||
if (bwa_verbose >= 3)
|
||||
fprintf(stderr, "[M::%s] %d single-end sequences; %d paired-end sequences\n", __func__, n_sep[0], n_sep[1]);
|
||||
if (n_sep[0]) {
|
||||
tmp_opt.flag &= ~MEM_F_PE;
|
||||
mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, n_processed, n_sep[0], sep[0], 0);
|
||||
for (i = 0; i < n_sep[0]; ++i)
|
||||
seqs[sep[0][i].id].sam = sep[0][i].sam;
|
||||
}
|
||||
if (n_sep[1]) {
|
||||
tmp_opt.flag |= MEM_F_PE;
|
||||
mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, n_processed + n_sep[0], n_sep[1], sep[1], pes0);
|
||||
for (i = 0; i < n_sep[1]; ++i)
|
||||
seqs[sep[1][i].id].sam = sep[1][i].sam;
|
||||
}
|
||||
free(sep[0]); free(sep[1]);
|
||||
} else mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, pes0);
|
||||
n_processed += n;
|
||||
for (i = 0; i < n; ++i) {
|
||||
if (seqs[i].sam) err_fputs(seqs[i].sam, stdout);
|
||||
|
|
|
|||
18
kseq.h
18
kseq.h
|
|
@ -54,9 +54,9 @@
|
|||
#define __KS_BASIC(type_t, __bufsize) \
|
||||
static inline kstream_t *ks_init(type_t f) \
|
||||
{ \
|
||||
kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
|
||||
kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
|
||||
ks->f = f; \
|
||||
ks->buf = (unsigned char*)malloc(__bufsize); \
|
||||
ks->buf = (unsigned char*)malloc(__bufsize); \
|
||||
return ks; \
|
||||
} \
|
||||
static inline void ks_destroy(kstream_t *ks) \
|
||||
|
|
@ -74,8 +74,7 @@
|
|||
if (ks->begin >= ks->end) { \
|
||||
ks->begin = 0; \
|
||||
ks->end = __read(ks->f, ks->buf, __bufsize); \
|
||||
if (ks->end < __bufsize) ks->is_eof = 1; \
|
||||
if (ks->end == 0) return -1; \
|
||||
if (ks->end == 0) { ks->is_eof = 1; return -1;} \
|
||||
} \
|
||||
return (int)ks->buf[ks->begin++]; \
|
||||
}
|
||||
|
|
@ -95,17 +94,16 @@ typedef struct __kstring_t {
|
|||
#define __KS_GETUNTIL(__read, __bufsize) \
|
||||
static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
|
||||
{ \
|
||||
int gotany = 0; \
|
||||
if (dret) *dret = 0; \
|
||||
str->l = append? str->l : 0; \
|
||||
if (ks->begin >= ks->end && ks->is_eof) return -1; \
|
||||
for (;;) { \
|
||||
int i; \
|
||||
if (ks->begin >= ks->end) { \
|
||||
if (!ks->is_eof) { \
|
||||
ks->begin = 0; \
|
||||
ks->end = __read(ks->f, ks->buf, __bufsize); \
|
||||
if (ks->end < __bufsize) ks->is_eof = 1; \
|
||||
if (ks->end == 0) break; \
|
||||
if (ks->end == 0) { ks->is_eof = 1; break; } \
|
||||
} else break; \
|
||||
} \
|
||||
if (delimiter == KS_SEP_LINE) { \
|
||||
|
|
@ -124,8 +122,9 @@ typedef struct __kstring_t {
|
|||
if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
|
||||
str->m = str->l + (i - ks->begin) + 1; \
|
||||
kroundup32(str->m); \
|
||||
str->s = (char*)realloc(str->s, str->m); \
|
||||
str->s = (char*)realloc(str->s, str->m); \
|
||||
} \
|
||||
gotany = 1; \
|
||||
memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
|
||||
str->l = str->l + (i - ks->begin); \
|
||||
ks->begin = i + 1; \
|
||||
|
|
@ -134,6 +133,7 @@ typedef struct __kstring_t {
|
|||
break; \
|
||||
} \
|
||||
} \
|
||||
if (!gotany && ks_eof(ks)) return -1; \
|
||||
if (str->s == 0) { \
|
||||
str->m = 1; \
|
||||
str->s = (char*)calloc(1, 1); \
|
||||
|
|
@ -155,7 +155,7 @@ typedef struct __kstring_t {
|
|||
#define __KSEQ_BASIC(SCOPE, type_t) \
|
||||
SCOPE kseq_t *kseq_init(type_t fd) \
|
||||
{ \
|
||||
kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
|
||||
kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
|
||||
s->f = ks_init(fd); \
|
||||
return s; \
|
||||
} \
|
||||
|
|
|
|||
Loading…
Reference in New Issue