coarse HLA typing and doc
This commit is contained in:
parent
3d129be642
commit
f005ea4bf3
|
|
@ -23,8 +23,12 @@ bwa mem hs38a.fa read1.fq read2.fq \
|
|||
| bwa-hs38-bundle/k8-linux bwa-postalt.js hs38a.fa.alt \
|
||||
| samtools view -bS - > aln.unsrt.bam
|
||||
```
|
||||
For short reads, the postprocessing script `bwa-postalt.js` runs at about the
|
||||
same speed as BAM compression.
|
||||
|
||||
In the final alignment, a read may be placed on the [primary assembly][grcdef]
|
||||
and multiple overlapping ALT contigs at the same time (on multiple SAM lines).
|
||||
Mapping quality (mapQ) is properly adjusted by the postprocessing script
|
||||
`bwa-postalt.js` using the ALT-to-reference alignment `hs38a.fa.alt`. For
|
||||
details, see the [Methods section](#methods).
|
||||
|
||||
#### Option 2: Mapping to the collection of GRCh38, decoy and HLA genes
|
||||
|
||||
|
|
@ -40,14 +44,20 @@ bwa mem hs38d4.fa read1.fq read2.fq \
|
|||
| bwa-hs38-bundle/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \
|
||||
| samtools view -bS - > aln.unsrt.bam
|
||||
```
|
||||
This command line generates `postinfo.ctw` which loosely evaluates the presence
|
||||
of an ALT contig with an empirical score at the last column.
|
||||
The benefit of this option is to have a more complete reference sequence and
|
||||
to facilitate HLA typing with a 3rd-party tool (see below).
|
||||
|
||||
***If you are not interested in the way BWA-MEM performs ALT mapping, you can
|
||||
skip the rest of this documentation.***
|
||||
|
||||
## Background
|
||||
|
||||
GRCh38 consists of several components: chromosomal assembly, unlocalized contigs
|
||||
(chromosome known but location unknown), unplaced contigs (chromosome unknown)
|
||||
and ALT contigs (long clustered variations). The combination of the first three
|
||||
components is called the *primary assembly*. You can find the more exact
|
||||
definitions from the [GRC website][grcdef].
|
||||
|
||||
GRCh38 ALT contigs are totaled 109Mb in length, spanning 60Mbp genomic regions.
|
||||
However, sequences that are highly diverged from the primary assembly only
|
||||
contribute a few million bp. Most subsequences of ALT contigs are highly similar
|
||||
|
|
@ -113,7 +123,7 @@ pow(4,s_i)}` is the posterior of c_k given a read r mapped to it with a
|
|||
Smith-Waterman score s_k. This weight is reported in `postinfo.ctw` in the
|
||||
option 2 above.
|
||||
|
||||
### On the Completeness of GRCh38+ALT
|
||||
### On the completeness of GRCh38+ALT
|
||||
|
||||
While GRCh38 is much more complete than GRCh37, it is still missing some true
|
||||
human sequences. To make sure every piece of sequence in the reference assembly
|
||||
|
|
@ -134,6 +144,27 @@ In addition to decoy, we also put multiple alleles of HLA genes in
|
|||
version 3.18.0. Script `bwa-postalt.js` also helps to genotype HLA genes, though
|
||||
not to high resolution for now.
|
||||
|
||||
### More on HLA typing
|
||||
|
||||
It is [well known][hlalink] that HLA genes are associated with many autoimmune
|
||||
diseases as well as some others not directly related to the immune system.
|
||||
However, many HLA alleles are highly diverged from the reference genome. If we
|
||||
map whole-genome shotgun (WGS) reads to the reference only, many
|
||||
allele-informative will get lost. As a result, the vast majority of WGS projects
|
||||
have ignored these important genes.
|
||||
|
||||
We recommend to include the genomic regions of classical HLA genes in the BWA
|
||||
index. This way we will be able to get a more complete collection of reads
|
||||
mapped to HLA. We can then isolate these reads with little computational cost
|
||||
and type HLA genes with another program, such as [Dilthey et al (2014)][hla1] or
|
||||
one from [this list][hlatools].
|
||||
|
||||
If the postprocessing script `bwa-postalt.js` is invoked with `-p prefix`, it
|
||||
will also write the top three alleles to file `prefix.hla`. However, as most HLA
|
||||
alleles from IMGT/HLA don't have intronic sequences and thus are not included in
|
||||
the reference genome, we are unable to type HLA genes to high resolution with
|
||||
the BWA-MEM mapping alone. A dedicated tool is recommended for accurate typing.
|
||||
|
||||
### Evaluating ALT Mapping
|
||||
|
||||
(Coming soon...)
|
||||
|
|
@ -159,3 +190,7 @@ can even get rid of ALT contigs for good.
|
|||
[blast]: http://blast.st-va.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome
|
||||
[sgdp]: http://www.simonsfoundation.org/life-sciences/simons-genome-diversity-project/
|
||||
[hladb]: http://www.ebi.ac.uk/ipd/imgt/hla/
|
||||
[grcdef]: http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/info/definitions.shtml
|
||||
[hla1]: http://biorxiv.org/content/early/2014/07/08/006973
|
||||
[hlalink]: http://www.hladiseaseassociations.com
|
||||
[hlatools]: https://www.biostars.org/p/93245/
|
||||
|
|
|
|||
|
|
@ -203,6 +203,44 @@ function parse_hit(s, opt)
|
|||
return h;
|
||||
}
|
||||
|
||||
function type_hla(w)
|
||||
{
|
||||
var hla = ["A", "B", "C", "DQA1", "DQB1", "DRB1"];
|
||||
var hla_hash = {}, a = [], r = [];
|
||||
for (var i = 0; i < hla.length; ++i) {
|
||||
hla_hash[hla[i]] = i;
|
||||
a[i] = [];
|
||||
}
|
||||
for (var i = 0; i < w.length; ++i) {
|
||||
var t = w[i][0].split(/[:\*]/);
|
||||
var x = hla_hash[t[0]];
|
||||
if (x != null)
|
||||
a[x].push([w[i][0], t[1], t[1] + ':' + t[2], w[i][1]]);
|
||||
}
|
||||
for (var k = 1; k <= 2; ++k) {
|
||||
for (var i = 0; i < hla.length; ++i) {
|
||||
var ai = a[i], m = {};
|
||||
for (var j = 0; j < ai.length; ++j) {
|
||||
var key = ai[j][k], val = ai[j][3];
|
||||
if (m[key] == null) m[key] = [-1, -1.0];
|
||||
if (m[key][1] < val) m[key] = [j, val];
|
||||
}
|
||||
var sum = 0;
|
||||
for (var x in m) sum += m[x][1];
|
||||
var max = -1, max2 = -1, max3 = -1, max_x, max_x2, max_x3;
|
||||
for (var x in m) {
|
||||
if (max < m[x][1]) max3 = max2, max_x3 = max_x2, max2 = max, max_x2 = max_x, max = m[x][1], max_x = x;
|
||||
else if (max2 < m[x][1]) max3 = max2, max_x3 = max_x2, max2 = m[x][1], max_x2 = x;
|
||||
else if (max3 < m[x][1]) max3 = m[x][1], max_x3 = x;
|
||||
}
|
||||
r.push([hla[i], k, hla[i]+'*'+max_x, max.toFixed(3), hla[i]+'*'+max_x2, max2.toFixed(3), hla[i]+'*'+max_x3, max3.toFixed(3)]);
|
||||
}
|
||||
}
|
||||
for (var i = 0; i < r.length; ++i)
|
||||
print(r[i].join("\t"));
|
||||
return r;
|
||||
}
|
||||
|
||||
function bwa_postalt(args)
|
||||
{
|
||||
var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, update_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:10, show_ev:false, min_pa_ratio:1 };
|
||||
|
|
@ -574,11 +612,13 @@ function bwa_postalt(args)
|
|||
if (opt.pre != null) {
|
||||
var fpout = new File(opt.pre + '.ctw', "w");
|
||||
var weight_arr = [];
|
||||
var weight_hla = [];
|
||||
for (var ctg in weight_alt) {
|
||||
var w = weight_alt[ctg];
|
||||
weight_arr.push([ctg, w[6], w[7], w[8],
|
||||
w[0], w[1].toFixed(3), w[2].toFixed(3), w[1] > 0? (w[2]/w[1]).toFixed(3) : '0.000',
|
||||
w[3], w[4].toFixed(3), w[5].toFixed(3), w[4] > 0? (w[5]/w[4]).toFixed(3) : '0.000']);
|
||||
weight_hla.push([ctg, w[4] > 0? w[5]/w[4] : 0]);
|
||||
}
|
||||
weight_arr.sort(function(a,b) {
|
||||
return a[1] < b[1]? -1 : a[1] > b[1]? 1 : a[2] != b[2]? a[2] - b[2] : a[0] < b[0]? -1 : a[0] > b[0]? 1 : 0;
|
||||
|
|
@ -588,6 +628,12 @@ function bwa_postalt(args)
|
|||
fpout.write(weight_arr[i].join("\t") + '\n');
|
||||
}
|
||||
fpout.close();
|
||||
|
||||
var r = type_hla(weight_hla);
|
||||
fpout = new File(opt.pre + '.hla', "w");
|
||||
for (var i = 0; i < r.length; ++i)
|
||||
fpout.write(r[i].join("\t") + '\n');
|
||||
fpout.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue