From f005ea4bf3f646e7b13ceca87de380f9f1bf7b8e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 23 Oct 2014 14:42:45 -0400 Subject: [PATCH] coarse HLA typing and doc --- README-alt.md | 45 ++++++++++++++++++++++++++++++++++++++++----- bwa-postalt.js | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 5 deletions(-) diff --git a/README-alt.md b/README-alt.md index e57313f..61839a7 100644 --- a/README-alt.md +++ b/README-alt.md @@ -23,8 +23,12 @@ bwa mem hs38a.fa read1.fq read2.fq \ | bwa-hs38-bundle/k8-linux bwa-postalt.js hs38a.fa.alt \ | samtools view -bS - > aln.unsrt.bam ``` -For short reads, the postprocessing script `bwa-postalt.js` runs at about the -same speed as BAM compression. + +In the final alignment, a read may be placed on the [primary assembly][grcdef] +and multiple overlapping ALT contigs at the same time (on multiple SAM lines). +Mapping quality (mapQ) is properly adjusted by the postprocessing script +`bwa-postalt.js` using the ALT-to-reference alignment `hs38a.fa.alt`. For +details, see the [Methods section](#methods). #### Option 2: Mapping to the collection of GRCh38, decoy and HLA genes @@ -40,14 +44,20 @@ bwa mem hs38d4.fa read1.fq read2.fq \ | bwa-hs38-bundle/k8-linux bwa-postalt.js -p postinfo hs38d4.fa.alt \ | samtools view -bS - > aln.unsrt.bam ``` -This command line generates `postinfo.ctw` which loosely evaluates the presence -of an ALT contig with an empirical score at the last column. +The benefit of this option is to have a more complete reference sequence and +to facilitate HLA typing with a 3rd-party tool (see below). ***If you are not interested in the way BWA-MEM performs ALT mapping, you can skip the rest of this documentation.*** ## Background +GRCh38 consists of several components: chromosomal assembly, unlocalized contigs +(chromosome known but location unknown), unplaced contigs (chromosome unknown) +and ALT contigs (long clustered variations). The combination of the first three +components is called the *primary assembly*. You can find the more exact +definitions from the [GRC website][grcdef]. + GRCh38 ALT contigs are totaled 109Mb in length, spanning 60Mbp genomic regions. However, sequences that are highly diverged from the primary assembly only contribute a few million bp. Most subsequences of ALT contigs are highly similar @@ -113,7 +123,7 @@ pow(4,s_i)}` is the posterior of c_k given a read r mapped to it with a Smith-Waterman score s_k. This weight is reported in `postinfo.ctw` in the option 2 above. -### On the Completeness of GRCh38+ALT +### On the completeness of GRCh38+ALT While GRCh38 is much more complete than GRCh37, it is still missing some true human sequences. To make sure every piece of sequence in the reference assembly @@ -134,6 +144,27 @@ In addition to decoy, we also put multiple alleles of HLA genes in version 3.18.0. Script `bwa-postalt.js` also helps to genotype HLA genes, though not to high resolution for now. +### More on HLA typing + +It is [well known][hlalink] that HLA genes are associated with many autoimmune +diseases as well as some others not directly related to the immune system. +However, many HLA alleles are highly diverged from the reference genome. If we +map whole-genome shotgun (WGS) reads to the reference only, many +allele-informative will get lost. As a result, the vast majority of WGS projects +have ignored these important genes. + +We recommend to include the genomic regions of classical HLA genes in the BWA +index. This way we will be able to get a more complete collection of reads +mapped to HLA. We can then isolate these reads with little computational cost +and type HLA genes with another program, such as [Dilthey et al (2014)][hla1] or +one from [this list][hlatools]. + +If the postprocessing script `bwa-postalt.js` is invoked with `-p prefix`, it +will also write the top three alleles to file `prefix.hla`. However, as most HLA +alleles from IMGT/HLA don't have intronic sequences and thus are not included in +the reference genome, we are unable to type HLA genes to high resolution with +the BWA-MEM mapping alone. A dedicated tool is recommended for accurate typing. + ### Evaluating ALT Mapping (Coming soon...) @@ -159,3 +190,7 @@ can even get rid of ALT contigs for good. [blast]: http://blast.st-va.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome [sgdp]: http://www.simonsfoundation.org/life-sciences/simons-genome-diversity-project/ [hladb]: http://www.ebi.ac.uk/ipd/imgt/hla/ +[grcdef]: http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/info/definitions.shtml +[hla1]: http://biorxiv.org/content/early/2014/07/08/006973 +[hlalink]: http://www.hladiseaseassociations.com +[hlatools]: https://www.biostars.org/p/93245/ diff --git a/bwa-postalt.js b/bwa-postalt.js index a02b3e0..8c4218e 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -203,6 +203,44 @@ function parse_hit(s, opt) return h; } +function type_hla(w) +{ + var hla = ["A", "B", "C", "DQA1", "DQB1", "DRB1"]; + var hla_hash = {}, a = [], r = []; + for (var i = 0; i < hla.length; ++i) { + hla_hash[hla[i]] = i; + a[i] = []; + } + for (var i = 0; i < w.length; ++i) { + var t = w[i][0].split(/[:\*]/); + var x = hla_hash[t[0]]; + if (x != null) + a[x].push([w[i][0], t[1], t[1] + ':' + t[2], w[i][1]]); + } + for (var k = 1; k <= 2; ++k) { + for (var i = 0; i < hla.length; ++i) { + var ai = a[i], m = {}; + for (var j = 0; j < ai.length; ++j) { + var key = ai[j][k], val = ai[j][3]; + if (m[key] == null) m[key] = [-1, -1.0]; + if (m[key][1] < val) m[key] = [j, val]; + } + var sum = 0; + for (var x in m) sum += m[x][1]; + var max = -1, max2 = -1, max3 = -1, max_x, max_x2, max_x3; + for (var x in m) { + if (max < m[x][1]) max3 = max2, max_x3 = max_x2, max2 = max, max_x2 = max_x, max = m[x][1], max_x = x; + else if (max2 < m[x][1]) max3 = max2, max_x3 = max_x2, max2 = m[x][1], max_x2 = x; + else if (max3 < m[x][1]) max3 = m[x][1], max_x3 = x; + } + r.push([hla[i], k, hla[i]+'*'+max_x, max.toFixed(3), hla[i]+'*'+max_x2, max2.toFixed(3), hla[i]+'*'+max_x3, max3.toFixed(3)]); + } + } + for (var i = 0; i < r.length; ++i) + print(r[i].join("\t")); + return r; +} + function bwa_postalt(args) { var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false, update_mapq:true, min_mapq:10, min_sc:90, max_nm_sc:10, show_ev:false, min_pa_ratio:1 }; @@ -574,11 +612,13 @@ function bwa_postalt(args) if (opt.pre != null) { var fpout = new File(opt.pre + '.ctw', "w"); var weight_arr = []; + var weight_hla = []; for (var ctg in weight_alt) { var w = weight_alt[ctg]; weight_arr.push([ctg, w[6], w[7], w[8], w[0], w[1].toFixed(3), w[2].toFixed(3), w[1] > 0? (w[2]/w[1]).toFixed(3) : '0.000', w[3], w[4].toFixed(3), w[5].toFixed(3), w[4] > 0? (w[5]/w[4]).toFixed(3) : '0.000']); + weight_hla.push([ctg, w[4] > 0? w[5]/w[4] : 0]); } weight_arr.sort(function(a,b) { return a[1] < b[1]? -1 : a[1] > b[1]? 1 : a[2] != b[2]? a[2] - b[2] : a[0] < b[0]? -1 : a[0] > b[0]? 1 : 0; @@ -588,6 +628,12 @@ function bwa_postalt(args) fpout.write(weight_arr[i].join("\t") + '\n'); } fpout.close(); + + var r = type_hla(weight_hla); + fpout = new File(opt.pre + '.hla', "w"); + for (var i = 0; i < r.length; ++i) + fpout.write(r[i].join("\t") + '\n'); + fpout.close(); } }