Merge branch 'dev'

Conflicts:
	main.c
This commit is contained in:
Heng Li 2014-09-17 14:22:20 -04:00
commit d677a4325b
6 changed files with 168 additions and 78 deletions

View File

@ -12,8 +12,10 @@ having ALT contigs almost has not effect on alignments to the primary assembly
(seeding may be affected in rare corner cases). At the same time, users may
get a primary alignment to ALT contigs (no 0x800 flag) if there are no good
hits to the primary assembly, or get a supplementary alignment to ALT contigs
if it is better than hits to the primary assembly. Since this release, it is
recommended to include ALT contigs.
if it is better than hits to the primary assembly. In the latter case, the
`pa:f` tag shows the ratio of the primary hit score to the best ALT hit score,
which is no greater than 1. Since this release, it is recommended to include
ALT contigs.
Users may consider to use ALT contigs from GRCh38. I am also constructing a
non-redundant and more complete set of sequences missing from GRCh38.

View File

@ -103,6 +103,8 @@ Bytes.prototype.revcomp = function()
this[this.length>>1] = Bytes.rctab[this[this.length>>1]];
}
var re_cigar = /(\d+)([MIDSHN])/g;
/************************
*** command markovlp ***
************************/
@ -432,80 +434,65 @@ function intv_ovlp(intv, bits) // interval index
}
}
function bwa_genalt(args)
function cigar2pos(cigar, pos) // given a pos on ALT and the ALT-to-REF CIGAR, find the pos on REF
{
var re_cigar = /(\d+)([MIDSHN])/g;
function cigar2pos(cigar, pos) // given a pos on ALT and the ALT-to-REF CIGAR, find the pos on REF
{
var x = 0, y = 0;
for (var i = 0; i < cigar.length; ++i) {
var op = cigar[i][0], len = cigar[i][1];
if (op == 'M') {
if (y <= pos && pos < y + len)
return x + (pos - y);
x += len, y += len;
} else if (op == 'D') {
x += len;
} else if (op == 'I') {
if (y <= pos && pos < y + len)
return x;
y += len;
} else if (op == 'S' || op == 'H') {
if (y <= pos && pos < y + len)
return -1;
y += len;
}
var x = 0, y = 0;
for (var i = 0; i < cigar.length; ++i) {
var op = cigar[i][0], len = cigar[i][1];
if (op == 'M') {
if (y <= pos && pos < y + len)
return x + (pos - y);
x += len, y += len;
} else if (op == 'D') {
x += len;
} else if (op == 'I') {
if (y <= pos && pos < y + len)
return x;
y += len;
} else if (op == 'S' || op == 'H') {
if (y <= pos && pos < y + len)
return -1;
y += len;
}
return -1;
}
return -1;
}
function parse_hit(s, opt) // parse a hit. s looks something like ["chr1", "+12345", "100M", 5]
{
var h = {};
h.ctg = s[0];
h.start = parseInt(s[1].substr(1)) - 1;
h.rev = (s[1].charAt(0) == '-');
h.cigar = s[2];
h.NM = parseInt(s[3]);
h.hard = false;
var m, l_ins, n_ins, l_del, n_del, l_match, l_skip, l_clip;
l_ins = l_del = n_ins = n_del = l_match = l_skip = l_clip = 0;
while ((m = re_cigar.exec(h.cigar)) != null) {
var l = parseInt(m[1]);
if (m[2] == 'M') l_match += l;
else if (m[2] == 'D') ++n_del, l_del += l;
else if (m[2] == 'I') ++n_ins, l_ins += l;
else if (m[2] == 'N') l_skip += l;
else if (m[2] == 'H' || m[2] == 'S') {
l_clip += l;
if (m[2] == 'H') h.hard = true;
}
function parse_hit(s, opt) // parse a hit. s looks something like ["chr1", "+12345", "100M", 5]
{
var h = {};
h.ctg = s[0];
h.start = parseInt(s[1].substr(1)) - 1;
h.rev = (s[1].charAt(0) == '-');
h.cigar = s[2];
h.NM = parseInt(s[3]);
h.hard = false;
var m, l_ins, n_ins, l_del, n_del, l_match, l_skip, l_clip;
l_ins = l_del = n_ins = n_del = l_match = l_skip = l_clip = 0;
while ((m = re_cigar.exec(h.cigar)) != null) {
var l = parseInt(m[1]);
if (m[2] == 'M') l_match += l;
else if (m[2] == 'D') ++n_del, l_del += l;
else if (m[2] == 'I') ++n_ins, l_ins += l;
else if (m[2] == 'N') l_skip += l;
else if (m[2] == 'H' || m[2] == 'S') {
l_clip += l;
if (m[2] == 'H') h.hard = true;
}
h.end = h.start + l_match + l_del + l_skip;
h.NM = h.NM > l_del + l_ins? h.NM : l_del + l_ins;
h.score = Math.floor((opt.a * l_match - (opt.a + opt.b) * (h.NM - l_del - l_ins) - opt.o * (n_del + n_ins) - opt.e * (l_del + l_ins)) / opt.a + .499);
h.l_query = l_match + l_ins + l_clip;
return h;
}
h.end = h.start + l_match + l_del + l_skip;
h.NM = h.NM > l_del + l_ins? h.NM : l_del + l_ins;
h.score = Math.floor((opt.a * l_match - (opt.a + opt.b) * (h.NM - l_del - l_ins) - opt.o * (n_del + n_ins) - opt.e * (l_del + l_ins)) / opt.a + .499);
h.l_query = l_match + l_ins + l_clip;
return h;
}
var c, opt = { a:1, b:4, o:6, e:1, verbose:3 };
while ((c = getopt(args, 'v:')) != null) {
if (c == 'v') opt.verbose = parseInt(getopt.arg);
}
if (args.length == getopt.ind) {
print("Usage: k8 bwa-helper.js genalt <alt.sam> [aln.sam]");
exit(1);
}
var file, buf = new Bytes();
var aux = new Bytes(); // used for reverse and reverse complement
// read the ALT-to-REF alignment and generate the index
// read the ALT-to-REF alignment and generate the index
function read_ALT_sam(fn)
{
var intv = {};
file = new File(args[getopt.ind]);
var file = new File(fn);
var buf = new Bytes();
while (file.readline(buf) >= 0) {
var line = buf.toString();
var t = line.split("\t");
@ -524,11 +511,31 @@ function bwa_genalt(args)
intv[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, parseInt(t[3]) - 1, cigar]);
//print(start, start + l_qaln, t[2], flag&16? true : false, parseInt(t[3]), cigar);
}
buf.destroy();
file.close();
// create the interval index
var idx = {};
for (var ctg in intv)
idx[ctg] = intv_ovlp(intv[ctg]);
return idx;
}
function bwa_genalt(args)
{
var c, opt = { a:1, b:4, o:6, e:1, verbose:3 };
while ((c = getopt(args, 'v:')) != null) {
if (c == 'v') opt.verbose = parseInt(getopt.arg);
}
if (args.length == getopt.ind) {
print("Usage: k8 bwa-helper.js genalt <alt.sam> [aln.sam]");
exit(1);
}
var file, buf = new Bytes();
var aux = new Bytes(); // used for reverse and reverse complement
var idx = read_ALT_sam(args[getopt.ind]);
// process SAM
file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File();
@ -682,6 +689,86 @@ function bwa_genalt(args)
buf.destroy();
}
// This is in effect a simplified version of bwa_genalt().
function bwa_postalt(args)
{
var c, idx = {}, opt = { verbose:3, min_pa:0.8 };
while ((c = getopt(args, 'v:r:a:')) != null) {
if (c == 'v') opt.verbose = parseInt(getopt.arg);
else if (c == 'r') opt.min_pa = parseFloat(getopt.arg);
else if (c == 'a') idx = read_ALT_sam(getopt.arg);
}
if (args.length == getopt.ind) {
print("Usage: k8 bwa-helper.js postalt [-r "+opt.min_pa+"] [-a alt.sam] [aln.sam]");
exit(1);
}
// process SAM
var file = args.length - getopt.ind >= 1? new File(args[getopt.ind]) : new File();
var buf = new Bytes();
while (file.readline(buf) >= 0) {
var m, line = buf.toString();
if (line.charAt(0) == '@') {
print(line);
continue;
}
var t = line.split("\t");
var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1];
var flag = parseInt(t[1]);
var h = parse_hit([t[2], ((flag&16)?'-':'+') + t[3], t[5], NM], opt);
// lift mapping positions to coordinates on the primary assembly
{
var a = null;
if (idx[h.ctg] != null)
a = idx[h.ctg](h.start, h.end);
if (a == null) a = [];
// find the approximate position on the primary assembly
var lifted = [];
for (var j = 0; j < a.length; ++j) {
var s, e;
if (!a[j][4]) { // ALT is mapped to the forward strand of the primary assembly
s = cigar2pos(a[j][6], h.start);
e = cigar2pos(a[j][6], h.end - 1) + 1;
} else {
s = cigar2pos(a[j][6], a[j][2] - h.end);
e = cigar2pos(a[j][6], a[j][2] - h.start - 1) + 1;
}
if (s < 0 || e < 0) continue; // read is mapped to clippings in the ALT-to-chr alignment
s += a[j][5]; e += a[j][5];
lifted.push([a[j][3], (h.rev!=a[j][4]), s, e]);
}
h.lifted = lifted;
}
// update mapQ if necessary
var ori_mapQ = null;
if ((m = /\tpa:f:(\d+\.\d+)/.exec(line)) != null) {
if (parseFloat(m[1]) < opt.min_pa)
ori_mapQ = t[4], t[4] = 0;
}
// generate lifted_str
if (h.lifted && h.lifted.length) {
var lifted = h.lifted;
var u = '';
for (var j = 0; j < lifted.length; ++j)
u += lifted[j][0] + "," + lifted[j][2] + "," + lifted[j][3] + "," + (lifted[j][1]?'-':'+') + ";";
h.lifted_str = u;
} else h.lifted_str = null;
// print
if (ori_mapQ != null) t.push("om:i:"+ori_mapQ);
if (h.lifted_str) t.push("lt:Z:" + h.lifted_str);
print(t.join("\t"));
}
buf.destroy();
file.close();
}
/*********************
*** Main function ***
*********************/
@ -690,7 +777,8 @@ function main(args)
{
if (args.length == 0) {
print("\nUsage: k8 bwa-helper.js <command> [arguments]\n");
print("Commands: genalt generate ALT alignments");
print("Commands: postalt post process ALT-aware alignments");
print(" genalt generate ALT alignments for ALT-unaware alignments");
print(" sam2pas convert SAM to pairwise alignment summary format (PAS)");
print(" pas2reg extract covered regions");
print(" reg2cut regions to extract for the 2nd round bwa-mem");
@ -708,6 +796,7 @@ function main(args)
else if (cmd == 'pas2reg') bwa_pas2reg(args);
else if (cmd == 'reg2cut') bwa_reg2cut(args);
else if (cmd == 'genalt') bwa_genalt(args);
else if (cmd == 'postalt') bwa_postalt(args);
else if (cmd == 'shortname') bwa_shortname(args);
else warn("Unrecognized command");
}

View File

@ -880,11 +880,13 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq
for (i = 0; i < n; ++i)
if (i != which && !(list[i].flag&0x100)) break;
if (i < n) { // there are other primary hits; output them
int pri_alt_sc = -1;
kputsn("\tSA:Z:", 6, str);
for (i = 0; i < n; ++i) {
const mem_aln_t *r = &list[i];
int k;
if (i == which || (list[i].flag&0x100)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit
if (list[i].is_alt) pri_alt_sc = pri_alt_sc > r->score? pri_alt_sc : r->score;
kputs(bns->anns[r->rid].name, str); kputc(',', str);
kputl(r->pos+1, str); kputc(',', str);
kputc("+-"[r->is_rev], str); kputc(',', str);
@ -895,6 +897,8 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq
kputc(',', str); kputw(r->NM, str);
kputc(';', str);
}
if (pri_alt_sc > 0)
ksprintf(str, "\tpa:f:%.3f", (double)p->score / pri_alt_sc);
}
}
if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); }
@ -1094,6 +1098,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *
assert(a.rid == ar->rid);
a.pos = pos - bns->anns[a.rid].offset;
a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub;
a.is_alt = ar->is_alt;
free(query);
return a;
}

View File

@ -85,7 +85,7 @@ typedef struct { // This struct is only used for the convenience of API.
int64_t pos; // forward strand 5'-end mapping position
int rid; // reference sequence index in bntseq_t; <0 for unmapped
int flag; // extra flag
uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
uint32_t is_rev:1, is_alt:1, mapq:8, NM:22; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
int n_cigar; // number of CIGAR operations
uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234
char *XA; // alternative mappings

View File

@ -59,19 +59,13 @@ void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv)
const bwtintv_v *smem_next(smem_i *itr)
{
int i, max, max_i, ori_start;
int ori_start;
itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0;
if (itr->start >= itr->len || itr->start < 0) return 0;
while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
if (itr->start == itr->len) return 0;
ori_start = itr->start;
itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_len, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM
if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here
for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match
bwtintv_t *p = &itr->matches->a[i];
int len = (uint32_t)p->info - (p->info>>32);
if (max < len) max = len, max_i = i;
}
return itr->matches;
}

2
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.10-r846-dirty"
#define PACKAGE_VERSION "0.7.10-r849-dirty"
#endif
int bwa_fa2pac(int argc, char *argv[]);