From 8d61316cc850896b3dd6a5a20dbec922c20f62d1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 23 Sep 2014 10:33:15 -0400 Subject: [PATCH 1/3] more comments --- bwa-postalt.js | 55 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/bwa-postalt.js b/bwa-postalt.js index 258d25d..5e6b7c7 100644 --- a/bwa-postalt.js +++ b/bwa-postalt.js @@ -13,6 +13,7 @@ *** From k8.js *** ******************/ +// Parse command-line options. A BSD getopt() clone in javascript. var getopt = function(args, ostr) { var oli; // option letter list index if (typeof(getopt.place) == 'undefined') @@ -51,6 +52,7 @@ var getopt = function(args, ostr) { return optopt; } +// print an object in a format similar to JSON. For debugging. function obj2str(o) { if (typeof(o) != 'object') { @@ -75,6 +77,7 @@ function obj2str(o) } } +// reverse a string Bytes.prototype.reverse = function() { for (var i = 0; i < this.length>>1; ++i) { @@ -84,6 +87,7 @@ Bytes.prototype.reverse = function() } } +// reverse complement a DNA string Bytes.prototype.revcomp = function() { if (Bytes.rctab == null) { @@ -103,13 +107,8 @@ Bytes.prototype.revcomp = function() this[this.length>>1] = Bytes.rctab[this[this.length>>1]]; } -var re_cigar = /(\d+)([MIDSHN])/g; - -/****************************** - *** Generate ALT alignment *** - ******************************/ - -function intv_ovlp(intv, bits) // interval index +// create index for a list of intervals for fast interval queries; ported from bedidx.c in samtools +function intv_ovlp(intv, bits) { if (typeof bits == "undefined") bits = 13; intv.sort(function(a,b) {return a[0]-b[0];}); @@ -142,7 +141,14 @@ function intv_ovlp(intv, bits) // interval index } } -function cigar2pos(cigar, pos) // given a pos on ALT and the ALT-to-REF CIGAR, find the pos on REF +var re_cigar = /(\d+)([MIDSHN])/g; + +/****************************** + *** Generate ALT alignment *** + ******************************/ + +// given a pos on ALT and the ALT-to-REF CIGAR, find the pos on REF +function cigar2pos(cigar, pos) { var x = 0, y = 0; for (var i = 0; i < cigar.length; ++i) { @@ -166,7 +172,9 @@ function cigar2pos(cigar, pos) // given a pos on ALT and the ALT-to-REF CIGAR, f return -1; } -function parse_hit(s, opt) // parse a hit. s looks something like ["chr1", "+12345", "100M", 5] +// Parse a hit. $s is an array that looks something like ["chr1", "+12345", "100M", 5] +// Return an object keeping various information about the alignment. +function parse_hit(s, opt) { var h = {}; h.ctg = s[0]; @@ -221,7 +229,7 @@ function read_ALT_sam(fn) } buf.destroy(); file.close(); - // create the interval index + // create index for intervals on ALT contigs var idx = {}; for (var ctg in intv) idx[ctg] = intv_ovlp(intv[ctg]); @@ -270,20 +278,22 @@ function bwa_postalt(args) var t = line.split("\t"); t[1] = parseInt(t[1]); t[3] = parseInt(t[3]); t[4] = parseInt(t[4]); - if (buf2.length && buf2[0][0] != t[0]) { + + // print bufferred reads + if (buf2.length && (buf2[0][0] != t[0] || (buf2[0][1]&0xc0) != (t[1]&0xc0))) { for (var i = 0; i < buf2.length; ++i) print(buf2[i].join("\t")); buf2 = []; } - if ((m = /\tXA:Z:(\S+)/.exec(line)) == null) { // TODO: this does not work with PE file + if ((m = /\tXA:Z:(\S+)/.exec(line)) == null) { buf2.push(t); continue; } - - // parse hits - var hits = []; var XA_strs = m[1].split(";"); + + // parse the reported hit + var hits = []; var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1]; var flag = t[1]; var h = parse_hit([t[2], ((flag&16)?'-':'+') + t[3], t[5], NM], opt); @@ -292,11 +302,13 @@ function bwa_postalt(args) continue; } hits.push(h); - for (var i = 0; i < XA_strs.length; ++i) // hits in the XA tag + + // parse hits in the XA tag + for (var i = 0; i < XA_strs.length; ++i) if (XA_strs[i] != '') // as the last symbol in an XA tag is ";", the last split is an empty string hits.push(parse_hit(XA_strs[i].split(","), opt)); - // lift mapping positions to coordinates on the primary assembly + // lift mapping positions to the primary assembly var n_lifted = 0, n_rpt_lifted = 0; for (var i = 0; i < hits.length; ++i) { var h = hits[i]; @@ -328,7 +340,7 @@ function bwa_postalt(args) continue; } - // group hits + // group hits based on the lifted positions on the primary assembly for (var i = 0; i < hits.length; ++i) { // set keys for sorting if (hits[i].lifted && hits[i].lifted.length) // TODO: only the first element in lifted[] is used hits[i].pctg = hits[i].lifted[0][0], hits[i].pstart = hits[i].lifted[0][2], hits[i].pend = hits[i].lifted[0][3]; @@ -352,8 +364,9 @@ function bwa_postalt(args) if (hits[i].g == reported_g) ++n_group0; + // re-estimate mapping quality if necessary var mapQ, ori_mapQ = t[4]; - if (n_group0 > 1) { // then re-estimate mapQ + if (n_group0 > 1) { var group_max = []; for (var i = 0; i < hits.length; ++i) { var g = hits[i].g; @@ -414,11 +427,13 @@ function bwa_postalt(args) aux.set(t[10],0); aux.reverse(); rq = aux.toString(); } - // print + // stage the reported hit t[4] = mapQ; if (n_group0 > 1) t.push("om:i:"+ori_mapQ); if (hits[reported_i].lifted_str) t.push("lt:Z:" + hits[reported_i].lifted_str); buf2.push(t); + + // stage the hits generated from the XA tag var cnt = 0; for (var i = 0; i < hits.length; ++i) { if (opt.verbose >= 5) print(obj2str(hits[i])); From dae4ca3cedf4617c3bf09ef93d77a78c290c98e4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 26 Sep 2014 15:29:08 -0400 Subject: [PATCH 2/3] r875: invalid SAM output for ALT hits --- bwamem.c | 4 ++-- main.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index 394aa40..6bbbc6e 100644 --- a/bwamem.c +++ b/bwamem.c @@ -858,7 +858,7 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq kputsn("*\t*", 3, str); } else if (!p->is_rev) { // the forward strand int i, qb = 0, qe = s->l_seq; - if (p->n_cigar && which && !(opt->flag&MEM_F_SOFTCLIP)) { // have cigar && not the primary alignment && not softclip all + if (p->n_cigar && which && !(opt->flag&MEM_F_SOFTCLIP) && !p->is_alt) { // have cigar && not the primary alignment && not softclip all if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qb += p->cigar[0]>>4; if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qe -= p->cigar[p->n_cigar-1]>>4; } @@ -872,7 +872,7 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq } else kputc('*', str); } else { // the reverse strand int i, qb = 0, qe = s->l_seq; - if (p->n_cigar && which && !(opt->flag&MEM_F_SOFTCLIP)) { + if (p->n_cigar && which && !(opt->flag&MEM_F_SOFTCLIP) && !p->is_alt) { if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qe -= p->cigar[0]>>4; if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qb += p->cigar[p->n_cigar-1]>>4; } diff --git a/main.c b/main.c index 7a398f4..fc7fe78 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r868-dirty" +#define PACKAGE_VERSION "0.7.10-r875-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From 0a2cf98293afa3250f7689c4056cf00633ca65d6 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 27 Sep 2014 23:21:50 -0400 Subject: [PATCH 3/3] r876: optionally ignore idxbase.alt file --- fastmap.c | 11 ++++++++--- main.c | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/fastmap.c b/fastmap.c index 10571d9..43ddfd3 100644 --- a/fastmap.c +++ b/fastmap.c @@ -10,8 +10,8 @@ #include "bwamem.h" #include "kvec.h" #include "utils.h" +#include "bntseq.h" #include "kseq.h" -#include "utils.h" KSEQ_DECLARE(gzFile) extern unsigned char nst_nt4_table[256]; @@ -38,7 +38,7 @@ static void update_a(mem_opt_t *opt, const mem_opt_t *opt0) int main_mem(int argc, char *argv[]) { mem_opt_t *opt, opt0; - int fd, fd2, i, c, n, copy_comment = 0; + int fd, fd2, i, c, n, copy_comment = 0, ignore_alt = 0; int fixed_chunk_size = -1, actual_chunk_size; gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; @@ -55,7 +55,7 @@ int main_mem(int argc, char *argv[]) opt = mem_opt_init(); memset(&opt0, 0, sizeof(mem_opt_t)); - while ((c = getopt(argc, argv, "epaFMCSPHVYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:g:")) >= 0) { + while ((c = getopt(argc, argv, "epaFMCSPHVYjk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:g:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1; else if (c == 'x') mode = optarg; else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1; @@ -76,6 +76,7 @@ int main_mem(int argc, char *argv[]) else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1; else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1; else if (c == 'v') bwa_verbose = atoi(optarg); + else if (c == 'j') ignore_alt = 1; else if (c == 'r') opt->split_factor = atof(optarg), opt0.split_factor = 1.; else if (c == 'D') opt->drop_ratio = atof(optarg), opt0.drop_ratio = 1.; else if (c == 'm') opt->max_matesw = atoi(optarg), opt0.max_matesw = 1; @@ -168,6 +169,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); + fprintf(stderr, " -j ignore ALT contigs\n"); fprintf(stderr, "\n"); fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose); fprintf(stderr, " -g FLOAT set mapQ to zero if the ratio of the primary-to-alt scores below FLOAT [%.3f]\n", opt->min_pa_ratio); @@ -229,6 +231,9 @@ int main_mem(int argc, char *argv[]) bwa_fill_scmat(opt->a, opt->b, opt->mat); if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak + if (ignore_alt) + for (i = 0; i < idx->bns->n_seqs; ++i) + idx->bns->anns[i].is_alt = 0; ko = kopen(argv[optind + 1], &fd); if (ko == 0) { diff --git a/main.c b/main.c index fc7fe78..4b39d6e 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r875-dirty" +#define PACKAGE_VERSION "0.7.10-r876-dirty" #endif int bwa_fa2pac(int argc, char *argv[]);