diff --git a/bwa-helper.js b/bwa-helper.js index bf3fd00..7e938fa 100644 --- a/bwa-helper.js +++ b/bwa-helper.js @@ -522,14 +522,15 @@ function read_ALT_sam(fn) function bwa_altgen(args) { - var c, opt = { a:1, b:4, o:6, e:1, verbose:3 }; + var c, opt = { a:1, b:4, o:6, e:1, verbose:3, show_pri:false }; - while ((c = getopt(args, 'v:')) != null) { + while ((c = getopt(args, 'pv:')) != null) { if (c == 'v') opt.verbose = parseInt(getopt.arg); + else if (c == 'p') opt.show_pri = true; } if (args.length == getopt.ind) { - print("Usage: k8 bwa-helper.js altgen [aln.sam]"); + print("Usage: k8 bwa-helper.js altgen [-p] [aln.sam]"); exit(1); } @@ -671,6 +672,7 @@ function bwa_altgen(args) for (var i = 0; i < hits.length; ++i) { if (opt.verbose >= 5) print(obj2str(hits[i])); if (hits[i].g != reported_g || i == reported_i) continue; + if (!opt.show_pri && idx[hits[i].ctg] == null) continue; var s = [t[0], flag&0xf10, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, '*', 0, 0]; // update name if (flag&0x40) s[0] += "/1"; @@ -689,71 +691,6 @@ function bwa_altgen(args) buf.destroy(); } -// This is in effect a simplified version of bwa_genalt(). -function bwa_altlift(args) -{ - var opt = { a:1, b:4, o:6, e:1 }; - if (args.length == 0) { - print("Usage: k8 bwa-helper.js altlift [aln.sam]"); - exit(1); - } - var idx = read_ALT_sam(args[0]); - - // process SAM - var file = args.length >= 2? new File(args[1]) : new File(); - var buf = new Bytes(); - while (file.readline(buf) >= 0) { - var m, line = buf.toString(); - if (line.charAt(0) == '@') { - print(line); - continue; - } - - var t = line.split("\t"); - var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1]; - var flag = parseInt(t[1]); - var h = parse_hit([t[2], ((flag&16)?'-':'+') + t[3], t[5], NM], opt); - - // lift mapping positions to coordinates on the primary assembly - var a = null; - if (idx[h.ctg] != null) - a = idx[h.ctg](h.start, h.end); - if (a == null) a = []; - - // find the approximate position on the primary assembly - var lifted = []; - for (var j = 0; j < a.length; ++j) { - var s, e; - if (!a[j][4]) { // ALT is mapped to the forward strand of the primary assembly - s = cigar2pos(a[j][6], h.start); - e = cigar2pos(a[j][6], h.end - 1) + 1; - } else { - s = cigar2pos(a[j][6], a[j][2] - h.end); - e = cigar2pos(a[j][6], a[j][2] - h.start - 1) + 1; - } - if (s < 0 || e < 0) continue; // read is mapped to clippings in the ALT-to-chr alignment - s += a[j][5]; e += a[j][5]; - lifted.push([a[j][3], (h.rev!=a[j][4]), s, e]); - } - h.lifted = lifted; - - // generate lifted_str - if (h.lifted && h.lifted.length) { - var lifted = h.lifted; - var u = ''; - for (var j = 0; j < lifted.length; ++j) - u += lifted[j][0] + "," + lifted[j][2] + "," + lifted[j][3] + "," + (lifted[j][1]?'-':'+') + ";"; - h.lifted_str = u; - } else h.lifted_str = null; - - // print - if (h.lifted_str) t.push("lt:Z:" + h.lifted_str); - print(t.join("\t")); - } - buf.destroy(); - file.close(); -} - /********************* *** Main function *** *********************/ @@ -762,8 +699,7 @@ function main(args) { if (args.length == 0) { print("\nUsage: k8 bwa-helper.js [arguments]\n"); - print("Commands: altlift add lt tag to show lifted position on the primary assembly"); - print(" altgen generate ALT alignments for ALT-unaware alignments\n"); + print("Commands: altgen generate ALT alignments for ALT-unaware alignments"); print(" sam2pas convert SAM to pairwise alignment summary format (PAS)"); print(" pas2reg extract covered regions"); print(" reg2cut regions to extract for the 2nd round bwa-mem"); @@ -780,8 +716,7 @@ function main(args) else if (cmd == 'markovlp') bwa_markOvlp(args); else if (cmd == 'pas2reg') bwa_pas2reg(args); else if (cmd == 'reg2cut') bwa_reg2cut(args); - else if (cmd == 'altgen' || cmd == 'genalt') bwa_alt(args); - else if (cmd == 'altlift') bwa_altlift(args); + else if (cmd == 'altgen' || cmd == 'genalt') bwa_altgen(args); else if (cmd == 'shortname') bwa_shortname(args); else warn("Unrecognized command"); } diff --git a/bwamem.c b/bwamem.c index 7ba6c57..394aa40 100644 --- a/bwamem.c +++ b/bwamem.c @@ -72,7 +72,8 @@ mem_opt_t *mem_opt_init() o->split_factor = 1.5; o->chunk_size = 10000000; o->n_threads = 1; - o->max_hits = 5; + o->max_XA_hits = 5; + o->max_XA_hits_alt = 50; o->max_matesw = 50; o->mask_level_redun = 0.95; o->min_chain_weight = 0; @@ -511,26 +512,38 @@ static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t * int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id) { - int i, n_pri; + int i, j, n_pri; int_v z = {0,0,0}; if (n == 0) return 0; for (i = n_pri = 0; i < n; ++i) { - a[i].sub = 0, a[i].alt_sc = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i); + a[i].sub = a[i].alt_sc = 0, a[i].secondary = a[i].secondary_alt = -1, a[i].hash = hash_64(id+i); if (!a[i].is_alt) ++n_pri; } ks_introsort(mem_ars_hash, n, a); mem_mark_primary_se_core(opt, n, a, &z); for (i = 0; i < n; ++i) { mem_alnreg_t *p = &a[i]; - if (p->is_alt && p->secondary >= 0) // don't track the "parent" hit of ALT secondary hits - p->secondary = INT_MAX; + p->secondary_alt = i; // keep the rank in the first round if (!p->is_alt && p->secondary >= 0 && a[p->secondary].is_alt) p->alt_sc = a[p->secondary].score; } - if (n_pri > 0 && n_pri < n) { - ks_introsort(mem_ars_hash2, n, a); - for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1; - mem_mark_primary_se_core(opt, n_pri, a, &z); + if (n_pri >= 0 && n_pri < n) { + kv_resize(int, z, n); + if (n_pri > 0) ks_introsort(mem_ars_hash2, n, a); + for (i = 0; i < n; ++i) z.a[a[i].secondary_alt] = i; + for (i = 0; i < n; ++i) { + if (a[i].secondary < 0) { + a[i].secondary_alt = -1; + continue; + } + j = z.a[a[i].secondary]; + a[i].secondary_alt = a[j].is_alt? j : -1; + if (a[i].is_alt) a[i].secondary = INT_MAX; + } + if (n_pri > 0) { // mark primary for hits to the primary assembly only + for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1; + mem_mark_primary_se_core(opt, n_pri, a, &z); + } } free(z.a); return n_pri; @@ -817,7 +830,7 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq if (p->n_cigar) { // aligned for (i = 0; i < p->n_cigar; ++i) { int c = p->cigar[i]&0xf; - if (!(opt->flag&MEM_F_SOFTCLIP) && (c == 3 || c == 4)) + if (!(opt->flag&MEM_F_SOFTCLIP) && !p->is_alt && (c == 3 || c == 4)) c = which? 4 : 3; // use hard clipping for supplementary alignments kputw(p->cigar[i]>>4, str); kputc("MIDSH"[c], str); } @@ -976,7 +989,7 @@ void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score if (k && p->secondary < 0) // if supplementary q->flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800; - if (k && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq; + if (k && !p->is_alt && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq; } if (aa.n == 0) { // no alignments good enough; then write an unaligned record mem_aln_t t; diff --git a/bwamem.h b/bwamem.h index f5f33bb..6079e5a 100644 --- a/bwamem.h +++ b/bwamem.h @@ -52,7 +52,7 @@ typedef struct { int mapQ_coef_fac; int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end - int max_hits; // if there are max_hits or fewer, output them all + int max_XA_hits, max_XA_hits_alt; // if there are max_hits or fewer, output them all int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset } mem_opt_t; @@ -69,6 +69,7 @@ typedef struct { int w; // actual band width used in extension int seedcov; // length of regions coverged by seeds int secondary; // index of the parent hit shadowing the current hit; <0 if primary + int secondary_alt; int seedlen0; // length of the starting seed int n_comp:30, is_alt:2; // number of sub-alignments chained together float frac_rep; diff --git a/bwamem_extra.c b/bwamem_extra.c index 9e9122d..eea5ac7 100644 --- a/bwamem_extra.c +++ b/bwamem_extra.c @@ -112,43 +112,54 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, s->sam = str.s; } +static inline void get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i, int r[2]) +{ + int j = a[i].secondary, k = a[i].secondary_alt; + r[0] = r[1] = -1; + if (j >= 0 && j < INT_MAX && !a[j].is_alt && !a[i].is_alt && a[i].score >= a[j].score * XA_drop_ratio) r[0] = j; + if (k >= 0 && a[k].is_alt && a[i].score >= a[k].score * XA_drop_ratio) r[1] = k; +} + // Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup. char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se() { - int i, k, *cnt, tot; - kstring_t *aln = 0; + int i, k, *cnt, tot, r[2]; + kstring_t *aln = 0, str = {0,0,0}; char **XA = 0; cnt = calloc(a->n, sizeof(int)); for (i = 0, tot = 0; i < a->n; ++i) { - int j = a->a[i].secondary; - if (j >= 0 && j < INT_MAX && a->a[i].score >= a->a[j].score * opt->XA_drop_ratio) - ++cnt[j], ++tot; + get_pri_idx(opt->XA_drop_ratio, a->a, i, r); + if (r[0] >= 0) ++cnt[r[0]], ++tot; + if (r[1] >= 0) ++cnt[r[1]], ++tot; } if (tot == 0) goto end_gen_alt; aln = calloc(a->n, sizeof(kstring_t)); for (i = 0; i < a->n; ++i) { mem_aln_t t; - int j = a->a[i].secondary; - if (j < 0 || j == INT_MAX || a->a[i].score < a->a[j].score * opt->XA_drop_ratio) continue; // we don't process the primary alignments as they will be converted to SAM later - if (cnt[j] > opt->max_hits) continue; + get_pri_idx(opt->XA_drop_ratio, a->a, i, r); + if (r[0] < 0 && r[1] < 0) continue; + if ((r[0] >= 0 && cnt[r[0]] > opt->max_XA_hits) || (r[1] >= 0 && cnt[r[1]] > opt->max_XA_hits_alt)) continue; t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]); - kputs(bns->anns[t.rid].name, &aln[j]); - kputc(',', &aln[j]); kputc("+-"[t.is_rev], &aln[j]); kputl(t.pos + 1, &aln[j]); - kputc(',', &aln[j]); + str.l = 0; + kputs(bns->anns[t.rid].name, &str); + kputc(',', &str); kputc("+-"[t.is_rev], &str); kputl(t.pos + 1, &str); + kputc(',', &str); for (k = 0; k < t.n_cigar; ++k) { - kputw(t.cigar[k]>>4, &aln[j]); - kputc("MIDSHN"[t.cigar[k]&0xf], &aln[j]); + kputw(t.cigar[k]>>4, &str); + kputc("MIDSHN"[t.cigar[k]&0xf], &str); } - kputc(',', &aln[j]); kputw(t.NM, &aln[j]); - kputc(';', &aln[j]); + kputc(',', &str); kputw(t.NM, &str); + kputc(';', &str); free(t.cigar); + if (r[0] >= 0 && cnt[r[0]] <= opt->max_XA_hits) kputsn(str.s, str.l, &aln[r[0]]); + if (r[1] >= 0 && cnt[r[1]] <= opt->max_XA_hits_alt) kputsn(str.s, str.l, &aln[r[1]]); } XA = calloc(a->n, sizeof(char*)); for (k = 0; k < a->n; ++k) XA[k] = aln[k].s; end_gen_alt: - free(cnt); free(aln); + free(cnt); free(aln); free(str.s); return XA; } diff --git a/fastmap.c b/fastmap.c index 8164ff3..10571d9 100644 --- a/fastmap.c +++ b/fastmap.c @@ -79,7 +79,6 @@ int main_mem(int argc, char *argv[]) else if (c == 'r') opt->split_factor = atof(optarg), opt0.split_factor = 1.; else if (c == 'D') opt->drop_ratio = atof(optarg), opt0.drop_ratio = 1.; else if (c == 'm') opt->max_matesw = atoi(optarg), opt0.max_matesw = 1; - else if (c == 'h') opt->max_hits = atoi(optarg), opt0.max_hits = 1; else if (c == 's') opt->split_width = atoi(optarg), opt0.split_width = 1; else if (c == 'G') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1; else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1; @@ -88,6 +87,12 @@ int main_mem(int argc, char *argv[]) else if (c == 'g') opt->min_pa_ratio = atof(optarg), opt0.min_pa_ratio = 1; else if (c == 'C') copy_comment = 1; else if (c == 'K') fixed_chunk_size = atoi(optarg); + else if (c == 'h') { + opt0.max_XA_hits = opt0.max_XA_hits_alt = 1; + opt->max_XA_hits = opt->max_XA_hits_alt = strtol(optarg, &p, 10); + if (*p != 0 && ispunct(*p) && isdigit(p[1])) + opt->max_XA_hits_alt = strtol(p+1, &p, 10); + } else if (c == 'Q') { opt0.mapQ_coef_len = 1; opt->mapQ_coef_len = atoi(optarg); @@ -167,7 +172,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose); fprintf(stderr, " -g FLOAT set mapQ to zero if the ratio of the primary-to-alt scores below FLOAT [%.3f]\n", opt->min_pa_ratio); fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T); - fprintf(stderr, " -h INT if there are 80%% of the max score, output all in XA [%d]\n", opt->max_hits); + fprintf(stderr, " -h INT[,INT] if there are 80%% of the max score, output all in XA [%d,%d]\n", opt->max_XA_hits, opt->max_XA_hits_alt); fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); fprintf(stderr, " -V output the reference FASTA header in the XR tag\n"); diff --git a/main.c b/main.c index 063ddda..7a398f4 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r858-dirty" +#define PACKAGE_VERSION "0.7.10-r868-dirty" #endif int bwa_fa2pac(int argc, char *argv[]);