From a03d01f944d2fd0ec5d967cac41065bd84db11d3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 30 Sep 2014 13:50:51 -0400 Subject: [PATCH] r878: XA is given to the best alignment Non-ALT hits may get ALT hits in the XA tag. This will simplify haplotype assignment. --- bwamem.c | 22 +++++++++++----------- bwamem.h | 2 +- bwamem_extra.c | 32 ++++++++++++++++---------------- main.c | 2 +- 4 files changed, 29 insertions(+), 29 deletions(-) diff --git a/bwamem.c b/bwamem.c index 6bbbc6e..7c54b25 100644 --- a/bwamem.c +++ b/bwamem.c @@ -512,38 +512,38 @@ static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t * int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id) { - int i, j, n_pri; + int i, n_pri; int_v z = {0,0,0}; if (n == 0) return 0; for (i = n_pri = 0; i < n; ++i) { - a[i].sub = a[i].alt_sc = 0, a[i].secondary = a[i].secondary_alt = -1, a[i].hash = hash_64(id+i); + a[i].sub = a[i].alt_sc = 0, a[i].secondary = a[i].secondary_all = -1, a[i].hash = hash_64(id+i); if (!a[i].is_alt) ++n_pri; } ks_introsort(mem_ars_hash, n, a); mem_mark_primary_se_core(opt, n, a, &z); for (i = 0; i < n; ++i) { mem_alnreg_t *p = &a[i]; - p->secondary_alt = i; // keep the rank in the first round + p->secondary_all = i; // keep the rank in the first round if (!p->is_alt && p->secondary >= 0 && a[p->secondary].is_alt) p->alt_sc = a[p->secondary].score; } if (n_pri >= 0 && n_pri < n) { kv_resize(int, z, n); if (n_pri > 0) ks_introsort(mem_ars_hash2, n, a); - for (i = 0; i < n; ++i) z.a[a[i].secondary_alt] = i; + for (i = 0; i < n; ++i) z.a[a[i].secondary_all] = i; for (i = 0; i < n; ++i) { - if (a[i].secondary < 0) { - a[i].secondary_alt = -1; - continue; - } - j = z.a[a[i].secondary]; - a[i].secondary_alt = a[j].is_alt? j : -1; - if (a[i].is_alt) a[i].secondary = INT_MAX; + if (a[i].secondary >= 0) { + a[i].secondary_all = z.a[a[i].secondary]; + if (a[i].is_alt) a[i].secondary = INT_MAX; + } else a[i].secondary_all = -1; } if (n_pri > 0) { // mark primary for hits to the primary assembly only for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1; mem_mark_primary_se_core(opt, n_pri, a, &z); } + } else { + for (i = 0; i < n; ++i) + a[i].secondary_all = a[i].secondary; } free(z.a); return n_pri; diff --git a/bwamem.h b/bwamem.h index 6079e5a..ae53601 100644 --- a/bwamem.h +++ b/bwamem.h @@ -69,7 +69,7 @@ typedef struct { int w; // actual band width used in extension int seedcov; // length of regions coverged by seeds int secondary; // index of the parent hit shadowing the current hit; <0 if primary - int secondary_alt; + int secondary_all; int seedlen0; // length of the starting seed int n_comp:30, is_alt:2; // number of sub-alignments chained together float frac_rep; diff --git a/bwamem_extra.c b/bwamem_extra.c index eea5ac7..09faaa2 100644 --- a/bwamem_extra.c +++ b/bwamem_extra.c @@ -112,34 +112,35 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, s->sam = str.s; } -static inline void get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i, int r[2]) +static inline int get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i) { - int j = a[i].secondary, k = a[i].secondary_alt; - r[0] = r[1] = -1; - if (j >= 0 && j < INT_MAX && !a[j].is_alt && !a[i].is_alt && a[i].score >= a[j].score * XA_drop_ratio) r[0] = j; - if (k >= 0 && a[k].is_alt && a[i].score >= a[k].score * XA_drop_ratio) r[1] = k; + int k = a[i].secondary_all; + if (k >= 0 && a[i].score >= a[k].score * XA_drop_ratio) return k; + return -1; } // Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup. char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se() { - int i, k, *cnt, tot, r[2]; + int i, k, r, *cnt, tot; kstring_t *aln = 0, str = {0,0,0}; - char **XA = 0; + char **XA = 0, *has_alt; cnt = calloc(a->n, sizeof(int)); + has_alt = calloc(a->n, 1); for (i = 0, tot = 0; i < a->n; ++i) { - get_pri_idx(opt->XA_drop_ratio, a->a, i, r); - if (r[0] >= 0) ++cnt[r[0]], ++tot; - if (r[1] >= 0) ++cnt[r[1]], ++tot; + r = get_pri_idx(opt->XA_drop_ratio, a->a, i); + if (r >= 0) { + ++cnt[r], ++tot; + if (a->a[i].is_alt) has_alt[r] = 1; + } } if (tot == 0) goto end_gen_alt; aln = calloc(a->n, sizeof(kstring_t)); for (i = 0; i < a->n; ++i) { mem_aln_t t; - get_pri_idx(opt->XA_drop_ratio, a->a, i, r); - if (r[0] < 0 && r[1] < 0) continue; - if ((r[0] >= 0 && cnt[r[0]] > opt->max_XA_hits) || (r[1] >= 0 && cnt[r[1]] > opt->max_XA_hits_alt)) continue; + if ((r = get_pri_idx(opt->XA_drop_ratio, a->a, i)) < 0) continue; + if (cnt[r] > opt->max_XA_hits_alt || (!has_alt[r] && cnt[r] > opt->max_XA_hits)) continue; t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]); str.l = 0; kputs(bns->anns[t.rid].name, &str); @@ -152,14 +153,13 @@ char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac kputc(',', &str); kputw(t.NM, &str); kputc(';', &str); free(t.cigar); - if (r[0] >= 0 && cnt[r[0]] <= opt->max_XA_hits) kputsn(str.s, str.l, &aln[r[0]]); - if (r[1] >= 0 && cnt[r[1]] <= opt->max_XA_hits_alt) kputsn(str.s, str.l, &aln[r[1]]); + kputsn(str.s, str.l, &aln[r]); } XA = calloc(a->n, sizeof(char*)); for (k = 0; k < a->n; ++k) XA[k] = aln[k].s; end_gen_alt: - free(cnt); free(aln); free(str.s); + free(has_alt); free(cnt); free(aln); free(str.s); return XA; } diff --git a/main.c b/main.c index 4b39d6e..b162ab0 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r876-dirty" +#define PACKAGE_VERSION "0.7.10-r878-dirty" #endif int bwa_fa2pac(int argc, char *argv[]);