From 825ae92e58527c8eff4523f1cb970dd08459a3dd Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 17 Sep 2014 13:05:35 -0400 Subject: [PATCH] r849: the pa tag now gives a number ... which is the ratio of this hit to the best ALT hit. --- NEWS.md | 6 ++++-- bwamem.c | 7 ++++--- bwamem_extra.c | 8 +------- main.c | 2 +- 4 files changed, 10 insertions(+), 13 deletions(-) diff --git a/NEWS.md b/NEWS.md index cc7946c..095b2de 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,8 +12,10 @@ having ALT contigs almost has not effect on alignments to the primary assembly (seeding may be affected in rare corner cases). At the same time, users may get a primary alignment to ALT contigs (no 0x800 flag) if there are no good hits to the primary assembly, or get a supplementary alignment to ALT contigs -if it is better than hits to the primary assembly. Since this release, it is -recommended to include ALT contigs. +if it is better than hits to the primary assembly. In the latter case, the +`pa:f` tag shows the ratio of the primary hit score to the best ALT hit score, +which is no greater than 1. Since this release, it is recommended to include +ALT contigs. Users may consider to use ALT contigs from GRCh38. I am also constructing a non-redundant and more complete set of sequences missing from GRCh38. diff --git a/bwamem.c b/bwamem.c index 1699e97..533565b 100644 --- a/bwamem.c +++ b/bwamem.c @@ -880,13 +880,13 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq for (i = 0; i < n; ++i) if (i != which && !(list[i].flag&0x100)) break; if (i < n) { // there are other primary hits; output them - int has_pri_alt = 0; + int pri_alt_sc = -1; kputsn("\tSA:Z:", 6, str); for (i = 0; i < n; ++i) { const mem_aln_t *r = &list[i]; int k; if (i == which || (list[i].flag&0x100)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit - if (list[i].is_alt) has_pri_alt = 1; + if (list[i].is_alt) pri_alt_sc = pri_alt_sc > r->score? pri_alt_sc : r->score; kputs(bns->anns[r->rid].name, str); kputc(',', str); kputl(r->pos+1, str); kputc(',', str); kputc("+-"[r->is_rev], str); kputc(',', str); @@ -897,7 +897,8 @@ void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq kputc(',', str); kputw(r->NM, str); kputc(';', str); } - if (has_pri_alt) kputsn("\tpa:A:Y", 7, str); + if (pri_alt_sc > 0) + ksprintf(str, "\tpa:f:%.3f", (double)p->score / pri_alt_sc); } } if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); } diff --git a/bwamem_extra.c b/bwamem_extra.c index 45289ea..9e9122d 100644 --- a/bwamem_extra.c +++ b/bwamem_extra.c @@ -59,19 +59,13 @@ void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv) const bwtintv_v *smem_next(smem_i *itr) { - int i, max, max_i, ori_start; + int ori_start; itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0; if (itr->start >= itr->len || itr->start < 0) return 0; while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases if (itr->start == itr->len) return 0; ori_start = itr->start; itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_len, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM - if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here - for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match - bwtintv_t *p = &itr->matches->a[i]; - int len = (uint32_t)p->info - (p->info>>32); - if (max < len) max = len, max_i = i; - } return itr->matches; } diff --git a/main.c b/main.c index ca71750..123f0d4 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.10-r848-dirty" +#define PACKAGE_VERSION "0.7.10-r849-dirty" #endif int bwa_fa2pac(int argc, char *argv[]);