From 41f720dfa7def8651a1e740eeafaacf557a43634 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 4 Apr 2014 16:05:41 -0400 Subject: [PATCH] dev-461: added a heuristic for PacBio data See the comment above mem_test_chain_sw() for details. --- bwamem.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++---- bwamem.h | 1 + fastmap.c | 9 ++++++-- main.c | 2 +- 4 files changed, 70 insertions(+), 7 deletions(-) diff --git a/bwamem.c b/bwamem.c index ab9f349..ae1ce68 100644 --- a/bwamem.c +++ b/bwamem.c @@ -69,6 +69,7 @@ mem_opt_t *mem_opt_init() o->n_threads = 1; o->max_matesw = 100; o->mask_level_redun = 0.95; + o->min_HSP_score = 0; o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len); // o->mapQ_coef_len = o->mapQ_coef_fac = 0; bwa_fill_scmat(o->a, o->b, o->mat); @@ -527,6 +528,62 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t i #define MEM_SHORT_LEN 200 #define MAX_BAND_TRY 2 +/* mem_test_chain_sw() uses SSE2-SW to align a short chain with 50bp added to + * each end of the chain. If the SW score is below opt->min_HSP_score, it will + * return 0, informing the caller to discard the chain. This heuristic is + * somewhat similar to BLAST which drops a seed hit if ungapped extension is + * below a certain score (true for old BLAST; don't know how BLAST+ works). + * + * For PacBio data, we need to set high matching score and low gap penalties; + * otherwise we are likely to get fragmented alignments. However, with such + * settings, we can often extend most random seed hits to the end. These + * extensions are wasteful and time consuming. By testing the chain with SW, + * we can discard bad chains before performing the expensive extension. + * + * Although probably it is not a bad idea to use this function for + * low-divergence sequences, more testing is needed. For now, I only recommend + * to use mem_test_chain_sw() for PacBio data. It is disabled by default. + */ +int mem_test_chain_sw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c) +{ + int i, qb, qe; + int64_t rb, re, rlen; + uint8_t *rseq = 0; + kswr_t x; + + if (c->n == 0) return -1; + qb = l_query; qe = 0; + rb = l_pac<<1; re = 0; + for (i = 0; i < c->n; ++i) { + const mem_seed_t *s = &c->seeds[i]; + qb = qb < s->qbeg? qb : s->qbeg; + qe = qe > s->qbeg + s->len? qe : s->qbeg + s->len; + rb = rb < s->rbeg? rb : s->rbeg; + re = re > s->rbeg + s->len? re : s->rbeg + s->len; + } + qb -= MEM_SHORT_EXT; qe += MEM_SHORT_EXT; + qb = qb > 0? qb : 0; + qe = qe < l_query? qe : l_query; + rb -= MEM_SHORT_EXT; re += MEM_SHORT_EXT; + rb = rb > 0? rb : 0; + re = re < l_pac<<1? re : l_pac<<1; + if (rb < l_pac && l_pac < re) { + if (c->seeds[0].rbeg < l_pac) re = l_pac; + else rb = l_pac; + } + if ((re - rb) - (qe - qb) > MEM_SHORT_EXT || (qe - qb) - (re - rb) > MEM_SHORT_EXT) return 1; + if (qe - qb >= opt->w * 4 || re - rb >= opt->w * 4) return 1; + if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return 1; + + rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); + assert(rlen == re - rb); + x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, KSW_XSTART, 0); + free(rseq); + if (x.score >= opt->min_HSP_score) return 1; + if (bwa_verbose >= 4) printf("** give up the chain due to small HSP score %d.\n", x.score); + return 0; +} + int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) { int i, qb, qe, xtra; @@ -565,14 +622,13 @@ int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, xtra = KSW_XSUBO | KSW_XSTART | ((qe - qb) * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0); free(rseq); - if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1; - a.rb = rb + x.tb; a.re = rb + x.te + 1; a.qb = qb + x.qb; a.qe = qb + x.qe + 1; a.score = x.score; a.csub = x.score2; + if (bwa_verbose >= 4) printf("** Attempted alignment via mem_chain2aln_short(): [%d,%d) <=> [%ld,%ld); score=%d; %d/%d\n", a.qb, a.qe, (long)a.rb, (long)a.re, x.score, a.qe-a.qb, qe-qb); + if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1; kv_push(mem_alnreg_t, *av, a); - if (bwa_verbose >= 4) printf("** Added alignment region via mem_chain2aln_short(): [%d,%d) <=> [%ld,%ld)\n", a.qb, a.qe, (long)a.rb, (long)a.re); return 0; } @@ -958,7 +1014,8 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse mem_chain_t *p = &chn.a[i]; int ret; if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i); - ret = mem_chain2aln_short(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); + if (opt->min_HSP_score > 0) ret = mem_test_chain_sw(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p); + else ret = mem_chain2aln_short(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); if (ret > 0) mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); free(chn.a[i].seeds); } diff --git a/bwamem.h b/bwamem.h index 86e7d47..dd68d16 100644 --- a/bwamem.h +++ b/bwamem.h @@ -36,6 +36,7 @@ typedef struct { int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed int n_threads; // number of threads int chunk_size; // process chunk_size-bp sequences in a batch + int min_HSP_score; // used in mem_test_chain(); disabled by default float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain float mask_level_redun; diff --git a/fastmap.c b/fastmap.c index 5aac1f7..1db00cb 100644 --- a/fastmap.c +++ b/fastmap.c @@ -37,7 +37,7 @@ int main_mem(int argc, char *argv[]) opt = mem_opt_init(); opt0.a = opt0.b = opt0.o_del = opt0.e_del = opt0.o_ins = opt0.e_ins = opt0.pen_unpaired = -1; opt0.pen_clip5 = opt0.pen_clip3 = opt0.zdrop = opt0.T = -1; - while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:")) >= 0) { + while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:u:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg), opt0.a = 1; @@ -56,6 +56,7 @@ int main_mem(int argc, char *argv[]) else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg); else if (c == 'D') opt->chain_drop_ratio = atof(optarg); + else if (c == 'u') opt->min_HSP_score = atoi(optarg); else if (c == 'm') opt->max_matesw = atoi(optarg); else if (c == 's') opt->split_width = atoi(optarg); else if (c == 'N') opt->max_chain_gap = atoi(optarg); @@ -101,6 +102,7 @@ int main_mem(int argc, char *argv[]) else return 1; } if (opt->n_threads < 1) opt->n_threads = 1; + if (opt->T < opt->min_HSP_score) opt->T = opt->min_HSP_score; if (optind + 1 >= argc || optind + 3 < argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); @@ -123,6 +125,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins); fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3); fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired); + fprintf(stderr, " -u INT drop a chain if local SW score below INT; 0 to disable [%d]\n", opt->min_HSP_score); fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); @@ -136,7 +139,9 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " specify the mean, standard deviation (10%% of the mean if absent), max\n"); fprintf(stderr, " (4 sigma from the mean if absent) and min of the insert size distribution.\n"); fprintf(stderr, " FR orientation only. [inferred]\n"); - fprintf(stderr, "\nNote: Please read the man page for detailed description of the command line and options.\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n"); + fprintf(stderr, " `-k18 -u200 -w200 -c1000 -r10 -A3 -O3 -E1' is recommended for PacBio reads as of early 2014.\n"); fprintf(stderr, "\n"); free(opt); return 1; diff --git a/main.c b/main.c index 16fd48d..9dc0cdf 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r460" +#define PACKAGE_VERSION "0.7.8+dev-r461" #endif int bwa_fa2pac(int argc, char *argv[]);