diff --git a/bwamem.c b/bwamem.c index ae1ce68..5ee153a 100644 --- a/bwamem.c +++ b/bwamem.c @@ -69,7 +69,7 @@ mem_opt_t *mem_opt_init() o->n_threads = 1; o->max_matesw = 100; o->mask_level_redun = 0.95; - o->min_HSP_score = 0; + o->min_chain_weight = 0; o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len); // o->mapQ_coef_len = o->mapQ_coef_fac = 0; bwa_fill_scmat(o->a, o->b, o->mat); @@ -357,6 +357,15 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains) flt_aux_t *a; int i, j, n; if (n_chn <= 1) return n_chn; // no need to filter + for (i = j = 0; i < n_chn; ++i) { + mem_chain_t *c = &chains[i]; + int w; + w = mem_chain_weight(c); + if (w >= opt->min_chain_weight) + chains[j++] = *c; + } + n_chn = j; + if (n_chn == 0) return 0; a = malloc(sizeof(flt_aux_t) * n_chn); for (i = 0; i < n_chn; ++i) { mem_chain_t *c = &chains[i]; @@ -526,10 +535,13 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t i #define MEM_SHORT_EXT 50 #define MEM_SHORT_LEN 200 + +#define MEM_HSP_COEF 1.5 + #define MAX_BAND_TRY 2 /* mem_test_chain_sw() uses SSE2-SW to align a short chain with 50bp added to - * each end of the chain. If the SW score is below opt->min_HSP_score, it will + * each end of the chain. If the SW score is below min_HSP_score, it will * return 0, informing the caller to discard the chain. This heuristic is * somewhat similar to BLAST which drops a seed hit if ungapped extension is * below a certain score (true for old BLAST; don't know how BLAST+ works). @@ -547,6 +559,7 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t i int mem_test_chain_sw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c) { int i, qb, qe; + int min_HSP_score = (int)(opt->min_chain_weight * opt->a * MEM_HSP_COEF + .499); int64_t rb, re, rlen; uint8_t *rseq = 0; kswr_t x; @@ -579,7 +592,7 @@ int mem_test_chain_sw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, i assert(rlen == re - rb); x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, KSW_XSTART, 0); free(rseq); - if (x.score >= opt->min_HSP_score) return 1; + if (x.score >= min_HSP_score) return 1; if (bwa_verbose >= 4) printf("** give up the chain due to small HSP score %d.\n", x.score); return 0; } @@ -1014,7 +1027,7 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse mem_chain_t *p = &chn.a[i]; int ret; if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i); - if (opt->min_HSP_score > 0) ret = mem_test_chain_sw(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p); + if (opt->min_chain_weight > 0) ret = mem_test_chain_sw(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p); else ret = mem_chain2aln_short(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); if (ret > 0) mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); free(chn.a[i].seeds); diff --git a/bwamem.h b/bwamem.h index dd68d16..27514cd 100644 --- a/bwamem.h +++ b/bwamem.h @@ -30,13 +30,13 @@ typedef struct { int T; // output score threshold; only affecting output int flag; // see MEM_F_* macros int min_seed_len; // minimum seed length + int min_chain_weight; float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor int split_width; // split into a seed if its occurence is smaller than this value int max_occ; // skip a seed if its occurence is larger than this value int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed int n_threads; // number of threads int chunk_size; // process chunk_size-bp sequences in a batch - int min_HSP_score; // used in mem_test_chain(); disabled by default float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain float mask_level_redun; diff --git a/fastmap.c b/fastmap.c index 1db00cb..c971f9b 100644 --- a/fastmap.c +++ b/fastmap.c @@ -37,7 +37,7 @@ int main_mem(int argc, char *argv[]) opt = mem_opt_init(); opt0.a = opt0.b = opt0.o_del = opt0.e_del = opt0.o_ins = opt0.e_ins = opt0.pen_unpaired = -1; opt0.pen_clip5 = opt0.pen_clip3 = opt0.zdrop = opt0.T = -1; - while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:u:")) >= 0) { + while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg), opt0.a = 1; @@ -56,10 +56,10 @@ int main_mem(int argc, char *argv[]) else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg); else if (c == 'D') opt->chain_drop_ratio = atof(optarg); - else if (c == 'u') opt->min_HSP_score = atoi(optarg); else if (c == 'm') opt->max_matesw = atoi(optarg); else if (c == 's') opt->split_width = atoi(optarg); else if (c == 'N') opt->max_chain_gap = atoi(optarg); + else if (c == 'W') opt->min_chain_weight = atoi(optarg); else if (c == 'C') copy_comment = 1; else if (c == 'Q') { opt->mapQ_coef_len = atoi(optarg); @@ -102,7 +102,7 @@ int main_mem(int argc, char *argv[]) else return 1; } if (opt->n_threads < 1) opt->n_threads = 1; - if (opt->T < opt->min_HSP_score) opt->T = opt->min_HSP_score; +// if (opt->T < opt->min_HSP_score) opt->T = opt->min_HSP_score; // TODO: tie ->T to MEM_HSP_COEF if (optind + 1 >= argc || optind + 3 < argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); @@ -125,7 +125,6 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins); fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3); fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired); - fprintf(stderr, " -u INT drop a chain if local SW score below INT; 0 to disable [%d]\n", opt->min_HSP_score); fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); @@ -141,7 +140,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " FR orientation only. [inferred]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n"); - fprintf(stderr, " `-k18 -u200 -w200 -c1000 -r10 -A3 -O3 -E1' is recommended for PacBio reads as of early 2014.\n"); + fprintf(stderr, " `-k17 -W40 -w200 -c1000 -r10 -A2 -O2 -E1' is recommended for PacBio reads as of early 2014.\n"); fprintf(stderr, "\n"); free(opt); return 1; diff --git a/main.c b/main.c index 9dc0cdf..1089477 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r461" +#define PACKAGE_VERSION "0.7.8+dev-r462" #endif int bwa_fa2pac(int argc, char *argv[]);