dev-r462: refined setting for PacBio; weight flt

The recommended setting in the last commit is wrong. If we can extend a random
seed hit to the full length, we will force the read aligned through break
points, which is wrong. The new setting is better but it may lead to a small
fraction of fragmented alignments.

In addition, I added a filter on the minimum chain weight and tied
min_HSP_score to this filter. It doubles the mapping speed.
This commit is contained in:
Heng Li 2014-04-04 17:01:04 -04:00
parent 41f720dfa7
commit 114901b005
4 changed files with 23 additions and 11 deletions

View File

@ -69,7 +69,7 @@ mem_opt_t *mem_opt_init()
o->n_threads = 1;
o->max_matesw = 100;
o->mask_level_redun = 0.95;
o->min_HSP_score = 0;
o->min_chain_weight = 0;
o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len);
// o->mapQ_coef_len = o->mapQ_coef_fac = 0;
bwa_fill_scmat(o->a, o->b, o->mat);
@ -357,6 +357,15 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains)
flt_aux_t *a;
int i, j, n;
if (n_chn <= 1) return n_chn; // no need to filter
for (i = j = 0; i < n_chn; ++i) {
mem_chain_t *c = &chains[i];
int w;
w = mem_chain_weight(c);
if (w >= opt->min_chain_weight)
chains[j++] = *c;
}
n_chn = j;
if (n_chn == 0) return 0;
a = malloc(sizeof(flt_aux_t) * n_chn);
for (i = 0; i < n_chn; ++i) {
mem_chain_t *c = &chains[i];
@ -526,10 +535,13 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t i
#define MEM_SHORT_EXT 50
#define MEM_SHORT_LEN 200
#define MEM_HSP_COEF 1.5
#define MAX_BAND_TRY 2
/* mem_test_chain_sw() uses SSE2-SW to align a short chain with 50bp added to
* each end of the chain. If the SW score is below opt->min_HSP_score, it will
* each end of the chain. If the SW score is below min_HSP_score, it will
* return 0, informing the caller to discard the chain. This heuristic is
* somewhat similar to BLAST which drops a seed hit if ungapped extension is
* below a certain score (true for old BLAST; don't know how BLAST+ works).
@ -547,6 +559,7 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t i
int mem_test_chain_sw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c)
{
int i, qb, qe;
int min_HSP_score = (int)(opt->min_chain_weight * opt->a * MEM_HSP_COEF + .499);
int64_t rb, re, rlen;
uint8_t *rseq = 0;
kswr_t x;
@ -579,7 +592,7 @@ int mem_test_chain_sw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, i
assert(rlen == re - rb);
x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, KSW_XSTART, 0);
free(rseq);
if (x.score >= opt->min_HSP_score) return 1;
if (x.score >= min_HSP_score) return 1;
if (bwa_verbose >= 4) printf("** give up the chain due to small HSP score %d.\n", x.score);
return 0;
}
@ -1014,7 +1027,7 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse
mem_chain_t *p = &chn.a[i];
int ret;
if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i);
if (opt->min_HSP_score > 0) ret = mem_test_chain_sw(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p);
if (opt->min_chain_weight > 0) ret = mem_test_chain_sw(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p);
else ret = mem_chain2aln_short(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, &regs);
if (ret > 0) mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, &regs);
free(chn.a[i].seeds);

View File

@ -30,13 +30,13 @@ typedef struct {
int T; // output score threshold; only affecting output
int flag; // see MEM_F_* macros
int min_seed_len; // minimum seed length
int min_chain_weight;
float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor
int split_width; // split into a seed if its occurence is smaller than this value
int max_occ; // skip a seed if its occurence is larger than this value
int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed
int n_threads; // number of threads
int chunk_size; // process chunk_size-bp sequences in a batch
int min_HSP_score; // used in mem_test_chain(); disabled by default
float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain
float mask_level_redun;

View File

@ -37,7 +37,7 @@ int main_mem(int argc, char *argv[])
opt = mem_opt_init();
opt0.a = opt0.b = opt0.o_del = opt0.e_del = opt0.o_ins = opt0.e_ins = opt0.pen_unpaired = -1;
opt0.pen_clip5 = opt0.pen_clip3 = opt0.zdrop = opt0.T = -1;
while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:u:")) >= 0) {
while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:")) >= 0) {
if (c == 'k') opt->min_seed_len = atoi(optarg);
else if (c == 'w') opt->w = atoi(optarg);
else if (c == 'A') opt->a = atoi(optarg), opt0.a = 1;
@ -56,10 +56,10 @@ int main_mem(int argc, char *argv[])
else if (c == 'v') bwa_verbose = atoi(optarg);
else if (c == 'r') opt->split_factor = atof(optarg);
else if (c == 'D') opt->chain_drop_ratio = atof(optarg);
else if (c == 'u') opt->min_HSP_score = atoi(optarg);
else if (c == 'm') opt->max_matesw = atoi(optarg);
else if (c == 's') opt->split_width = atoi(optarg);
else if (c == 'N') opt->max_chain_gap = atoi(optarg);
else if (c == 'W') opt->min_chain_weight = atoi(optarg);
else if (c == 'C') copy_comment = 1;
else if (c == 'Q') {
opt->mapQ_coef_len = atoi(optarg);
@ -102,7 +102,7 @@ int main_mem(int argc, char *argv[])
else return 1;
}
if (opt->n_threads < 1) opt->n_threads = 1;
if (opt->T < opt->min_HSP_score) opt->T = opt->min_HSP_score;
// if (opt->T < opt->min_HSP_score) opt->T = opt->min_HSP_score; // TODO: tie ->T to MEM_HSP_COEF
if (optind + 1 >= argc || optind + 3 < argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa mem [options] <idxbase> <in1.fq> [in2.fq]\n\n");
@ -125,7 +125,6 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins);
fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3);
fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired);
fprintf(stderr, " -u INT drop a chain if local SW score below INT; 0 to disable [%d]\n", opt->min_HSP_score);
fprintf(stderr, "\nInput/output options:\n\n");
fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n");
fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
@ -141,7 +140,7 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " FR orientation only. [inferred]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n");
fprintf(stderr, " `-k18 -u200 -w200 -c1000 -r10 -A3 -O3 -E1' is recommended for PacBio reads as of early 2014.\n");
fprintf(stderr, " `-k17 -W40 -w200 -c1000 -r10 -A2 -O2 -E1' is recommended for PacBio reads as of early 2014.\n");
fprintf(stderr, "\n");
free(opt);
return 1;

2
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.8+dev-r461"
#define PACKAGE_VERSION "0.7.8+dev-r462"
#endif
int bwa_fa2pac(int argc, char *argv[]);