dev-461: added a heuristic for PacBio data

See the comment above mem_test_chain_sw() for details.
This commit is contained in:
Heng Li 2014-04-04 16:05:41 -04:00
parent 066ec4aa95
commit 41f720dfa7
4 changed files with 70 additions and 7 deletions

View File

@ -69,6 +69,7 @@ mem_opt_t *mem_opt_init()
o->n_threads = 1;
o->max_matesw = 100;
o->mask_level_redun = 0.95;
o->min_HSP_score = 0;
o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len);
// o->mapQ_coef_len = o->mapQ_coef_fac = 0;
bwa_fill_scmat(o->a, o->b, o->mat);
@ -527,6 +528,62 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t i
#define MEM_SHORT_LEN 200
#define MAX_BAND_TRY 2
/* mem_test_chain_sw() uses SSE2-SW to align a short chain with 50bp added to
* each end of the chain. If the SW score is below opt->min_HSP_score, it will
* return 0, informing the caller to discard the chain. This heuristic is
* somewhat similar to BLAST which drops a seed hit if ungapped extension is
* below a certain score (true for old BLAST; don't know how BLAST+ works).
*
* For PacBio data, we need to set high matching score and low gap penalties;
* otherwise we are likely to get fragmented alignments. However, with such
* settings, we can often extend most random seed hits to the end. These
* extensions are wasteful and time consuming. By testing the chain with SW,
* we can discard bad chains before performing the expensive extension.
*
* Although probably it is not a bad idea to use this function for
* low-divergence sequences, more testing is needed. For now, I only recommend
* to use mem_test_chain_sw() for PacBio data. It is disabled by default.
*/
int mem_test_chain_sw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c)
{
int i, qb, qe;
int64_t rb, re, rlen;
uint8_t *rseq = 0;
kswr_t x;
if (c->n == 0) return -1;
qb = l_query; qe = 0;
rb = l_pac<<1; re = 0;
for (i = 0; i < c->n; ++i) {
const mem_seed_t *s = &c->seeds[i];
qb = qb < s->qbeg? qb : s->qbeg;
qe = qe > s->qbeg + s->len? qe : s->qbeg + s->len;
rb = rb < s->rbeg? rb : s->rbeg;
re = re > s->rbeg + s->len? re : s->rbeg + s->len;
}
qb -= MEM_SHORT_EXT; qe += MEM_SHORT_EXT;
qb = qb > 0? qb : 0;
qe = qe < l_query? qe : l_query;
rb -= MEM_SHORT_EXT; re += MEM_SHORT_EXT;
rb = rb > 0? rb : 0;
re = re < l_pac<<1? re : l_pac<<1;
if (rb < l_pac && l_pac < re) {
if (c->seeds[0].rbeg < l_pac) re = l_pac;
else rb = l_pac;
}
if ((re - rb) - (qe - qb) > MEM_SHORT_EXT || (qe - qb) - (re - rb) > MEM_SHORT_EXT) return 1;
if (qe - qb >= opt->w * 4 || re - rb >= opt->w * 4) return 1;
if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return 1;
rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
assert(rlen == re - rb);
x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, KSW_XSTART, 0);
free(rseq);
if (x.score >= opt->min_HSP_score) return 1;
if (bwa_verbose >= 4) printf("** give up the chain due to small HSP score %d.\n", x.score);
return 0;
}
int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av)
{
int i, qb, qe, xtra;
@ -565,14 +622,13 @@ int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac,
xtra = KSW_XSUBO | KSW_XSTART | ((qe - qb) * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a);
x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0);
free(rseq);
if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1;
a.rb = rb + x.tb; a.re = rb + x.te + 1;
a.qb = qb + x.qb; a.qe = qb + x.qe + 1;
a.score = x.score;
a.csub = x.score2;
if (bwa_verbose >= 4) printf("** Attempted alignment via mem_chain2aln_short(): [%d,%d) <=> [%ld,%ld); score=%d; %d/%d\n", a.qb, a.qe, (long)a.rb, (long)a.re, x.score, a.qe-a.qb, qe-qb);
if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1;
kv_push(mem_alnreg_t, *av, a);
if (bwa_verbose >= 4) printf("** Added alignment region via mem_chain2aln_short(): [%d,%d) <=> [%ld,%ld)\n", a.qb, a.qe, (long)a.rb, (long)a.re);
return 0;
}
@ -958,7 +1014,8 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse
mem_chain_t *p = &chn.a[i];
int ret;
if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i);
ret = mem_chain2aln_short(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, &regs);
if (opt->min_HSP_score > 0) ret = mem_test_chain_sw(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p);
else ret = mem_chain2aln_short(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, &regs);
if (ret > 0) mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, &regs);
free(chn.a[i].seeds);
}

View File

@ -36,6 +36,7 @@ typedef struct {
int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed
int n_threads; // number of threads
int chunk_size; // process chunk_size-bp sequences in a batch
int min_HSP_score; // used in mem_test_chain(); disabled by default
float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain
float mask_level_redun;

View File

@ -37,7 +37,7 @@ int main_mem(int argc, char *argv[])
opt = mem_opt_init();
opt0.a = opt0.b = opt0.o_del = opt0.e_del = opt0.o_ins = opt0.e_ins = opt0.pen_unpaired = -1;
opt0.pen_clip5 = opt0.pen_clip3 = opt0.zdrop = opt0.T = -1;
while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:")) >= 0) {
while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:u:")) >= 0) {
if (c == 'k') opt->min_seed_len = atoi(optarg);
else if (c == 'w') opt->w = atoi(optarg);
else if (c == 'A') opt->a = atoi(optarg), opt0.a = 1;
@ -56,6 +56,7 @@ int main_mem(int argc, char *argv[])
else if (c == 'v') bwa_verbose = atoi(optarg);
else if (c == 'r') opt->split_factor = atof(optarg);
else if (c == 'D') opt->chain_drop_ratio = atof(optarg);
else if (c == 'u') opt->min_HSP_score = atoi(optarg);
else if (c == 'm') opt->max_matesw = atoi(optarg);
else if (c == 's') opt->split_width = atoi(optarg);
else if (c == 'N') opt->max_chain_gap = atoi(optarg);
@ -101,6 +102,7 @@ int main_mem(int argc, char *argv[])
else return 1;
}
if (opt->n_threads < 1) opt->n_threads = 1;
if (opt->T < opt->min_HSP_score) opt->T = opt->min_HSP_score;
if (optind + 1 >= argc || optind + 3 < argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa mem [options] <idxbase> <in1.fq> [in2.fq]\n\n");
@ -123,6 +125,7 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins);
fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3);
fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired);
fprintf(stderr, " -u INT drop a chain if local SW score below INT; 0 to disable [%d]\n", opt->min_HSP_score);
fprintf(stderr, "\nInput/output options:\n\n");
fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n");
fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
@ -136,7 +139,9 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " specify the mean, standard deviation (10%% of the mean if absent), max\n");
fprintf(stderr, " (4 sigma from the mean if absent) and min of the insert size distribution.\n");
fprintf(stderr, " FR orientation only. [inferred]\n");
fprintf(stderr, "\nNote: Please read the man page for detailed description of the command line and options.\n");
fprintf(stderr, "\n");
fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n");
fprintf(stderr, " `-k18 -u200 -w200 -c1000 -r10 -A3 -O3 -E1' is recommended for PacBio reads as of early 2014.\n");
fprintf(stderr, "\n");
free(opt);
return 1;

2
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.8+dev-r460"
#define PACKAGE_VERSION "0.7.8+dev-r461"
#endif
int bwa_fa2pac(int argc, char *argv[]);