From f44edd4fc9e06f7e81b84d23f6bdb06ab1934a45 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 28 Jun 2012 14:51:02 -0400 Subject: [PATCH] r128: more conservative chaining filter --- bwtsw2.h | 2 +- bwtsw2_aux.c | 3 ++- bwtsw2_chain.c | 11 ++++++----- bwtsw2_main.c | 6 ++++-- main.c | 2 +- 5 files changed, 14 insertions(+), 10 deletions(-) diff --git a/bwtsw2.h b/bwtsw2.h index 0a1b860..b1f6a3f 100644 --- a/bwtsw2.h +++ b/bwtsw2.h @@ -13,7 +13,7 @@ typedef struct { int skip_sw:16, hard_clip:16; - int a, b, q, r, t, qr, bw, max_ins; + int a, b, q, r, t, qr, bw, max_ins, max_chain_gap; int z, is, t_seeds, multi_2nd; float mask_level, coef; int n_threads, chunk_size; diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 8f838f5..f6f6df8 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -54,6 +54,7 @@ bsw2opt_t *bsw2_init_opt() o->z = 1; o->is = 3; o->t_seeds = 5; o->hard_clip = 0; o->skip_sw = 0; o->mask_level = 0.50f; o->coef = 5.5f; o->qr = o->q + o->r; o->n_threads = 1; o->chunk_size = 10000000; + o->max_chain_gap = 10000; return o; } @@ -286,7 +287,7 @@ static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8 } } b[0] = bb[0][1]; b[1] = bb[1][1]; // bb[*][1] are "narrow SA hits" - bsw2_chain_filter(opt, l, b); + bsw2_chain_filter(opt, l, b); // NB: only unique seeds are chained for (k = 0; k < 2; ++k) { bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, pool->aln_mem); merge_hits(bb[k], l, 0); // bb[k][1] is merged to bb[k][0] here diff --git a/bwtsw2_chain.c b/bwtsw2_chain.c index c734657..381d0b7 100644 --- a/bwtsw2_chain.c +++ b/bwtsw2_chain.c @@ -23,15 +23,15 @@ static int chaining(const bsw2opt_t *opt, int shift, int n, hsaip_t *z, hsaip_t hsaip_t *q = chain + k; int x = p->qbeg - q->qbeg; // always positive int y = p->tbeg - q->tbeg; - if (y > 0 && x - y <= opt->bw && y - x <= opt->bw) { + if (y > 0 && x < opt->max_chain_gap && y < opt->max_chain_gap && x - y <= opt->bw && y - x <= opt->bw) { // chained if (p->qend > q->qend) q->qend = p->qend; if (p->tend > q->tend) q->tend = p->tend; ++q->chain; p->chain = shift + k; break; - } + } else if (q->chain > opt->t_seeds * 2) k = 0; // if the chain is strong enough, do not check the previous chains } - if (k < 0) { + if (k < 0) { // not added to any previous chains chain[m] = *p; chain[m].chain = 1; chain[m].idx = p->chain = shift + m; @@ -44,7 +44,7 @@ static int chaining(const bsw2opt_t *opt, int shift, int n, hsaip_t *z, hsaip_t void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]) { hsaip_t *z[2], *chain[2]; - int i, j, k, n[2], m[2]; + int i, j, k, n[2], m[2], thres = opt->t_seeds * 2; char *flag; // initialization n[0] = b[0]->n; n[1] = b[1]->n; @@ -71,6 +71,7 @@ void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]) int tmp = p->qbeg; p->qbeg = len - p->qend; p->qend = len - tmp; } + //for (k = 0; k < m[0]; ++k) printf("%d, [%d,%d), [%d,%d)\n", chain[0][k].chain, chain[0][k].tbeg, chain[0][k].tend, chain[0][k].qbeg, chain[0][k].qend); // filtering flag = calloc(m[0] + m[1], 1); ks_introsort(hsaip, m[0] + m[1], chain[0]); @@ -79,7 +80,7 @@ void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]) for (j = 0; j < k; ++j) { hsaip_t *q = chain[0] + j; if (flag[q->idx]) continue; - if (q->qend >= p->qend && q->chain > p->chain * opt->t_seeds * 2) { + if (q->qend >= p->qend && q->chain > p->chain * thres && p->chain < thres) { flag[p->idx] = 1; break; } diff --git a/bwtsw2_main.c b/bwtsw2_main.c index 50355fe..a802ee7 100644 --- a/bwtsw2_main.c +++ b/bwtsw2_main.c @@ -18,7 +18,7 @@ int bwa_bwtsw2(int argc, char *argv[]) opt = bsw2_init_opt(); srand48(11); - while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:MI:S")) >= 0) { + while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:MI:SG:")) >= 0) { switch (c) { case 'q': opt->q = atoi(optarg); break; case 'r': opt->r = atoi(optarg); break; @@ -37,6 +37,7 @@ int bwa_bwtsw2(int argc, char *argv[]) case 'f': xreopen(optarg, "w", stdout); break; case 'I': opt->max_ins = atoi(optarg); break; case 'S': opt->skip_sw = 1; break; + case 'G': opt->max_chain_gap = atoi(optarg); break; } } opt->qr = opt->q + opt->r; @@ -62,7 +63,8 @@ int bwa_bwtsw2(int argc, char *argv[]) fprintf(stderr, " -c FLOAT coefficient of length-threshold adjustment [%.1f]\n", opt->coef); fprintf(stderr, " -z INT Z-best [%d]\n", opt->z); fprintf(stderr, " -s INT maximum seeding interval size [%d]\n", opt->is); - fprintf(stderr, " -N INT # seeds to trigger reverse alignment [%d]\n", opt->t_seeds); + fprintf(stderr, " -N INT # seeds to trigger rev aln; 2*INT is also the chaining threshold [%d]\n", opt->t_seeds); + fprintf(stderr, " -G INT maximum gap size during chaining [%d]\n", opt->max_chain_gap); fprintf(stderr, "\n"); fprintf(stderr, "Note: For long Illumina, 454 and Sanger reads, assembly contigs, fosmids and\n"); fprintf(stderr, " BACs, the default setting usually works well. For the current PacBio\n"); diff --git a/main.c b/main.c index 0e7af77..aa5bb3b 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r126" +#define PACKAGE_VERSION "0.6.2-r128" #endif static int usage()