r809: new strategy for the -a mode
This commit is contained in:
parent
bf7d1d46ca
commit
b5cba257c1
8
bwamem.c
8
bwamem.c
|
|
@ -2,6 +2,7 @@
|
|||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
#include <limits.h>
|
||||
#include <math.h>
|
||||
#ifdef HAVE_PTHREAD
|
||||
#include <pthread.h>
|
||||
|
|
@ -57,6 +58,9 @@ mem_opt_t *mem_opt_init()
|
|||
o->zdrop = 100;
|
||||
o->pen_unpaired = 17;
|
||||
o->pen_clip5 = o->pen_clip3 = 5;
|
||||
|
||||
o->max_mem_intv = 0;
|
||||
|
||||
o->min_seed_len = 19;
|
||||
o->split_width = 10;
|
||||
o->max_occ = 500;
|
||||
|
|
@ -115,7 +119,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
|
|||
// first pass: find all SMEMs
|
||||
while (x < len) {
|
||||
if (seq[x] < 4) {
|
||||
x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv);
|
||||
x = bwt_smem1a(bwt, len, seq, x, start_width, split_len, opt->max_mem_intv, &a->mem1, a->tmpv);
|
||||
for (i = 0; i < a->mem1.n; ++i) {
|
||||
bwtintv_t *p = &a->mem1.a[i];
|
||||
int slen = (uint32_t)p->info - (p->info>>32); // seed length
|
||||
|
|
@ -125,6 +129,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
|
|||
} else ++x;
|
||||
}
|
||||
// second pass: find MEMs inside a long SMEM
|
||||
if (opt->max_mem_intv > 0) goto sort_intv;
|
||||
old_n = a->mem.n;
|
||||
for (k = 0; k < old_n; ++k) {
|
||||
bwtintv_t *p = &a->mem.a[k];
|
||||
|
|
@ -136,6 +141,7 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
|
|||
kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
|
||||
}
|
||||
// sort
|
||||
sort_intv:
|
||||
ks_introsort(mem_intv, a->mem.n, a->mem.a);
|
||||
}
|
||||
|
||||
|
|
|
|||
3
bwamem.h
3
bwamem.h
|
|
@ -29,6 +29,8 @@ typedef struct {
|
|||
int w; // band width
|
||||
int zdrop; // Z-dropoff
|
||||
|
||||
uint64_t max_mem_intv;
|
||||
|
||||
int T; // output score threshold; only affecting output
|
||||
int flag; // see MEM_F_* macros
|
||||
int min_seed_len; // minimum seed length
|
||||
|
|
@ -97,6 +99,7 @@ extern "C" {
|
|||
smem_i *smem_itr_init(const bwt_t *bwt);
|
||||
void smem_itr_destroy(smem_i *itr);
|
||||
void smem_set_query(smem_i *itr, int len, const uint8_t *query);
|
||||
void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv);
|
||||
const bwtintv_v *smem_next(smem_i *itr);
|
||||
|
||||
mem_opt_t *mem_opt_init(void);
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
#include <limits.h>
|
||||
#include "bwa.h"
|
||||
#include "bwamem.h"
|
||||
#include "bntseq.h"
|
||||
|
|
@ -11,6 +12,8 @@ struct __smem_i {
|
|||
const bwt_t *bwt;
|
||||
const uint8_t *query;
|
||||
int start, len;
|
||||
int min_intv, max_len;
|
||||
uint64_t max_intv;
|
||||
bwtintv_v *matches; // matches; to be returned by smem_next()
|
||||
bwtintv_v *sub; // sub-matches inside the longest match; temporary
|
||||
bwtintv_v *tmpvec[2]; // temporary arrays
|
||||
|
|
@ -25,6 +28,9 @@ smem_i *smem_itr_init(const bwt_t *bwt)
|
|||
itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v));
|
||||
itr->matches = calloc(1, sizeof(bwtintv_v));
|
||||
itr->sub = calloc(1, sizeof(bwtintv_v));
|
||||
itr->min_intv = 1;
|
||||
itr->max_len = INT_MAX;
|
||||
itr->max_intv = 0;
|
||||
return itr;
|
||||
}
|
||||
|
||||
|
|
@ -44,6 +50,13 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query)
|
|||
itr->len = len;
|
||||
}
|
||||
|
||||
void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv)
|
||||
{
|
||||
itr->min_intv = min_intv;
|
||||
itr->max_len = max_len;
|
||||
itr->max_intv = max_intv;
|
||||
}
|
||||
|
||||
const bwtintv_v *smem_next(smem_i *itr)
|
||||
{
|
||||
int i, max, max_i, ori_start;
|
||||
|
|
@ -52,7 +65,7 @@ const bwtintv_v *smem_next(smem_i *itr)
|
|||
while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
|
||||
if (itr->start == itr->len) return 0;
|
||||
ori_start = itr->start;
|
||||
itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM
|
||||
itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_len, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM
|
||||
if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here
|
||||
for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match
|
||||
bwtintv_t *p = &itr->matches->a[i];
|
||||
|
|
|
|||
16
bwt.c
16
bwt.c
|
|
@ -30,6 +30,7 @@
|
|||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
#include <limits.h>
|
||||
#include "utils.h"
|
||||
#include "bwt.h"
|
||||
#include "kvec.h"
|
||||
|
|
@ -285,7 +286,7 @@ static void bwt_reverse_intvs(bwtintv_v *p)
|
|||
}
|
||||
}
|
||||
|
||||
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
|
||||
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, int max_len, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
|
||||
{
|
||||
int i, j, c, ret;
|
||||
bwtintv_t ik, ok[4];
|
||||
|
|
@ -307,6 +308,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
|
|||
if (ok[c].x[2] != ik.x[2]) { // change of the interval size
|
||||
kv_push(bwtintv_t, *curr, ik);
|
||||
if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further
|
||||
if (i - x > max_len && ik.x[2] < max_intv) break;
|
||||
}
|
||||
ik = ok[c]; ik.info = i + 1;
|
||||
} else { // an ambiguous base
|
||||
|
|
@ -323,8 +325,13 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
|
|||
c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base
|
||||
for (j = 0, curr->n = 0; j < prev->n; ++j) {
|
||||
bwtintv_t *p = &prev->a[j];
|
||||
int max_stop = 0;
|
||||
bwt_extend(bwt, p, ok, 1);
|
||||
if (c < 0 || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
|
||||
if (ok[c].x[2] != p->x[2]) { // change of interval size
|
||||
if (p->info - i - 1 > max_len && p->x[2] < max_intv)
|
||||
max_stop = 1;
|
||||
}
|
||||
if (c < 0 || ok[c].x[2] < min_intv || max_stop) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
|
||||
if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches
|
||||
if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches
|
||||
ik = *p; ik.info |= (uint64_t)(i + 1)<<32;
|
||||
|
|
@ -346,6 +353,11 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
|
|||
return ret;
|
||||
}
|
||||
|
||||
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
|
||||
{
|
||||
return bwt_smem1a(bwt, len, q, x, min_intv, INT_MAX, 0, mem, tmpvec);
|
||||
}
|
||||
|
||||
/*************************
|
||||
* Read/write BWT and SA *
|
||||
*************************/
|
||||
|
|
|
|||
1
bwt.h
1
bwt.h
|
|
@ -119,6 +119,7 @@ extern "C" {
|
|||
* Return the end of the longest exact match starting from _x_.
|
||||
*/
|
||||
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
|
||||
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, int max_len, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
|
||||
|
||||
// SMEM iterator interface
|
||||
|
||||
|
|
|
|||
26
fastmap.c
26
fastmap.c
|
|
@ -3,6 +3,7 @@
|
|||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
#include <ctype.h>
|
||||
#include <math.h>
|
||||
#include "bwa.h"
|
||||
|
|
@ -53,7 +54,7 @@ int main_mem(int argc, char *argv[])
|
|||
|
||||
opt = mem_opt_init();
|
||||
memset(&opt0, 0, sizeof(mem_opt_t));
|
||||
while ((c = getopt(argc, argv, "epaFMCSPHYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:")) >= 0) {
|
||||
while ((c = getopt(argc, argv, "epaFMCSPHYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:")) >= 0) {
|
||||
if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
|
||||
else if (c == 'x') mode = optarg;
|
||||
else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1;
|
||||
|
|
@ -81,6 +82,7 @@ int main_mem(int argc, char *argv[])
|
|||
else if (c == 'G') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1;
|
||||
else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1;
|
||||
else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1;
|
||||
else if (c == 'y') opt->max_mem_intv = atol(optarg), opt0.max_mem_intv = 1;
|
||||
else if (c == 'C') copy_comment = 1;
|
||||
else if (c == 'Q') {
|
||||
opt0.mapQ_coef_len = 1;
|
||||
|
|
@ -133,6 +135,7 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
|
||||
fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop);
|
||||
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
|
||||
fprintf(stderr, " -y INT find MEMs longer than {-k} * {-r} with size less than INT [%ld]\n", (long)opt->max_mem_intv);
|
||||
// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
|
||||
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
|
||||
fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio);
|
||||
|
|
@ -141,12 +144,13 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, " -S skip mate rescue\n");
|
||||
fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n");
|
||||
fprintf(stderr, " -e discard full-length exact matches\n");
|
||||
fprintf(stderr, "\nScoring options:\n\n");
|
||||
fprintf(stderr, " -A INT score for a sequence match, which scales options -TdBOELU unless overridden [%d]\n", opt->a);
|
||||
fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
|
||||
fprintf(stderr, " -O INT[,INT] gap open penalties for deletions and insertions [%d,%d]\n", opt->o_del, opt->o_ins);
|
||||
fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins);
|
||||
fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3);
|
||||
fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired);
|
||||
fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n\n", opt->pen_unpaired);
|
||||
fprintf(stderr, " -x STR read type. Setting -x changes multiple parameters unless overriden [null]\n");
|
||||
fprintf(stderr, " pacbio: -k17 -W40 -r10 -A2 -B5 -O2 -E1 -L0\n");
|
||||
fprintf(stderr, " pbread: -k13 -W40 -c1000 -r10 -A2 -B5 -O2 -E1 -N25 -FeaD.001\n");
|
||||
|
|
@ -264,7 +268,8 @@ int main_mem(int argc, char *argv[])
|
|||
|
||||
int main_fastmap(int argc, char *argv[])
|
||||
{
|
||||
int c, i, min_iwidth = 20, min_len = 17, print_seq = 0;
|
||||
int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1, max_len = INT_MAX;
|
||||
uint64_t max_intv = 0;
|
||||
kseq_t *seq;
|
||||
bwtint_t k;
|
||||
gzFile fp;
|
||||
|
|
@ -272,16 +277,26 @@ int main_fastmap(int argc, char *argv[])
|
|||
const bwtintv_v *a;
|
||||
bwaidx_t *idx;
|
||||
|
||||
while ((c = getopt(argc, argv, "w:l:p")) >= 0) {
|
||||
while ((c = getopt(argc, argv, "w:l:pi:I:L:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'p': print_seq = 1; break;
|
||||
case 'w': min_iwidth = atoi(optarg); break;
|
||||
case 'l': min_len = atoi(optarg); break;
|
||||
case 'i': min_intv = atoi(optarg); break;
|
||||
case 'I': max_intv = atol(optarg); break;
|
||||
case 'L': max_len = atoi(optarg); break;
|
||||
default: return 1;
|
||||
}
|
||||
}
|
||||
if (optind + 1 >= argc) {
|
||||
fprintf(stderr, "Usage: bwa fastmap [-p] [-l minLen=%d] [-w maxSaSize=%d] <idxbase> <in.fq>\n", min_len, min_iwidth);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Usage: bwa fastmap [options] <idxbase> <in.fq>\n\n");
|
||||
fprintf(stderr, "Options: -l INT min SMEM length to output [%d]\n", min_len);
|
||||
fprintf(stderr, " -w INT max interval size to find coordiantes [%d]\n", min_iwidth);
|
||||
fprintf(stderr, " -i INT min SMEM interval size [%d]\n", min_intv);
|
||||
fprintf(stderr, " -l INT max MEM length [%d]\n", max_len);
|
||||
fprintf(stderr, " -I INT stop if MEM is longer than -l with a size less than INT [%ld]\n", (long)max_intv);
|
||||
fprintf(stderr, "\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
@ -289,6 +304,7 @@ int main_fastmap(int argc, char *argv[])
|
|||
seq = kseq_init(fp);
|
||||
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1;
|
||||
itr = smem_itr_init(idx->bwt);
|
||||
smem_config(itr, min_intv, max_len, max_intv);
|
||||
while (kseq_read(seq) >= 0) {
|
||||
err_printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l);
|
||||
if (print_seq) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue