From f12dfae7729f395d1eb33540f22be2d132c08d0a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 8 Apr 2014 16:29:36 -0400 Subject: [PATCH] dev-465: a new output format for read overlap Also moved a few functions to bwamem_extra.c. File bwamem.c is becoming far too long. --- Makefile | 2 +- bwamem.c | 89 +++++----------------------------------- bwamem.h | 3 +- bwamem_extra.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++ fastmap.c | 15 ++++--- main.c | 2 +- 6 files changed, 134 insertions(+), 86 deletions(-) create mode 100644 bwamem_extra.c diff --git a/Makefile b/Makefile index 6490932..aa51937 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ CFLAGS= -g -Wall -O2 -Wno-unused-function WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS AR= ar DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) -LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o malloc_wrap.o +LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ is.o bwtindex.o bwape.o kopen.o pemerge.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ diff --git a/bwamem.c b/bwamem.c index 8d15f15..2e5104a 100644 --- a/bwamem.c +++ b/bwamem.c @@ -76,65 +76,6 @@ mem_opt_t *mem_opt_init() return o; } -/*************************** - * SMEM iterator interface * - ***************************/ - -struct __smem_i { - const bwt_t *bwt; - const uint8_t *query; - int start, len; - bwtintv_v *matches; // matches; to be returned by smem_next() - bwtintv_v *sub; // sub-matches inside the longest match; temporary - bwtintv_v *tmpvec[2]; // temporary arrays -}; - -smem_i *smem_itr_init(const bwt_t *bwt) -{ - smem_i *itr; - itr = calloc(1, sizeof(smem_i)); - itr->bwt = bwt; - itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); - itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); - itr->matches = calloc(1, sizeof(bwtintv_v)); - itr->sub = calloc(1, sizeof(bwtintv_v)); - return itr; -} - -void smem_itr_destroy(smem_i *itr) -{ - free(itr->tmpvec[0]->a); free(itr->tmpvec[0]); - free(itr->tmpvec[1]->a); free(itr->tmpvec[1]); - free(itr->matches->a); free(itr->matches); - free(itr->sub->a); free(itr->sub); - free(itr); -} - -void smem_set_query(smem_i *itr, int len, const uint8_t *query) -{ - itr->query = query; - itr->start = 0; - itr->len = len; -} - -const bwtintv_v *smem_next(smem_i *itr) -{ - int i, max, max_i, ori_start; - itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0; - if (itr->start >= itr->len || itr->start < 0) return 0; - while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases - if (itr->start == itr->len) return 0; - ori_start = itr->start; - itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM - if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here - for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match - bwtintv_t *p = &itr->matches->a[i]; - int len = (uint32_t)p->info - (p->info>>32); - if (max < len) max = len, max_i = i; - } - return itr->matches; -} - /*************************** * Collection SA invervals * ***************************/ @@ -165,7 +106,7 @@ static void smem_aux_destroy(smem_aux_t *a) static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq, smem_aux_t *a) { int i, k, x = 0, old_n; - int start_width = (opt->flag & MEM_F_NO_EXACT)? 2 : 1; + int start_width = (opt->flag & MEM_F_SELF_OVLP)? 2 : 1; int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); a->mem.n = 0; // first pass: find all SMEMs @@ -357,7 +298,7 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains) flt_aux_t *a; int i, j, n; if (n_chn <= 1) return n_chn; // no need to filter - for (i = j = 0; i < n_chn; ++i) { + for (i = j = 0; i < n_chn; ++i) { // filter out chains with small weight mem_chain_t *c = &chains[i]; int w; w = mem_chain_weight(c); @@ -482,7 +423,7 @@ int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun) int mem_test_and_remove_exact(const mem_opt_t *opt, int n, mem_alnreg_t *a, int qlen) { - if (!(opt->flag & MEM_F_NO_EXACT) || n == 0 || a->truesc != qlen * opt->a) return n; + if (!(opt->flag & MEM_F_SELF_OVLP) || n == 0 || a->truesc != qlen * opt->a) return n; memmove(a, a + 1, (n - 1) * sizeof(mem_alnreg_t)); return n - 1; } @@ -1034,7 +975,7 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse } free(chn.a); regs.n = mem_sort_and_dedup(regs.n, regs.a, opt->mask_level_redun); - if (opt->flag & MEM_F_NO_EXACT) + if (opt->flag & MEM_F_SELF_OVLP) regs.n = mem_test_and_remove_exact(opt, regs.n, regs.a, l_seq); if (bwa_verbose >= 4) { err_printf("* %ld chains remain after removing duplicated chains\n", regs.n); @@ -1046,19 +987,7 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse return regs; } -mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq_) -{ // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence - mem_alnreg_v ar; - char *seq; - seq = malloc(l_seq); - memcpy(seq, seq_, l_seq); // makes a copy of seq_ - ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq); - mem_mark_primary_se(opt, ar.n, ar.a, lrand48()); - free(seq); - return ar; -} -// This routine is only used for the API purpose mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) { mem_aln_t a; @@ -1087,7 +1016,6 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * w2 = w2 > tmp? w2 : tmp; if (bwa_verbose >= 4) printf("* Band width: inferred=%d, cmd_opt=%d, alnreg=%d\n", w2, opt->w, ar->w); if (w2 > opt->w) w2 = w2 < ar->w? w2 : ar->w; -// else w2 = opt->w; // TODO: check if we need this line on long reads. On 1-800bp reads, it does not matter and it should be. i = 0; a.cigar = 0; do { free(a.cigar); @@ -1161,11 +1089,16 @@ static void worker1(void *data, int i, int tid) static void worker2(void *data, int i, int tid) { extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]); + extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a); worker_t *w = (worker_t*)data; if (!(w->opt->flag&MEM_F_PE)) { if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name); - mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i); - mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); + if (w->opt->flag & MEM_F_ALN_REG) { + mem_reg2ovlp(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i]); + } else { + mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i); + mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); + } free(w->regs[i].a); } else { if (bwa_verbose >= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i<<1|0].name); diff --git a/bwamem.h b/bwamem.h index 27514cd..7b7a7e8 100644 --- a/bwamem.h +++ b/bwamem.h @@ -16,7 +16,8 @@ typedef struct __smem_i smem_i; #define MEM_F_ALL 0x8 #define MEM_F_NO_MULTI 0x10 #define MEM_F_NO_RESCUE 0x20 -#define MEM_F_NO_EXACT 0x40 +#define MEM_F_SELF_OVLP 0x40 +#define MEM_F_ALN_REG 0x80 typedef struct { int a, b; // match score and mismatch penalty diff --git a/bwamem_extra.c b/bwamem_extra.c new file mode 100644 index 0000000..58e9f67 --- /dev/null +++ b/bwamem_extra.c @@ -0,0 +1,109 @@ +#include "bwa.h" +#include "bwamem.h" +#include "bntseq.h" +#include "kstring.h" + +/*************************** + * SMEM iterator interface * + ***************************/ + +struct __smem_i { + const bwt_t *bwt; + const uint8_t *query; + int start, len; + bwtintv_v *matches; // matches; to be returned by smem_next() + bwtintv_v *sub; // sub-matches inside the longest match; temporary + bwtintv_v *tmpvec[2]; // temporary arrays +}; + +smem_i *smem_itr_init(const bwt_t *bwt) +{ + smem_i *itr; + itr = calloc(1, sizeof(smem_i)); + itr->bwt = bwt; + itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); + itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); + itr->matches = calloc(1, sizeof(bwtintv_v)); + itr->sub = calloc(1, sizeof(bwtintv_v)); + return itr; +} + +void smem_itr_destroy(smem_i *itr) +{ + free(itr->tmpvec[0]->a); free(itr->tmpvec[0]); + free(itr->tmpvec[1]->a); free(itr->tmpvec[1]); + free(itr->matches->a); free(itr->matches); + free(itr->sub->a); free(itr->sub); + free(itr); +} + +void smem_set_query(smem_i *itr, int len, const uint8_t *query) +{ + itr->query = query; + itr->start = 0; + itr->len = len; +} + +const bwtintv_v *smem_next(smem_i *itr) +{ + int i, max, max_i, ori_start; + itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0; + if (itr->start >= itr->len || itr->start < 0) return 0; + while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases + if (itr->start == itr->len) return 0; + ori_start = itr->start; + itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM + if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here + for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match + bwtintv_t *p = &itr->matches->a[i]; + int len = (uint32_t)p->info - (p->info>>32); + if (max < len) max = len, max_i = i; + } + return itr->matches; +} + +/*********************** + *** Extra functions *** + ***********************/ + +mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq_) +{ // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence + extern mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq); + extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); + mem_alnreg_v ar; + char *seq; + seq = malloc(l_seq); + memcpy(seq, seq_, l_seq); // makes a copy of seq_ + ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq); + mem_mark_primary_se(opt, ar.n, ar.a, lrand48()); + free(seq); + return ar; +} + +void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a) +{ + int i; + kstring_t str = {0,0,0}; + for (i = 0; i < a->n; ++i) { + const mem_alnreg_t *p = &a->a[i]; + int is_rev, rid, qb = p->qb, qe = p->qe; + int64_t pos, rb = p->rb, re = p->re; + if (bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)s->seq, &qb, &qe, &rb, &re) < 0) { + fprintf(stderr, "[E::%s] Internal errors when processing read '%s'. Please let the developer know. Abort. Sorry.\n", __func__, s->name); + exit(1); + } + pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); + rid = bns_pos2rid(bns, pos); + pos -= bns->anns[rid].offset; + kputs(s->name, &str); kputc('\t', &str); + kputw(s->l_seq, &str); kputc('\t', &str); + if (is_rev) qb ^= qe, qe ^= qb, qb ^= qe; // swap + kputw(qb, &str); kputc('\t', &str); kputw(qe, &str); kputc('\t', &str); + kputs(bns->anns[rid].name, &str); kputc('\t', &str); + kputw(bns->anns[rid].len, &str); kputc('\t', &str); + kputw(pos, &str); kputc('\t', &str); kputw(pos + (re - rb), &str); kputc('\t', &str); + kputw(p->truesc, &str); kputc('\n', &str); + } + s->sam = str.s; +} + diff --git a/fastmap.c b/fastmap.c index 35797fd..7dc8001 100644 --- a/fastmap.c +++ b/fastmap.c @@ -53,7 +53,7 @@ int main_mem(int argc, char *argv[]) opt = mem_opt_init(); memset(&opt0, 0, sizeof(mem_opt_t)); - while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:")) >= 0) { + while ((c = getopt(argc, argv, "epaFMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1; else if (c == 'x') mode = optarg; else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1; @@ -67,7 +67,8 @@ int main_mem(int argc, char *argv[]) else if (c == 'p') opt->flag |= MEM_F_PE; else if (c == 'M') opt->flag |= MEM_F_NO_MULTI; else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE; - else if (c == 'e') opt->flag |= MEM_F_NO_EXACT; + else if (c == 'e') opt->flag |= MEM_F_SELF_OVLP; + else if (c == 'F') opt->flag |= MEM_F_ALN_REG; else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1; else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1; else if (c == 'v') bwa_verbose = atoi(optarg); @@ -145,7 +146,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired); fprintf(stderr, " -x STR read type. Setting -x changes multiple parameters unless overriden [null]\n"); fprintf(stderr, " pacbio: -k17 -W40 -w200 -c1000 -r10 -A2 -B7 -O2 -E1 -L0\n"); - fprintf(stderr, " pbread: -k13 -W30 -w200 -c1000 -r10 -A2 -B5 -O2 -E1\n"); + fprintf(stderr, " pbread: -k13 -W30 -w100 -c1000 -r10 -A2 -B5 -O2 -E1 -aeD.02\n"); fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); @@ -174,15 +175,18 @@ int main_mem(int argc, char *argv[]) if (!opt0.e_del) opt->e_del = 1; if (!opt0.o_ins) opt->o_ins = 2; if (!opt0.e_ins) opt->e_ins = 1; - if (!opt0.w) opt->w = 200; if (!opt0.max_occ) opt->max_occ = 1000; if (opt0.split_factor == 0.) opt->split_factor = 10.; if (strcmp(mode, "pbread") == 0) { + opt->flag |= MEM_F_ALL | MEM_F_SELF_OVLP | MEM_F_ALN_REG; if (!opt0.b) opt->b = 5; + if (!opt0.w) opt->w = 100; if (!opt0.min_seed_len) opt->min_seed_len = 13; if (!opt0.min_chain_weight) opt->min_chain_weight = 30; + if (opt0.chain_drop_ratio == 0.) opt->chain_drop_ratio = .02; } else { if (!opt0.b) opt->b = 7; + if (!opt0.w) opt->w = 200; if (!opt0.min_seed_len) opt->min_seed_len = 17; if (!opt0.min_chain_weight) opt->min_chain_weight = 40; if (!opt0.pen_clip5) opt->pen_clip5 = 0; @@ -220,7 +224,8 @@ int main_mem(int argc, char *argv[]) opt->flag |= MEM_F_PE; } } - bwa_print_sam_hdr(idx->bns, rg_line); + if (!(opt->flag & MEM_F_ALN_REG)) + bwa_print_sam_hdr(idx->bns, rg_line); while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { int64_t size = 0; if ((opt->flag & MEM_F_PE) && (n&1) == 1) { diff --git a/main.c b/main.c index a177e6f..0ef8307 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r464" +#define PACKAGE_VERSION "0.7.8+dev-r465" #endif int bwa_fa2pac(int argc, char *argv[]);