From 4219e586238a602b84f4d8be920514e3a736e9af Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Nov 2013 09:36:26 -0500 Subject: [PATCH] r423: bugfix - SE hits not random --- bwamem.c | 27 ++++++++++++++------------- bwamem.h | 3 ++- bwamem_pair.c | 6 +++--- fastmap.c | 4 +++- main.c | 2 +- 5 files changed, 23 insertions(+), 19 deletions(-) diff --git a/bwamem.c b/bwamem.c index b1e9f4b..5089d75 100644 --- a/bwamem.c +++ b/bwamem.c @@ -372,6 +372,9 @@ KSORT_INIT(mem_ars2, mem_alnreg_t, alnreg_slt2) #define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb)))) KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt) +#define alnreg_hlt(a, b) ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash)) +KSORT_INIT(mem_ars_hash, mem_alnreg_t, alnreg_hlt) + int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun) { int m, i, j; @@ -415,13 +418,14 @@ int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun) return m; } -void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORTANT: must run mem_sort_and_dedup() before calling this function +void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id) // IMPORTANT: must run mem_sort_and_dedup() before calling this function { // similar to the loop in mem_chain_flt() int i, k, tmp; kvec_t(int) z; if (n == 0) return; kv_init(z); - for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1; + for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i); + ks_introsort(mem_ars_hash, n, a); tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r; kv_push(int, z, 0); for (i = 1; i < n; ++i) { @@ -890,7 +894,7 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t * seq = malloc(l_seq); memcpy(seq, seq_, l_seq); // makes a copy of seq_ ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq); - mem_mark_primary_se(opt, ar.n, ar.a); + mem_mark_primary_se(opt, ar.n, ar.a, lrand48()); free(seq); return ar; } @@ -967,6 +971,7 @@ typedef struct { const mem_pestat_t *pes; bseq1_t *seqs; mem_alnreg_v *regs; + int64_t n_processed; } worker_t; static void worker1(void *data, int i, int tid) @@ -985,30 +990,26 @@ static void worker2(void *data, int i, int tid) extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]); worker_t *w = (worker_t*)data; if (!(w->opt->flag&MEM_F_PE)) { - mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a); + mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i); mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); free(w->regs[i].a); } else { - mem_sam_pe(w->opt, w->bns, w->pac, w->pes, i, &w->seqs[i<<1], &w->regs[i<<1]); + mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1]); free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a); } } -void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs, const mem_pestat_t *pes0) +void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0) { extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n); - int i; worker_t w; mem_alnreg_v *regs; mem_pestat_t pes[4]; regs = malloc(n * sizeof(mem_alnreg_v)); - for (i = 0; i < opt->n_threads; ++i) { - worker_t *p = &w; - p->opt = opt; p->bwt = bwt; p->bns = bns; p->pac = pac; - p->seqs = seqs; p->regs = regs; - p->pes = &pes[0]; - } + w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac; + w.seqs = seqs; w.regs = regs; w.n_processed = n_processed; + w.pes = &pes[0]; kt_for(opt->n_threads, worker1, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0 diff --git a/bwamem.h b/bwamem.h index 1beaa23..8b24c51 100644 --- a/bwamem.h +++ b/bwamem.h @@ -54,6 +54,7 @@ typedef struct { int w; // actual band width used in extension int seedcov; // length of regions coverged by seeds int secondary; // index of the parent hit shadowing the current hit; <0 if primary + uint64_t hash; } mem_alnreg_t; typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v; @@ -109,7 +110,7 @@ extern "C" { * @param pes0 insert-size info; if NULL, infer from data; if not NULL, it should be an array with 4 elements, * corresponding to each FF, FR, RF and RR orientation. See mem_pestat() for more info. */ - void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs, const mem_pestat_t *pes0); + void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0); /** * Find the aligned regions for one query sequence diff --git a/bwamem_pair.c b/bwamem_pair.c index 648c67e..1729b8c 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -235,7 +235,7 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]) { - extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a); + extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a); extern void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m); extern void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m); @@ -257,8 +257,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); free(b[0].a); free(b[1].a); } - mem_mark_primary_se(opt, a[0].n, a[0].a); - mem_mark_primary_se(opt, a[1].n, a[1].a); + mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0); + mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1); if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; // pairing single-end hits if (a[0].n && a[1].n && (o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, &n_sub, z)) > 0) { diff --git a/fastmap.c b/fastmap.c index 05e7d7f..1884ebc 100644 --- a/fastmap.c +++ b/fastmap.c @@ -27,6 +27,7 @@ int main_mem(int argc, char *argv[]) bwaidx_t *idx; char *rg_line = 0; void *ko = 0, *ko2 = 0; + int64_t n_processed = 0; opt = mem_opt_init(); while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:")) >= 0) { @@ -137,7 +138,8 @@ int main_mem(int argc, char *argv[]) for (i = 0; i < n; ++i) size += seqs[i].l_seq; if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size); - mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n, seqs, 0); + mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, 0); + n_processed += n; for (i = 0; i < n; ++i) { err_fputs(seqs[i].sam, stdout); free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam); diff --git a/main.c b/main.c index 5acd89f..6d3e301 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.5a-r422" +#define PACKAGE_VERSION "0.7.5a-r423" #endif int bwa_fa2pac(int argc, char *argv[]);