r423: bugfix - SE hits not random

This commit is contained in:
Heng Li 2013-11-23 09:36:26 -05:00
parent 29aa855432
commit 4219e58623
5 changed files with 23 additions and 19 deletions

View File

@ -372,6 +372,9 @@ KSORT_INIT(mem_ars2, mem_alnreg_t, alnreg_slt2)
#define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb))))
KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt)
#define alnreg_hlt(a, b) ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash))
KSORT_INIT(mem_ars_hash, mem_alnreg_t, alnreg_hlt)
int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun)
{
int m, i, j;
@ -415,13 +418,14 @@ int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun)
return m;
}
void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORTANT: must run mem_sort_and_dedup() before calling this function
void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id) // IMPORTANT: must run mem_sort_and_dedup() before calling this function
{ // similar to the loop in mem_chain_flt()
int i, k, tmp;
kvec_t(int) z;
if (n == 0) return;
kv_init(z);
for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1;
for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i);
ks_introsort(mem_ars_hash, n, a);
tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r;
kv_push(int, z, 0);
for (i = 1; i < n; ++i) {
@ -890,7 +894,7 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *
seq = malloc(l_seq);
memcpy(seq, seq_, l_seq); // makes a copy of seq_
ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq);
mem_mark_primary_se(opt, ar.n, ar.a);
mem_mark_primary_se(opt, ar.n, ar.a, lrand48());
free(seq);
return ar;
}
@ -967,6 +971,7 @@ typedef struct {
const mem_pestat_t *pes;
bseq1_t *seqs;
mem_alnreg_v *regs;
int64_t n_processed;
} worker_t;
static void worker1(void *data, int i, int tid)
@ -985,30 +990,26 @@ static void worker2(void *data, int i, int tid)
extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]);
worker_t *w = (worker_t*)data;
if (!(w->opt->flag&MEM_F_PE)) {
mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a);
mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i);
mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0);
free(w->regs[i].a);
} else {
mem_sam_pe(w->opt, w->bns, w->pac, w->pes, i, &w->seqs[i<<1], &w->regs[i<<1]);
mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1]);
free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a);
}
}
void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs, const mem_pestat_t *pes0)
void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0)
{
extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n);
int i;
worker_t w;
mem_alnreg_v *regs;
mem_pestat_t pes[4];
regs = malloc(n * sizeof(mem_alnreg_v));
for (i = 0; i < opt->n_threads; ++i) {
worker_t *p = &w;
p->opt = opt; p->bwt = bwt; p->bns = bns; p->pac = pac;
p->seqs = seqs; p->regs = regs;
p->pes = &pes[0];
}
w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac;
w.seqs = seqs; w.regs = regs; w.n_processed = n_processed;
w.pes = &pes[0];
kt_for(opt->n_threads, worker1, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions
if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided
if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0

View File

@ -54,6 +54,7 @@ typedef struct {
int w; // actual band width used in extension
int seedcov; // length of regions coverged by seeds
int secondary; // index of the parent hit shadowing the current hit; <0 if primary
uint64_t hash;
} mem_alnreg_t;
typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v;
@ -109,7 +110,7 @@ extern "C" {
* @param pes0 insert-size info; if NULL, infer from data; if not NULL, it should be an array with 4 elements,
* corresponding to each FF, FR, RF and RR orientation. See mem_pestat() for more info.
*/
void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs, const mem_pestat_t *pes0);
void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0);
/**
* Find the aligned regions for one query sequence

View File

@ -235,7 +235,7 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_
int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2])
{
extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a);
extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id);
extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a);
extern void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m);
extern void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m);
@ -257,8 +257,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
free(b[0].a); free(b[1].a);
}
mem_mark_primary_se(opt, a[0].n, a[0].a);
mem_mark_primary_se(opt, a[1].n, a[1].a);
mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0);
mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1);
if (opt->flag&MEM_F_NOPAIRING) goto no_pairing;
// pairing single-end hits
if (a[0].n && a[1].n && (o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, &n_sub, z)) > 0) {

View File

@ -27,6 +27,7 @@ int main_mem(int argc, char *argv[])
bwaidx_t *idx;
char *rg_line = 0;
void *ko = 0, *ko2 = 0;
int64_t n_processed = 0;
opt = mem_opt_init();
while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:")) >= 0) {
@ -137,7 +138,8 @@ int main_mem(int argc, char *argv[])
for (i = 0; i < n; ++i) size += seqs[i].l_seq;
if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size);
mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n, seqs, 0);
mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, 0);
n_processed += n;
for (i = 0; i < n; ++i) {
err_fputs(seqs[i].sam, stdout);
free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam);

2
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.7.5a-r422"
#define PACKAGE_VERSION "0.7.5a-r423"
#endif
int bwa_fa2pac(int argc, char *argv[]);