From 4b707aac9226aaf5fbaea45ed6d78f797f8c8432 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 15 Jul 2018 10:55:00 -0400 Subject: [PATCH] working with toy examples --- align.c | 4 ++-- hit.c | 14 ++++++++++---- map.c | 24 ++++++++++++++++++++---- mmpriv.h | 4 ++-- splitidx.c | 37 +++++++++++++++---------------------- 5 files changed, 49 insertions(+), 34 deletions(-) diff --git a/align.c b/align.c index c86e4c1..b22a3f2 100644 --- a/align.c +++ b/align.c @@ -199,7 +199,7 @@ static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) // mm_extra_t *p; if (n_cigar == 0) return; if (r->p == 0) { - uint32_t capacity = n_cigar + sizeof(mm_extra_t)/4; // TODO: should this be "n_cigar + sizeof(mm_extra_t)/4" instead? + uint32_t capacity = n_cigar + sizeof(mm_extra_t)/4; kroundup32(capacity); r->p = (mm_extra_t*)calloc(capacity, 4); r->p->capacity = capacity; @@ -878,6 +878,6 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m kfree(km, qseq0[0]); kfree(km, ez.cigar); mm_filter_regs(opt, qlen, n_regs_, regs); - mm_hit_sort_by_dp(km, n_regs_, regs); + mm_hit_sort(km, n_regs_, regs); return regs; } diff --git a/hit.c b/hit.c index 972bdb6..9616bf5 100644 --- a/hit.c +++ b/hit.c @@ -164,9 +164,9 @@ set_parent_test: kfree(km, w); } -void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r) +void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r) { - int32_t i, n_aux, n = *n_regs; + int32_t i, n_aux, n = *n_regs, has_cigar = 0, no_cigar = 0; mm128_t *aux; mm_reg1_t *t; @@ -175,14 +175,20 @@ void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r) t = (mm_reg1_t*)kmalloc(km, n * sizeof(mm_reg1_t)); for (i = n_aux = 0; i < n; ++i) { if (r[i].inv || r[i].cnt > 0) { // squeeze out elements with cnt==0 (soft deleted) - assert(r[i].p); - aux[n_aux].x = (uint64_t)r[i].p->dp_max << 32 | r[i].hash; + if (r[i].p) { + aux[n_aux].x = (uint64_t)r[i].p->dp_max << 32 | r[i].hash; + has_cigar = 1; + } else { + aux[n_aux].x = (uint64_t)r[i].score << 32 | r[i].hash; + no_cigar = 1; + } aux[n_aux++].y = i; } else if (r[i].p) { free(r[i].p); r[i].p = 0; } } + assert(has_cigar + no_cigar == 1); radix_sort_128x(aux, aux + n_aux); for (i = n_aux - 1; i >= 0; --i) t[n_aux - 1 - i] = r[aux[i].y]; diff --git a/map.c b/map.c index 5ce8523..be4ae74 100644 --- a/map.c +++ b/map.c @@ -399,6 +399,7 @@ typedef struct { kstring_t str; int n_parts; + uint32_t *rid_shift; FILE *fp_split, **fp_parts; } pipeline_t; @@ -486,6 +487,7 @@ static void merge_hits(step_t *s) mm_reg1_t *r = &s->reg[k][l]; uint32_t capacity; mm_err_fread(r, sizeof(mm_reg1_t), 1, fp[j]); + r->rid += s->p->rid_shift[j]; if (opt->flag & MM_F_CIGAR) { mm_err_fread(&capacity, 4, 1, fp[j]); r->p = (mm_extra_t*)calloc(capacity, 4); @@ -494,6 +496,7 @@ static void merge_hits(step_t *s) } } } + mm_hit_sort(km, &s->n_reg[k], s->reg[k]); mm_set_parent(km, opt->mask_level, s->n_reg[k], s->reg[k], opt->a * 2 + opt->b); if (!(opt->flag & MM_F_ALL_CHAINS)) { mm_select_sub(km, opt->pri_ratio, s->p->mi->k*2, opt->best_n, &s->n_reg[k], s->reg[k]); @@ -663,13 +666,26 @@ int mm_split_merge(int n_segs, const char **fn, const mm_mapopt_t *opt, int n_sp pl.fp = open_bseqs(pl.n_fp, fn); if (pl.fp == 0) return -1; pl.opt = opt; - pl.n_parts = n_split_idx; - pl.fp_parts = mm_split_merge_prep(opt->split_prefix, n_split_idx, &mi); - if (pl.fp_parts == 0) return -1; - pl.mi = mi; pl.mini_batch_size = opt->mini_batch_size; + + pl.n_parts = n_split_idx; + pl.fp_parts = CALLOC(FILE*, pl.n_parts); + pl.rid_shift = CALLOC(uint32_t, pl.n_parts); + pl.mi = mm_split_merge_prep(opt->split_prefix, n_split_idx, pl.fp_parts, pl.rid_shift); + if (pl.mi == 0) { + free(pl.fp_parts); + free(pl.rid_shift); + return -1; + } + for (i = n_split_idx - 1; i > 0; --i) + pl.rid_shift[i] = pl.rid_shift[i - 1]; + for (pl.rid_shift[0] = 0, i = 1; i < n_split_idx; ++i) + pl.rid_shift[i] += pl.rid_shift[i - 1]; + kt_pipeline(2, worker_pipeline, &pl, 3); + free(pl.str.s); + free(pl.rid_shift); for (i = 0; i < n_split_idx; ++i) fclose(pl.fp_parts[i]); free(pl.fp_parts); diff --git a/mmpriv.h b/mmpriv.h index 99272a0..a02a1b7 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -80,7 +80,7 @@ void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int *n_, void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r); void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs); void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs, mm128_t *a); -void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r); +void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r); void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr); void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const mm128_t *a, int32_t n, const uint64_t *mini_pos); @@ -90,7 +90,7 @@ void mm_seg_free(void *km, int n_segs, mm_seg_t *segs); void mm_pair(void *km, int max_gap_ref, int dp_bonus, int sub_diff, int match_sc, const int *qlens, int *n_regs, mm_reg1_t **regs); FILE *mm_split_init(const char *prefix, const mm_idx_t *mi); -FILE **mm_split_merge_prep(const char *prefix, int n_splits, mm_idx_t **mi_); +mm_idx_t *mm_split_merge_prep(const char *prefix, int n_splits, FILE **fp, uint32_t *n_seq_part); int mm_split_merge(int n_segs, const char **fn, const mm_mapopt_t *opt, int n_split_idx); void mm_err_puts(const char *str); diff --git a/splitidx.c b/splitidx.c index db0217c..7d8627d 100644 --- a/splitidx.c +++ b/splitidx.c @@ -26,17 +26,13 @@ FILE *mm_split_init(const char *prefix, const mm_idx_t *mi) return fp; } -FILE **mm_split_merge_prep(const char *prefix, int n_splits, mm_idx_t **mi_) +mm_idx_t *mm_split_merge_prep(const char *prefix, int n_splits, FILE **fp, uint32_t *n_seq_part) { mm_idx_t *mi = 0; - FILE **fp; char *fn; int i, j; - uint32_t m_seq = 0; if (n_splits < 1) return 0; - *mi_ = 0; - fp = (FILE**)calloc(n_splits, sizeof(FILE*)); fn = (char*)calloc(strlen(prefix) + 10, 1); for (i = 0; i < n_splits; ++i) { sprintf(fn, "%s.%.4d.tmp", prefix, i); @@ -45,31 +41,28 @@ FILE **mm_split_merge_prep(const char *prefix, int n_splits, mm_idx_t **mi_) fprintf(stderr, "ERROR: failed to open temporary file '%s'\n", fn); for (j = 0; j < i; ++j) fclose(fp[j]); - free(fp); + free(fn); return 0; } } free(fn); + mi = (mm_idx_t*)calloc(1, sizeof(mm_idx_t)); for (i = 0; i < n_splits; ++i) { - uint32_t k, n; - mm_err_fread(&k, 4, 1, fp[i]); - mm_err_fread(&n, 4, 1, fp[i]); - mi->k = k; - if (mi->n_seq + n > m_seq) { - m_seq = mi->n_seq + n; - kroundup32(m_seq); - mi->seq = (mm_idx_seq_t*)realloc(mi->seq, sizeof(mm_idx_seq_t) * m_seq); - } - for (k = 0; k < n; ++i) { + mm_err_fread(&mi->k, 4, 1, fp[i]); // TODO: check if k is all the same + mm_err_fread(&n_seq_part[i], 4, 1, fp[i]); + mi->n_seq += n_seq_part[i]; + } + mi->seq = CALLOC(mm_idx_seq_t, mi->n_seq); + for (i = j = 0; i < n_splits; ++i) { + uint32_t k; + for (k = 0; k < n_seq_part[i]; ++k, ++j) { uint8_t l; mm_err_fread(&l, 1, 1, fp[i]); - mi->seq[mi->n_seq].name = (char*)calloc(l + 1, 1); - mm_err_fread(mi->seq[mi->n_seq].name, 1, l, fp[i]); - mm_err_fread(&mi->seq[mi->n_seq].len, 4, 1, fp[i]); - ++mi->n_seq; + mi->seq[j].name = (char*)calloc(l + 1, 1); + mm_err_fread(mi->seq[j].name, 1, l, fp[i]); + mm_err_fread(&mi->seq[j].len, 4, 1, fp[i]); } } - *mi_ = mi; - return fp; + return mi; }