working with toy examples

This commit is contained in:
Heng Li 2018-07-15 10:55:00 -04:00
parent 951c0d1d35
commit 4b707aac92
5 changed files with 49 additions and 34 deletions

View File

@ -199,7 +199,7 @@ static void mm_append_cigar(mm_reg1_t *r, uint32_t n_cigar, uint32_t *cigar) //
mm_extra_t *p;
if (n_cigar == 0) return;
if (r->p == 0) {
uint32_t capacity = n_cigar + sizeof(mm_extra_t)/4; // TODO: should this be "n_cigar + sizeof(mm_extra_t)/4" instead?
uint32_t capacity = n_cigar + sizeof(mm_extra_t)/4;
kroundup32(capacity);
r->p = (mm_extra_t*)calloc(capacity, 4);
r->p->capacity = capacity;
@ -878,6 +878,6 @@ mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *m
kfree(km, qseq0[0]);
kfree(km, ez.cigar);
mm_filter_regs(opt, qlen, n_regs_, regs);
mm_hit_sort_by_dp(km, n_regs_, regs);
mm_hit_sort(km, n_regs_, regs);
return regs;
}

14
hit.c
View File

@ -164,9 +164,9 @@ set_parent_test:
kfree(km, w);
}
void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r)
void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r)
{
int32_t i, n_aux, n = *n_regs;
int32_t i, n_aux, n = *n_regs, has_cigar = 0, no_cigar = 0;
mm128_t *aux;
mm_reg1_t *t;
@ -175,14 +175,20 @@ void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r)
t = (mm_reg1_t*)kmalloc(km, n * sizeof(mm_reg1_t));
for (i = n_aux = 0; i < n; ++i) {
if (r[i].inv || r[i].cnt > 0) { // squeeze out elements with cnt==0 (soft deleted)
assert(r[i].p);
aux[n_aux].x = (uint64_t)r[i].p->dp_max << 32 | r[i].hash;
if (r[i].p) {
aux[n_aux].x = (uint64_t)r[i].p->dp_max << 32 | r[i].hash;
has_cigar = 1;
} else {
aux[n_aux].x = (uint64_t)r[i].score << 32 | r[i].hash;
no_cigar = 1;
}
aux[n_aux++].y = i;
} else if (r[i].p) {
free(r[i].p);
r[i].p = 0;
}
}
assert(has_cigar + no_cigar == 1);
radix_sort_128x(aux, aux + n_aux);
for (i = n_aux - 1; i >= 0; --i)
t[n_aux - 1 - i] = r[aux[i].y];

24
map.c
View File

@ -399,6 +399,7 @@ typedef struct {
kstring_t str;
int n_parts;
uint32_t *rid_shift;
FILE *fp_split, **fp_parts;
} pipeline_t;
@ -486,6 +487,7 @@ static void merge_hits(step_t *s)
mm_reg1_t *r = &s->reg[k][l];
uint32_t capacity;
mm_err_fread(r, sizeof(mm_reg1_t), 1, fp[j]);
r->rid += s->p->rid_shift[j];
if (opt->flag & MM_F_CIGAR) {
mm_err_fread(&capacity, 4, 1, fp[j]);
r->p = (mm_extra_t*)calloc(capacity, 4);
@ -494,6 +496,7 @@ static void merge_hits(step_t *s)
}
}
}
mm_hit_sort(km, &s->n_reg[k], s->reg[k]);
mm_set_parent(km, opt->mask_level, s->n_reg[k], s->reg[k], opt->a * 2 + opt->b);
if (!(opt->flag & MM_F_ALL_CHAINS)) {
mm_select_sub(km, opt->pri_ratio, s->p->mi->k*2, opt->best_n, &s->n_reg[k], s->reg[k]);
@ -663,13 +666,26 @@ int mm_split_merge(int n_segs, const char **fn, const mm_mapopt_t *opt, int n_sp
pl.fp = open_bseqs(pl.n_fp, fn);
if (pl.fp == 0) return -1;
pl.opt = opt;
pl.n_parts = n_split_idx;
pl.fp_parts = mm_split_merge_prep(opt->split_prefix, n_split_idx, &mi);
if (pl.fp_parts == 0) return -1;
pl.mi = mi;
pl.mini_batch_size = opt->mini_batch_size;
pl.n_parts = n_split_idx;
pl.fp_parts = CALLOC(FILE*, pl.n_parts);
pl.rid_shift = CALLOC(uint32_t, pl.n_parts);
pl.mi = mm_split_merge_prep(opt->split_prefix, n_split_idx, pl.fp_parts, pl.rid_shift);
if (pl.mi == 0) {
free(pl.fp_parts);
free(pl.rid_shift);
return -1;
}
for (i = n_split_idx - 1; i > 0; --i)
pl.rid_shift[i] = pl.rid_shift[i - 1];
for (pl.rid_shift[0] = 0, i = 1; i < n_split_idx; ++i)
pl.rid_shift[i] += pl.rid_shift[i - 1];
kt_pipeline(2, worker_pipeline, &pl, 3);
free(pl.str.s);
free(pl.rid_shift);
for (i = 0; i < n_split_idx; ++i)
fclose(pl.fp_parts[i]);
free(pl.fp_parts);

View File

@ -80,7 +80,7 @@ void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int *n_,
void mm_select_sub_multi(void *km, float pri_ratio, float pri1, float pri2, int max_gap_ref, int min_diff, int best_n, int n_segs, const int *qlens, int *n_, mm_reg1_t *r);
void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs);
void mm_join_long(void *km, const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs, mm128_t *a);
void mm_hit_sort_by_dp(void *km, int *n_regs, mm_reg1_t *r);
void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r);
void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr);
void mm_est_err(const mm_idx_t *mi, int qlen, int n_regs, mm_reg1_t *regs, const mm128_t *a, int32_t n, const uint64_t *mini_pos);
@ -90,7 +90,7 @@ void mm_seg_free(void *km, int n_segs, mm_seg_t *segs);
void mm_pair(void *km, int max_gap_ref, int dp_bonus, int sub_diff, int match_sc, const int *qlens, int *n_regs, mm_reg1_t **regs);
FILE *mm_split_init(const char *prefix, const mm_idx_t *mi);
FILE **mm_split_merge_prep(const char *prefix, int n_splits, mm_idx_t **mi_);
mm_idx_t *mm_split_merge_prep(const char *prefix, int n_splits, FILE **fp, uint32_t *n_seq_part);
int mm_split_merge(int n_segs, const char **fn, const mm_mapopt_t *opt, int n_split_idx);
void mm_err_puts(const char *str);

View File

@ -26,17 +26,13 @@ FILE *mm_split_init(const char *prefix, const mm_idx_t *mi)
return fp;
}
FILE **mm_split_merge_prep(const char *prefix, int n_splits, mm_idx_t **mi_)
mm_idx_t *mm_split_merge_prep(const char *prefix, int n_splits, FILE **fp, uint32_t *n_seq_part)
{
mm_idx_t *mi = 0;
FILE **fp;
char *fn;
int i, j;
uint32_t m_seq = 0;
if (n_splits < 1) return 0;
*mi_ = 0;
fp = (FILE**)calloc(n_splits, sizeof(FILE*));
fn = (char*)calloc(strlen(prefix) + 10, 1);
for (i = 0; i < n_splits; ++i) {
sprintf(fn, "%s.%.4d.tmp", prefix, i);
@ -45,31 +41,28 @@ FILE **mm_split_merge_prep(const char *prefix, int n_splits, mm_idx_t **mi_)
fprintf(stderr, "ERROR: failed to open temporary file '%s'\n", fn);
for (j = 0; j < i; ++j)
fclose(fp[j]);
free(fp);
free(fn);
return 0;
}
}
free(fn);
mi = (mm_idx_t*)calloc(1, sizeof(mm_idx_t));
for (i = 0; i < n_splits; ++i) {
uint32_t k, n;
mm_err_fread(&k, 4, 1, fp[i]);
mm_err_fread(&n, 4, 1, fp[i]);
mi->k = k;
if (mi->n_seq + n > m_seq) {
m_seq = mi->n_seq + n;
kroundup32(m_seq);
mi->seq = (mm_idx_seq_t*)realloc(mi->seq, sizeof(mm_idx_seq_t) * m_seq);
}
for (k = 0; k < n; ++i) {
mm_err_fread(&mi->k, 4, 1, fp[i]); // TODO: check if k is all the same
mm_err_fread(&n_seq_part[i], 4, 1, fp[i]);
mi->n_seq += n_seq_part[i];
}
mi->seq = CALLOC(mm_idx_seq_t, mi->n_seq);
for (i = j = 0; i < n_splits; ++i) {
uint32_t k;
for (k = 0; k < n_seq_part[i]; ++k, ++j) {
uint8_t l;
mm_err_fread(&l, 1, 1, fp[i]);
mi->seq[mi->n_seq].name = (char*)calloc(l + 1, 1);
mm_err_fread(mi->seq[mi->n_seq].name, 1, l, fp[i]);
mm_err_fread(&mi->seq[mi->n_seq].len, 4, 1, fp[i]);
++mi->n_seq;
mi->seq[j].name = (char*)calloc(l + 1, 1);
mm_err_fread(mi->seq[j].name, 1, l, fp[i]);
mm_err_fread(&mi->seq[j].len, 4, 1, fp[i]);
}
}
*mi_ = mi;
return fp;
return mi;
}