dp chaining mostly works, but fails sometimes

which means there are bugs that need to be fixed
This commit is contained in:
Heng Li 2017-06-06 14:19:50 -04:00
parent 1a9fc04cf0
commit 6d4348db44
6 changed files with 60 additions and 25 deletions

29
chain.c
View File

@ -19,11 +19,13 @@ static inline int ilog2_32(uint32_t v)
return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v];
}
int mm_chain_dp(int match_len, int max_dist, int bw, int max_skip, int min_sc, int n, mm128_t *a, uint64_t **_u, int32_t **_v, void *km)
int mm_chain_dp(int match_len, int max_dist, int bw, int max_skip, int min_sc, int n, mm128_t *a, uint64_t **_u, void *km)
{
int32_t st = 0, i, j, k, *p, *f, *t, *idx, *v, n_u, n_v;
int32_t st = 0, i, j, k, *p, *f, *t, *v, n_u, n_v;
uint64_t *u;
mm128_t *b;
if (_u) *_u = 0;
f = (int32_t*)kmalloc(km, n * 4);
p = (int32_t*)kmalloc(km, n * 4);
t = (int32_t*)kmalloc(km, n * 4);
@ -54,12 +56,6 @@ int mm_chain_dp(int match_len, int max_dist, int bw, int max_skip, int min_sc, i
else f[i] = match_len, p[i] = -1;
}
// free the a[] array
for (i = 0; i < n; ++i) t[i] = a[i].y>>32;
kfree(km, a);
idx = (int32_t*)kmalloc(km, n * 4);
for (i = 0; i < n; ++i) idx[i] = t[i];
// find the ending positions of chains
memset(t, 0, n * 4);
for (i = 0; i < n; ++i)
@ -67,6 +63,7 @@ int mm_chain_dp(int match_len, int max_dist, int bw, int max_skip, int min_sc, i
for (i = n_u = 0; i < n; ++i)
if (t[i] == 0 && f[i] >= min_sc)
++n_u;
if (n_u == 0) return 0;
u = (uint64_t*)kmalloc(km, n_u * 8);
for (i = n_u = 0; i < n; ++i)
if (t[i] == 0 && f[i] >= min_sc)
@ -80,22 +77,30 @@ int mm_chain_dp(int match_len, int max_dist, int bw, int max_skip, int min_sc, i
int32_t n_v0 = n_v;
j = (int32_t)u[i];
do {
v[n_v++] = idx[j];
v[n_v++] = j;
t[j] = 1;
j = p[j];
} while (j >= 0 && t[j] == 0);
if (j < 0) u[k++] = u[i]>>32<<32 | (n_v - n_v0);
else if ((u[i]>>32) - f[j] >= min_sc) u[k++] = ((u[i]>>32) - f[j]) << 32 | (n_v - n_v0);
else n_v0 = n_v;
for (j = 0; j < (n_v - n_v0) >> 1; ++j) {
for (j = 0; j < (n_v - n_v0) >> 1; ++j) { // reverse v[] such that the smallest index comes the first
int32_t t = v[n_v0 + j];
v[n_v0 + j] = v[n_v - 1 - j];
v[n_v - 1 - j] = t;
}
}
n_u = k, *_u = u, *_v = v;
n_u = k, *_u = u;
// free
kfree(km, f); kfree(km, p); kfree(km, t); kfree(km, idx);
kfree(km, f); kfree(km, p); kfree(km, t);
// write the result to _a_
b = kmalloc(km, n_v * sizeof(mm128_t));
for (i = 0, k = 0; i < n_u; ++i)
for (j = 0; j < (int32_t)u[i]; ++j)
b[k] = a[v[k]], ++k;
memcpy(a, b, n_v * sizeof(mm128_t));
kfree(km, b);
return n_u;
}

View File

@ -299,8 +299,6 @@ mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads)
* index I/O *
*************/
#define MM_IDX_MAGIC "MMI\2"
void mm_idx_dump(FILE *fp, const mm_idx_t *mi)
{
uint64_t sum_len = 0;

31
main.c
View File

@ -4,6 +4,7 @@
#include <string.h>
#include <sys/resource.h>
#include <sys/time.h>
#include <fcntl.h>
#include "bseq.h"
#include "minimap.h"
#include "mmpriv.h"
@ -20,10 +21,29 @@ void liftrlimit()
#endif
}
static int test_idx(const char *fn)
{
int fd, is_idx = 0;
off_t ret;
char magic[4];
if (strcmp(fn, "-") == 0) return 0; // read from pipe; not an index
fd = open(fn, O_RDONLY);
if (fd < 0) return -1; // error
if ((ret = lseek(fd, 0, SEEK_END)) >= 4) {
lseek(fd, 0, SEEK_SET);
read(fd, magic, 4);
if (strncmp(magic, MM_IDX_MAGIC, 4) == 0)
is_idx = 1;
}
close(fd);
return is_idx;
}
int main(int argc, char *argv[])
{
mm_mapopt_t opt;
int i, c, k = 17, w = -1, b = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx = 0, is_hpc = 0;
int i, c, k = 17, w = -1, b = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0;
int mini_batch_size = 100000000;
uint64_t batch_size = 4000000000ULL;
bseq_file_t *fp = 0;
@ -34,12 +54,11 @@ int main(int argc, char *argv[])
mm_realtime0 = realtime();
mm_mapopt_init(&opt);
while ((c = getopt(argc, argv, "w:k:B:b:t:r:f:Vv:NOg:I:d:lST:s:Dx:H")) >= 0) {
while ((c = getopt(argc, argv, "w:k:B:b:t:r:f:Vv:NOg:I:d:ST:s:Dx:H")) >= 0) {
if (c == 'w') w = atoi(optarg);
else if (c == 'k') k = atoi(optarg);
else if (c == 'b') b = atoi(optarg);
else if (c == 'H') is_hpc = 1;
else if (c == 'l') is_idx = 1;
else if (c == 'd') fnw = optarg; // the above are indexing related options, except -I
else if (c == 'r') opt.bw = atoi(optarg);
else if (c == 'f') opt.mid_occ_frac = atof(optarg);
@ -83,7 +102,6 @@ int main(int argc, char *argv[])
fprintf(stderr, " -w INT minizer window size [{-k}*2/3]\n");
fprintf(stderr, " -I NUM split index for every ~NUM input bases [4G]\n");
fprintf(stderr, " -d FILE dump index to FILE []\n");
fprintf(stderr, " -l the 1st argument is a index file (overriding -k, -w and -I)\n");
// fprintf(stderr, " -b INT bucket bits [%d]\n", b); // most users wouldn't care about this
fprintf(stderr, " Mapping:\n");
fprintf(stderr, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%.3f]\n", opt.mid_occ_frac);
@ -106,6 +124,11 @@ int main(int argc, char *argv[])
return 1;
}
is_idx = test_idx(argv[optind]);
if (is_idx < 0) {
fprintf(stderr, "[E::%s] failed to open file '%s'\n", __func__, argv[optind]);
return 1;
}
if (is_idx) fpr = fopen(argv[optind], "rb");
else fp = bseq_open(argv[optind]);
if (fnw) fpw = fopen(fnw, "wb");

19
map.c
View File

@ -155,8 +155,7 @@ void mm_pair_thin(mm_tbuf_t *b, int radius, mm_match_t *m1, mm_match_t *m2)
void mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, uint32_t m_st, uint32_t m_en, const char *qname, int qlen)
{
int i, n = m_en - m_st, last = -1, last2 = -1, j, n_a, n_u;
int32_t *v;
int i, n = m_en - m_st, j, n_a, n_u;
uint64_t *u;
mm_match_t *m;
mm128_t *a;
@ -178,6 +177,7 @@ void mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, uint3
}
if (mm_verbose >= 5) printf("\n");
#if 0
int last = -1, last2 = -1;
// pair k-mer thinning
for (i = 0; i < n; ++i) {
if (m[i].n >= opt->mid_occ && m[i].n < opt->max_occ) {
@ -214,10 +214,10 @@ void mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, uint3
p = &a[j++];
if ((r[k]&1) == (q->qpos&1)) { // forward strand
p->x = (r[k]&0xffffffff00000000ULL) | (uint32_t)r[k]>>1;
p->y = (uint64_t)i << 32 | q->qpos >> 1;
p->y = (uint64_t)q_span << 32 | q->qpos >> 1;
} else { // reverse strand
p->x = 1ULL<<63 | (r[k]&0xffffffff00000000ULL) | (uint32_t)r[k]>>1;
p->y = (uint64_t)i << 32 | (qlen - ((q->qpos>>1) + 1 - q_span) - 1);
p->y = (uint64_t)q_span << 32 | (qlen - ((q->qpos>>1) + 1 - q_span) - 1);
}
}
}
@ -227,11 +227,18 @@ void mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, uint3
kfree(b->km_fixed, m);
//for (i = 0; i < n_a; ++i) printf("%c\t%s\t%d\t%d\n", "+-"[a[i].x>>63], mi->seq[a[i].x<<1>>33].name, (uint32_t)a[i].x>>1, (uint32_t)a[i].y);
n_u = mm_chain_dp(mi->k, opt->max_gap, opt->bw, opt->max_skip, opt->min_score, n_a, a, &u, &v, b->km_fixed);
n_u = mm_chain_dp(mi->k, opt->max_gap, opt->bw, opt->max_skip, opt->min_score, n_a, a, &u, b->km_fixed);
printf("%s\t%d", qname, n_u);
for (i = j = 0; i < n_u; ++i) {
int n = (uint32_t)u[i];
printf("\t%d@%s:%d-%d", (uint32_t)(u[i]>>32), mi->seq[a[j].x<<1>>33].name, (uint32_t)a[j].x, (uint32_t)a[j+n-1].x);
j += n;
}
printf("\n");
// free
kfree(b->km_fixed, a);
kfree(b->km_fixed, u);
kfree(b->km_fixed, v);
}
const mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname)

View File

@ -12,6 +12,8 @@
#define MM_F_NO_ISO 0x4
#define MM_F_AVA 0x8
#define MM_IDX_MAGIC "MMI\2"
typedef struct {
uint64_t x, y;
} mm128_t;

View File

@ -14,7 +14,7 @@ void radix_sort_128x(mm128_t *beg, mm128_t *end);
void radix_sort_64(uint64_t *beg, uint64_t *end);
uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
int mm_chain_dp(int match_len, int max_dist, int bw, int max_skip, int min_sc, int n, mm128_t *a, uint64_t **_u, int32_t **_v, void *km);
int mm_chain_dp(int match_len, int max_dist, int bw, int max_skip, int min_sc, int n, mm128_t *a, uint64_t **_u, void *km);
#ifdef __cplusplus
}