dp chaining mostly works, but fails sometimes
which means there are bugs that need to be fixed
This commit is contained in:
parent
1a9fc04cf0
commit
6d4348db44
29
chain.c
29
chain.c
|
|
@ -19,11 +19,13 @@ static inline int ilog2_32(uint32_t v)
|
|||
return (t = v>>8) ? 8 + LogTable256[t] : LogTable256[v];
|
||||
}
|
||||
|
||||
int mm_chain_dp(int match_len, int max_dist, int bw, int max_skip, int min_sc, int n, mm128_t *a, uint64_t **_u, int32_t **_v, void *km)
|
||||
int mm_chain_dp(int match_len, int max_dist, int bw, int max_skip, int min_sc, int n, mm128_t *a, uint64_t **_u, void *km)
|
||||
{
|
||||
int32_t st = 0, i, j, k, *p, *f, *t, *idx, *v, n_u, n_v;
|
||||
int32_t st = 0, i, j, k, *p, *f, *t, *v, n_u, n_v;
|
||||
uint64_t *u;
|
||||
mm128_t *b;
|
||||
|
||||
if (_u) *_u = 0;
|
||||
f = (int32_t*)kmalloc(km, n * 4);
|
||||
p = (int32_t*)kmalloc(km, n * 4);
|
||||
t = (int32_t*)kmalloc(km, n * 4);
|
||||
|
|
@ -54,12 +56,6 @@ int mm_chain_dp(int match_len, int max_dist, int bw, int max_skip, int min_sc, i
|
|||
else f[i] = match_len, p[i] = -1;
|
||||
}
|
||||
|
||||
// free the a[] array
|
||||
for (i = 0; i < n; ++i) t[i] = a[i].y>>32;
|
||||
kfree(km, a);
|
||||
idx = (int32_t*)kmalloc(km, n * 4);
|
||||
for (i = 0; i < n; ++i) idx[i] = t[i];
|
||||
|
||||
// find the ending positions of chains
|
||||
memset(t, 0, n * 4);
|
||||
for (i = 0; i < n; ++i)
|
||||
|
|
@ -67,6 +63,7 @@ int mm_chain_dp(int match_len, int max_dist, int bw, int max_skip, int min_sc, i
|
|||
for (i = n_u = 0; i < n; ++i)
|
||||
if (t[i] == 0 && f[i] >= min_sc)
|
||||
++n_u;
|
||||
if (n_u == 0) return 0;
|
||||
u = (uint64_t*)kmalloc(km, n_u * 8);
|
||||
for (i = n_u = 0; i < n; ++i)
|
||||
if (t[i] == 0 && f[i] >= min_sc)
|
||||
|
|
@ -80,22 +77,30 @@ int mm_chain_dp(int match_len, int max_dist, int bw, int max_skip, int min_sc, i
|
|||
int32_t n_v0 = n_v;
|
||||
j = (int32_t)u[i];
|
||||
do {
|
||||
v[n_v++] = idx[j];
|
||||
v[n_v++] = j;
|
||||
t[j] = 1;
|
||||
j = p[j];
|
||||
} while (j >= 0 && t[j] == 0);
|
||||
if (j < 0) u[k++] = u[i]>>32<<32 | (n_v - n_v0);
|
||||
else if ((u[i]>>32) - f[j] >= min_sc) u[k++] = ((u[i]>>32) - f[j]) << 32 | (n_v - n_v0);
|
||||
else n_v0 = n_v;
|
||||
for (j = 0; j < (n_v - n_v0) >> 1; ++j) {
|
||||
for (j = 0; j < (n_v - n_v0) >> 1; ++j) { // reverse v[] such that the smallest index comes the first
|
||||
int32_t t = v[n_v0 + j];
|
||||
v[n_v0 + j] = v[n_v - 1 - j];
|
||||
v[n_v - 1 - j] = t;
|
||||
}
|
||||
}
|
||||
n_u = k, *_u = u, *_v = v;
|
||||
n_u = k, *_u = u;
|
||||
|
||||
// free
|
||||
kfree(km, f); kfree(km, p); kfree(km, t); kfree(km, idx);
|
||||
kfree(km, f); kfree(km, p); kfree(km, t);
|
||||
|
||||
// write the result to _a_
|
||||
b = kmalloc(km, n_v * sizeof(mm128_t));
|
||||
for (i = 0, k = 0; i < n_u; ++i)
|
||||
for (j = 0; j < (int32_t)u[i]; ++j)
|
||||
b[k] = a[v[k]], ++k;
|
||||
memcpy(a, b, n_v * sizeof(mm128_t));
|
||||
kfree(km, b);
|
||||
return n_u;
|
||||
}
|
||||
|
|
|
|||
2
index.c
2
index.c
|
|
@ -299,8 +299,6 @@ mm_idx_t *mm_idx_build(const char *fn, int w, int k, int is_hpc, int n_threads)
|
|||
* index I/O *
|
||||
*************/
|
||||
|
||||
#define MM_IDX_MAGIC "MMI\2"
|
||||
|
||||
void mm_idx_dump(FILE *fp, const mm_idx_t *mi)
|
||||
{
|
||||
uint64_t sum_len = 0;
|
||||
|
|
|
|||
31
main.c
31
main.c
|
|
@ -4,6 +4,7 @@
|
|||
#include <string.h>
|
||||
#include <sys/resource.h>
|
||||
#include <sys/time.h>
|
||||
#include <fcntl.h>
|
||||
#include "bseq.h"
|
||||
#include "minimap.h"
|
||||
#include "mmpriv.h"
|
||||
|
|
@ -20,10 +21,29 @@ void liftrlimit()
|
|||
#endif
|
||||
}
|
||||
|
||||
static int test_idx(const char *fn)
|
||||
{
|
||||
int fd, is_idx = 0;
|
||||
off_t ret;
|
||||
char magic[4];
|
||||
|
||||
if (strcmp(fn, "-") == 0) return 0; // read from pipe; not an index
|
||||
fd = open(fn, O_RDONLY);
|
||||
if (fd < 0) return -1; // error
|
||||
if ((ret = lseek(fd, 0, SEEK_END)) >= 4) {
|
||||
lseek(fd, 0, SEEK_SET);
|
||||
read(fd, magic, 4);
|
||||
if (strncmp(magic, MM_IDX_MAGIC, 4) == 0)
|
||||
is_idx = 1;
|
||||
}
|
||||
close(fd);
|
||||
return is_idx;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
mm_mapopt_t opt;
|
||||
int i, c, k = 17, w = -1, b = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx = 0, is_hpc = 0;
|
||||
int i, c, k = 17, w = -1, b = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx, is_hpc = 0;
|
||||
int mini_batch_size = 100000000;
|
||||
uint64_t batch_size = 4000000000ULL;
|
||||
bseq_file_t *fp = 0;
|
||||
|
|
@ -34,12 +54,11 @@ int main(int argc, char *argv[])
|
|||
mm_realtime0 = realtime();
|
||||
mm_mapopt_init(&opt);
|
||||
|
||||
while ((c = getopt(argc, argv, "w:k:B:b:t:r:f:Vv:NOg:I:d:lST:s:Dx:H")) >= 0) {
|
||||
while ((c = getopt(argc, argv, "w:k:B:b:t:r:f:Vv:NOg:I:d:ST:s:Dx:H")) >= 0) {
|
||||
if (c == 'w') w = atoi(optarg);
|
||||
else if (c == 'k') k = atoi(optarg);
|
||||
else if (c == 'b') b = atoi(optarg);
|
||||
else if (c == 'H') is_hpc = 1;
|
||||
else if (c == 'l') is_idx = 1;
|
||||
else if (c == 'd') fnw = optarg; // the above are indexing related options, except -I
|
||||
else if (c == 'r') opt.bw = atoi(optarg);
|
||||
else if (c == 'f') opt.mid_occ_frac = atof(optarg);
|
||||
|
|
@ -83,7 +102,6 @@ int main(int argc, char *argv[])
|
|||
fprintf(stderr, " -w INT minizer window size [{-k}*2/3]\n");
|
||||
fprintf(stderr, " -I NUM split index for every ~NUM input bases [4G]\n");
|
||||
fprintf(stderr, " -d FILE dump index to FILE []\n");
|
||||
fprintf(stderr, " -l the 1st argument is a index file (overriding -k, -w and -I)\n");
|
||||
// fprintf(stderr, " -b INT bucket bits [%d]\n", b); // most users wouldn't care about this
|
||||
fprintf(stderr, " Mapping:\n");
|
||||
fprintf(stderr, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%.3f]\n", opt.mid_occ_frac);
|
||||
|
|
@ -106,6 +124,11 @@ int main(int argc, char *argv[])
|
|||
return 1;
|
||||
}
|
||||
|
||||
is_idx = test_idx(argv[optind]);
|
||||
if (is_idx < 0) {
|
||||
fprintf(stderr, "[E::%s] failed to open file '%s'\n", __func__, argv[optind]);
|
||||
return 1;
|
||||
}
|
||||
if (is_idx) fpr = fopen(argv[optind], "rb");
|
||||
else fp = bseq_open(argv[optind]);
|
||||
if (fnw) fpw = fopen(fnw, "wb");
|
||||
|
|
|
|||
19
map.c
19
map.c
|
|
@ -155,8 +155,7 @@ void mm_pair_thin(mm_tbuf_t *b, int radius, mm_match_t *m1, mm_match_t *m2)
|
|||
|
||||
void mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, uint32_t m_st, uint32_t m_en, const char *qname, int qlen)
|
||||
{
|
||||
int i, n = m_en - m_st, last = -1, last2 = -1, j, n_a, n_u;
|
||||
int32_t *v;
|
||||
int i, n = m_en - m_st, j, n_a, n_u;
|
||||
uint64_t *u;
|
||||
mm_match_t *m;
|
||||
mm128_t *a;
|
||||
|
|
@ -178,6 +177,7 @@ void mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, uint3
|
|||
}
|
||||
if (mm_verbose >= 5) printf("\n");
|
||||
#if 0
|
||||
int last = -1, last2 = -1;
|
||||
// pair k-mer thinning
|
||||
for (i = 0; i < n; ++i) {
|
||||
if (m[i].n >= opt->mid_occ && m[i].n < opt->max_occ) {
|
||||
|
|
@ -214,10 +214,10 @@ void mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, uint3
|
|||
p = &a[j++];
|
||||
if ((r[k]&1) == (q->qpos&1)) { // forward strand
|
||||
p->x = (r[k]&0xffffffff00000000ULL) | (uint32_t)r[k]>>1;
|
||||
p->y = (uint64_t)i << 32 | q->qpos >> 1;
|
||||
p->y = (uint64_t)q_span << 32 | q->qpos >> 1;
|
||||
} else { // reverse strand
|
||||
p->x = 1ULL<<63 | (r[k]&0xffffffff00000000ULL) | (uint32_t)r[k]>>1;
|
||||
p->y = (uint64_t)i << 32 | (qlen - ((q->qpos>>1) + 1 - q_span) - 1);
|
||||
p->y = (uint64_t)q_span << 32 | (qlen - ((q->qpos>>1) + 1 - q_span) - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -227,11 +227,18 @@ void mm_map_frag(const mm_mapopt_t *opt, const mm_idx_t *mi, mm_tbuf_t *b, uint3
|
|||
kfree(b->km_fixed, m);
|
||||
//for (i = 0; i < n_a; ++i) printf("%c\t%s\t%d\t%d\n", "+-"[a[i].x>>63], mi->seq[a[i].x<<1>>33].name, (uint32_t)a[i].x>>1, (uint32_t)a[i].y);
|
||||
|
||||
n_u = mm_chain_dp(mi->k, opt->max_gap, opt->bw, opt->max_skip, opt->min_score, n_a, a, &u, &v, b->km_fixed);
|
||||
n_u = mm_chain_dp(mi->k, opt->max_gap, opt->bw, opt->max_skip, opt->min_score, n_a, a, &u, b->km_fixed);
|
||||
printf("%s\t%d", qname, n_u);
|
||||
for (i = j = 0; i < n_u; ++i) {
|
||||
int n = (uint32_t)u[i];
|
||||
printf("\t%d@%s:%d-%d", (uint32_t)(u[i]>>32), mi->seq[a[j].x<<1>>33].name, (uint32_t)a[j].x, (uint32_t)a[j+n-1].x);
|
||||
j += n;
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
// free
|
||||
kfree(b->km_fixed, a);
|
||||
kfree(b->km_fixed, u);
|
||||
kfree(b->km_fixed, v);
|
||||
}
|
||||
|
||||
const mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname)
|
||||
|
|
|
|||
|
|
@ -12,6 +12,8 @@
|
|||
#define MM_F_NO_ISO 0x4
|
||||
#define MM_F_AVA 0x8
|
||||
|
||||
#define MM_IDX_MAGIC "MMI\2"
|
||||
|
||||
typedef struct {
|
||||
uint64_t x, y;
|
||||
} mm128_t;
|
||||
|
|
|
|||
2
mmpriv.h
2
mmpriv.h
|
|
@ -14,7 +14,7 @@ void radix_sort_128x(mm128_t *beg, mm128_t *end);
|
|||
void radix_sort_64(uint64_t *beg, uint64_t *end);
|
||||
uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk);
|
||||
|
||||
int mm_chain_dp(int match_len, int max_dist, int bw, int max_skip, int min_sc, int n, mm128_t *a, uint64_t **_u, int32_t **_v, void *km);
|
||||
int mm_chain_dp(int match_len, int max_dist, int bw, int max_skip, int min_sc, int n, mm128_t *a, uint64_t **_u, void *km);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue