BED I/O
This commit is contained in:
parent
371bc9513a
commit
6420acca6d
107
index.c
107
index.c
|
|
@ -585,3 +585,110 @@ int mm_idx_reader_eof(const mm_idx_reader_t *r) // TODO: in extremely rare cases
|
|||
{
|
||||
return r->is_idx? (feof(r->fp.idx) || ftell(r->fp.idx) == r->idx_size) : mm_bseq_eof(r->fp.seq);
|
||||
}
|
||||
|
||||
#include <ctype.h>
|
||||
#include <zlib.h>
|
||||
#include "ksort.h"
|
||||
#include "kseq.h"
|
||||
KSTREAM_DECLARE(gzFile, gzread)
|
||||
|
||||
typedef struct {
|
||||
int32_t st, en, max; // max is not used for now
|
||||
int32_t score:30, strand:2;
|
||||
} mm_idx_intv1_t;
|
||||
|
||||
typedef struct mm_idx_intv_s {
|
||||
int32_t n, m;
|
||||
mm_idx_intv1_t *a;
|
||||
} mm_idx_intv_t;
|
||||
|
||||
#define sort_key_bed(a) ((a).st)
|
||||
KRADIX_SORT_INIT(bed, mm_idx_intv1_t, sort_key_bed, 4)
|
||||
|
||||
mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn)
|
||||
{
|
||||
gzFile fp;
|
||||
kstream_t *ks;
|
||||
kstring_t str = {0,0,0};
|
||||
mm_idx_intv_t *I;
|
||||
|
||||
fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
|
||||
if (fp == 0) return 0;
|
||||
I = (mm_idx_intv_t*)calloc(mi->n_seq, sizeof(*I));
|
||||
ks = ks_init(fp);
|
||||
while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0) {
|
||||
mm_idx_intv_t *r;
|
||||
mm_idx_intv1_t t = {-1,-1,-1,-1,0};
|
||||
char *p, *q;
|
||||
int32_t i, id = -1;
|
||||
for (p = q = str.s, i = 0;; ++p) {
|
||||
if (*p == 0 || isspace(*p)) {
|
||||
int32_t c = *p;
|
||||
*p = 0;
|
||||
if (i == 0) { // chr
|
||||
id = mm_idx_name2id(mi, q);
|
||||
if (id < 0) break; // unknown name; TODO: throw a warning
|
||||
} else if (i == 1) { // start
|
||||
t.st = atol(q); // TODO: watch out integer overflow!
|
||||
if (t.st < 0) break;
|
||||
} else if (i == 2) { // end
|
||||
t.en = atol(q);
|
||||
if (t.en < 0) break;
|
||||
} else if (i == 3) { // name; do nothing
|
||||
} else if (i == 4) { // BED score
|
||||
t.score = atol(q);
|
||||
} else if (i == 5) { // strand
|
||||
t.strand = *q == '+'? 1 : *q == '-'? -1 : 0;
|
||||
} else break;
|
||||
if (c == 0) break;
|
||||
++i, q = p + 1;
|
||||
}
|
||||
}
|
||||
if (id < 0 || t.st < 0 || t.st >= t.en) continue;
|
||||
r = &I[id];
|
||||
if (r->n == r->m) {
|
||||
r->m = r->m? r->m + (r->m>>1) : 16;
|
||||
r->a = (mm_idx_intv1_t*)realloc(r->a, sizeof(*r->a));
|
||||
}
|
||||
r->a[r->n++] = t;
|
||||
}
|
||||
ks_destroy(ks);
|
||||
gzclose(fp);
|
||||
return I;
|
||||
}
|
||||
|
||||
int mm_idx_bed_read(mm_idx_t *mi, const char *fn)
|
||||
{
|
||||
int32_t i;
|
||||
if (mi->h == 0) mm_idx_index_name(mi);
|
||||
mi->I = mm_idx_read_bed(mi, fn);
|
||||
if (mi->I == 0) return -1;
|
||||
for (i = 0; i < mi->n_seq; ++i)
|
||||
radix_sort_bed(mi->I[i].a, mi->I[i].a + mi->I[i].n);
|
||||
return mi->I? 0 : -1;
|
||||
}
|
||||
|
||||
int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, int32_t strand, int8_t *s)
|
||||
{
|
||||
int32_t i, left, right;
|
||||
mm_idx_intv_t *r;
|
||||
memset(s, 0, en - st);
|
||||
if (mi->I == 0 || ctg < 0 || ctg >= mi->n_seq) return -1;
|
||||
r = &mi->I[ctg];
|
||||
left = 0, right = r->n;
|
||||
while (right > left) {
|
||||
int32_t mid = left + ((right - left) >> 1);
|
||||
if (r->a[mid].st >= st) right = mid;
|
||||
else left = mid + 1;
|
||||
}
|
||||
for (i = left; i < r->n; ++i) {
|
||||
if (st <= r->a[i].st && en >= r->a[i].en && r->a[i].strand != 0 && strand * r->a[i].strand >= 0) {
|
||||
if (r->a[i].strand > 0) {
|
||||
s[r->a[i].st] |= 1, s[r->a[i].en - 1] |= 2;
|
||||
} else {
|
||||
s[r->a[i].st] |= 2, s[r->a[i].en - 1] |= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
return left;
|
||||
}
|
||||
|
|
|
|||
10
kseq.h
10
kseq.h
|
|
@ -37,6 +37,14 @@
|
|||
#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
|
||||
#define KS_SEP_MAX 2
|
||||
|
||||
#ifndef klib_unused
|
||||
#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
|
||||
#define klib_unused __attribute__ ((__unused__))
|
||||
#else
|
||||
#define klib_unused
|
||||
#endif
|
||||
#endif /* klib_unused */
|
||||
|
||||
#define __KS_TYPE(type_t) \
|
||||
typedef struct __kstream_t { \
|
||||
int begin, end; \
|
||||
|
|
@ -64,7 +72,7 @@
|
|||
}
|
||||
|
||||
#define __KS_INLINED(__read) \
|
||||
static inline int ks_getc(kstream_t *ks) \
|
||||
static inline klib_unused int ks_getc(kstream_t *ks) \
|
||||
{ \
|
||||
if (ks->is_eof && ks->begin >= ks->end) return -1; \
|
||||
if (ks->begin >= ks->end) { \
|
||||
|
|
|
|||
6
main.c
6
main.c
|
|
@ -63,6 +63,8 @@ static ko_longopt_t long_options[] = {
|
|||
{ "cap-sw-mem", ko_required_argument, 337 },
|
||||
{ "max-qlen", ko_required_argument, 338 },
|
||||
{ "max-chain-iter", ko_required_argument, 339 },
|
||||
{ "junc-bed", ko_required_argument, 340 },
|
||||
{ "junc-bonus", ko_required_argument, 341 },
|
||||
{ "help", ko_no_argument, 'h' },
|
||||
{ "max-intron-len", ko_required_argument, 'G' },
|
||||
{ "version", ko_no_argument, 'V' },
|
||||
|
|
@ -105,7 +107,7 @@ int main(int argc, char *argv[])
|
|||
mm_mapopt_t opt;
|
||||
mm_idxopt_t ipt;
|
||||
int i, c, n_threads = 3, n_parts, old_best_n = -1;
|
||||
char *fnw = 0, *rg = 0, *s;
|
||||
char *fnw = 0, *rg = 0, *junc_bed = 0, *s;
|
||||
FILE *fp_help = stderr;
|
||||
mm_idx_reader_t *idx_rdr;
|
||||
mm_idx_t *mi;
|
||||
|
|
@ -204,6 +206,7 @@ int main(int argc, char *argv[])
|
|||
else if (c == 336) opt.flag |= MM_F_HARD_MLEVEL; // --hard-mask-level
|
||||
else if (c == 337) opt.max_sw_mat = mm_parse_num(o.arg); // --cap-sw-mat
|
||||
else if (c == 338) opt.max_qlen = mm_parse_num(o.arg); // --max-qlen
|
||||
else if (c == 339) junc_bed = o.arg; // --junc-bed
|
||||
else if (c == 314) { // --frag
|
||||
yes_or_no(&opt, MM_F_FRAG_MODE, o.longidx, o.arg, 1);
|
||||
} else if (c == 315) { // --secondary
|
||||
|
|
@ -343,6 +346,7 @@ int main(int argc, char *argv[])
|
|||
if (opt.best_n == 0 && (opt.flag&MM_F_CIGAR) && mm_verbose >= 2)
|
||||
fprintf(stderr, "[WARNING]\033[1;31m `-N 0' reduces alignment accuracy. Please use --secondary=no to suppress secondary alignments.\033[0m\n");
|
||||
while ((mi = mm_idx_reader_read(idx_rdr, n_threads)) != 0) {
|
||||
if (junc_bed) mm_idx_bed_read(mi, junc_bed);
|
||||
if ((opt.flag & MM_F_CIGAR) && (mi->flag & MM_I_NO_SEQ)) {
|
||||
fprintf(stderr, "[ERROR] the prebuilt index doesn't contain sequences.\n");
|
||||
mm_idx_destroy(mi);
|
||||
|
|
|
|||
|
|
@ -66,6 +66,7 @@ typedef struct {
|
|||
mm_idx_seq_t *seq; // sequence name, length and offset
|
||||
uint32_t *S; // 4-bit packed sequence
|
||||
struct mm_idx_bucket_s *B; // index (hidden)
|
||||
struct mm_idx_intv_s *I; // intervals (hidden)
|
||||
void *km, *h;
|
||||
} mm_idx_t;
|
||||
|
||||
|
|
@ -365,6 +366,9 @@ int mm_idx_index_name(mm_idx_t *mi);
|
|||
int mm_idx_name2id(const mm_idx_t *mi, const char *name);
|
||||
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
|
||||
|
||||
int mm_idx_bed_read(mm_idx_t *mi, const char *fn);
|
||||
int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, int32_t strand, int8_t *s);
|
||||
|
||||
// deprecated APIs for backward compatibility
|
||||
void mm_mapopt_init(mm_mapopt_t *opt);
|
||||
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads);
|
||||
|
|
|
|||
Loading…
Reference in New Issue