diff --git a/index.c b/index.c index 05420d8..d842ec4 100644 --- a/index.c +++ b/index.c @@ -585,3 +585,110 @@ int mm_idx_reader_eof(const mm_idx_reader_t *r) // TODO: in extremely rare cases { return r->is_idx? (feof(r->fp.idx) || ftell(r->fp.idx) == r->idx_size) : mm_bseq_eof(r->fp.seq); } + +#include +#include +#include "ksort.h" +#include "kseq.h" +KSTREAM_DECLARE(gzFile, gzread) + +typedef struct { + int32_t st, en, max; // max is not used for now + int32_t score:30, strand:2; +} mm_idx_intv1_t; + +typedef struct mm_idx_intv_s { + int32_t n, m; + mm_idx_intv1_t *a; +} mm_idx_intv_t; + +#define sort_key_bed(a) ((a).st) +KRADIX_SORT_INIT(bed, mm_idx_intv1_t, sort_key_bed, 4) + +mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn) +{ + gzFile fp; + kstream_t *ks; + kstring_t str = {0,0,0}; + mm_idx_intv_t *I; + + fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) return 0; + I = (mm_idx_intv_t*)calloc(mi->n_seq, sizeof(*I)); + ks = ks_init(fp); + while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0) { + mm_idx_intv_t *r; + mm_idx_intv1_t t = {-1,-1,-1,-1,0}; + char *p, *q; + int32_t i, id = -1; + for (p = q = str.s, i = 0;; ++p) { + if (*p == 0 || isspace(*p)) { + int32_t c = *p; + *p = 0; + if (i == 0) { // chr + id = mm_idx_name2id(mi, q); + if (id < 0) break; // unknown name; TODO: throw a warning + } else if (i == 1) { // start + t.st = atol(q); // TODO: watch out integer overflow! + if (t.st < 0) break; + } else if (i == 2) { // end + t.en = atol(q); + if (t.en < 0) break; + } else if (i == 3) { // name; do nothing + } else if (i == 4) { // BED score + t.score = atol(q); + } else if (i == 5) { // strand + t.strand = *q == '+'? 1 : *q == '-'? -1 : 0; + } else break; + if (c == 0) break; + ++i, q = p + 1; + } + } + if (id < 0 || t.st < 0 || t.st >= t.en) continue; + r = &I[id]; + if (r->n == r->m) { + r->m = r->m? r->m + (r->m>>1) : 16; + r->a = (mm_idx_intv1_t*)realloc(r->a, sizeof(*r->a)); + } + r->a[r->n++] = t; + } + ks_destroy(ks); + gzclose(fp); + return I; +} + +int mm_idx_bed_read(mm_idx_t *mi, const char *fn) +{ + int32_t i; + if (mi->h == 0) mm_idx_index_name(mi); + mi->I = mm_idx_read_bed(mi, fn); + if (mi->I == 0) return -1; + for (i = 0; i < mi->n_seq; ++i) + radix_sort_bed(mi->I[i].a, mi->I[i].a + mi->I[i].n); + return mi->I? 0 : -1; +} + +int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, int32_t strand, int8_t *s) +{ + int32_t i, left, right; + mm_idx_intv_t *r; + memset(s, 0, en - st); + if (mi->I == 0 || ctg < 0 || ctg >= mi->n_seq) return -1; + r = &mi->I[ctg]; + left = 0, right = r->n; + while (right > left) { + int32_t mid = left + ((right - left) >> 1); + if (r->a[mid].st >= st) right = mid; + else left = mid + 1; + } + for (i = left; i < r->n; ++i) { + if (st <= r->a[i].st && en >= r->a[i].en && r->a[i].strand != 0 && strand * r->a[i].strand >= 0) { + if (r->a[i].strand > 0) { + s[r->a[i].st] |= 1, s[r->a[i].en - 1] |= 2; + } else { + s[r->a[i].st] |= 2, s[r->a[i].en - 1] |= 1; + } + } + } + return left; +} diff --git a/kseq.h b/kseq.h index d301ddc..8021e56 100644 --- a/kseq.h +++ b/kseq.h @@ -37,6 +37,14 @@ #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) #define KS_SEP_MAX 2 +#ifndef klib_unused +#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) +#define klib_unused __attribute__ ((__unused__)) +#else +#define klib_unused +#endif +#endif /* klib_unused */ + #define __KS_TYPE(type_t) \ typedef struct __kstream_t { \ int begin, end; \ @@ -64,7 +72,7 @@ } #define __KS_INLINED(__read) \ - static inline int ks_getc(kstream_t *ks) \ + static inline klib_unused int ks_getc(kstream_t *ks) \ { \ if (ks->is_eof && ks->begin >= ks->end) return -1; \ if (ks->begin >= ks->end) { \ diff --git a/main.c b/main.c index aa0cf91..f7b529b 100644 --- a/main.c +++ b/main.c @@ -63,6 +63,8 @@ static ko_longopt_t long_options[] = { { "cap-sw-mem", ko_required_argument, 337 }, { "max-qlen", ko_required_argument, 338 }, { "max-chain-iter", ko_required_argument, 339 }, + { "junc-bed", ko_required_argument, 340 }, + { "junc-bonus", ko_required_argument, 341 }, { "help", ko_no_argument, 'h' }, { "max-intron-len", ko_required_argument, 'G' }, { "version", ko_no_argument, 'V' }, @@ -105,7 +107,7 @@ int main(int argc, char *argv[]) mm_mapopt_t opt; mm_idxopt_t ipt; int i, c, n_threads = 3, n_parts, old_best_n = -1; - char *fnw = 0, *rg = 0, *s; + char *fnw = 0, *rg = 0, *junc_bed = 0, *s; FILE *fp_help = stderr; mm_idx_reader_t *idx_rdr; mm_idx_t *mi; @@ -204,6 +206,7 @@ int main(int argc, char *argv[]) else if (c == 336) opt.flag |= MM_F_HARD_MLEVEL; // --hard-mask-level else if (c == 337) opt.max_sw_mat = mm_parse_num(o.arg); // --cap-sw-mat else if (c == 338) opt.max_qlen = mm_parse_num(o.arg); // --max-qlen + else if (c == 339) junc_bed = o.arg; // --junc-bed else if (c == 314) { // --frag yes_or_no(&opt, MM_F_FRAG_MODE, o.longidx, o.arg, 1); } else if (c == 315) { // --secondary @@ -343,6 +346,7 @@ int main(int argc, char *argv[]) if (opt.best_n == 0 && (opt.flag&MM_F_CIGAR) && mm_verbose >= 2) fprintf(stderr, "[WARNING]\033[1;31m `-N 0' reduces alignment accuracy. Please use --secondary=no to suppress secondary alignments.\033[0m\n"); while ((mi = mm_idx_reader_read(idx_rdr, n_threads)) != 0) { + if (junc_bed) mm_idx_bed_read(mi, junc_bed); if ((opt.flag & MM_F_CIGAR) && (mi->flag & MM_I_NO_SEQ)) { fprintf(stderr, "[ERROR] the prebuilt index doesn't contain sequences.\n"); mm_idx_destroy(mi); diff --git a/minimap.h b/minimap.h index 176342b..947efe1 100644 --- a/minimap.h +++ b/minimap.h @@ -66,6 +66,7 @@ typedef struct { mm_idx_seq_t *seq; // sequence name, length and offset uint32_t *S; // 4-bit packed sequence struct mm_idx_bucket_s *B; // index (hidden) + struct mm_idx_intv_s *I; // intervals (hidden) void *km, *h; } mm_idx_t; @@ -365,6 +366,9 @@ int mm_idx_index_name(mm_idx_t *mi); int mm_idx_name2id(const mm_idx_t *mi, const char *name); int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq); +int mm_idx_bed_read(mm_idx_t *mi, const char *fn); +int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, int32_t strand, int8_t *s); + // deprecated APIs for backward compatibility void mm_mapopt_init(mm_mapopt_t *opt); mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads);