This commit is contained in:
Heng Li 2019-04-27 21:50:02 -04:00
parent 371bc9513a
commit 6420acca6d
4 changed files with 125 additions and 2 deletions

107
index.c
View File

@ -585,3 +585,110 @@ int mm_idx_reader_eof(const mm_idx_reader_t *r) // TODO: in extremely rare cases
{
return r->is_idx? (feof(r->fp.idx) || ftell(r->fp.idx) == r->idx_size) : mm_bseq_eof(r->fp.seq);
}
#include <ctype.h>
#include <zlib.h>
#include "ksort.h"
#include "kseq.h"
KSTREAM_DECLARE(gzFile, gzread)
typedef struct {
int32_t st, en, max; // max is not used for now
int32_t score:30, strand:2;
} mm_idx_intv1_t;
typedef struct mm_idx_intv_s {
int32_t n, m;
mm_idx_intv1_t *a;
} mm_idx_intv_t;
#define sort_key_bed(a) ((a).st)
KRADIX_SORT_INIT(bed, mm_idx_intv1_t, sort_key_bed, 4)
mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn)
{
gzFile fp;
kstream_t *ks;
kstring_t str = {0,0,0};
mm_idx_intv_t *I;
fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
if (fp == 0) return 0;
I = (mm_idx_intv_t*)calloc(mi->n_seq, sizeof(*I));
ks = ks_init(fp);
while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0) {
mm_idx_intv_t *r;
mm_idx_intv1_t t = {-1,-1,-1,-1,0};
char *p, *q;
int32_t i, id = -1;
for (p = q = str.s, i = 0;; ++p) {
if (*p == 0 || isspace(*p)) {
int32_t c = *p;
*p = 0;
if (i == 0) { // chr
id = mm_idx_name2id(mi, q);
if (id < 0) break; // unknown name; TODO: throw a warning
} else if (i == 1) { // start
t.st = atol(q); // TODO: watch out integer overflow!
if (t.st < 0) break;
} else if (i == 2) { // end
t.en = atol(q);
if (t.en < 0) break;
} else if (i == 3) { // name; do nothing
} else if (i == 4) { // BED score
t.score = atol(q);
} else if (i == 5) { // strand
t.strand = *q == '+'? 1 : *q == '-'? -1 : 0;
} else break;
if (c == 0) break;
++i, q = p + 1;
}
}
if (id < 0 || t.st < 0 || t.st >= t.en) continue;
r = &I[id];
if (r->n == r->m) {
r->m = r->m? r->m + (r->m>>1) : 16;
r->a = (mm_idx_intv1_t*)realloc(r->a, sizeof(*r->a));
}
r->a[r->n++] = t;
}
ks_destroy(ks);
gzclose(fp);
return I;
}
int mm_idx_bed_read(mm_idx_t *mi, const char *fn)
{
int32_t i;
if (mi->h == 0) mm_idx_index_name(mi);
mi->I = mm_idx_read_bed(mi, fn);
if (mi->I == 0) return -1;
for (i = 0; i < mi->n_seq; ++i)
radix_sort_bed(mi->I[i].a, mi->I[i].a + mi->I[i].n);
return mi->I? 0 : -1;
}
int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, int32_t strand, int8_t *s)
{
int32_t i, left, right;
mm_idx_intv_t *r;
memset(s, 0, en - st);
if (mi->I == 0 || ctg < 0 || ctg >= mi->n_seq) return -1;
r = &mi->I[ctg];
left = 0, right = r->n;
while (right > left) {
int32_t mid = left + ((right - left) >> 1);
if (r->a[mid].st >= st) right = mid;
else left = mid + 1;
}
for (i = left; i < r->n; ++i) {
if (st <= r->a[i].st && en >= r->a[i].en && r->a[i].strand != 0 && strand * r->a[i].strand >= 0) {
if (r->a[i].strand > 0) {
s[r->a[i].st] |= 1, s[r->a[i].en - 1] |= 2;
} else {
s[r->a[i].st] |= 2, s[r->a[i].en - 1] |= 1;
}
}
}
return left;
}

10
kseq.h
View File

@ -37,6 +37,14 @@
#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
#define KS_SEP_MAX 2
#ifndef klib_unused
#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
#define klib_unused __attribute__ ((__unused__))
#else
#define klib_unused
#endif
#endif /* klib_unused */
#define __KS_TYPE(type_t) \
typedef struct __kstream_t { \
int begin, end; \
@ -64,7 +72,7 @@
}
#define __KS_INLINED(__read) \
static inline int ks_getc(kstream_t *ks) \
static inline klib_unused int ks_getc(kstream_t *ks) \
{ \
if (ks->is_eof && ks->begin >= ks->end) return -1; \
if (ks->begin >= ks->end) { \

6
main.c
View File

@ -63,6 +63,8 @@ static ko_longopt_t long_options[] = {
{ "cap-sw-mem", ko_required_argument, 337 },
{ "max-qlen", ko_required_argument, 338 },
{ "max-chain-iter", ko_required_argument, 339 },
{ "junc-bed", ko_required_argument, 340 },
{ "junc-bonus", ko_required_argument, 341 },
{ "help", ko_no_argument, 'h' },
{ "max-intron-len", ko_required_argument, 'G' },
{ "version", ko_no_argument, 'V' },
@ -105,7 +107,7 @@ int main(int argc, char *argv[])
mm_mapopt_t opt;
mm_idxopt_t ipt;
int i, c, n_threads = 3, n_parts, old_best_n = -1;
char *fnw = 0, *rg = 0, *s;
char *fnw = 0, *rg = 0, *junc_bed = 0, *s;
FILE *fp_help = stderr;
mm_idx_reader_t *idx_rdr;
mm_idx_t *mi;
@ -204,6 +206,7 @@ int main(int argc, char *argv[])
else if (c == 336) opt.flag |= MM_F_HARD_MLEVEL; // --hard-mask-level
else if (c == 337) opt.max_sw_mat = mm_parse_num(o.arg); // --cap-sw-mat
else if (c == 338) opt.max_qlen = mm_parse_num(o.arg); // --max-qlen
else if (c == 339) junc_bed = o.arg; // --junc-bed
else if (c == 314) { // --frag
yes_or_no(&opt, MM_F_FRAG_MODE, o.longidx, o.arg, 1);
} else if (c == 315) { // --secondary
@ -343,6 +346,7 @@ int main(int argc, char *argv[])
if (opt.best_n == 0 && (opt.flag&MM_F_CIGAR) && mm_verbose >= 2)
fprintf(stderr, "[WARNING]\033[1;31m `-N 0' reduces alignment accuracy. Please use --secondary=no to suppress secondary alignments.\033[0m\n");
while ((mi = mm_idx_reader_read(idx_rdr, n_threads)) != 0) {
if (junc_bed) mm_idx_bed_read(mi, junc_bed);
if ((opt.flag & MM_F_CIGAR) && (mi->flag & MM_I_NO_SEQ)) {
fprintf(stderr, "[ERROR] the prebuilt index doesn't contain sequences.\n");
mm_idx_destroy(mi);

View File

@ -66,6 +66,7 @@ typedef struct {
mm_idx_seq_t *seq; // sequence name, length and offset
uint32_t *S; // 4-bit packed sequence
struct mm_idx_bucket_s *B; // index (hidden)
struct mm_idx_intv_s *I; // intervals (hidden)
void *km, *h;
} mm_idx_t;
@ -365,6 +366,9 @@ int mm_idx_index_name(mm_idx_t *mi);
int mm_idx_name2id(const mm_idx_t *mi, const char *name);
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
int mm_idx_bed_read(mm_idx_t *mi, const char *fn);
int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, int32_t strand, int8_t *s);
// deprecated APIs for backward compatibility
void mm_mapopt_init(mm_mapopt_t *opt);
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads);