From 49c6d83a8ec6cb0f37b7e5c55be70c9377c4eac3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 28 Apr 2019 20:12:28 -0400 Subject: [PATCH] r934: --junc-bed to read BED12 --- index.c | 48 +++++++++++++++++++++++++++++++++++++----------- main.c | 4 ++-- minimap.h | 2 +- minimap2.1 | 10 +++++----- 4 files changed, 45 insertions(+), 19 deletions(-) diff --git a/index.c b/index.c index 0ba6fde..164e8e5 100644 --- a/index.c +++ b/index.c @@ -610,7 +610,7 @@ KSTREAM_DECLARE(gzFile, gzread) #define sort_key_bed(a) ((a).st) KRADIX_SORT_INIT(bed, mm_idx_intv1_t, sort_key_bed, 4) -mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn) +mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc) { gzFile fp; kstream_t *ks; @@ -624,8 +624,8 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn) while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0) { mm_idx_intv_t *r; mm_idx_intv1_t t = {-1,-1,-1,-1,0}; - char *p, *q; - int32_t i, id = -1; + char *p, *q, *bl, *bs; + int32_t i, id = -1, n_blk = 0; for (p = q = str.s, i = 0;; ++p) { if (*p == 0 || isspace(*p)) { int32_t c = *p; @@ -639,23 +639,49 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn) } else if (i == 2) { // end t.en = atol(q); if (t.en < 0) break; - } else if (i == 3) { // name; do nothing } else if (i == 4) { // BED score t.score = atol(q); } else if (i == 5) { // strand t.strand = *q == '+'? 1 : *q == '-'? -1 : 0; - } else break; + } else if (i == 9) { + if (!isdigit(*q)) break; + n_blk = atol(q); + } else if (i == 10) { + bl = q; + } else if (i == 11) { + bs = q; + break; + } if (c == 0) break; ++i, q = p + 1; } } if (id < 0 || t.st < 0 || t.st >= t.en) continue; r = &I[id]; - if (r->n == r->m) { - r->m = r->m? r->m + (r->m>>1) : 16; - r->a = (mm_idx_intv1_t*)realloc(r->a, sizeof(*r->a) * r->m); + if (i >= 11 && read_junc) { // BED12 + int32_t st, sz, en; + st = strtol(bs, &bs, 10); ++bs; + sz = strtol(bl, &bl, 10); ++bl; + en = t.st + st + sz; + for (i = 1; i < n_blk; ++i) { + mm_idx_intv1_t s = t; + if (r->n == r->m) { + r->m = r->m? r->m + (r->m>>1) : 16; + r->a = (mm_idx_intv1_t*)realloc(r->a, sizeof(*r->a) * r->m); + } + st = strtol(bs, &bs, 10); ++bs; + sz = strtol(bl, &bl, 10); ++bl; + s.st = en, s.en = t.st + st; + en = t.st + st + sz; + if (s.en > s.st) r->a[r->n++] = s; + } + } else { + if (r->n == r->m) { + r->m = r->m? r->m + (r->m>>1) : 16; + r->a = (mm_idx_intv1_t*)realloc(r->a, sizeof(*r->a) * r->m); + } + r->a[r->n++] = t; } - r->a[r->n++] = t; } free(str.s); ks_destroy(ks); @@ -663,11 +689,11 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn) return I; } -int mm_idx_bed_read(mm_idx_t *mi, const char *fn) +int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc) { int32_t i; if (mi->h == 0) mm_idx_index_name(mi); - mi->I = mm_idx_read_bed(mi, fn); + mi->I = mm_idx_read_bed(mi, fn, read_junc); if (mi->I == 0) return -1; for (i = 0; i < mi->n_seq; ++i) // TODO: eliminate redundant intervals radix_sort_bed(mi->I[i].a, mi->I[i].a + mi->I[i].n); diff --git a/main.c b/main.c index bf179c0..9d11102 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.16-r933-dirty" +#define MM_VERSION "2.16-r934-dirty" #ifdef __linux__ #include @@ -366,7 +366,7 @@ int main(int argc, char *argv[]) __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n_seq); if (argc != o.ind + 1) mm_mapopt_update(&opt, mi); if (mm_verbose >= 3) mm_idx_stat(mi); - if (junc_bed) mm_idx_bed_read(mi, junc_bed); + if (junc_bed) mm_idx_bed_read(mi, junc_bed, 1); if (!(opt.flag & MM_F_FRAG_MODE)) { for (i = o.ind + 1; i < argc; ++i) mm_map_file(mi, argv[i], &opt, n_threads); diff --git a/minimap.h b/minimap.h index 358cad7..993b008 100644 --- a/minimap.h +++ b/minimap.h @@ -367,7 +367,7 @@ int mm_idx_index_name(mm_idx_t *mi); int mm_idx_name2id(const mm_idx_t *mi, const char *name); int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq); -int mm_idx_bed_read(mm_idx_t *mi, const char *fn); +int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc); int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s); // deprecated APIs for backward compatibility diff --git a/minimap2.1 b/minimap2.1 index d13e3ee..99cab5c 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "28 Feburary 2019" "minimap2-2.16-dirty (r933)" "Bioinformatics tools" +.TH minimap2 1 "28 Feburary 2019" "minimap2-2.16-dirty (r934)" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences @@ -365,10 +365,10 @@ on SIRV data, please add to the command line. .TP .BR --junc-bed \ FILE -BED file consisting of annotated introns and their strands. With this option, -minimap2 prefers splicing in annotations. -.I FILE -can be generated with `paftools.js gff2bed -j ann.gtf' []. +Gene annotations in the BED12 format (aka 12-column BED), or intron positions +in 5-column BED. With this option, minimap2 prefers splicing in annotations. +BED12 file can be converted from GTF/GFF3 with `paftools.js gff2bed anno.gtf' +[]. .TP .BR --junc-bonus \ INT Score bonus for a splice donor or acceptor found in annotation (effective with