r934: --junc-bed to read BED12

This commit is contained in:
Heng Li 2019-04-28 20:12:28 -04:00
parent f64e426a5a
commit 49c6d83a8e
4 changed files with 45 additions and 19 deletions

48
index.c
View File

@ -610,7 +610,7 @@ KSTREAM_DECLARE(gzFile, gzread)
#define sort_key_bed(a) ((a).st)
KRADIX_SORT_INIT(bed, mm_idx_intv1_t, sort_key_bed, 4)
mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn)
mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc)
{
gzFile fp;
kstream_t *ks;
@ -624,8 +624,8 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn)
while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0) {
mm_idx_intv_t *r;
mm_idx_intv1_t t = {-1,-1,-1,-1,0};
char *p, *q;
int32_t i, id = -1;
char *p, *q, *bl, *bs;
int32_t i, id = -1, n_blk = 0;
for (p = q = str.s, i = 0;; ++p) {
if (*p == 0 || isspace(*p)) {
int32_t c = *p;
@ -639,23 +639,49 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn)
} else if (i == 2) { // end
t.en = atol(q);
if (t.en < 0) break;
} else if (i == 3) { // name; do nothing
} else if (i == 4) { // BED score
t.score = atol(q);
} else if (i == 5) { // strand
t.strand = *q == '+'? 1 : *q == '-'? -1 : 0;
} else break;
} else if (i == 9) {
if (!isdigit(*q)) break;
n_blk = atol(q);
} else if (i == 10) {
bl = q;
} else if (i == 11) {
bs = q;
break;
}
if (c == 0) break;
++i, q = p + 1;
}
}
if (id < 0 || t.st < 0 || t.st >= t.en) continue;
r = &I[id];
if (r->n == r->m) {
r->m = r->m? r->m + (r->m>>1) : 16;
r->a = (mm_idx_intv1_t*)realloc(r->a, sizeof(*r->a) * r->m);
if (i >= 11 && read_junc) { // BED12
int32_t st, sz, en;
st = strtol(bs, &bs, 10); ++bs;
sz = strtol(bl, &bl, 10); ++bl;
en = t.st + st + sz;
for (i = 1; i < n_blk; ++i) {
mm_idx_intv1_t s = t;
if (r->n == r->m) {
r->m = r->m? r->m + (r->m>>1) : 16;
r->a = (mm_idx_intv1_t*)realloc(r->a, sizeof(*r->a) * r->m);
}
st = strtol(bs, &bs, 10); ++bs;
sz = strtol(bl, &bl, 10); ++bl;
s.st = en, s.en = t.st + st;
en = t.st + st + sz;
if (s.en > s.st) r->a[r->n++] = s;
}
} else {
if (r->n == r->m) {
r->m = r->m? r->m + (r->m>>1) : 16;
r->a = (mm_idx_intv1_t*)realloc(r->a, sizeof(*r->a) * r->m);
}
r->a[r->n++] = t;
}
r->a[r->n++] = t;
}
free(str.s);
ks_destroy(ks);
@ -663,11 +689,11 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn)
return I;
}
int mm_idx_bed_read(mm_idx_t *mi, const char *fn)
int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc)
{
int32_t i;
if (mi->h == 0) mm_idx_index_name(mi);
mi->I = mm_idx_read_bed(mi, fn);
mi->I = mm_idx_read_bed(mi, fn, read_junc);
if (mi->I == 0) return -1;
for (i = 0; i < mi->n_seq; ++i) // TODO: eliminate redundant intervals
radix_sort_bed(mi->I[i].a, mi->I[i].a + mi->I[i].n);

4
main.c
View File

@ -6,7 +6,7 @@
#include "mmpriv.h"
#include "ketopt.h"
#define MM_VERSION "2.16-r933-dirty"
#define MM_VERSION "2.16-r934-dirty"
#ifdef __linux__
#include <sys/resource.h>
@ -366,7 +366,7 @@ int main(int argc, char *argv[])
__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n_seq);
if (argc != o.ind + 1) mm_mapopt_update(&opt, mi);
if (mm_verbose >= 3) mm_idx_stat(mi);
if (junc_bed) mm_idx_bed_read(mi, junc_bed);
if (junc_bed) mm_idx_bed_read(mi, junc_bed, 1);
if (!(opt.flag & MM_F_FRAG_MODE)) {
for (i = o.ind + 1; i < argc; ++i)
mm_map_file(mi, argv[i], &opt, n_threads);

View File

@ -367,7 +367,7 @@ int mm_idx_index_name(mm_idx_t *mi);
int mm_idx_name2id(const mm_idx_t *mi, const char *name);
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
int mm_idx_bed_read(mm_idx_t *mi, const char *fn);
int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc);
int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s);
// deprecated APIs for backward compatibility

View File

@ -1,4 +1,4 @@
.TH minimap2 1 "28 Feburary 2019" "minimap2-2.16-dirty (r933)" "Bioinformatics tools"
.TH minimap2 1 "28 Feburary 2019" "minimap2-2.16-dirty (r934)" "Bioinformatics tools"
.SH NAME
.PP
minimap2 - mapping and alignment between collections of DNA sequences
@ -365,10 +365,10 @@ on SIRV data, please add
to the command line.
.TP
.BR --junc-bed \ FILE
BED file consisting of annotated introns and their strands. With this option,
minimap2 prefers splicing in annotations.
.I FILE
can be generated with `paftools.js gff2bed -j ann.gtf' [].
Gene annotations in the BED12 format (aka 12-column BED), or intron positions
in 5-column BED. With this option, minimap2 prefers splicing in annotations.
BED12 file can be converted from GTF/GFF3 with `paftools.js gff2bed anno.gtf'
[].
.TP
.BR --junc-bonus \ INT
Score bonus for a splice donor or acceptor found in annotation (effective with