drop smem based mapping algorithm
While we can compute smems very efficiently, there is still a long way to get the alignment. On simulated data, this smem-based algorithm is 4X faster than bwasw and twice as fast as bowtie2, but the accuracy is far lower than bwasw and even lower than bowtie2 in the high-mapQ range. I am kind of sure that if we continue to increase the mapping accuracy, the speed will approach to bwasw, if not slower. Smem-based mapping algorithm is still interesting, but given that I am short of time, I will not explore it further.
This commit is contained in:
parent
7467671c30
commit
7babb54e4c
145
fastmap.c
145
fastmap.c
|
|
@ -8,125 +8,26 @@
|
||||||
#include "kseq.h"
|
#include "kseq.h"
|
||||||
KSEQ_INIT(gzFile, gzread)
|
KSEQ_INIT(gzFile, gzread)
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
int qbeg, len;
|
|
||||||
int64_t tbeg;
|
|
||||||
} fmhit_t;
|
|
||||||
|
|
||||||
#define hit_qlt(a, b) ((a).qbeg < (b).qbeg)
|
|
||||||
#define hit_tlt(a, b) ((a).tbeg < (b).tbeg)
|
|
||||||
|
|
||||||
#include "ksort.h"
|
|
||||||
KSORT_INIT(hitq, fmhit_t, hit_qlt)
|
|
||||||
KSORT_INIT(hitt, fmhit_t, hit_tlt)
|
|
||||||
|
|
||||||
extern unsigned char nst_nt4_table[256];
|
extern unsigned char nst_nt4_table[256];
|
||||||
|
|
||||||
typedef struct { size_t n, m; fmhit_t *a; } fmhit_v;
|
|
||||||
|
|
||||||
static uint64_t cluster_qhits(fmhit_v *hits, int beg, int end, int max_dist, int *_score)
|
|
||||||
{
|
|
||||||
int qend, score, max = 0, cbeg = beg;
|
|
||||||
size_t i;
|
|
||||||
uint64_t max_cluster = 0;
|
|
||||||
ks_introsort(hitq, end - beg, hits->a + beg);
|
|
||||||
qend = hits->a[beg].qbeg + hits->a[beg].len;
|
|
||||||
score = hits->a[beg].len;
|
|
||||||
for (i = beg + 1; i < end; ++i) {
|
|
||||||
fmhit_t *p = &hits->a[i];
|
|
||||||
if (p->qbeg - qend > max_dist) {
|
|
||||||
if (score > max) max = score, max_cluster = (uint64_t)cbeg<<32 | i;
|
|
||||||
score = 0; cbeg = i;
|
|
||||||
}
|
|
||||||
score += p->len;
|
|
||||||
qend = p->qbeg + p->len;
|
|
||||||
}
|
|
||||||
if (score > max) max = score, max_cluster = (uint64_t)cbeg<<32 | i;
|
|
||||||
*_score = score;
|
|
||||||
return max_cluster;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int cluster_hits(fmhit_v *hits, int max_dist)
|
|
||||||
{
|
|
||||||
size_t i;
|
|
||||||
int64_t tend;
|
|
||||||
uint64_t cluster, max_cluster = 0;
|
|
||||||
int j, score, max = 0, max2 = 0, cbeg = 0, cend;
|
|
||||||
ks_introsort(hitt, hits->n, hits->a);
|
|
||||||
tend = hits->a[0].tbeg + hits->a[0].len;
|
|
||||||
for (i = 1; i < hits->n; ++i) {
|
|
||||||
fmhit_t *p = &hits->a[i];
|
|
||||||
if (p->tbeg - tend > max_dist) {
|
|
||||||
cluster = cluster_qhits(hits, cbeg, i, max_dist, &score);
|
|
||||||
if (score > max) max2 = max, max = score, max_cluster = cluster;
|
|
||||||
else if (score > max2) max2 = score;
|
|
||||||
cbeg = i;
|
|
||||||
}
|
|
||||||
tend = p->tbeg + p->len;
|
|
||||||
}
|
|
||||||
cluster = cluster_qhits(hits, cbeg, i, max_dist, &score);
|
|
||||||
if (score > max) max2 = max, max = score, max_cluster = cluster;
|
|
||||||
else if (score > max2) max2 = score;
|
|
||||||
cbeg = max_cluster>>32; cend = (uint32_t)max_cluster;
|
|
||||||
for (i = 0, j = cbeg; j < cend; ++j) hits->a[i++] = hits->a[j];
|
|
||||||
hits->n = i;
|
|
||||||
return (int)(200.0 * (max - max2) / max + .499);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int fake_cigar(const bntseq_t *bns, fmhit_v *hits, int beg, int end, int len, uint32_t *cigar, int64_t *pos, int *is_rev)
|
|
||||||
{
|
|
||||||
size_t i;
|
|
||||||
int qbeg, qend, n_cigar = 0;
|
|
||||||
int64_t tbeg, tend, tmp;
|
|
||||||
qbeg = len; qend = 0;
|
|
||||||
tbeg = bns->l_pac<<1; tend = 0;
|
|
||||||
for (i = beg; i < end; ++i) {
|
|
||||||
fmhit_t *p = &hits->a[i];
|
|
||||||
if (p->qbeg < qbeg) qbeg = p->qbeg;
|
|
||||||
if (p->qbeg + p->len > qend) qend = p->qbeg + p->len;
|
|
||||||
if (p->tbeg < tbeg) tbeg = p->tbeg;
|
|
||||||
if (p->tbeg + p->len > qend) tend = p->tbeg + p->len;
|
|
||||||
}
|
|
||||||
if (tbeg >= bns->l_pac) {
|
|
||||||
tmp = tend; tend = bns->l_pac*2 - tbeg; tbeg = bns->l_pac*2 - tmp;
|
|
||||||
tmp = qend; qend = len - qbeg; qbeg = len - tmp;
|
|
||||||
*is_rev = 1;
|
|
||||||
} else *is_rev = 0;
|
|
||||||
*pos = tbeg;
|
|
||||||
if (qbeg) cigar[n_cigar++] = qbeg<<4|4;
|
|
||||||
if (tend - tbeg < qend - qbeg) { // reference is shorter
|
|
||||||
cigar[n_cigar++] = (uint32_t)(tend - tbeg)<<4 | 0;
|
|
||||||
cigar[n_cigar++] = (uint32_t)((qend - qbeg) - (tend - tbeg))<<4 | 2;
|
|
||||||
} else if (tend - tbeg > qend - qbeg) { // query is shorter
|
|
||||||
cigar[n_cigar++] = (uint32_t)(qend - qbeg)<<4 | 0;
|
|
||||||
cigar[n_cigar++] = (uint32_t)((tend - tbeg) - (qend - qbeg))<<4 | 1;
|
|
||||||
} else cigar[n_cigar++] = (uint32_t)(qend - qbeg)<<4 | 0;
|
|
||||||
if (len > qend) cigar[n_cigar++] = (uint32_t)(len - qend)<<4|4;
|
|
||||||
return n_cigar;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main_fastmap(int argc, char *argv[])
|
int main_fastmap(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
int c, i, min_iwidth = 20, min_len = 17, max_dist = 50, mem_only = 0;
|
int c, i, min_iwidth = 20, min_len = 17;
|
||||||
kseq_t *seq;
|
kseq_t *seq;
|
||||||
bwtint_t k;
|
bwtint_t k;
|
||||||
gzFile fp;
|
gzFile fp;
|
||||||
bwt_t *bwt;
|
bwt_t *bwt;
|
||||||
bntseq_t *bns;
|
bntseq_t *bns;
|
||||||
bwtintv_v a[3], mem, *tvec[3];
|
bwtintv_v a[3], mem, *tvec[3];
|
||||||
fmhit_v hits;
|
|
||||||
uint32_t cigar[1024];
|
|
||||||
|
|
||||||
while ((c = getopt(argc, argv, "w:l:d:p")) >= 0) {
|
while ((c = getopt(argc, argv, "w:l:")) >= 0) {
|
||||||
switch (c) {
|
switch (c) {
|
||||||
case 'w': min_iwidth = atoi(optarg); break;
|
case 'w': min_iwidth = atoi(optarg); break;
|
||||||
case 'l': min_len = atoi(optarg); break;
|
case 'l': min_len = atoi(optarg); break;
|
||||||
case 'd': max_dist = atoi(optarg); break;
|
|
||||||
case 'p': mem_only = 1; break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (optind + 1 >= argc) {
|
if (optind + 1 >= argc) {
|
||||||
fprintf(stderr, "bwa fastmap <idxbase> <in.fq>\n");
|
fprintf(stderr, "Usage: bwa fastmap [-l minLen=%d] [-w maxSaSize=%d] <idxbase> <in.fq>\n", min_len, min_iwidth);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -145,46 +46,30 @@ int main_fastmap(int argc, char *argv[])
|
||||||
kv_init(a[i]);
|
kv_init(a[i]);
|
||||||
tvec[i] = &a[i];
|
tvec[i] = &a[i];
|
||||||
}
|
}
|
||||||
kv_init(mem); kv_init(hits);
|
kv_init(mem);
|
||||||
while (kseq_read(seq) >= 0) {
|
while (kseq_read(seq) >= 0) {
|
||||||
for (i = 0; i < seq->seq.l; ++i)
|
for (i = 0; i < seq->seq.l; ++i)
|
||||||
seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
|
seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
|
||||||
bwt_smem(bwt, seq->seq.l, (uint8_t*)seq->seq.s, &mem, tvec);
|
bwt_smem(bwt, seq->seq.l, (uint8_t*)seq->seq.s, &mem, tvec);
|
||||||
if (mem_only) printf("SQ\t%s\t%ld\n", seq->name.s, seq->seq.l);
|
printf("SQ\t%s\t%ld\n", seq->name.s, seq->seq.l);
|
||||||
for (i = 0, hits.n = 0; i < mem.n; ++i) {
|
for (i = 0; i < mem.n; ++i) {
|
||||||
bwtintv_t *p = &mem.a[i];
|
bwtintv_t *p = &mem.a[i];
|
||||||
if ((uint32_t)p->info - (p->info>>32) < min_len) continue;
|
if ((uint32_t)p->info - (p->info>>32) < min_len) continue;
|
||||||
if (mem_only) printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]);
|
printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]);
|
||||||
if (p->x[2] <= min_iwidth) {
|
if (p->x[2] <= min_iwidth) {
|
||||||
for (k = 0; k < p->x[2]; ++k) {
|
for (k = 0; k < p->x[2]; ++k) {
|
||||||
fmhit_t z;
|
bwtint_t pos;
|
||||||
z.tbeg = bwt_sa(bwt, p->x[0] + k);
|
int len, is_rev, ref_id;
|
||||||
z.qbeg = p->info>>32;
|
len = (uint32_t)p->info - (p->info>>32);
|
||||||
z.len = (uint32_t)p->info - z.qbeg;
|
pos = bns_depos(bns, bwt_sa(bwt, p->x[0] + k), &is_rev);
|
||||||
kv_push(fmhit_t, hits, z);
|
if (is_rev) pos -= len - 1;
|
||||||
if (mem_only) {
|
bns_cnt_ambi(bns, pos, len, &ref_id);
|
||||||
int is_rev, ref_id;
|
|
||||||
int64_t pos = bns_depos(bns, z.tbeg, &is_rev);
|
|
||||||
if (is_rev) pos -= z.len - 1;
|
|
||||||
bns_cnt_ambi(bns, pos, z.len, &ref_id);
|
|
||||||
printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
|
printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
putchar('\n');
|
||||||
}
|
}
|
||||||
if (mem_only) putchar('\n');
|
puts("//");
|
||||||
}
|
|
||||||
if (!mem_only) {
|
|
||||||
int64_t pos;
|
|
||||||
int n_cigar, is_rev, ref_id, mapq;
|
|
||||||
if (hits.n == 0) continue;
|
|
||||||
mapq = cluster_hits(&hits, max_dist);
|
|
||||||
n_cigar = fake_cigar(bns, &hits, 0, hits.n, seq->seq.l, cigar, &pos, &is_rev);
|
|
||||||
bns_cnt_ambi(bns, pos, 1, &ref_id);
|
|
||||||
printf("%s\t%d\t%s\t%ld\t%d\t", seq->name.s, is_rev?16:0, bns->anns[ref_id].name, (long)(pos - bns->anns[ref_id].offset) + 1, mapq);
|
|
||||||
for (i = 0; i < n_cigar; ++i)
|
|
||||||
printf("%d%c", cigar[i]>>4, "MIDNSHP"[cigar[i]&0xf]);
|
|
||||||
printf("\t*\t0\t0\t*\t*\n");
|
|
||||||
} else puts("//");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
free(mem.a);
|
free(mem.a);
|
||||||
|
|
|
||||||
3
main.c
3
main.c
|
|
@ -4,7 +4,7 @@
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
|
|
||||||
#ifndef PACKAGE_VERSION
|
#ifndef PACKAGE_VERSION
|
||||||
#define PACKAGE_VERSION "0.6.0-r64-dev"
|
#define PACKAGE_VERSION "0.6.0-r68-dev"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static int usage()
|
static int usage()
|
||||||
|
|
@ -19,6 +19,7 @@ static int usage()
|
||||||
fprintf(stderr, " samse generate alignment (single ended)\n");
|
fprintf(stderr, " samse generate alignment (single ended)\n");
|
||||||
fprintf(stderr, " sampe generate alignment (paired ended)\n");
|
fprintf(stderr, " sampe generate alignment (paired ended)\n");
|
||||||
fprintf(stderr, " bwasw BWA-SW for long queries\n");
|
fprintf(stderr, " bwasw BWA-SW for long queries\n");
|
||||||
|
fprintf(stderr, " fastmap identify super-maximal exact matches\n");
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
|
fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
|
||||||
fprintf(stderr, " pac2bwt generate BWT from PAC\n");
|
fprintf(stderr, " pac2bwt generate BWT from PAC\n");
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue