fast-bwa/fastmap.c

157 lines
4.3 KiB
C
Raw Normal View History

2011-10-25 12:22:28 +08:00
#include <zlib.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include "bntseq.h"
2011-10-25 12:22:28 +08:00
#include "bwt.h"
2013-02-01 02:59:48 +08:00
#include "bwamem.h"
2011-10-25 12:22:28 +08:00
#include "kvec.h"
#include "kseq.h"
KSEQ_INIT(gzFile, gzread)
extern unsigned char nst_nt4_table[256];
2013-02-01 02:59:48 +08:00
int main_mem(int argc, char *argv[])
{
mem_opt_t *opt;
2013-02-01 02:59:48 +08:00
bwt_t *bwt;
bntseq_t *bns;
int i, j, c;
2013-02-05 04:40:26 +08:00
gzFile fp;
kseq_t *seq;
uint8_t *pac = 0;
2013-02-01 02:59:48 +08:00
opt = mem_opt_init();
while ((c = getopt(argc, argv, "")) >= 0) {
}
if (optind + 1 >= argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bwa mem [options] <idxbase> <in1.fq> [in2.fq]\n");
fprintf(stderr, "\n");
free(opt);
return 1;
}
mem_fill_scmat(opt->a, opt->b, opt->mat);
fp = gzopen(argv[optind + 1], "r");
seq = kseq_init(fp);
{ // load the packed sequences, BWT and SA
char *tmp = calloc(strlen(argv[optind]) + 5, 1);
strcat(strcpy(tmp, argv[optind]), ".bwt");
bwt = bwt_restore_bwt(tmp);
strcat(strcpy(tmp, argv[optind]), ".sa");
bwt_restore_sa(tmp, bwt);
free(tmp);
bns = bns_restore(argv[optind]);
pac = calloc(bns->l_pac/4+1, 1);
fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
}
while (kseq_read(seq) >= 0) {
mem_chain_t chain;
printf(">%s\n", seq->name.s);
for (i = 0; i < seq->seq.l; ++i)
seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
chain = mem_chain(opt, bwt, seq->seq.l, (uint8_t*)seq->seq.s);
2013-02-05 13:17:20 +08:00
mem_chain_flt(opt, &chain);
for (i = 0; i < chain.n; ++i) {
mem_chain1_t *p = &chain.chains[i];
2013-02-05 06:23:06 +08:00
mem_aln_t a;
mem_chain2aln(opt, bns->l_pac, pac, seq->seq.l, (uint8_t*)seq->seq.s, p, &a);
printf("%d\t%d", i, p->n);
for (j = 0; j < p->n; ++j) {
bwtint_t pos;
int is_rev, ref_id;
pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev);
if (is_rev) pos -= p->seeds[j].len - 1;
bns_cnt_ambi(bns, pos, p->seeds[j].len, &ref_id);
printf("\t%d,%d,%s:%c%ld", p->seeds[j].len, p->seeds[j].qbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
}
putchar('\n');
}
puts("//");
2013-02-05 04:40:26 +08:00
for (i = 0; i < chain.n; ++i) free(chain.chains[i].seeds);
free(chain.chains);
}
2013-02-01 02:59:48 +08:00
2013-02-05 04:40:26 +08:00
free(pac); free(opt);
bns_destroy(bns);
bwt_destroy(bwt);
kseq_destroy(seq);
gzclose(fp);
2013-02-01 02:59:48 +08:00
return 0;
}
2011-10-25 12:22:28 +08:00
int main_fastmap(int argc, char *argv[])
{
2013-02-02 03:20:38 +08:00
int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, split_long = 0;
2011-10-25 12:22:28 +08:00
kseq_t *seq;
bwtint_t k;
2011-10-25 12:22:28 +08:00
gzFile fp;
bwt_t *bwt;
bntseq_t *bns;
2013-02-01 01:34:05 +08:00
smem_i *itr;
2013-02-02 03:38:44 +08:00
const bwtintv_v *a;
2013-02-02 03:20:38 +08:00
while ((c = getopt(argc, argv, "w:l:ps")) >= 0) {
2011-10-25 12:22:28 +08:00
switch (c) {
2013-02-02 03:20:38 +08:00
case 's': split_long = 1; break;
case 'p': print_seq = 1; break;
case 'w': min_iwidth = atoi(optarg); break;
case 'l': min_len = atoi(optarg); break;
2011-10-25 12:22:28 +08:00
}
}
if (optind + 1 >= argc) {
2013-02-02 03:20:38 +08:00
fprintf(stderr, "Usage: bwa fastmap [-ps] [-l minLen=%d] [-w maxSaSize=%d] <idxbase> <in.fq>\n", min_len, min_iwidth);
2011-10-25 12:22:28 +08:00
return 1;
}
2011-10-25 12:22:28 +08:00
fp = gzopen(argv[optind + 1], "r");
seq = kseq_init(fp);
{ // load the packed sequences, BWT and SA
2011-10-25 12:22:28 +08:00
char *tmp = calloc(strlen(argv[optind]) + 5, 1);
strcat(strcpy(tmp, argv[optind]), ".bwt");
bwt = bwt_restore_bwt(tmp);
strcat(strcpy(tmp, argv[optind]), ".sa");
bwt_restore_sa(tmp, bwt);
2011-10-25 12:22:28 +08:00
free(tmp);
bns = bns_restore(argv[optind]);
2011-10-25 12:22:28 +08:00
}
2013-02-01 01:34:05 +08:00
itr = smem_itr_init(bwt);
2011-10-25 12:22:28 +08:00
while (kseq_read(seq) >= 0) {
printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l);
if (print_seq) {
putchar('\t');
puts(seq->seq.s);
} else putchar('\n');
2011-10-25 12:22:28 +08:00
for (i = 0; i < seq->seq.l; ++i)
seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
2013-02-02 03:20:38 +08:00
smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s);
2013-02-02 03:38:44 +08:00
while ((a = smem_next(itr, split_long? min_len<<1 : 0)) != 0) {
for (i = 0; i < a->n; ++i) {
bwtintv_t *p = &a->a[i];
2011-11-25 08:15:14 +08:00
if ((uint32_t)p->info - (p->info>>32) < min_len) continue;
printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]);
if (p->x[2] <= min_iwidth) {
for (k = 0; k < p->x[2]; ++k) {
bwtint_t pos;
int len, is_rev, ref_id;
len = (uint32_t)p->info - (p->info>>32);
pos = bns_depos(bns, bwt_sa(bwt, p->x[0] + k), &is_rev);
if (is_rev) pos -= len - 1;
bns_cnt_ambi(bns, pos, len, &ref_id);
printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
}
} else fputs("\t*", stdout);
putchar('\n');
}
2011-10-25 12:22:28 +08:00
}
puts("//");
2011-10-25 12:22:28 +08:00
}
2013-02-01 01:34:05 +08:00
smem_itr_destroy(itr);
bns_destroy(bns);
2011-10-25 12:22:28 +08:00
bwt_destroy(bwt);
kseq_destroy(seq);
gzclose(fp);
return 0;
}