fast-bwa/fastmap.c

215 lines
8.1 KiB
C
Raw Normal View History

2011-10-25 12:22:28 +08:00
#include <zlib.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
2013-02-24 04:55:55 +08:00
#include "bwa.h"
2013-02-01 02:59:48 +08:00
#include "bwamem.h"
2011-10-25 12:22:28 +08:00
#include "kvec.h"
#include "utils.h"
2011-10-25 12:22:28 +08:00
#include "kseq.h"
#include "utils.h"
2013-02-07 03:38:40 +08:00
KSEQ_DECLARE(gzFile)
2011-10-25 12:22:28 +08:00
extern unsigned char nst_nt4_table[256];
2013-02-24 06:09:23 +08:00
void *kopen(const char *fn, int *_fd);
int kclose(void *a);
2011-11-25 08:15:14 +08:00
2013-02-01 02:59:48 +08:00
int main_mem(int argc, char *argv[])
2011-11-25 08:15:14 +08:00
{
mem_opt_t *opt;
2013-02-24 06:09:23 +08:00
int fd, fd2, i, c, n, copy_comment = 0;
2013-02-08 02:13:43 +08:00
gzFile fp, fp2 = 0;
kseq_t *ks, *ks2 = 0;
bseq1_t *seqs;
2013-02-24 04:55:55 +08:00
bwaidx_t *idx;
2013-02-24 05:41:44 +08:00
char *rg_line = 0;
2013-02-24 06:09:23 +08:00
void *ko = 0, *ko2 = 0;
2011-11-25 08:15:14 +08:00
2013-02-01 02:59:48 +08:00
opt = mem_opt_init();
2013-04-10 04:13:55 +08:00
while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:")) >= 0) {
2013-02-08 05:27:11 +08:00
if (c == 'k') opt->min_seed_len = atoi(optarg);
2013-02-26 00:56:02 +08:00
else if (c == 'w') opt->w = atoi(optarg);
2013-02-26 00:24:21 +08:00
else if (c == 'A') opt->a = atoi(optarg);
else if (c == 'B') opt->b = atoi(optarg);
else if (c == 'O') opt->q = atoi(optarg);
else if (c == 'E') opt->r = atoi(optarg);
else if (c == 'T') opt->T = atoi(optarg);
else if (c == 'L') opt->pen_clip = atoi(optarg);
2013-02-28 02:16:22 +08:00
else if (c == 'U') opt->pen_unpaired = atoi(optarg);
2013-02-24 03:48:54 +08:00
else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
2013-02-19 05:33:06 +08:00
else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
2013-02-25 02:23:43 +08:00
else if (c == 'a') opt->flag |= MEM_F_ALL;
2013-02-25 13:13:32 +08:00
else if (c == 'p') opt->flag |= MEM_F_PE;
2013-02-26 00:18:35 +08:00
else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
2013-04-10 04:13:55 +08:00
else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE;
else if (c == 'c') opt->max_occ = atoi(optarg);
else if (c == 'd') opt->zdrop = atoi(optarg);
2013-02-24 05:57:34 +08:00
else if (c == 'v') bwa_verbose = atoi(optarg);
2013-02-22 01:52:00 +08:00
else if (c == 'r') opt->split_factor = atof(optarg);
2013-02-24 05:57:34 +08:00
else if (c == 'C') copy_comment = 1;
2013-02-24 05:41:44 +08:00
else if (c == 'R') {
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak
} else if (c == 's') opt->split_width = atoi(optarg);
else return 1;
2013-02-01 02:59:48 +08:00
}
2013-02-27 02:36:01 +08:00
if (opt->n_threads < 1) opt->n_threads = 1;
2013-02-01 02:59:48 +08:00
if (optind + 1 >= argc) {
fprintf(stderr, "\n");
2013-02-26 00:18:35 +08:00
fprintf(stderr, "Usage: bwa mem [options] <idxbase> <in1.fq> [in2.fq]\n\n");
fprintf(stderr, "Algorithm options:\n\n");
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len);
2013-02-26 00:56:02 +08:00
fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop);
2013-02-26 00:18:35 +08:00
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
2013-02-28 02:16:22 +08:00
// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
2013-02-26 00:18:35 +08:00
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
2013-04-10 04:13:55 +08:00
fprintf(stderr, " -S skip mate rescue\n");
fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n");
2013-02-26 00:24:21 +08:00
fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a);
fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q);
fprintf(stderr, " -E INT gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r);
fprintf(stderr, " -L INT penalty for clipping [%d]\n", opt->pen_clip);
2013-02-28 02:16:22 +08:00
fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired);
2013-02-26 00:18:35 +08:00
fprintf(stderr, "\nInput/output options:\n\n");
fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n");
fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
fprintf(stderr, "\n");
fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T);
2013-02-26 00:18:35 +08:00
fprintf(stderr, " -a output all alignments for SE or unpaired PE\n");
fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n");
fprintf(stderr, " -M mark shorter split hits as secondary (for Picard/GATK compatibility)\n");
2013-02-28 02:16:22 +08:00
fprintf(stderr, "\nNote: Please read the man page for detailed description of the command line and options.\n");
2013-02-01 02:59:48 +08:00
fprintf(stderr, "\n");
free(opt);
return 1;
}
2011-11-25 08:15:14 +08:00
2013-03-05 22:38:12 +08:00
bwa_fill_scmat(opt->a, opt->b, opt->mat);
2013-02-24 05:41:44 +08:00
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
2011-11-25 08:15:14 +08:00
2013-02-24 06:09:23 +08:00
ko = kopen(argv[optind + 1], &fd);
2013-04-27 22:08:01 +08:00
if (ko == 0) {
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 1]);
return 1;
}
2013-02-24 06:09:23 +08:00
fp = gzdopen(fd, "r");
2013-02-08 02:13:43 +08:00
ks = kseq_init(fp);
if (optind + 2 < argc) {
2013-02-25 13:13:32 +08:00
if (opt->flag&MEM_F_PE) {
if (bwa_verbose >= 2)
fprintf(stderr, "[W::%s] when '-p' is in use, the second query file will be ignored.\n", __func__);
} else {
ko2 = kopen(argv[optind + 2], &fd2);
2013-04-27 22:08:01 +08:00
if (ko2 == 0) {
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 2]);
return 1;
}
2013-02-25 13:13:32 +08:00
fp2 = gzdopen(fd2, "r");
ks2 = kseq_init(fp2);
opt->flag |= MEM_F_PE;
}
2013-02-08 02:13:43 +08:00
}
2013-04-27 22:08:01 +08:00
bwa_print_sam_hdr(idx->bns, rg_line);
2013-02-27 02:36:01 +08:00
while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) {
2013-02-24 05:57:34 +08:00
int64_t size = 0;
if ((opt->flag & MEM_F_PE) && (n&1) == 1) {
if (bwa_verbose >= 2)
fprintf(stderr, "[W::%s] odd number of reads in the PE mode; last read dropped\n", __func__);
n = n>>1<<1;
}
2013-02-24 05:57:34 +08:00
if (!copy_comment)
for (i = 0; i < n; ++i) {
free(seqs[i].comment); seqs[i].comment = 0;
}
for (i = 0; i < n; ++i) size += seqs[i].l_seq;
if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size);
mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n, seqs, 0);
for (i = 0; i < n; ++i) {
err_fputs(seqs[i].sam, stdout);
free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam);
}
2013-02-08 02:13:43 +08:00
free(seqs);
}
2013-02-01 02:59:48 +08:00
2013-02-24 04:55:55 +08:00
free(opt);
bwa_idx_destroy(idx);
2013-02-08 02:13:43 +08:00
kseq_destroy(ks);
err_gzclose(fp); kclose(ko);
2013-02-08 02:13:43 +08:00
if (ks2) {
kseq_destroy(ks2);
err_gzclose(fp2); kclose(ko2);
2013-02-08 02:13:43 +08:00
}
2013-02-01 02:59:48 +08:00
return 0;
2011-11-25 08:15:14 +08:00
}
2011-10-25 12:22:28 +08:00
int main_fastmap(int argc, char *argv[])
{
int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, split_width = 0;
2011-10-25 12:22:28 +08:00
kseq_t *seq;
bwtint_t k;
2011-10-25 12:22:28 +08:00
gzFile fp;
2013-02-01 01:34:05 +08:00
smem_i *itr;
2013-02-02 03:38:44 +08:00
const bwtintv_v *a;
2013-02-24 04:55:55 +08:00
bwaidx_t *idx;
while ((c = getopt(argc, argv, "w:l:ps:")) >= 0) {
2011-10-25 12:22:28 +08:00
switch (c) {
case 's': split_width = atoi(optarg); break;
2013-02-02 03:20:38 +08:00
case 'p': print_seq = 1; break;
case 'w': min_iwidth = atoi(optarg); break;
case 'l': min_len = atoi(optarg); break;
default: return 1;
2011-10-25 12:22:28 +08:00
}
}
if (optind + 1 >= argc) {
fprintf(stderr, "Usage: bwa fastmap [-p] [-s splitWidth=%d] [-l minLen=%d] [-w maxSaSize=%d] <idxbase> <in.fq>\n", split_width, min_len, min_iwidth);
2011-10-25 12:22:28 +08:00
return 1;
}
fp = xzopen(argv[optind + 1], "r");
2011-10-25 12:22:28 +08:00
seq = kseq_init(fp);
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1;
2013-02-24 04:55:55 +08:00
itr = smem_itr_init(idx->bwt);
2011-10-25 12:22:28 +08:00
while (kseq_read(seq) >= 0) {
err_printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l);
if (print_seq) {
err_putchar('\t');
err_puts(seq->seq.s);
} else err_putchar('\n');
2011-10-25 12:22:28 +08:00
for (i = 0; i < seq->seq.l; ++i)
seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
2013-02-02 03:20:38 +08:00
smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s);
while ((a = smem_next(itr, min_len<<1, split_width)) != 0) {
2013-02-02 03:38:44 +08:00
for (i = 0; i < a->n; ++i) {
bwtintv_t *p = &a->a[i];
2011-11-25 08:15:14 +08:00
if ((uint32_t)p->info - (p->info>>32) < min_len) continue;
err_printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]);
2011-11-25 08:15:14 +08:00
if (p->x[2] <= min_iwidth) {
for (k = 0; k < p->x[2]; ++k) {
bwtint_t pos;
int len, is_rev, ref_id;
len = (uint32_t)p->info - (p->info>>32);
2013-02-24 04:55:55 +08:00
pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &is_rev);
2011-11-25 08:15:14 +08:00
if (is_rev) pos -= len - 1;
2013-02-24 04:55:55 +08:00
bns_cnt_ambi(idx->bns, pos, len, &ref_id);
err_printf("\t%s:%c%ld", idx->bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - idx->bns->anns[ref_id].offset) + 1);
2011-11-25 08:15:14 +08:00
}
} else err_puts("\t*");
err_putchar('\n');
}
2011-10-25 12:22:28 +08:00
}
err_puts("//");
2011-10-25 12:22:28 +08:00
}
2013-02-01 01:34:05 +08:00
smem_itr_destroy(itr);
2013-02-24 04:55:55 +08:00
bwa_idx_destroy(idx);
2011-10-25 12:22:28 +08:00
kseq_destroy(seq);
err_gzclose(fp);
2011-10-25 12:22:28 +08:00
return 0;
}