#include #include #include #include #include #include #include #include #include "bwa.h" #include "bwamem.h" #include "kvec.h" #include "utils.h" #include "kseq.h" #include "utils.h" KSEQ_DECLARE(gzFile) extern unsigned char nst_nt4_table[256]; void *kopen(const char *fn, int *_fd); int kclose(void *a); static void update_a(mem_opt_t *opt, const mem_opt_t *opt0) { if (opt0->a) { // matching score is changed if (!opt0->b) opt->b *= opt->a; if (!opt0->T) opt->T *= opt->a; if (!opt0->o_del) opt->o_del *= opt->a; if (!opt0->e_del) opt->e_del *= opt->a; if (!opt0->o_ins) opt->o_ins *= opt->a; if (!opt0->e_ins) opt->e_ins *= opt->a; if (!opt0->zdrop) opt->zdrop *= opt->a; if (!opt0->pen_clip5) opt->pen_clip5 *= opt->a; if (!opt0->pen_clip3) opt->pen_clip3 *= opt->a; if (!opt0->pen_unpaired) opt->pen_unpaired *= opt->a; } } int main_mem(int argc, char *argv[]) { mem_opt_t *opt, opt0; int fd, fd2, i, c, n, copy_comment = 0; int fixed_chunk_size = -1, actual_chunk_size; gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; bseq1_t *seqs; bwaidx_t *idx; char *p, *rg_line = 0; const char *mode = 0; void *ko = 0, *ko2 = 0; int64_t n_processed = 0; mem_pestat_t pes[4], *pes0 = 0; memset(pes, 0, 4 * sizeof(mem_pestat_t)); for (i = 0; i < 4; ++i) pes[i].failed = 1; opt = mem_opt_init(); memset(&opt0, 0, sizeof(mem_opt_t)); while ((c = getopt(argc, argv, "epaFMCSPHVYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1; else if (c == 'x') mode = optarg; else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1; else if (c == 'A') opt->a = atoi(optarg), opt0.a = 1; else if (c == 'B') opt->b = atoi(optarg), opt0.b = 1; else if (c == 'T') opt->T = atoi(optarg), opt0.T = 1; else if (c == 'U') opt->pen_unpaired = atoi(optarg), opt0.pen_unpaired = 1; else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; else if (c == 'a') opt->flag |= MEM_F_ALL; else if (c == 'p') opt->flag |= MEM_F_PE; else if (c == 'M') opt->flag |= MEM_F_NO_MULTI; else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE; else if (c == 'e') opt->flag |= MEM_F_SELF_OVLP; else if (c == 'F') opt->flag |= MEM_F_ALN_REG; else if (c == 'Y') opt->flag |= MEM_F_SOFTCLIP; else if (c == 'V') opt->flag |= MEM_F_REF_HDR; else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1; else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1; else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg), opt0.split_factor = 1.; else if (c == 'D') opt->drop_ratio = atof(optarg), opt0.drop_ratio = 1.; else if (c == 'm') opt->max_matesw = atoi(optarg), opt0.max_matesw = 1; else if (c == 'h') opt->max_hits = atoi(optarg), opt0.max_hits = 1; else if (c == 's') opt->split_width = atoi(optarg), opt0.split_width = 1; else if (c == 'G') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1; else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1; else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1; else if (c == 'y') opt->max_mem_intv = atol(optarg), opt0.max_mem_intv = 1; else if (c == 'C') copy_comment = 1; else if (c == 'K') fixed_chunk_size = atoi(optarg); else if (c == 'Q') { opt0.mapQ_coef_len = 1; opt->mapQ_coef_len = atoi(optarg); opt->mapQ_coef_fac = opt->mapQ_coef_len > 0? log(opt->mapQ_coef_len) : 0; } else if (c == 'O') { opt0.o_del = opt0.o_ins = 1; opt->o_del = opt->o_ins = strtol(optarg, &p, 10); if (*p != 0 && ispunct(*p) && isdigit(p[1])) opt->o_ins = strtol(p+1, &p, 10); } else if (c == 'E') { opt0.e_del = opt0.e_ins = 1; opt->e_del = opt->e_ins = strtol(optarg, &p, 10); if (*p != 0 && ispunct(*p) && isdigit(p[1])) opt->e_ins = strtol(p+1, &p, 10); } else if (c == 'L') { opt0.pen_clip5 = opt0.pen_clip3 = 1; opt->pen_clip5 = opt->pen_clip3 = strtol(optarg, &p, 10); if (*p != 0 && ispunct(*p) && isdigit(p[1])) opt->pen_clip3 = strtol(p+1, &p, 10); } else if (c == 'R') { if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak } else if (c == 'I') { // specify the insert size distribution pes0 = pes; pes[1].failed = 0; pes[1].avg = strtod(optarg, &p); pes[1].std = pes[1].avg * .1; if (*p != 0 && ispunct(*p) && isdigit(p[1])) pes[1].std = strtod(p+1, &p); pes[1].high = (int)(pes[1].avg + 4. * pes[1].std + .499); pes[1].low = (int)(pes[1].avg - 4. * pes[1].std + .499); if (pes[1].low < 1) pes[1].low = 1; if (*p != 0 && ispunct(*p) && isdigit(p[1])) pes[1].high = (int)(strtod(p+1, &p) + .499); if (*p != 0 && ispunct(*p) && isdigit(p[1])) pes[1].low = (int)(strtod(p+1, &p) + .499); if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] mean insert size: %.3f, stddev: %.3f, max: %d, min: %d\n", __func__, pes[1].avg, pes[1].std, pes[1].high, pes[1].low); } else return 1; } if (opt->n_threads < 1) opt->n_threads = 1; if (optind + 1 >= argc || optind + 3 < argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); fprintf(stderr, "Algorithm options:\n\n"); fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); fprintf(stderr, " -y INT find MEMs longer than {-k} * {-r} with size less than INT [%ld]\n", (long)opt->max_mem_intv); // fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio); fprintf(stderr, " -W INT discard a chain if seeded bases shorter than INT [0]\n"); fprintf(stderr, " -m INT perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw); fprintf(stderr, " -S skip mate rescue\n"); fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n"); fprintf(stderr, " -e discard full-length exact matches\n"); fprintf(stderr, "\nScoring options:\n\n"); fprintf(stderr, " -A INT score for a sequence match, which scales options -TdBOELU unless overridden [%d]\n", opt->a); fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); fprintf(stderr, " -O INT[,INT] gap open penalties for deletions and insertions [%d,%d]\n", opt->o_del, opt->o_ins); fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins); fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3); fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n\n", opt->pen_unpaired); fprintf(stderr, " -x STR read type. Setting -x changes multiple parameters unless overriden [null]\n"); fprintf(stderr, " pacbio: -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0 (PacBio reads to ref)\n"); fprintf(stderr, " ont2d: -k14 -W20 -r10 -A1 -B1 -O1 -E1 -L0 (Oxford Nanopore 2D-reads to ref)\n"); fprintf(stderr, " intractg: -B9 -O16 -L5 (intra-species contigs to ref)\n"); // fprintf(stderr, " pbread: -k13 -W40 -c1000 -r10 -A1 -B1 -O1 -E1 -N25 -FeaD.001\n"); fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); fprintf(stderr, "\n"); fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose); fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T); fprintf(stderr, " -h INT if there are 80%% of the max score, output all in XA [%d]\n", opt->max_hits); fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); fprintf(stderr, " -V output the reference FASTA header in the XR tag\n"); fprintf(stderr, " -Y use soft clipping for supplementary alignments\n"); fprintf(stderr, " -M mark shorter split hits as secondary\n\n"); fprintf(stderr, " -I FLOAT[,FLOAT[,INT[,INT]]]\n"); fprintf(stderr, " specify the mean, standard deviation (10%% of the mean if absent), max\n"); fprintf(stderr, " (4 sigma from the mean if absent) and min of the insert size distribution.\n"); fprintf(stderr, " FR orientation only. [inferred]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n"); fprintf(stderr, "\n"); free(opt); return 1; } if (mode) { if (strcmp(mode, "intractg") == 0) { if (!opt0.o_del) opt->o_del = 16; if (!opt0.o_ins) opt->o_ins = 16; if (!opt0.b) opt->b = 9; if (!opt0.pen_clip5) opt->pen_clip5 = 5; if (!opt0.pen_clip3) opt->pen_clip3 = 5; } else if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "pbread") == 0 || strcmp(mode, "ont2d") == 0) { if (!opt0.o_del) opt->o_del = 1; if (!opt0.e_del) opt->e_del = 1; if (!opt0.o_ins) opt->o_ins = 1; if (!opt0.e_ins) opt->e_ins = 1; if (!opt0.b) opt->b = 1; if (opt0.split_factor == 0.) opt->split_factor = 10.; if (strcmp(mode, "pbread") == 0) { // pacbio read-to-read setting; NOT working well! opt->flag |= MEM_F_ALL | MEM_F_SELF_OVLP | MEM_F_ALN_REG; if (!opt0.min_chain_weight) opt->min_chain_weight = 40; if (!opt0.max_occ) opt->max_occ = 1000; if (!opt0.min_seed_len) opt->min_seed_len = 13; if (!opt0.max_chain_extend) opt->max_chain_extend = 25; if (opt0.drop_ratio == 0.) opt->drop_ratio = .001; } else if (strcmp(mode, "ont2d") == 0) { if (!opt0.min_chain_weight) opt->min_chain_weight = 20; if (!opt0.min_seed_len) opt->min_seed_len = 14; if (!opt0.pen_clip5) opt->pen_clip5 = 0; if (!opt0.pen_clip3) opt->pen_clip3 = 0; } else { if (!opt0.min_chain_weight) opt->min_chain_weight = 40; if (!opt0.min_seed_len) opt->min_seed_len = 17; if (!opt0.pen_clip5) opt->pen_clip5 = 0; if (!opt0.pen_clip3) opt->pen_clip3 = 0; } } else { fprintf(stderr, "[E::%s] unknown read type '%s'\n", __func__, mode); return 1; // FIXME memory leak } } else update_a(opt, &opt0); bwa_fill_scmat(opt->a, opt->b, opt->mat); if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak ko = kopen(argv[optind + 1], &fd); if (ko == 0) { if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 1]); return 1; } fp = gzdopen(fd, "r"); ks = kseq_init(fp); if (optind + 2 < argc) { if (opt->flag&MEM_F_PE) { if (bwa_verbose >= 2) fprintf(stderr, "[W::%s] when '-p' is in use, the second query file will be ignored.\n", __func__); } else { ko2 = kopen(argv[optind + 2], &fd2); if (ko2 == 0) { if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 2]); return 1; } fp2 = gzdopen(fd2, "r"); ks2 = kseq_init(fp2); opt->flag |= MEM_F_PE; } } if (!(opt->flag & MEM_F_ALN_REG)) bwa_print_sam_hdr(idx->bns, rg_line); actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads; while ((seqs = bseq_read(actual_chunk_size, &n, ks, ks2)) != 0) { int64_t size = 0; if ((opt->flag & MEM_F_PE) && (n&1) == 1) { if (bwa_verbose >= 2) fprintf(stderr, "[W::%s] odd number of reads in the PE mode; last read dropped\n", __func__); n = n>>1<<1; } if (!copy_comment) for (i = 0; i < n; ++i) { free(seqs[i].comment); seqs[i].comment = 0; } for (i = 0; i < n; ++i) size += seqs[i].l_seq; if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size); mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, pes0); n_processed += n; for (i = 0; i < n; ++i) { if (seqs[i].sam) err_fputs(seqs[i].sam, stdout); free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam); } free(seqs); } free(opt); bwa_idx_destroy(idx); kseq_destroy(ks); err_gzclose(fp); kclose(ko); if (ks2) { kseq_destroy(ks2); err_gzclose(fp2); kclose(ko2); } return 0; } int main_fastmap(int argc, char *argv[]) { int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1, max_len = INT_MAX; uint64_t max_intv = 0; kseq_t *seq; bwtint_t k; gzFile fp; smem_i *itr; const bwtintv_v *a; bwaidx_t *idx; while ((c = getopt(argc, argv, "w:l:pi:I:L:")) >= 0) { switch (c) { case 'p': print_seq = 1; break; case 'w': min_iwidth = atoi(optarg); break; case 'l': min_len = atoi(optarg); break; case 'i': min_intv = atoi(optarg); break; case 'I': max_intv = atol(optarg); break; case 'L': max_len = atoi(optarg); break; default: return 1; } } if (optind + 1 >= argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa fastmap [options] \n\n"); fprintf(stderr, "Options: -l INT min SMEM length to output [%d]\n", min_len); fprintf(stderr, " -w INT max interval size to find coordiantes [%d]\n", min_iwidth); fprintf(stderr, " -i INT min SMEM interval size [%d]\n", min_intv); fprintf(stderr, " -l INT max MEM length [%d]\n", max_len); fprintf(stderr, " -I INT stop if MEM is longer than -l with a size less than INT [%ld]\n", (long)max_intv); fprintf(stderr, "\n"); return 1; } fp = xzopen(argv[optind + 1], "r"); seq = kseq_init(fp); if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1; itr = smem_itr_init(idx->bwt); smem_config(itr, min_intv, max_len, max_intv); while (kseq_read(seq) >= 0) { err_printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l); if (print_seq) { err_putchar('\t'); err_puts(seq->seq.s); } else err_putchar('\n'); for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s); while ((a = smem_next(itr)) != 0) { for (i = 0; i < a->n; ++i) { bwtintv_t *p = &a->a[i]; if ((uint32_t)p->info - (p->info>>32) < min_len) continue; err_printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); if (p->x[2] <= min_iwidth) { for (k = 0; k < p->x[2]; ++k) { bwtint_t pos; int len, is_rev, ref_id; len = (uint32_t)p->info - (p->info>>32); pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &is_rev); if (is_rev) pos -= len - 1; bns_cnt_ambi(idx->bns, pos, len, &ref_id); err_printf("\t%s:%c%ld", idx->bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - idx->bns->anns[ref_id].offset) + 1); } } else err_puts("\t*"); err_putchar('\n'); } } err_puts("//"); } smem_itr_destroy(itr); bwa_idx_destroy(idx); kseq_destroy(seq); err_gzclose(fp); return 0; }