diff --git a/bwa.c b/bwa.c index fac0db7..f5e8692 100644 --- a/bwa.c +++ b/bwa.c @@ -1,10 +1,13 @@ +#include #include #include #include "bntseq.h" #include "bwa.h" #include "ksw.h" +#include "utils.h" int bwa_verbose = 3; +char bwa_rg_id[256]; /************************ * Batch FASTA/Q reader * @@ -132,8 +135,7 @@ bwt_t *bwa_idx_load_bwt(const char *hint) bwt_t *bwt; prefix = bwa_idx_infer_prefix(hint); if (prefix == 0) { - if (bwa_verbose >= 1) - fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__); + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__); return 0; } tmp = calloc(strlen(prefix) + 5, 1); @@ -150,7 +152,10 @@ bwaidx_t *bwa_idx_load(const char *hint, int which) bwaidx_t *idx; char *prefix; prefix = bwa_idx_infer_prefix(hint); - if (prefix == 0) return 0; + if (prefix == 0) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__); + return 0; + } idx = calloc(1, sizeof(bwaidx_t)); if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint); if (which & BWA_IDX_BNS) { @@ -174,3 +179,58 @@ void bwa_idx_destroy(bwaidx_t *idx) if (idx->pac) free(idx->pac); free(idx); } + +/*********************** + * SAM header routines * + ***********************/ + +void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line) +{ + int i; + for (i = 0; i < bns->n_seqs; ++i) + err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len); + if (rg_line) err_printf("%s\n", rg_line); +} + +static char *bwa_escape(char *s) +{ + char *p, *q; + for (p = q = s; *p; ++p) { + if (*p == '\\') { + ++p; + if (*p == 't') *q++ = '\t'; + else if (*p == 'n') *q++ = '\n'; + else if (*p == 'r') *q++ = '\r'; + else if (*p == '\\') *q++ = '\\'; + } else *q++ = *p; + } + *q = '\0'; + return s; +} + +char *bwa_set_rg(const char *s) +{ + char *p, *q, *r, *rg_line = 0; + memset(bwa_rg_id, 0, 256); + if (strstr(s, "@RG") != s) return 0; + rg_line = strdup(s); + bwa_escape(rg_line); + if ((p = strstr(rg_line, "\tID:")) == 0) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID in the @RG line\n", __func__); + goto err_set_rg; + } + p += 4; + for (q = p; *q && *q != '\t' && *q != '\n'; ++q); + if (q - p + 1 > 256) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] RG:ID is longer than 255 characters\n", __func__); + goto err_set_rg; + } + for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q) + *r++ = *q; + return rg_line; + +err_set_rg: + free(rg_line); + return 0; +} + diff --git a/bwa.h b/bwa.h index b5eda13..208db6a 100644 --- a/bwa.h +++ b/bwa.h @@ -22,6 +22,7 @@ typedef struct { } bseq1_t; extern int bwa_verbose; +extern char bwa_rg_id[256]; #ifdef __cplusplus extern "C" { @@ -36,6 +37,9 @@ extern "C" { bwaidx_t *bwa_idx_load(const char *hint, int which); void bwa_idx_destroy(bwaidx_t *idx); + void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line); + char *bwa_set_rg(const char *s); + #ifdef __cplusplus } #endif diff --git a/bwamem.c b/bwamem.c index 6b219cf..ce55cad 100644 --- a/bwamem.c +++ b/bwamem.c @@ -571,6 +571,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons } if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } + if (bwa_rg_id) { kputsn("\tRG:i:", 6, str); kputs(bwa_rg_id, str); } kputc('\n', str); free(cigar); #undef is_mapped diff --git a/bwape.c b/bwape.c index 87393b1..0b2b8d6 100644 --- a/bwape.c +++ b/bwape.c @@ -611,7 +611,7 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, return pacseq; } -void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt) +void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line) { extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); int i, j, n_seqs, tot_seqs = 0; @@ -654,7 +654,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f } // core loop - bwa_print_sam_SQ(bns); + bwa_print_sam_hdr(bns, rg_line); bwa_print_sam_PG(); while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) { int cnt_chg; @@ -715,20 +715,15 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f int bwa_sai2sam_pe(int argc, char *argv[]) { - extern char *bwa_rg_line, *bwa_rg_id; - extern int bwa_set_rg(const char *s); int c; pe_opt_t *popt; - char *prefix; + char *prefix, *rg_line = 0; popt = bwa_init_pe_opt(); while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) { switch (c) { case 'r': - if (bwa_set_rg(optarg) < 0) { - fprintf(stderr, "[%s] malformated @RG line\n", __func__); - return 1; - } + if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; break; case 'a': popt->max_isize = atoi(optarg); break; case 'o': popt->max_occ = atoi(optarg); break; @@ -764,11 +759,9 @@ int bwa_sai2sam_pe(int argc, char *argv[]) } if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { fprintf(stderr, "[%s] fail to locate the index\n", __func__); - free(bwa_rg_line); free(bwa_rg_id); return 0; } - bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt); - free(bwa_rg_line); free(bwa_rg_id); free(prefix); - free(popt); + bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt, rg_line); + free(prefix); free(popt); return 0; } diff --git a/bwase.c b/bwase.c index 8f50c7a..27da794 100644 --- a/bwase.c +++ b/bwase.c @@ -13,7 +13,6 @@ #include "bwa.h" int g_log_n[256]; -char *bwa_rg_line, *bwa_rg_id; void bwa_print_sam_PG(); @@ -490,56 +489,13 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in } } -void bwa_print_sam_SQ(const bntseq_t *bns) -{ - int i; - for (i = 0; i < bns->n_seqs; ++i) - err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len); - if (bwa_rg_line) err_printf("%s\n", bwa_rg_line); -} - void bwase_initialize() { int i; for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); } -char *bwa_escape(char *s) -{ - char *p, *q; - for (p = q = s; *p; ++p) { - if (*p == '\\') { - ++p; - if (*p == 't') *q++ = '\t'; - else if (*p == 'n') *q++ = '\n'; - else if (*p == 'r') *q++ = '\r'; - else if (*p == '\\') *q++ = '\\'; - } else *q++ = *p; - } - *q = '\0'; - return s; -} - -int bwa_set_rg(const char *s) -{ - char *p, *q, *r; - if (strstr(s, "@RG") != s) return -1; - if (bwa_rg_line) free(bwa_rg_line); - if (bwa_rg_id) free(bwa_rg_id); - bwa_rg_line = strdup(s); - bwa_rg_id = 0; - bwa_escape(bwa_rg_line); - p = strstr(bwa_rg_line, "\tID:"); - if (p == 0) return -1; - p += 4; - for (q = p; *q && *q != '\t' && *q != '\n'; ++q); - bwa_rg_id = calloc(q - p + 1, 1); - for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q) - *r++ = *q; - return 0; -} - -void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ) +void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ, const char *rg_line) { extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); int i, n_seqs, tot_seqs = 0, m_aln; @@ -559,7 +515,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f m_aln = 0; fread(&opt, sizeof(gap_opt_t), 1, fp_sa); - bwa_print_sam_SQ(bns); + bwa_print_sam_hdr(bns, rg_line); //bwa_print_sam_PG(); // set ks ks = bwa_open_reads(opt.mode, fn_fa); @@ -608,15 +564,12 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f int bwa_sai2sam_se(int argc, char *argv[]) { int c, n_occ = 3; - char *prefix; + char *prefix, *rg_line = 0; while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) { switch (c) { case 'h': break; case 'r': - if (bwa_set_rg(optarg) < 0) { - fprintf(stderr, "[%s] malformated @RG line\n", __func__); - return 1; - } + if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; break; case 'n': n_occ = atoi(optarg); break; case 'f': xreopen(optarg, "w", stdout); break; @@ -630,10 +583,8 @@ int bwa_sai2sam_se(int argc, char *argv[]) } if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { fprintf(stderr, "[%s] fail to locate the index\n", __func__); - free(bwa_rg_line); free(bwa_rg_id); return 0; } - bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ); - free(bwa_rg_line); free(bwa_rg_id); + bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line); return 0; } diff --git a/fastmap.c b/fastmap.c index 2800821..adbe04c 100644 --- a/fastmap.c +++ b/fastmap.c @@ -14,14 +14,15 @@ extern unsigned char nst_nt4_table[256]; int main_mem(int argc, char *argv[]) { mem_opt_t *opt; - int c, n, l; + int c, n; gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; bseq1_t *seqs; bwaidx_t *idx; + char *rg_line = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "PHk:c:v:s:r:t:")) >= 0) { + while ((c = getopt(argc, argv, "PHk:c:v:s:r:t:R:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; @@ -29,7 +30,9 @@ int main_mem(int argc, char *argv[]) else if (c == 'c') opt->max_occ = atoi(optarg); else if (c == 'v') mem_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg); - else if (c == 's') opt->split_width = atoi(optarg); + else if (c == 'R') { + if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak + } else if (c == 's') opt->split_width = atoi(optarg); } if (optind + 1 >= argc) { fprintf(stderr, "\n"); @@ -39,15 +42,15 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); + fprintf(stderr, " -R STR read group header line such as '@RG\tID:foo\tSM:bar' [null]\n"); fprintf(stderr, " -v INT verbose level [%d]\n", mem_verbose); fprintf(stderr, "\n"); free(opt); return 1; } mem_fill_scmat(opt->a, opt->b, opt->mat); - idx = bwa_idx_load(argv[optind], BWA_IDX_ALL); - for (l = 0; l < idx->bns->n_seqs; ++l) - printf("@SQ\tSN:%s\tLN:%d\n", idx->bns->anns[l].name, idx->bns->anns[l].len); + if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak + bwa_print_sam_hdr(idx->bns, rg_line); fp = strcmp(argv[optind + 1], "-")? gzopen(argv[optind + 1], "r") : gzdopen(fileno(stdin), "r"); ks = kseq_init(fp);