support read group in bwa-mem
This commit is contained in:
parent
cfa7165036
commit
ee4540c394
66
bwa.c
66
bwa.c
|
|
@ -1,10 +1,13 @@
|
|||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <zlib.h>
|
||||
#include "bntseq.h"
|
||||
#include "bwa.h"
|
||||
#include "ksw.h"
|
||||
#include "utils.h"
|
||||
|
||||
int bwa_verbose = 3;
|
||||
char bwa_rg_id[256];
|
||||
|
||||
/************************
|
||||
* Batch FASTA/Q reader *
|
||||
|
|
@ -132,8 +135,7 @@ bwt_t *bwa_idx_load_bwt(const char *hint)
|
|||
bwt_t *bwt;
|
||||
prefix = bwa_idx_infer_prefix(hint);
|
||||
if (prefix == 0) {
|
||||
if (bwa_verbose >= 1)
|
||||
fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
|
||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
|
||||
return 0;
|
||||
}
|
||||
tmp = calloc(strlen(prefix) + 5, 1);
|
||||
|
|
@ -150,7 +152,10 @@ bwaidx_t *bwa_idx_load(const char *hint, int which)
|
|||
bwaidx_t *idx;
|
||||
char *prefix;
|
||||
prefix = bwa_idx_infer_prefix(hint);
|
||||
if (prefix == 0) return 0;
|
||||
if (prefix == 0) {
|
||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
|
||||
return 0;
|
||||
}
|
||||
idx = calloc(1, sizeof(bwaidx_t));
|
||||
if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
|
||||
if (which & BWA_IDX_BNS) {
|
||||
|
|
@ -174,3 +179,58 @@ void bwa_idx_destroy(bwaidx_t *idx)
|
|||
if (idx->pac) free(idx->pac);
|
||||
free(idx);
|
||||
}
|
||||
|
||||
/***********************
|
||||
* SAM header routines *
|
||||
***********************/
|
||||
|
||||
void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < bns->n_seqs; ++i)
|
||||
err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
|
||||
if (rg_line) err_printf("%s\n", rg_line);
|
||||
}
|
||||
|
||||
static char *bwa_escape(char *s)
|
||||
{
|
||||
char *p, *q;
|
||||
for (p = q = s; *p; ++p) {
|
||||
if (*p == '\\') {
|
||||
++p;
|
||||
if (*p == 't') *q++ = '\t';
|
||||
else if (*p == 'n') *q++ = '\n';
|
||||
else if (*p == 'r') *q++ = '\r';
|
||||
else if (*p == '\\') *q++ = '\\';
|
||||
} else *q++ = *p;
|
||||
}
|
||||
*q = '\0';
|
||||
return s;
|
||||
}
|
||||
|
||||
char *bwa_set_rg(const char *s)
|
||||
{
|
||||
char *p, *q, *r, *rg_line = 0;
|
||||
memset(bwa_rg_id, 0, 256);
|
||||
if (strstr(s, "@RG") != s) return 0;
|
||||
rg_line = strdup(s);
|
||||
bwa_escape(rg_line);
|
||||
if ((p = strstr(rg_line, "\tID:")) == 0) {
|
||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID in the @RG line\n", __func__);
|
||||
goto err_set_rg;
|
||||
}
|
||||
p += 4;
|
||||
for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
|
||||
if (q - p + 1 > 256) {
|
||||
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] RG:ID is longer than 255 characters\n", __func__);
|
||||
goto err_set_rg;
|
||||
}
|
||||
for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
|
||||
*r++ = *q;
|
||||
return rg_line;
|
||||
|
||||
err_set_rg:
|
||||
free(rg_line);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
|||
4
bwa.h
4
bwa.h
|
|
@ -22,6 +22,7 @@ typedef struct {
|
|||
} bseq1_t;
|
||||
|
||||
extern int bwa_verbose;
|
||||
extern char bwa_rg_id[256];
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
|
@ -36,6 +37,9 @@ extern "C" {
|
|||
bwaidx_t *bwa_idx_load(const char *hint, int which);
|
||||
void bwa_idx_destroy(bwaidx_t *idx);
|
||||
|
||||
void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line);
|
||||
char *bwa_set_rg(const char *s);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
1
bwamem.c
1
bwamem.c
|
|
@ -571,6 +571,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons
|
|||
}
|
||||
if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); }
|
||||
if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); }
|
||||
if (bwa_rg_id) { kputsn("\tRG:i:", 6, str); kputs(bwa_rg_id, str); }
|
||||
kputc('\n', str);
|
||||
free(cigar);
|
||||
#undef is_mapped
|
||||
|
|
|
|||
19
bwape.c
19
bwape.c
|
|
@ -611,7 +611,7 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
|
|||
return pacseq;
|
||||
}
|
||||
|
||||
void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt)
|
||||
void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line)
|
||||
{
|
||||
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
|
||||
int i, j, n_seqs, tot_seqs = 0;
|
||||
|
|
@ -654,7 +654,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
|
|||
}
|
||||
|
||||
// core loop
|
||||
bwa_print_sam_SQ(bns);
|
||||
bwa_print_sam_hdr(bns, rg_line);
|
||||
bwa_print_sam_PG();
|
||||
while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) {
|
||||
int cnt_chg;
|
||||
|
|
@ -715,20 +715,15 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
|
|||
|
||||
int bwa_sai2sam_pe(int argc, char *argv[])
|
||||
{
|
||||
extern char *bwa_rg_line, *bwa_rg_id;
|
||||
extern int bwa_set_rg(const char *s);
|
||||
int c;
|
||||
pe_opt_t *popt;
|
||||
char *prefix;
|
||||
char *prefix, *rg_line = 0;
|
||||
|
||||
popt = bwa_init_pe_opt();
|
||||
while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'r':
|
||||
if (bwa_set_rg(optarg) < 0) {
|
||||
fprintf(stderr, "[%s] malformated @RG line\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1;
|
||||
break;
|
||||
case 'a': popt->max_isize = atoi(optarg); break;
|
||||
case 'o': popt->max_occ = atoi(optarg); break;
|
||||
|
|
@ -764,11 +759,9 @@ int bwa_sai2sam_pe(int argc, char *argv[])
|
|||
}
|
||||
if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
|
||||
fprintf(stderr, "[%s] fail to locate the index\n", __func__);
|
||||
free(bwa_rg_line); free(bwa_rg_id);
|
||||
return 0;
|
||||
}
|
||||
bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt);
|
||||
free(bwa_rg_line); free(bwa_rg_id); free(prefix);
|
||||
free(popt);
|
||||
bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt, rg_line);
|
||||
free(prefix); free(popt);
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
59
bwase.c
59
bwase.c
|
|
@ -13,7 +13,6 @@
|
|||
#include "bwa.h"
|
||||
|
||||
int g_log_n[256];
|
||||
char *bwa_rg_line, *bwa_rg_id;
|
||||
|
||||
void bwa_print_sam_PG();
|
||||
|
||||
|
|
@ -490,56 +489,13 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
|
|||
}
|
||||
}
|
||||
|
||||
void bwa_print_sam_SQ(const bntseq_t *bns)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < bns->n_seqs; ++i)
|
||||
err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
|
||||
if (bwa_rg_line) err_printf("%s\n", bwa_rg_line);
|
||||
}
|
||||
|
||||
void bwase_initialize()
|
||||
{
|
||||
int i;
|
||||
for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
|
||||
}
|
||||
|
||||
char *bwa_escape(char *s)
|
||||
{
|
||||
char *p, *q;
|
||||
for (p = q = s; *p; ++p) {
|
||||
if (*p == '\\') {
|
||||
++p;
|
||||
if (*p == 't') *q++ = '\t';
|
||||
else if (*p == 'n') *q++ = '\n';
|
||||
else if (*p == 'r') *q++ = '\r';
|
||||
else if (*p == '\\') *q++ = '\\';
|
||||
} else *q++ = *p;
|
||||
}
|
||||
*q = '\0';
|
||||
return s;
|
||||
}
|
||||
|
||||
int bwa_set_rg(const char *s)
|
||||
{
|
||||
char *p, *q, *r;
|
||||
if (strstr(s, "@RG") != s) return -1;
|
||||
if (bwa_rg_line) free(bwa_rg_line);
|
||||
if (bwa_rg_id) free(bwa_rg_id);
|
||||
bwa_rg_line = strdup(s);
|
||||
bwa_rg_id = 0;
|
||||
bwa_escape(bwa_rg_line);
|
||||
p = strstr(bwa_rg_line, "\tID:");
|
||||
if (p == 0) return -1;
|
||||
p += 4;
|
||||
for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
|
||||
bwa_rg_id = calloc(q - p + 1, 1);
|
||||
for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
|
||||
*r++ = *q;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ)
|
||||
void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ, const char *rg_line)
|
||||
{
|
||||
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
|
||||
int i, n_seqs, tot_seqs = 0, m_aln;
|
||||
|
|
@ -559,7 +515,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
|
|||
|
||||
m_aln = 0;
|
||||
fread(&opt, sizeof(gap_opt_t), 1, fp_sa);
|
||||
bwa_print_sam_SQ(bns);
|
||||
bwa_print_sam_hdr(bns, rg_line);
|
||||
//bwa_print_sam_PG();
|
||||
// set ks
|
||||
ks = bwa_open_reads(opt.mode, fn_fa);
|
||||
|
|
@ -608,15 +564,12 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
|
|||
int bwa_sai2sam_se(int argc, char *argv[])
|
||||
{
|
||||
int c, n_occ = 3;
|
||||
char *prefix;
|
||||
char *prefix, *rg_line = 0;
|
||||
while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'h': break;
|
||||
case 'r':
|
||||
if (bwa_set_rg(optarg) < 0) {
|
||||
fprintf(stderr, "[%s] malformated @RG line\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1;
|
||||
break;
|
||||
case 'n': n_occ = atoi(optarg); break;
|
||||
case 'f': xreopen(optarg, "w", stdout); break;
|
||||
|
|
@ -630,10 +583,8 @@ int bwa_sai2sam_se(int argc, char *argv[])
|
|||
}
|
||||
if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
|
||||
fprintf(stderr, "[%s] fail to locate the index\n", __func__);
|
||||
free(bwa_rg_line); free(bwa_rg_id);
|
||||
return 0;
|
||||
}
|
||||
bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ);
|
||||
free(bwa_rg_line); free(bwa_rg_id);
|
||||
bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line);
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
15
fastmap.c
15
fastmap.c
|
|
@ -14,14 +14,15 @@ extern unsigned char nst_nt4_table[256];
|
|||
int main_mem(int argc, char *argv[])
|
||||
{
|
||||
mem_opt_t *opt;
|
||||
int c, n, l;
|
||||
int c, n;
|
||||
gzFile fp, fp2 = 0;
|
||||
kseq_t *ks, *ks2 = 0;
|
||||
bseq1_t *seqs;
|
||||
bwaidx_t *idx;
|
||||
char *rg_line = 0;
|
||||
|
||||
opt = mem_opt_init();
|
||||
while ((c = getopt(argc, argv, "PHk:c:v:s:r:t:")) >= 0) {
|
||||
while ((c = getopt(argc, argv, "PHk:c:v:s:r:t:R:")) >= 0) {
|
||||
if (c == 'k') opt->min_seed_len = atoi(optarg);
|
||||
else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
|
||||
else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
|
||||
|
|
@ -29,7 +30,9 @@ int main_mem(int argc, char *argv[])
|
|||
else if (c == 'c') opt->max_occ = atoi(optarg);
|
||||
else if (c == 'v') mem_verbose = atoi(optarg);
|
||||
else if (c == 'r') opt->split_factor = atof(optarg);
|
||||
else if (c == 's') opt->split_width = atoi(optarg);
|
||||
else if (c == 'R') {
|
||||
if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak
|
||||
} else if (c == 's') opt->split_width = atoi(optarg);
|
||||
}
|
||||
if (optind + 1 >= argc) {
|
||||
fprintf(stderr, "\n");
|
||||
|
|
@ -39,15 +42,15 @@ int main_mem(int argc, char *argv[])
|
|||
fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
|
||||
fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
|
||||
fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
|
||||
fprintf(stderr, " -R STR read group header line such as '@RG\tID:foo\tSM:bar' [null]\n");
|
||||
fprintf(stderr, " -v INT verbose level [%d]\n", mem_verbose);
|
||||
fprintf(stderr, "\n");
|
||||
free(opt);
|
||||
return 1;
|
||||
}
|
||||
mem_fill_scmat(opt->a, opt->b, opt->mat);
|
||||
idx = bwa_idx_load(argv[optind], BWA_IDX_ALL);
|
||||
for (l = 0; l < idx->bns->n_seqs; ++l)
|
||||
printf("@SQ\tSN:%s\tLN:%d\n", idx->bns->anns[l].name, idx->bns->anns[l].len);
|
||||
if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
|
||||
bwa_print_sam_hdr(idx->bns, rg_line);
|
||||
|
||||
fp = strcmp(argv[optind + 1], "-")? gzopen(argv[optind + 1], "r") : gzdopen(fileno(stdin), "r");
|
||||
ks = kseq_init(fp);
|
||||
|
|
|
|||
Loading…
Reference in New Issue