From 51d354cd289e377aaa3af20e9b739e1819ba2ac3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 15 Jan 2011 15:35:39 -0500 Subject: [PATCH] Added barcode support --- .gitignore | 1 + bwape.c | 11 +++++------ bwase.c | 8 ++++---- bwaseqio.c | 28 +++++++++++++++++++++++++--- bwtaln.c | 10 +++++----- bwtaln.h | 6 ++++-- main.c | 2 +- 7 files changed, 45 insertions(+), 21 deletions(-) diff --git a/.gitignore b/.gitignore index 1e78ec2..16d123a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.[oa] bwa test +.*.swp diff --git a/bwape.c b/bwape.c index dec52c3..4e94373 100644 --- a/bwape.c +++ b/bwape.c @@ -645,13 +645,13 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt) { extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); - int i, j, n_seqs, tot_seqs = 0, read_flag = 0; + int i, j, n_seqs, tot_seqs = 0; bwa_seq_t *seqs[2]; bwa_seqio_t *ks[2]; clock_t t; bntseq_t *bns, *ntbns = 0; FILE *fp_sa[2]; - gap_opt_t opt; + gap_opt_t opt, opt0; khint_t iter; isize_info_t last_ii; // this is for the last batch of reads char str[1024]; @@ -671,6 +671,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f fread(&opt, sizeof(gap_opt_t), 1, fp_sa[0]); ks[0] = bwa_open_reads(opt.mode, fn_fa[0]); + opt0 = opt; fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten! ks[1] = bwa_open_reads(opt.mode, fn_fa[1]); if (!(opt.mode & BWA_MODE_COMPREAD)) { @@ -691,14 +692,12 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f // core loop bwa_print_sam_SQ(bns); bwa_print_sam_PG(); - read_flag |= (opt.mode & BWA_MODE_COMPREAD)? 1 : 0; - read_flag |= ((opt.mode & BWA_MODE_IL13)? 1 : 0)<<1; - while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, read_flag, opt.trim_qual)) != 0) { + while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) { int cnt_chg; isize_info_t ii; ubyte_t *pacseq; - seqs[1] = bwa_read_seq(ks[1], 0x40000, &n_seqs, read_flag, opt.trim_qual); + seqs[1] = bwa_read_seq(ks[1], 0x40000, &n_seqs, opt.mode, opt.trim_qual); tot_seqs += n_seqs; t = clock(); diff --git a/bwase.c b/bwase.c index 12f01fb..e9e164d 100644 --- a/bwase.c +++ b/bwase.c @@ -472,6 +472,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in } else printf("*"); if (bwa_rg_id) printf("\tRG:Z:%s", bwa_rg_id); + if (p->bc[0]) printf("\tBC:Z:%s", p->bc); if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len); if (p->type != BWA_TYPE_NO_MATCH) { int i; @@ -519,6 +520,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in printf("%s", p->qual); } else printf("*"); if (bwa_rg_id) printf("\tRG:Z:%s", bwa_rg_id); + if (p->bc[0]) printf("\tBC:Z:%s", p->bc); if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len); putchar('\n'); } @@ -587,7 +589,7 @@ int bwa_set_rg(const char *s) void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ) { extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); - int i, n_seqs, tot_seqs = 0, m_aln, read_flag = 0; + int i, n_seqs, tot_seqs = 0, m_aln; bwt_aln1_t *aln = 0; bwa_seq_t *seqs; bwa_seqio_t *ks; @@ -611,9 +613,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f // set ks ks = bwa_open_reads(opt.mode, fn_fa); // core loop - read_flag |= (opt.mode & BWA_MODE_COMPREAD)? 1 : 0; - read_flag |= ((opt.mode & BWA_MODE_IL13)? 1 : 0)<<1; - while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, read_flag, opt.trim_qual)) != 0) { + while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode, opt.trim_qual)) != 0) { tot_seqs += n_seqs; t = clock(); diff --git a/bwaseqio.c b/bwaseqio.c index 10ff83b..94271b9 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -1,4 +1,5 @@ #include +#include #include "bwtaln.h" #include "utils.h" #include "bamlite.h" @@ -139,20 +140,41 @@ static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_com return seqs; } -bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int flag, int trim_qual) +#define BARCODE_LOW_QUAL 13 + +bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual) { bwa_seq_t *seqs, *p; kseq_t *seq = bs->ks; - int n_seqs, l, i, is_comp = flag&1, is_64 = flag&2; + int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24; long n_trimmed = 0, n_tot = 0; - if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); + if (l_bc > 15) { + fprintf(stderr, "[%s] the maximum barcode length is 15.\n", __func__); + return 0; + } + if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((l = kseq_read(seq)) >= 0) { if (is_64 && seq->qual.l) for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; + if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length p = &seqs[n_seqs++]; + if (l_bc) { // then trim barcode + for (i = 0; i < l_bc; ++i) + p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]); + p->bc[i] = 0; + for (; i < seq->seq.l; ++i) + seq->seq.s[i - l_bc] = seq->seq.s[i]; + seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0; + if (seq->qual.l) { + for (i = l_bc; i < seq->qual.l; ++i) + seq->qual.s[i - l_bc] = seq->qual.s[i]; + seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0; + } + l = seq->seq.l; + } else p->bc[0] = 0; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; diff --git a/bwtaln.c b/bwtaln.c index 3aa9a5c..bd2aad2 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -172,7 +172,7 @@ bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa) void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) { - int i, n_seqs, tot_seqs = 0, read_flag = 0; + int i, n_seqs, tot_seqs = 0; bwa_seq_t *seqs; bwa_seqio_t *ks; clock_t t; @@ -190,9 +190,7 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) // core loop fwrite(opt, sizeof(gap_opt_t), 1, stdout); - read_flag |= (opt->mode & BWA_MODE_COMPREAD)? 1 : 0; - read_flag |= ((opt->mode & BWA_MODE_IL13)? 1 : 0)<<1; - while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, read_flag, opt->trim_qual)) != 0) { + while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual)) != 0) { tot_seqs += n_seqs; t = clock(); @@ -248,7 +246,7 @@ int bwa_aln(int argc, char *argv[]) gap_opt_t *opt; opt = gap_init_opt(); - while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012I")) >= 0) { + while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IB:")) >= 0) { switch (c) { case 'n': if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; @@ -276,6 +274,7 @@ int bwa_aln(int argc, char *argv[]) case '1': opt->mode |= BWA_MODE_BAM_READ1; break; case '2': opt->mode |= BWA_MODE_BAM_READ2; break; case 'I': opt->mode |= BWA_MODE_IL13; break; + case 'B': opt->mode |= atoi(optarg) << 24; break; default: return 1; } } @@ -303,6 +302,7 @@ int bwa_aln(int argc, char *argv[]) fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2); fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual); fprintf(stderr, " -f FILE file to write output to instead of stdout\n"); + fprintf(stderr, " -B INT length of barcode\n"); fprintf(stderr, " -c input sequences are in the color space\n"); fprintf(stderr, " -L log-scaled gap penalty for long deletions\n"); fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n"); diff --git a/bwtaln.h b/bwtaln.h index 9e841f8..f659ac8 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -74,6 +74,8 @@ typedef struct { bwa_cigar_t *cigar; // for multi-threading only int tid; + // barcode + char bc[16]; // null terminated; up to 15 bases // NM and MD tags uint32_t full_len:20, nm:12; char *md; @@ -91,7 +93,7 @@ typedef struct { typedef struct { int s_mm, s_gapo, s_gape; - int mode; + int mode; // bit 24-31 are the barcode length int indel_end_skip, max_del_occ, max_entries; float fnr; int max_diff, max_gapo, max_gape; @@ -126,7 +128,7 @@ extern "C" { bwa_seqio_t *bwa_bam_open(const char *fn, int which); void bwa_seq_close(bwa_seqio_t *bs); void seq_reverse(int len, ubyte_t *seq, int is_comp); - bwa_seq_t *bwa_read_seq(bwa_seqio_t *seq, int n_needed, int *n, int is_comp, int trim_qual); + bwa_seq_t *bwa_read_seq(bwa_seqio_t *seq, int n_needed, int *n, int mode, int trim_qual); void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs); int bwa_cal_maxdiff(int l, double err, double thres); diff --git a/main.c b/main.c index 25e32e8..1b9d7f2 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "main.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.5.9rc1-r10" +#define PACKAGE_VERSION "0.5.9rc1-r11" #endif static int usage()