Added barcode support
This commit is contained in:
parent
10721ca602
commit
51d354cd28
|
|
@ -1,3 +1,4 @@
|
||||||
*.[oa]
|
*.[oa]
|
||||||
bwa
|
bwa
|
||||||
test
|
test
|
||||||
|
.*.swp
|
||||||
|
|
|
||||||
11
bwape.c
11
bwape.c
|
|
@ -645,13 +645,13 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
|
||||||
void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt)
|
void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt)
|
||||||
{
|
{
|
||||||
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
|
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
|
||||||
int i, j, n_seqs, tot_seqs = 0, read_flag = 0;
|
int i, j, n_seqs, tot_seqs = 0;
|
||||||
bwa_seq_t *seqs[2];
|
bwa_seq_t *seqs[2];
|
||||||
bwa_seqio_t *ks[2];
|
bwa_seqio_t *ks[2];
|
||||||
clock_t t;
|
clock_t t;
|
||||||
bntseq_t *bns, *ntbns = 0;
|
bntseq_t *bns, *ntbns = 0;
|
||||||
FILE *fp_sa[2];
|
FILE *fp_sa[2];
|
||||||
gap_opt_t opt;
|
gap_opt_t opt, opt0;
|
||||||
khint_t iter;
|
khint_t iter;
|
||||||
isize_info_t last_ii; // this is for the last batch of reads
|
isize_info_t last_ii; // this is for the last batch of reads
|
||||||
char str[1024];
|
char str[1024];
|
||||||
|
|
@ -671,6 +671,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
|
||||||
|
|
||||||
fread(&opt, sizeof(gap_opt_t), 1, fp_sa[0]);
|
fread(&opt, sizeof(gap_opt_t), 1, fp_sa[0]);
|
||||||
ks[0] = bwa_open_reads(opt.mode, fn_fa[0]);
|
ks[0] = bwa_open_reads(opt.mode, fn_fa[0]);
|
||||||
|
opt0 = opt;
|
||||||
fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten!
|
fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten!
|
||||||
ks[1] = bwa_open_reads(opt.mode, fn_fa[1]);
|
ks[1] = bwa_open_reads(opt.mode, fn_fa[1]);
|
||||||
if (!(opt.mode & BWA_MODE_COMPREAD)) {
|
if (!(opt.mode & BWA_MODE_COMPREAD)) {
|
||||||
|
|
@ -691,14 +692,12 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
|
||||||
// core loop
|
// core loop
|
||||||
bwa_print_sam_SQ(bns);
|
bwa_print_sam_SQ(bns);
|
||||||
bwa_print_sam_PG();
|
bwa_print_sam_PG();
|
||||||
read_flag |= (opt.mode & BWA_MODE_COMPREAD)? 1 : 0;
|
while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) {
|
||||||
read_flag |= ((opt.mode & BWA_MODE_IL13)? 1 : 0)<<1;
|
|
||||||
while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, read_flag, opt.trim_qual)) != 0) {
|
|
||||||
int cnt_chg;
|
int cnt_chg;
|
||||||
isize_info_t ii;
|
isize_info_t ii;
|
||||||
ubyte_t *pacseq;
|
ubyte_t *pacseq;
|
||||||
|
|
||||||
seqs[1] = bwa_read_seq(ks[1], 0x40000, &n_seqs, read_flag, opt.trim_qual);
|
seqs[1] = bwa_read_seq(ks[1], 0x40000, &n_seqs, opt.mode, opt.trim_qual);
|
||||||
tot_seqs += n_seqs;
|
tot_seqs += n_seqs;
|
||||||
t = clock();
|
t = clock();
|
||||||
|
|
||||||
|
|
|
||||||
8
bwase.c
8
bwase.c
|
|
@ -472,6 +472,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
|
||||||
} else printf("*");
|
} else printf("*");
|
||||||
|
|
||||||
if (bwa_rg_id) printf("\tRG:Z:%s", bwa_rg_id);
|
if (bwa_rg_id) printf("\tRG:Z:%s", bwa_rg_id);
|
||||||
|
if (p->bc[0]) printf("\tBC:Z:%s", p->bc);
|
||||||
if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len);
|
if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len);
|
||||||
if (p->type != BWA_TYPE_NO_MATCH) {
|
if (p->type != BWA_TYPE_NO_MATCH) {
|
||||||
int i;
|
int i;
|
||||||
|
|
@ -519,6 +520,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
|
||||||
printf("%s", p->qual);
|
printf("%s", p->qual);
|
||||||
} else printf("*");
|
} else printf("*");
|
||||||
if (bwa_rg_id) printf("\tRG:Z:%s", bwa_rg_id);
|
if (bwa_rg_id) printf("\tRG:Z:%s", bwa_rg_id);
|
||||||
|
if (p->bc[0]) printf("\tBC:Z:%s", p->bc);
|
||||||
if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len);
|
if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len);
|
||||||
putchar('\n');
|
putchar('\n');
|
||||||
}
|
}
|
||||||
|
|
@ -587,7 +589,7 @@ int bwa_set_rg(const char *s)
|
||||||
void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ)
|
void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ)
|
||||||
{
|
{
|
||||||
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
|
extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
|
||||||
int i, n_seqs, tot_seqs = 0, m_aln, read_flag = 0;
|
int i, n_seqs, tot_seqs = 0, m_aln;
|
||||||
bwt_aln1_t *aln = 0;
|
bwt_aln1_t *aln = 0;
|
||||||
bwa_seq_t *seqs;
|
bwa_seq_t *seqs;
|
||||||
bwa_seqio_t *ks;
|
bwa_seqio_t *ks;
|
||||||
|
|
@ -611,9 +613,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
|
||||||
// set ks
|
// set ks
|
||||||
ks = bwa_open_reads(opt.mode, fn_fa);
|
ks = bwa_open_reads(opt.mode, fn_fa);
|
||||||
// core loop
|
// core loop
|
||||||
read_flag |= (opt.mode & BWA_MODE_COMPREAD)? 1 : 0;
|
while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode, opt.trim_qual)) != 0) {
|
||||||
read_flag |= ((opt.mode & BWA_MODE_IL13)? 1 : 0)<<1;
|
|
||||||
while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, read_flag, opt.trim_qual)) != 0) {
|
|
||||||
tot_seqs += n_seqs;
|
tot_seqs += n_seqs;
|
||||||
t = clock();
|
t = clock();
|
||||||
|
|
||||||
|
|
|
||||||
28
bwaseqio.c
28
bwaseqio.c
|
|
@ -1,4 +1,5 @@
|
||||||
#include <zlib.h>
|
#include <zlib.h>
|
||||||
|
#include <ctype.h>
|
||||||
#include "bwtaln.h"
|
#include "bwtaln.h"
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
#include "bamlite.h"
|
#include "bamlite.h"
|
||||||
|
|
@ -139,20 +140,41 @@ static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_com
|
||||||
return seqs;
|
return seqs;
|
||||||
}
|
}
|
||||||
|
|
||||||
bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int flag, int trim_qual)
|
#define BARCODE_LOW_QUAL 13
|
||||||
|
|
||||||
|
bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual)
|
||||||
{
|
{
|
||||||
bwa_seq_t *seqs, *p;
|
bwa_seq_t *seqs, *p;
|
||||||
kseq_t *seq = bs->ks;
|
kseq_t *seq = bs->ks;
|
||||||
int n_seqs, l, i, is_comp = flag&1, is_64 = flag&2;
|
int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24;
|
||||||
long n_trimmed = 0, n_tot = 0;
|
long n_trimmed = 0, n_tot = 0;
|
||||||
|
|
||||||
if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual);
|
if (l_bc > 15) {
|
||||||
|
fprintf(stderr, "[%s] the maximum barcode length is 15.\n", __func__);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input
|
||||||
n_seqs = 0;
|
n_seqs = 0;
|
||||||
seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
|
seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
|
||||||
while ((l = kseq_read(seq)) >= 0) {
|
while ((l = kseq_read(seq)) >= 0) {
|
||||||
if (is_64 && seq->qual.l)
|
if (is_64 && seq->qual.l)
|
||||||
for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31;
|
for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31;
|
||||||
|
if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length
|
||||||
p = &seqs[n_seqs++];
|
p = &seqs[n_seqs++];
|
||||||
|
if (l_bc) { // then trim barcode
|
||||||
|
for (i = 0; i < l_bc; ++i)
|
||||||
|
p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]);
|
||||||
|
p->bc[i] = 0;
|
||||||
|
for (; i < seq->seq.l; ++i)
|
||||||
|
seq->seq.s[i - l_bc] = seq->seq.s[i];
|
||||||
|
seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0;
|
||||||
|
if (seq->qual.l) {
|
||||||
|
for (i = l_bc; i < seq->qual.l; ++i)
|
||||||
|
seq->qual.s[i - l_bc] = seq->qual.s[i];
|
||||||
|
seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0;
|
||||||
|
}
|
||||||
|
l = seq->seq.l;
|
||||||
|
} else p->bc[0] = 0;
|
||||||
p->tid = -1; // no assigned to a thread
|
p->tid = -1; // no assigned to a thread
|
||||||
p->qual = 0;
|
p->qual = 0;
|
||||||
p->full_len = p->clip_len = p->len = l;
|
p->full_len = p->clip_len = p->len = l;
|
||||||
|
|
|
||||||
10
bwtaln.c
10
bwtaln.c
|
|
@ -172,7 +172,7 @@ bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa)
|
||||||
|
|
||||||
void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
|
void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
|
||||||
{
|
{
|
||||||
int i, n_seqs, tot_seqs = 0, read_flag = 0;
|
int i, n_seqs, tot_seqs = 0;
|
||||||
bwa_seq_t *seqs;
|
bwa_seq_t *seqs;
|
||||||
bwa_seqio_t *ks;
|
bwa_seqio_t *ks;
|
||||||
clock_t t;
|
clock_t t;
|
||||||
|
|
@ -190,9 +190,7 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
|
||||||
|
|
||||||
// core loop
|
// core loop
|
||||||
fwrite(opt, sizeof(gap_opt_t), 1, stdout);
|
fwrite(opt, sizeof(gap_opt_t), 1, stdout);
|
||||||
read_flag |= (opt->mode & BWA_MODE_COMPREAD)? 1 : 0;
|
while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual)) != 0) {
|
||||||
read_flag |= ((opt->mode & BWA_MODE_IL13)? 1 : 0)<<1;
|
|
||||||
while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, read_flag, opt->trim_qual)) != 0) {
|
|
||||||
tot_seqs += n_seqs;
|
tot_seqs += n_seqs;
|
||||||
t = clock();
|
t = clock();
|
||||||
|
|
||||||
|
|
@ -248,7 +246,7 @@ int bwa_aln(int argc, char *argv[])
|
||||||
gap_opt_t *opt;
|
gap_opt_t *opt;
|
||||||
|
|
||||||
opt = gap_init_opt();
|
opt = gap_init_opt();
|
||||||
while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012I")) >= 0) {
|
while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IB:")) >= 0) {
|
||||||
switch (c) {
|
switch (c) {
|
||||||
case 'n':
|
case 'n':
|
||||||
if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1;
|
if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1;
|
||||||
|
|
@ -276,6 +274,7 @@ int bwa_aln(int argc, char *argv[])
|
||||||
case '1': opt->mode |= BWA_MODE_BAM_READ1; break;
|
case '1': opt->mode |= BWA_MODE_BAM_READ1; break;
|
||||||
case '2': opt->mode |= BWA_MODE_BAM_READ2; break;
|
case '2': opt->mode |= BWA_MODE_BAM_READ2; break;
|
||||||
case 'I': opt->mode |= BWA_MODE_IL13; break;
|
case 'I': opt->mode |= BWA_MODE_IL13; break;
|
||||||
|
case 'B': opt->mode |= atoi(optarg) << 24; break;
|
||||||
default: return 1;
|
default: return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -303,6 +302,7 @@ int bwa_aln(int argc, char *argv[])
|
||||||
fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2);
|
fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2);
|
||||||
fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual);
|
fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual);
|
||||||
fprintf(stderr, " -f FILE file to write output to instead of stdout\n");
|
fprintf(stderr, " -f FILE file to write output to instead of stdout\n");
|
||||||
|
fprintf(stderr, " -B INT length of barcode\n");
|
||||||
fprintf(stderr, " -c input sequences are in the color space\n");
|
fprintf(stderr, " -c input sequences are in the color space\n");
|
||||||
fprintf(stderr, " -L log-scaled gap penalty for long deletions\n");
|
fprintf(stderr, " -L log-scaled gap penalty for long deletions\n");
|
||||||
fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n");
|
fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n");
|
||||||
|
|
|
||||||
6
bwtaln.h
6
bwtaln.h
|
|
@ -74,6 +74,8 @@ typedef struct {
|
||||||
bwa_cigar_t *cigar;
|
bwa_cigar_t *cigar;
|
||||||
// for multi-threading only
|
// for multi-threading only
|
||||||
int tid;
|
int tid;
|
||||||
|
// barcode
|
||||||
|
char bc[16]; // null terminated; up to 15 bases
|
||||||
// NM and MD tags
|
// NM and MD tags
|
||||||
uint32_t full_len:20, nm:12;
|
uint32_t full_len:20, nm:12;
|
||||||
char *md;
|
char *md;
|
||||||
|
|
@ -91,7 +93,7 @@ typedef struct {
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int s_mm, s_gapo, s_gape;
|
int s_mm, s_gapo, s_gape;
|
||||||
int mode;
|
int mode; // bit 24-31 are the barcode length
|
||||||
int indel_end_skip, max_del_occ, max_entries;
|
int indel_end_skip, max_del_occ, max_entries;
|
||||||
float fnr;
|
float fnr;
|
||||||
int max_diff, max_gapo, max_gape;
|
int max_diff, max_gapo, max_gape;
|
||||||
|
|
@ -126,7 +128,7 @@ extern "C" {
|
||||||
bwa_seqio_t *bwa_bam_open(const char *fn, int which);
|
bwa_seqio_t *bwa_bam_open(const char *fn, int which);
|
||||||
void bwa_seq_close(bwa_seqio_t *bs);
|
void bwa_seq_close(bwa_seqio_t *bs);
|
||||||
void seq_reverse(int len, ubyte_t *seq, int is_comp);
|
void seq_reverse(int len, ubyte_t *seq, int is_comp);
|
||||||
bwa_seq_t *bwa_read_seq(bwa_seqio_t *seq, int n_needed, int *n, int is_comp, int trim_qual);
|
bwa_seq_t *bwa_read_seq(bwa_seqio_t *seq, int n_needed, int *n, int mode, int trim_qual);
|
||||||
void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs);
|
void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs);
|
||||||
|
|
||||||
int bwa_cal_maxdiff(int l, double err, double thres);
|
int bwa_cal_maxdiff(int l, double err, double thres);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue