From 36cd4f9882cec5b35836cc1ee09ced2aa9887a9d Mon Sep 17 00:00:00 2001 From: RoelKluin Date: Fri, 8 Jul 2011 16:49:09 +0200 Subject: [PATCH] In Casava 1.8 the fastq output changed, the name had a space which bwa wasn't parsing correctly. This patch fixes that and enables bwa to filter sequences marked by Casava, removing this tag from the output. Signed-off-by: RoelKluin --- bwaseqio.c | 19 +++++++++++++++++++ bwtaln.c | 4 +++- bwtaln.h | 1 + kseq.h | 2 +- 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/bwaseqio.c b/bwaseqio.c index 12ac765..ac29ba7 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -157,6 +157,25 @@ bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int tri n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((l = kseq_read(seq)) >= 0) { + // skip reads that are marked to be filtered by Casava + if (mode & BWA_MODE_CFY) { + char *s = rindex(seq->name.s, ' '); + if (s) { + *s = '\0'; + for(++s; *s != '\0'; ++s) { + if (*s == ':') { + ++s; + break; + } + } + if (*s == 'Y') + continue; + } + if (!s || *s != 'N') { + fprintf(stderr, "No Casava filter character found.\n"); + return 0; + } + } if (is_64 && seq->qual.l) for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length diff --git a/bwtaln.c b/bwtaln.c index 905c2d2..de6b001 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -233,7 +233,7 @@ int bwa_aln(int argc, char *argv[]) gap_opt_t *opt; opt = gap_init_opt(); - while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IB:")) >= 0) { + while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) { switch (c) { case 'n': if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; @@ -261,6 +261,7 @@ int bwa_aln(int argc, char *argv[]) case '1': opt->mode |= BWA_MODE_BAM_READ1; break; case '2': opt->mode |= BWA_MODE_BAM_READ2; break; case 'I': opt->mode |= BWA_MODE_IL13; break; + case 'Y': opt->mode |= BWA_MODE_CFY; break; case 'B': opt->mode |= atoi(optarg) << 24; break; default: return 1; } @@ -298,6 +299,7 @@ int bwa_aln(int argc, char *argv[]) fprintf(stderr, " -0 use single-end reads only (effective with -b)\n"); fprintf(stderr, " -1 use the 1st read in a pair (effective with -b)\n"); fprintf(stderr, " -2 use the 2nd read in a pair (effective with -b)\n"); + fprintf(stderr, " -Y filter Casava-filtered sequences\n"); fprintf(stderr, "\n"); return 1; } diff --git a/bwtaln.h b/bwtaln.h index 02f54f7..c2faa98 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -86,6 +86,7 @@ typedef struct { #define BWA_MODE_GAPE 0x01 #define BWA_MODE_COMPREAD 0x02 #define BWA_MODE_LOGGAP 0x04 +#define BWA_MODE_CFY 0x08 #define BWA_MODE_NONSTOP 0x10 #define BWA_MODE_BAM 0x20 #define BWA_MODE_BAM_SE 0x40 diff --git a/kseq.h b/kseq.h index ad8937c..f44ac31 100644 --- a/kseq.h +++ b/kseq.h @@ -102,7 +102,7 @@ typedef struct __kstring_t { if (ks->buf[i] == delimiter) break; \ } else { \ for (i = ks->begin; i < ks->end; ++i) \ - if (isspace(ks->buf[i])) break; \ + if (isspace(ks->buf[i]) && (ks->buf[i] != ' ')) break; \ } \ if (str->m - str->l < i - ks->begin + 1) { \ str->m = str->l + (i - ks->begin) + 1; \