From 36cd4f9882cec5b35836cc1ee09ced2aa9887a9d Mon Sep 17 00:00:00 2001 From: RoelKluin Date: Fri, 8 Jul 2011 16:49:09 +0200 Subject: [PATCH 1/3] In Casava 1.8 the fastq output changed, the name had a space which bwa wasn't parsing correctly. This patch fixes that and enables bwa to filter sequences marked by Casava, removing this tag from the output. Signed-off-by: RoelKluin --- bwaseqio.c | 19 +++++++++++++++++++ bwtaln.c | 4 +++- bwtaln.h | 1 + kseq.h | 2 +- 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/bwaseqio.c b/bwaseqio.c index 12ac765..ac29ba7 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -157,6 +157,25 @@ bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int tri n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((l = kseq_read(seq)) >= 0) { + // skip reads that are marked to be filtered by Casava + if (mode & BWA_MODE_CFY) { + char *s = rindex(seq->name.s, ' '); + if (s) { + *s = '\0'; + for(++s; *s != '\0'; ++s) { + if (*s == ':') { + ++s; + break; + } + } + if (*s == 'Y') + continue; + } + if (!s || *s != 'N') { + fprintf(stderr, "No Casava filter character found.\n"); + return 0; + } + } if (is_64 && seq->qual.l) for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length diff --git a/bwtaln.c b/bwtaln.c index 905c2d2..de6b001 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -233,7 +233,7 @@ int bwa_aln(int argc, char *argv[]) gap_opt_t *opt; opt = gap_init_opt(); - while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IB:")) >= 0) { + while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) { switch (c) { case 'n': if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; @@ -261,6 +261,7 @@ int bwa_aln(int argc, char *argv[]) case '1': opt->mode |= BWA_MODE_BAM_READ1; break; case '2': opt->mode |= BWA_MODE_BAM_READ2; break; case 'I': opt->mode |= BWA_MODE_IL13; break; + case 'Y': opt->mode |= BWA_MODE_CFY; break; case 'B': opt->mode |= atoi(optarg) << 24; break; default: return 1; } @@ -298,6 +299,7 @@ int bwa_aln(int argc, char *argv[]) fprintf(stderr, " -0 use single-end reads only (effective with -b)\n"); fprintf(stderr, " -1 use the 1st read in a pair (effective with -b)\n"); fprintf(stderr, " -2 use the 2nd read in a pair (effective with -b)\n"); + fprintf(stderr, " -Y filter Casava-filtered sequences\n"); fprintf(stderr, "\n"); return 1; } diff --git a/bwtaln.h b/bwtaln.h index 02f54f7..c2faa98 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -86,6 +86,7 @@ typedef struct { #define BWA_MODE_GAPE 0x01 #define BWA_MODE_COMPREAD 0x02 #define BWA_MODE_LOGGAP 0x04 +#define BWA_MODE_CFY 0x08 #define BWA_MODE_NONSTOP 0x10 #define BWA_MODE_BAM 0x20 #define BWA_MODE_BAM_SE 0x40 diff --git a/kseq.h b/kseq.h index ad8937c..f44ac31 100644 --- a/kseq.h +++ b/kseq.h @@ -102,7 +102,7 @@ typedef struct __kstring_t { if (ks->buf[i] == delimiter) break; \ } else { \ for (i = ks->begin; i < ks->end; ++i) \ - if (isspace(ks->buf[i])) break; \ + if (isspace(ks->buf[i]) && (ks->buf[i] != ' ')) break; \ } \ if (str->m - str->l < i - ks->begin + 1) { \ str->m = str->l + (i - ks->begin) + 1; \ From 8f115a8e00bd6f5c38292f04786f6ba1fd955a65 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Sun, 10 Jul 2011 16:40:42 +0200 Subject: [PATCH 2/3] Revert "In Casava 1.8 the fastq output changed, the name had a space which bwa" This reverts commit 36cd4f9882cec5b35836cc1ee09ced2aa9887a9d. The comment shouldn't be included in the sequence name. --- bwaseqio.c | 19 ------------------- bwtaln.c | 4 +--- bwtaln.h | 1 - kseq.h | 2 +- 4 files changed, 2 insertions(+), 24 deletions(-) diff --git a/bwaseqio.c b/bwaseqio.c index ac29ba7..12ac765 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -157,25 +157,6 @@ bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int tri n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((l = kseq_read(seq)) >= 0) { - // skip reads that are marked to be filtered by Casava - if (mode & BWA_MODE_CFY) { - char *s = rindex(seq->name.s, ' '); - if (s) { - *s = '\0'; - for(++s; *s != '\0'; ++s) { - if (*s == ':') { - ++s; - break; - } - } - if (*s == 'Y') - continue; - } - if (!s || *s != 'N') { - fprintf(stderr, "No Casava filter character found.\n"); - return 0; - } - } if (is_64 && seq->qual.l) for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length diff --git a/bwtaln.c b/bwtaln.c index de6b001..905c2d2 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -233,7 +233,7 @@ int bwa_aln(int argc, char *argv[]) gap_opt_t *opt; opt = gap_init_opt(); - while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) { + while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IB:")) >= 0) { switch (c) { case 'n': if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; @@ -261,7 +261,6 @@ int bwa_aln(int argc, char *argv[]) case '1': opt->mode |= BWA_MODE_BAM_READ1; break; case '2': opt->mode |= BWA_MODE_BAM_READ2; break; case 'I': opt->mode |= BWA_MODE_IL13; break; - case 'Y': opt->mode |= BWA_MODE_CFY; break; case 'B': opt->mode |= atoi(optarg) << 24; break; default: return 1; } @@ -299,7 +298,6 @@ int bwa_aln(int argc, char *argv[]) fprintf(stderr, " -0 use single-end reads only (effective with -b)\n"); fprintf(stderr, " -1 use the 1st read in a pair (effective with -b)\n"); fprintf(stderr, " -2 use the 2nd read in a pair (effective with -b)\n"); - fprintf(stderr, " -Y filter Casava-filtered sequences\n"); fprintf(stderr, "\n"); return 1; } diff --git a/bwtaln.h b/bwtaln.h index c2faa98..02f54f7 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -86,7 +86,6 @@ typedef struct { #define BWA_MODE_GAPE 0x01 #define BWA_MODE_COMPREAD 0x02 #define BWA_MODE_LOGGAP 0x04 -#define BWA_MODE_CFY 0x08 #define BWA_MODE_NONSTOP 0x10 #define BWA_MODE_BAM 0x20 #define BWA_MODE_BAM_SE 0x40 diff --git a/kseq.h b/kseq.h index f44ac31..ad8937c 100644 --- a/kseq.h +++ b/kseq.h @@ -102,7 +102,7 @@ typedef struct __kstring_t { if (ks->buf[i] == delimiter) break; \ } else { \ for (i = ks->begin; i < ks->end; ++i) \ - if (isspace(ks->buf[i]) && (ks->buf[i] != ' ')) break; \ + if (isspace(ks->buf[i])) break; \ } \ if (str->m - str->l < i - ks->begin + 1) { \ str->m = str->l + (i - ks->begin) + 1; \ From db59a605d1b31ec20645676c4f58d0af21198fec Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Sun, 10 Jul 2011 17:04:06 +0200 Subject: [PATCH 3/3] Remove sequences marked to be filtered by Casava-1.8 with bwa aln -Y In Casava 1.8 the fastq output changed. e.g. @EAS139:136:FC706VJ:2:5:1000:12850 1:Y:18:ATCACG AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + BBBBCCCC?::: With `Y' Casava indicates that a sequence should be filtered. This patch enables bwa, with an -Y flag, to filter these sequences. Signed-off-by: Roel Kluin --- bwaseqio.c | 7 +++++++ bwtaln.c | 4 +++- bwtaln.h | 1 + 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/bwaseqio.c b/bwaseqio.c index 12ac765..600754e 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -157,6 +157,13 @@ bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int tri n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((l = kseq_read(seq)) >= 0) { + if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) { + // skip reads that are marked to be filtered by Casava + char *s = index(seq->comment.s, ':'); + if (s && *(++s) == 'Y') { + continue; + } + } if (is_64 && seq->qual.l) for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length diff --git a/bwtaln.c b/bwtaln.c index 905c2d2..de6b001 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -233,7 +233,7 @@ int bwa_aln(int argc, char *argv[]) gap_opt_t *opt; opt = gap_init_opt(); - while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IB:")) >= 0) { + while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) { switch (c) { case 'n': if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; @@ -261,6 +261,7 @@ int bwa_aln(int argc, char *argv[]) case '1': opt->mode |= BWA_MODE_BAM_READ1; break; case '2': opt->mode |= BWA_MODE_BAM_READ2; break; case 'I': opt->mode |= BWA_MODE_IL13; break; + case 'Y': opt->mode |= BWA_MODE_CFY; break; case 'B': opt->mode |= atoi(optarg) << 24; break; default: return 1; } @@ -298,6 +299,7 @@ int bwa_aln(int argc, char *argv[]) fprintf(stderr, " -0 use single-end reads only (effective with -b)\n"); fprintf(stderr, " -1 use the 1st read in a pair (effective with -b)\n"); fprintf(stderr, " -2 use the 2nd read in a pair (effective with -b)\n"); + fprintf(stderr, " -Y filter Casava-filtered sequences\n"); fprintf(stderr, "\n"); return 1; } diff --git a/bwtaln.h b/bwtaln.h index 02f54f7..c2faa98 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -86,6 +86,7 @@ typedef struct { #define BWA_MODE_GAPE 0x01 #define BWA_MODE_COMPREAD 0x02 #define BWA_MODE_LOGGAP 0x04 +#define BWA_MODE_CFY 0x08 #define BWA_MODE_NONSTOP 0x10 #define BWA_MODE_BAM 0x20 #define BWA_MODE_BAM_SE 0x40