2012-04-07 12:23:01 +08:00
|
|
|
#include <string.h>
|
|
|
|
|
#include <stdio.h>
|
2013-02-24 04:30:46 +08:00
|
|
|
#include <zlib.h>
|
2013-02-26 13:03:49 +08:00
|
|
|
#include <assert.h>
|
2012-04-07 12:23:01 +08:00
|
|
|
#include "bntseq.h"
|
2013-02-24 04:30:46 +08:00
|
|
|
#include "bwa.h"
|
|
|
|
|
#include "ksw.h"
|
2013-02-24 05:41:44 +08:00
|
|
|
#include "utils.h"
|
2014-01-30 01:05:11 +08:00
|
|
|
#include "kstring.h"
|
2012-04-07 12:23:01 +08:00
|
|
|
|
2013-05-02 22:12:01 +08:00
|
|
|
#ifdef USE_MALLOC_WRAPPERS
|
|
|
|
|
# include "malloc_wrap.h"
|
|
|
|
|
#endif
|
|
|
|
|
|
2013-02-24 05:10:48 +08:00
|
|
|
int bwa_verbose = 3;
|
2013-02-24 05:41:44 +08:00
|
|
|
char bwa_rg_id[256];
|
2012-04-07 12:23:01 +08:00
|
|
|
|
2013-02-24 04:30:46 +08:00
|
|
|
/************************
|
|
|
|
|
* Batch FASTA/Q reader *
|
|
|
|
|
************************/
|
2012-04-07 12:23:01 +08:00
|
|
|
|
2013-02-24 04:30:46 +08:00
|
|
|
#include "kseq.h"
|
|
|
|
|
KSEQ_DECLARE(gzFile)
|
2012-04-07 12:23:01 +08:00
|
|
|
|
2013-02-24 04:30:46 +08:00
|
|
|
static inline void trim_readno(kstring_t *s)
|
2012-04-07 12:23:01 +08:00
|
|
|
{
|
2013-02-24 04:30:46 +08:00
|
|
|
if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1]))
|
|
|
|
|
s->l -= 2, s->s[s->l] = 0;
|
2012-04-07 12:23:01 +08:00
|
|
|
}
|
|
|
|
|
|
2013-02-24 04:30:46 +08:00
|
|
|
static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s)
|
|
|
|
|
{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice
|
2013-05-02 22:12:01 +08:00
|
|
|
s->name = strdup(ks->name.s);
|
|
|
|
|
s->comment = ks->comment.l? strdup(ks->comment.s) : 0;
|
|
|
|
|
s->seq = strdup(ks->seq.s);
|
|
|
|
|
s->qual = ks->qual.l? strdup(ks->qual.s) : 0;
|
2013-02-24 04:30:46 +08:00
|
|
|
s->l_seq = strlen(s->seq);
|
2012-04-07 12:23:01 +08:00
|
|
|
}
|
|
|
|
|
|
2013-02-24 04:30:46 +08:00
|
|
|
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
|
2012-04-07 12:23:01 +08:00
|
|
|
{
|
2013-02-24 04:30:46 +08:00
|
|
|
kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_;
|
|
|
|
|
int size = 0, m, n;
|
|
|
|
|
bseq1_t *seqs;
|
|
|
|
|
m = n = 0; seqs = 0;
|
|
|
|
|
while (kseq_read(ks) >= 0) {
|
|
|
|
|
if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads
|
|
|
|
|
fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
if (n >= m) {
|
|
|
|
|
m = m? m<<1 : 256;
|
2013-05-02 22:12:01 +08:00
|
|
|
seqs = realloc(seqs, m * sizeof(bseq1_t));
|
2013-02-24 04:30:46 +08:00
|
|
|
}
|
|
|
|
|
trim_readno(&ks->name);
|
|
|
|
|
kseq2bseq1(ks, &seqs[n]);
|
|
|
|
|
size += seqs[n++].l_seq;
|
|
|
|
|
if (ks2) {
|
|
|
|
|
trim_readno(&ks2->name);
|
|
|
|
|
kseq2bseq1(ks2, &seqs[n]);
|
|
|
|
|
size += seqs[n++].l_seq;
|
|
|
|
|
}
|
2013-03-08 00:00:15 +08:00
|
|
|
if (size >= chunk_size && (n&1) == 0) break;
|
2013-02-24 04:30:46 +08:00
|
|
|
}
|
|
|
|
|
if (size == 0) { // test if the 2nd file is finished
|
|
|
|
|
if (ks2 && kseq_read(ks2) >= 0)
|
|
|
|
|
fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__);
|
|
|
|
|
}
|
|
|
|
|
*n_ = n;
|
|
|
|
|
return seqs;
|
2012-04-07 12:23:01 +08:00
|
|
|
}
|
|
|
|
|
|
2013-02-26 11:49:15 +08:00
|
|
|
/*****************
|
|
|
|
|
* CIGAR related *
|
|
|
|
|
*****************/
|
2012-04-07 12:23:01 +08:00
|
|
|
|
2013-03-05 22:38:12 +08:00
|
|
|
void bwa_fill_scmat(int a, int b, int8_t mat[25])
|
|
|
|
|
{
|
|
|
|
|
int i, j, k;
|
|
|
|
|
for (i = k = 0; i < 4; ++i) {
|
|
|
|
|
for (j = 0; j < 4; ++j)
|
|
|
|
|
mat[k++] = i == j? a : -b;
|
2013-03-10 07:03:57 +08:00
|
|
|
mat[k++] = -1; // ambiguous base
|
2013-03-05 22:38:12 +08:00
|
|
|
}
|
2013-03-10 07:03:57 +08:00
|
|
|
for (j = 0; j < 5; ++j) mat[k++] = -1;
|
2013-03-05 22:38:12 +08:00
|
|
|
}
|
|
|
|
|
|
2013-02-24 04:30:46 +08:00
|
|
|
// Generate CIGAR when the alignment end points are known
|
2013-02-27 02:36:01 +08:00
|
|
|
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM)
|
2012-04-07 12:23:01 +08:00
|
|
|
{
|
2013-02-24 04:30:46 +08:00
|
|
|
uint32_t *cigar = 0;
|
|
|
|
|
uint8_t tmp, *rseq;
|
2013-02-28 12:59:50 +08:00
|
|
|
int i;
|
2013-02-24 04:30:46 +08:00
|
|
|
int64_t rlen;
|
2014-01-30 01:05:11 +08:00
|
|
|
kstring_t str;
|
2014-02-18 23:32:24 +08:00
|
|
|
const char *int2base;
|
2014-01-30 01:05:11 +08:00
|
|
|
|
2013-02-27 02:36:01 +08:00
|
|
|
*n_cigar = 0; *NM = -1;
|
2013-02-24 04:30:46 +08:00
|
|
|
if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand
|
|
|
|
|
rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
|
|
|
|
|
if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range
|
|
|
|
|
if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position
|
|
|
|
|
for (i = 0; i < l_query>>1; ++i)
|
|
|
|
|
tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
|
|
|
|
|
for (i = 0; i < rlen>>1; ++i)
|
|
|
|
|
tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp;
|
2012-04-07 12:23:01 +08:00
|
|
|
}
|
2013-02-28 12:59:50 +08:00
|
|
|
if (l_query == re - rb && w_ == 0) { // no gap; no need to do DP
|
2014-03-17 11:25:04 +08:00
|
|
|
// FIXME: due to an issue in mem_reg2aln(), we never come to this block. This does not affect accuracy, but it hurts performance.
|
2013-05-02 22:12:01 +08:00
|
|
|
cigar = malloc(4);
|
2013-02-28 12:59:50 +08:00
|
|
|
cigar[0] = l_query<<4 | 0;
|
|
|
|
|
*n_cigar = 1;
|
|
|
|
|
for (i = 0, *score = 0; i < l_query; ++i)
|
|
|
|
|
*score += mat[rseq[i]*5 + query[i]];
|
|
|
|
|
} else {
|
|
|
|
|
int w, max_gap, min_w;
|
|
|
|
|
// set the band-width
|
|
|
|
|
max_gap = (int)((double)(((l_query+1)>>1) * mat[0] - q) / r + 1.);
|
|
|
|
|
max_gap = max_gap > 1? max_gap : 1;
|
|
|
|
|
w = (max_gap + abs(rlen - l_query) + 1) >> 1;
|
|
|
|
|
w = w < w_? w : w_;
|
|
|
|
|
min_w = abs(rlen - l_query) + 3;
|
|
|
|
|
w = w > min_w? w : min_w;
|
|
|
|
|
// NW alignment
|
2014-03-17 11:25:04 +08:00
|
|
|
if (bwa_verbose >= 4) {
|
|
|
|
|
printf("* Global bandwidth: %d\n", w);
|
|
|
|
|
printf("* Global ref: "); for (i = 0; i < rlen; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
|
|
|
|
|
printf("* Global query: "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
|
|
|
|
|
}
|
2013-02-28 12:59:50 +08:00
|
|
|
*score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar);
|
|
|
|
|
}
|
2014-02-19 23:08:43 +08:00
|
|
|
{// compute NM and MD
|
2014-01-30 01:05:11 +08:00
|
|
|
int k, x, y, u, n_mm = 0, n_gap = 0;
|
|
|
|
|
str.l = str.m = *n_cigar * 4; str.s = (char*)cigar; // append MD to CIGAR
|
2014-02-18 23:32:24 +08:00
|
|
|
int2base = rb < l_pac? "ACGTN" : "TGCAN";
|
2014-01-30 01:05:11 +08:00
|
|
|
for (k = 0, x = y = u = 0; k < *n_cigar; ++k) {
|
2014-02-01 01:58:21 +08:00
|
|
|
int op, len;
|
|
|
|
|
cigar = (uint32_t*)str.s;
|
|
|
|
|
op = cigar[k]&0xf, len = cigar[k]>>4;
|
2013-02-27 02:36:01 +08:00
|
|
|
if (op == 0) { // match
|
2014-01-30 01:05:11 +08:00
|
|
|
for (i = 0; i < len; ++i) {
|
|
|
|
|
if (query[x + i] != rseq[y + i]) {
|
2014-02-18 23:32:24 +08:00
|
|
|
kputw(u, &str);
|
|
|
|
|
kputc(int2base[rseq[y+i]], &str);
|
2014-01-30 01:05:11 +08:00
|
|
|
++n_mm; u = 0;
|
|
|
|
|
} else ++u;
|
|
|
|
|
}
|
2013-02-27 02:36:01 +08:00
|
|
|
x += len; y += len;
|
2014-01-30 01:05:11 +08:00
|
|
|
} else if (op == 2) { // deletion
|
2014-02-21 02:06:40 +08:00
|
|
|
if (k > 0 && k < *n_cigar - 1) { // don't do the following if D is the first or the last CIGAR
|
2014-02-20 00:31:54 +08:00
|
|
|
kputw(u, &str); kputc('^', &str);
|
|
|
|
|
for (i = 0; i < len; ++i)
|
|
|
|
|
kputc(int2base[rseq[y+i]], &str);
|
|
|
|
|
u = 0; n_gap += len;
|
|
|
|
|
}
|
|
|
|
|
y += len;
|
2014-01-30 01:05:11 +08:00
|
|
|
} else if (op == 1) x += len, n_gap += len; // insertion
|
2012-04-08 10:00:03 +08:00
|
|
|
}
|
2014-01-30 01:05:11 +08:00
|
|
|
kputw(u, &str); kputc(0, &str);
|
2013-02-27 02:36:01 +08:00
|
|
|
*NM = n_mm + n_gap;
|
2014-01-30 01:05:11 +08:00
|
|
|
cigar = (uint32_t*)str.s;
|
2012-04-08 10:00:03 +08:00
|
|
|
}
|
2013-02-24 04:30:46 +08:00
|
|
|
if (rb >= l_pac) // reverse back query
|
|
|
|
|
for (i = 0; i < l_query>>1; ++i)
|
|
|
|
|
tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
|
|
|
|
|
|
|
|
|
|
ret_gen_cigar:
|
|
|
|
|
free(rseq);
|
|
|
|
|
return cigar;
|
2012-04-08 10:00:03 +08:00
|
|
|
}
|
|
|
|
|
|
2013-02-26 13:03:49 +08:00
|
|
|
int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)
|
2012-04-07 13:25:39 +08:00
|
|
|
{
|
2013-05-23 06:57:51 +08:00
|
|
|
int is_rev;
|
|
|
|
|
int64_t cb, ce, fm;
|
|
|
|
|
bntann1_t *ra;
|
2013-03-19 08:49:32 +08:00
|
|
|
if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary; actually with BWA-MEM, we should never come to here
|
2013-02-26 13:03:49 +08:00
|
|
|
*qb = *qe = *rb = *re = -1;
|
|
|
|
|
return -1; // unable to fix
|
2012-04-07 13:25:39 +08:00
|
|
|
}
|
2013-05-23 06:57:51 +08:00
|
|
|
fm = bns_depos(bns, (*rb + *re) >> 1, &is_rev); // coordinate of the middle point on the forward strand
|
|
|
|
|
ra = &bns->anns[bns_pos2rid(bns, fm)]; // annotation of chr corresponding to the middle point
|
|
|
|
|
cb = is_rev? (bns->l_pac<<1) - (ra->offset + ra->len) : ra->offset; // chr start on the mapping strand
|
|
|
|
|
ce = cb + ra->len; // chr end
|
|
|
|
|
if (cb > *rb || ce < *re) { // fix is needed
|
2013-02-27 02:36:01 +08:00
|
|
|
int i, score, n_cigar, y, NM;
|
2013-02-26 13:03:49 +08:00
|
|
|
uint32_t *cigar;
|
|
|
|
|
int64_t x;
|
2013-05-23 06:57:51 +08:00
|
|
|
cb = cb > *rb? cb : *rb;
|
|
|
|
|
ce = ce < *re? ce : *re;
|
2013-02-27 02:36:01 +08:00
|
|
|
cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM);
|
2013-02-26 13:03:49 +08:00
|
|
|
for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) {
|
|
|
|
|
int op = cigar[i]&0xf, len = cigar[i]>>4;
|
|
|
|
|
if (op == 0) {
|
2013-05-23 06:57:51 +08:00
|
|
|
if (x <= cb && cb < x + len)
|
|
|
|
|
*qb = y + (cb - x), *rb = cb;
|
|
|
|
|
if (x < ce && ce <= x + len) {
|
|
|
|
|
*qe = y + (ce - x), *re = ce;
|
2013-02-26 13:03:49 +08:00
|
|
|
break;
|
|
|
|
|
} else x += len, y += len;
|
2013-05-23 06:57:51 +08:00
|
|
|
} else if (op == 1) {
|
2013-02-26 13:03:49 +08:00
|
|
|
y += len;
|
2013-05-23 06:57:51 +08:00
|
|
|
} else if (op == 2) {
|
|
|
|
|
if (x <= cb && cb < x + len)
|
|
|
|
|
*qb = y, *rb = x + len;
|
|
|
|
|
if (x < ce && ce <= x + len) {
|
|
|
|
|
*qe = y, *re = x;
|
2013-02-26 13:03:49 +08:00
|
|
|
break;
|
|
|
|
|
} else x += len;
|
|
|
|
|
} else abort(); // should not be here
|
2012-04-08 10:00:03 +08:00
|
|
|
}
|
2013-02-26 13:03:49 +08:00
|
|
|
free(cigar);
|
2012-04-08 10:00:03 +08:00
|
|
|
}
|
2013-05-23 07:45:16 +08:00
|
|
|
return (*qb == *qe || *rb == *re)? -2 : 0;
|
2012-04-07 13:25:39 +08:00
|
|
|
}
|
2012-04-08 12:02:06 +08:00
|
|
|
|
2013-02-24 04:55:55 +08:00
|
|
|
/*********************
|
|
|
|
|
* Full index reader *
|
|
|
|
|
*********************/
|
2012-04-08 12:55:52 +08:00
|
|
|
|
2013-02-24 05:10:48 +08:00
|
|
|
char *bwa_idx_infer_prefix(const char *hint)
|
2012-04-08 12:02:06 +08:00
|
|
|
{
|
2013-02-24 05:10:48 +08:00
|
|
|
char *prefix;
|
|
|
|
|
int l_hint;
|
|
|
|
|
FILE *fp;
|
|
|
|
|
l_hint = strlen(hint);
|
2013-05-02 22:12:01 +08:00
|
|
|
prefix = malloc(l_hint + 3 + 4 + 1);
|
2013-02-24 05:10:48 +08:00
|
|
|
strcpy(prefix, hint);
|
|
|
|
|
strcpy(prefix + l_hint, ".64.bwt");
|
|
|
|
|
if ((fp = fopen(prefix, "rb")) != 0) {
|
|
|
|
|
fclose(fp);
|
|
|
|
|
prefix[l_hint + 3] = 0;
|
|
|
|
|
return prefix;
|
|
|
|
|
} else {
|
|
|
|
|
strcpy(prefix + l_hint, ".bwt");
|
|
|
|
|
if ((fp = fopen(prefix, "rb")) == 0) {
|
|
|
|
|
free(prefix);
|
|
|
|
|
return 0;
|
|
|
|
|
} else {
|
|
|
|
|
fclose(fp);
|
|
|
|
|
prefix[l_hint] = 0;
|
|
|
|
|
return prefix;
|
2012-04-08 12:02:06 +08:00
|
|
|
}
|
|
|
|
|
}
|
2013-02-24 05:10:48 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bwt_t *bwa_idx_load_bwt(const char *hint)
|
|
|
|
|
{
|
|
|
|
|
char *tmp, *prefix;
|
|
|
|
|
bwt_t *bwt;
|
|
|
|
|
prefix = bwa_idx_infer_prefix(hint);
|
|
|
|
|
if (prefix == 0) {
|
2013-02-24 05:41:44 +08:00
|
|
|
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
|
2013-02-24 05:10:48 +08:00
|
|
|
return 0;
|
|
|
|
|
}
|
2013-05-02 22:12:01 +08:00
|
|
|
tmp = calloc(strlen(prefix) + 5, 1);
|
2013-02-24 05:10:48 +08:00
|
|
|
strcat(strcpy(tmp, prefix), ".bwt"); // FM-index
|
|
|
|
|
bwt = bwt_restore_bwt(tmp);
|
|
|
|
|
strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA)
|
|
|
|
|
bwt_restore_sa(tmp, bwt);
|
|
|
|
|
free(tmp); free(prefix);
|
|
|
|
|
return bwt;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bwaidx_t *bwa_idx_load(const char *hint, int which)
|
2013-02-24 04:55:55 +08:00
|
|
|
{
|
|
|
|
|
bwaidx_t *idx;
|
2013-02-24 05:10:48 +08:00
|
|
|
char *prefix;
|
|
|
|
|
prefix = bwa_idx_infer_prefix(hint);
|
2013-02-24 05:41:44 +08:00
|
|
|
if (prefix == 0) {
|
|
|
|
|
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
|
|
|
|
|
return 0;
|
2012-04-08 12:02:06 +08:00
|
|
|
}
|
2013-05-02 22:12:01 +08:00
|
|
|
idx = calloc(1, sizeof(bwaidx_t));
|
2013-02-24 05:10:48 +08:00
|
|
|
if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
|
2013-02-24 04:55:55 +08:00
|
|
|
if (which & BWA_IDX_BNS) {
|
|
|
|
|
idx->bns = bns_restore(prefix);
|
|
|
|
|
if (which & BWA_IDX_PAC) {
|
2013-05-02 22:12:01 +08:00
|
|
|
idx->pac = calloc(idx->bns->l_pac/4+1, 1);
|
2013-03-01 17:37:46 +08:00
|
|
|
err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
|
|
|
|
|
err_fclose(idx->bns->fp_pac);
|
2013-02-24 05:10:48 +08:00
|
|
|
idx->bns->fp_pac = 0;
|
2013-02-24 04:55:55 +08:00
|
|
|
}
|
2012-04-08 12:02:06 +08:00
|
|
|
}
|
2013-02-24 05:10:48 +08:00
|
|
|
free(prefix);
|
2013-02-24 04:55:55 +08:00
|
|
|
return idx;
|
2012-04-08 12:02:06 +08:00
|
|
|
}
|
|
|
|
|
|
2013-02-24 04:55:55 +08:00
|
|
|
void bwa_idx_destroy(bwaidx_t *idx)
|
2012-04-08 12:02:06 +08:00
|
|
|
{
|
2013-02-24 04:55:55 +08:00
|
|
|
if (idx == 0) return;
|
|
|
|
|
if (idx->bwt) bwt_destroy(idx->bwt);
|
|
|
|
|
if (idx->bns) bns_destroy(idx->bns);
|
|
|
|
|
if (idx->pac) free(idx->pac);
|
|
|
|
|
free(idx);
|
2012-04-08 12:02:06 +08:00
|
|
|
}
|
2012-04-08 12:55:52 +08:00
|
|
|
|
2013-02-24 05:41:44 +08:00
|
|
|
/***********************
|
|
|
|
|
* SAM header routines *
|
|
|
|
|
***********************/
|
|
|
|
|
|
|
|
|
|
void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line)
|
|
|
|
|
{
|
|
|
|
|
int i;
|
2013-11-20 00:08:45 +08:00
|
|
|
extern char *bwa_pg;
|
2013-02-24 05:41:44 +08:00
|
|
|
for (i = 0; i < bns->n_seqs; ++i)
|
|
|
|
|
err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
|
|
|
|
|
if (rg_line) err_printf("%s\n", rg_line);
|
2013-11-20 00:08:45 +08:00
|
|
|
err_printf("%s\n", bwa_pg);
|
2013-02-24 05:41:44 +08:00
|
|
|
}
|
2012-04-08 12:55:52 +08:00
|
|
|
|
2013-02-24 05:41:44 +08:00
|
|
|
static char *bwa_escape(char *s)
|
2012-04-08 12:55:52 +08:00
|
|
|
{
|
2013-02-24 05:41:44 +08:00
|
|
|
char *p, *q;
|
|
|
|
|
for (p = q = s; *p; ++p) {
|
|
|
|
|
if (*p == '\\') {
|
|
|
|
|
++p;
|
|
|
|
|
if (*p == 't') *q++ = '\t';
|
|
|
|
|
else if (*p == 'n') *q++ = '\n';
|
|
|
|
|
else if (*p == 'r') *q++ = '\r';
|
|
|
|
|
else if (*p == '\\') *q++ = '\\';
|
|
|
|
|
} else *q++ = *p;
|
|
|
|
|
}
|
|
|
|
|
*q = '\0';
|
|
|
|
|
return s;
|
2012-04-08 12:55:52 +08:00
|
|
|
}
|
|
|
|
|
|
2013-02-24 05:41:44 +08:00
|
|
|
char *bwa_set_rg(const char *s)
|
2012-04-08 12:55:52 +08:00
|
|
|
{
|
2013-02-24 05:41:44 +08:00
|
|
|
char *p, *q, *r, *rg_line = 0;
|
|
|
|
|
memset(bwa_rg_id, 0, 256);
|
2013-02-24 05:44:02 +08:00
|
|
|
if (strstr(s, "@RG") != s) {
|
|
|
|
|
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__);
|
|
|
|
|
goto err_set_rg;
|
|
|
|
|
}
|
2013-05-02 22:12:01 +08:00
|
|
|
rg_line = strdup(s);
|
2013-02-24 05:41:44 +08:00
|
|
|
bwa_escape(rg_line);
|
|
|
|
|
if ((p = strstr(rg_line, "\tID:")) == 0) {
|
2013-02-24 05:44:02 +08:00
|
|
|
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID at the read group line\n", __func__);
|
2013-02-24 05:41:44 +08:00
|
|
|
goto err_set_rg;
|
|
|
|
|
}
|
|
|
|
|
p += 4;
|
|
|
|
|
for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
|
|
|
|
|
if (q - p + 1 > 256) {
|
2013-02-24 05:44:02 +08:00
|
|
|
if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__);
|
2013-02-24 05:41:44 +08:00
|
|
|
goto err_set_rg;
|
|
|
|
|
}
|
|
|
|
|
for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
|
|
|
|
|
*r++ = *q;
|
|
|
|
|
return rg_line;
|
|
|
|
|
|
|
|
|
|
err_set_rg:
|
|
|
|
|
free(rg_line);
|
|
|
|
|
return 0;
|
2012-04-08 12:55:52 +08:00
|
|
|
}
|
2013-02-24 05:41:44 +08:00
|
|
|
|