2013-03-07 08:23:45 +08:00
|
|
|
#include <stdio.h>
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
#include <unistd.h>
|
|
|
|
|
#include <string.h>
|
|
|
|
|
#include <zlib.h>
|
|
|
|
|
#include "ksw.h"
|
|
|
|
|
#include "kseq.h"
|
|
|
|
|
#include "bwa.h"
|
|
|
|
|
KSEQ_DECLARE(gzFile)
|
|
|
|
|
|
|
|
|
|
#define MAX_SCORE_RATIO 0.9f
|
|
|
|
|
#define MAX_ERR 8
|
|
|
|
|
|
|
|
|
|
static const char *err_msg[MAX_ERR+1] = {
|
|
|
|
|
"successful merges",
|
|
|
|
|
"low-scoring pairs",
|
|
|
|
|
"pairs where the best SW alignment is not an overlap (long left end)",
|
|
|
|
|
"pairs where the best SW alignment is not an overlap (long right end)",
|
|
|
|
|
"pairs with large 2nd best SW score",
|
|
|
|
|
"pairs with gapped overlap",
|
2013-03-07 12:38:07 +08:00
|
|
|
"pairs where the end-to-end alignment is inconsistent with SW",
|
2013-03-07 08:23:45 +08:00
|
|
|
"pairs potentially with tandem overlaps",
|
|
|
|
|
"pairs with high sum of errors"
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
|
kstring_t n, s, q;
|
|
|
|
|
} mseq_t;
|
|
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
|
int a, b, q, r, w;
|
|
|
|
|
int q_def, q_thres;
|
|
|
|
|
int T;
|
|
|
|
|
int8_t mat[25];
|
|
|
|
|
} pem_opt_t;
|
|
|
|
|
|
|
|
|
|
pem_opt_t *pem_opt_init()
|
|
|
|
|
{
|
|
|
|
|
pem_opt_t *opt;
|
|
|
|
|
opt = calloc(1, sizeof(pem_opt_t));
|
|
|
|
|
opt->a = 5; opt->b = 4; opt->q = 2, opt->r = 17; opt->w = 20;
|
|
|
|
|
opt->T = opt->a * 10;
|
|
|
|
|
opt->q_def = 20;
|
|
|
|
|
opt->q_thres = 70;
|
|
|
|
|
bwa_fill_scmat(opt->a, opt->b, opt->mat);
|
|
|
|
|
return opt;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int bwa_pemerge(const pem_opt_t *opt, mseq_t x[2], uint8_t **seq_, uint8_t **qual_)
|
|
|
|
|
{
|
|
|
|
|
uint8_t *s[2], *q[2], *seq, *qual;
|
|
|
|
|
int i, xtra, l, l_seq, sum_q;
|
|
|
|
|
kswr_t r;
|
|
|
|
|
|
|
|
|
|
*seq_ = *qual_ = 0;
|
|
|
|
|
s[0] = malloc(x[0].s.l); q[0] = malloc(x[0].s.l);
|
|
|
|
|
s[1] = malloc(x[1].s.l); q[1] = malloc(x[1].s.l);
|
|
|
|
|
for (i = 0; i < x[0].s.l; ++i) {
|
|
|
|
|
int c = x[0].s.s[i];
|
|
|
|
|
s[0][i] = c < 0 || c > 127? 4 : c <= 4? c : nst_nt4_table[c];
|
|
|
|
|
q[0][i] = x[0].q.l? x[0].q.s[i] - 33 : opt->q_def;
|
|
|
|
|
}
|
|
|
|
|
for (i = 0; i < x[1].s.l; ++i) {
|
|
|
|
|
int c = x[1].s.s[x[1].s.l - 1 - i];
|
|
|
|
|
c = c < 0 || c > 127? 4 : c < 4? c : nst_nt4_table[c];
|
|
|
|
|
s[1][i] = c < 4? 3 - c : 4;
|
|
|
|
|
q[1][i] = x[1].q.l? x[1].q.s[x[1].q.l - 1 - i] - 33 : opt->q_def;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
xtra = KSW_XSTART | KSW_XSUBO;
|
|
|
|
|
r = ksw_align(x[1].s.l, s[1], x[0].s.l, s[0], 5, opt->mat, opt->q, opt->r, xtra, 0);
|
|
|
|
|
++r.qe; ++r.te; // change to the half-close-half-open coordinates
|
|
|
|
|
|
|
|
|
|
if (r.score < opt->T) return -1; // poor alignment
|
|
|
|
|
if (r.tb < r.qb) return -2; // no enough space for the left end
|
|
|
|
|
if (x[0].s.l - r.te > x[1].s.l - r.qe) return -3; // no enough space for the right end
|
|
|
|
|
if ((double)r.score2 / r.score >= MAX_SCORE_RATIO) return -4; // the second best score is too large
|
|
|
|
|
if (r.qe - r.qb != r.te - r.tb) return -5; // we do not allow gaps
|
|
|
|
|
|
|
|
|
|
{ // test tandem match; O(n^2)
|
|
|
|
|
int max_m, max_m2, min_l, max_l, max_l2;
|
|
|
|
|
max_m = max_m2 = 0; max_l = max_l2 = 0;
|
|
|
|
|
min_l = x[0].s.l < x[1].s.l? x[0].s.l : x[1].s.l;
|
2013-03-07 12:38:07 +08:00
|
|
|
for (l = 1; l < min_l; ++l) {
|
2013-03-07 08:23:45 +08:00
|
|
|
int m = 0, o = x[0].s.l - l;
|
2013-03-07 12:38:07 +08:00
|
|
|
uint8_t *s0o = &s[0][o], *s1 = s[1];
|
|
|
|
|
for (i = 0; i < l; ++i) // TODO: in principle, this can be done with SSE2. It is the bottleneck!
|
|
|
|
|
m += opt->mat[(s1[i]<<2) + s1[i] + s0o[i]]; // equivalent to s[1][i]*5 + s[0][o+i]
|
2013-03-07 08:23:45 +08:00
|
|
|
if (m > max_m) max_m2 = max_m, max_m = m, max_l2 = max_l, max_l = l;
|
|
|
|
|
else if (m > max_m2) max_m2 = m, max_l2 = l;
|
|
|
|
|
}
|
2013-03-07 12:38:07 +08:00
|
|
|
if (max_m < opt->T || max_l != x[0].s.l - (r.tb - r.qb)) return -6;
|
2013-03-07 08:23:45 +08:00
|
|
|
if (max_l2 < max_l && max_m2 >= opt->T && (double)(max_m2 + (max_l - max_l2) * opt->a) / max_m >= MAX_SCORE_RATIO)
|
|
|
|
|
return -7;
|
2013-03-07 12:38:07 +08:00
|
|
|
if (max_l2 > max_l && (double)max_m2 / max_m >= MAX_SCORE_RATIO) return -7;
|
2013-03-07 08:23:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
l = x[0].s.l - (r.tb - r.qb); // length to merge
|
|
|
|
|
l_seq = x[0].s.l + x[1].s.l - l;
|
|
|
|
|
seq = malloc(l_seq + 1);
|
|
|
|
|
qual = malloc(l_seq + 1);
|
|
|
|
|
memcpy(seq, s[0], x[0].s.l); memcpy(seq + x[0].s.l, &s[1][l], x[1].s.l - l);
|
|
|
|
|
memcpy(qual, q[0], x[0].s.l); memcpy(qual + x[0].s.l, &q[1][l], x[1].s.l - l);
|
|
|
|
|
for (i = 0, sum_q = 0; i < l; ++i) {
|
|
|
|
|
int k = x[0].s.l - l + i;
|
|
|
|
|
if (s[0][k] == 4) { // ambiguous
|
|
|
|
|
seq[k] = s[1][i];
|
|
|
|
|
qual[k] = q[1][i];
|
|
|
|
|
} else if (s[1][i] == 4) { // do nothing
|
|
|
|
|
} else if (s[0][k] == s[1][i]) {
|
|
|
|
|
qual[k] = qual[k] > q[1][i]? qual[k] : q[1][i];
|
|
|
|
|
} else { // s[0][k] != s[1][i] and neither is N
|
|
|
|
|
int qq = q[0][k] < q[1][i]? q[0][k] : q[1][i];
|
|
|
|
|
sum_q += qq >= 3? qq<<1 : 1;
|
|
|
|
|
seq[k] = q[0][k] > q[1][i]? s[0][k] : s[1][i];
|
|
|
|
|
qual[k] = abs((int)q[0][k] - (int)q[1][i]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (sum_q>>1 > opt->q_thres) { // too many mismatches
|
|
|
|
|
free(seq); free(qual);
|
|
|
|
|
return -8;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < l_seq; ++i) seq[i] = "ACGTN"[(int)seq[i]], qual[i] += 33;
|
|
|
|
|
seq[l_seq] = qual[l_seq] = 0;
|
|
|
|
|
*seq_ = seq, *qual_ = qual;
|
|
|
|
|
return l_seq;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline void kstrcpy(kstring_t *dst, const kstring_t *src)
|
|
|
|
|
{
|
|
|
|
|
dst->l = 0;
|
|
|
|
|
if (src->l == 0) return;
|
|
|
|
|
if (dst->m < src->l + 1) {
|
|
|
|
|
dst->m = src->l + 2;
|
|
|
|
|
kroundup32(dst->m);
|
|
|
|
|
dst->s = realloc(dst->s, dst->m);
|
|
|
|
|
}
|
|
|
|
|
dst->l = src->l;
|
|
|
|
|
memcpy(dst->s, src->s, src->l + 1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline void kseq2mseq(mseq_t *ms, const kseq_t *ks)
|
|
|
|
|
{
|
|
|
|
|
kstrcpy(&ms->n, &ks->name);
|
|
|
|
|
kstrcpy(&ms->s, &ks->seq);
|
|
|
|
|
kstrcpy(&ms->q, &ks->qual);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline void print_seq(const char *n, const char *s, const char *q)
|
|
|
|
|
{
|
|
|
|
|
putchar(q? '@' : '>');
|
|
|
|
|
puts(n); puts(s);
|
|
|
|
|
if (q) {
|
|
|
|
|
puts("+"); puts(q);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int main_pemerge(int argc, char *argv[])
|
|
|
|
|
{
|
|
|
|
|
int c, flag = 0, i;
|
|
|
|
|
int64_t cnt[MAX_ERR+1];
|
|
|
|
|
gzFile fp, fp2 = 0;
|
|
|
|
|
kseq_t *ks, *ks2 = 0;
|
|
|
|
|
mseq_t r[2];
|
|
|
|
|
pem_opt_t *opt;
|
|
|
|
|
|
|
|
|
|
opt = pem_opt_init();
|
|
|
|
|
while ((c = getopt(argc, argv, "muQ:")) >= 0) {
|
|
|
|
|
if (c == 'm') flag |= 1;
|
|
|
|
|
else if (c == 'u') flag |= 2;
|
|
|
|
|
else if (c == 'Q') opt->q_thres = atoi(optarg);
|
|
|
|
|
}
|
|
|
|
|
if (flag == 0) flag = 3;
|
|
|
|
|
|
|
|
|
|
if (optind == argc) {
|
|
|
|
|
fprintf(stderr, "\n");
|
|
|
|
|
fprintf(stderr, "Usage: bwa pemerge [-mu] <read1.fq> [read2.fq]\n\n");
|
|
|
|
|
fprintf(stderr, "Options: -m output merged reads only\n");
|
|
|
|
|
fprintf(stderr, " -u output unmerged reads only\n");
|
|
|
|
|
fprintf(stderr, " -Q INT max sum of errors [%d]\n", opt->q_thres);
|
|
|
|
|
fprintf(stderr, "\n");
|
|
|
|
|
free(opt);
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
|
|
|
|
|
ks = kseq_init(fp);
|
|
|
|
|
if (optind + 1 < argc) {
|
|
|
|
|
fp2 = strcmp(argv[optind+1], "-")? gzopen(argv[optind+1], "r") : gzdopen(fileno(stdin), "r");
|
|
|
|
|
ks2 = kseq_init(fp2);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
memset(r, 0, sizeof(mseq_t)<<1);
|
|
|
|
|
memset(cnt, 0, 8 * (MAX_ERR+1));
|
|
|
|
|
while (kseq_read(ks) >= 0) {
|
|
|
|
|
uint8_t *seq, *qual;
|
|
|
|
|
int l_seq;
|
|
|
|
|
kseq2mseq(&r[0], ks);
|
|
|
|
|
if (ks2) {
|
|
|
|
|
if (kseq_read(ks2) < 0) break;
|
|
|
|
|
kseq2mseq(&r[1], ks2);
|
|
|
|
|
} else {
|
|
|
|
|
if (kseq_read(ks) < 0) break;
|
|
|
|
|
kseq2mseq(&r[1], ks);
|
|
|
|
|
}
|
|
|
|
|
l_seq = bwa_pemerge(opt, r, &seq, &qual);
|
|
|
|
|
if (l_seq > 0) {
|
|
|
|
|
++cnt[0];
|
|
|
|
|
if (flag & 1) {
|
|
|
|
|
if (r[0].n.l > 2 && (r[0].n.s[r[0].n.l-1] == '1' || r[0].n.s[r[0].n.l-1] == '2') && r[0].n.s[r[0].n.l-2] == '/')
|
|
|
|
|
r[0].n.s[r[0].n.l-2] = 0, r[0].n.l -= 2;
|
|
|
|
|
print_seq(r[0].n.s, (char*)seq, (char*)qual);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
++cnt[-l_seq];
|
|
|
|
|
if (flag & 2) {
|
|
|
|
|
print_seq(r[0].n.s, r[0].s.s, r[0].q.l? r[0].q.s : 0);
|
|
|
|
|
print_seq(r[1].n.s, r[1].s.s, r[1].q.l? r[1].q.s : 0);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fprintf(stderr, "%12ld %s\n", (long)cnt[0], err_msg[0]);
|
|
|
|
|
for (i = 1; i <= MAX_ERR; ++i)
|
|
|
|
|
fprintf(stderr, "%12ld %s\n", (long)cnt[i], err_msg[i]);
|
|
|
|
|
kseq_destroy(ks);
|
|
|
|
|
gzclose(fp);
|
|
|
|
|
if (ks2) {
|
|
|
|
|
kseq_destroy(ks2);
|
|
|
|
|
gzclose(fp2);
|
|
|
|
|
}
|
|
|
|
|
free(opt);
|
|
|
|
|
return 0;
|
|
|
|
|
}
|