diff --git a/bwtsw2.h b/bwtsw2.h index deae04b..951d2f1 100644 --- a/bwtsw2.h +++ b/bwtsw2.h @@ -6,7 +6,9 @@ #include "bwt_lite.h" #include "bwt.h" +#define BSW2_FLAG_MOVED 0x80 #define BSW2_FLAG_MATESW 0x100 +#define BSW2_FLAG_TANDEM 0x200 typedef struct { int a, b, q, r, t, qr, bw; diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 471c450..e077580 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -431,21 +431,28 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks bsw2hit_t *p = b->hits + i; int seqid = -1; int64_t coor = -1; - int j, qual, nn = 0; + int j, nn = 0; int beg, end; if (p->l == 0) { b->n_cigar[i] = fix_cigar(ks->name, bns, p, b->n_cigar[i], b->cigar[i]); nn = bns_cnt_ambi(bns, p->k, p->len, &seqid); coor = p->k - bns->anns[seqid].offset; } - ksprintf(&str, "%s\t%d", ks->name, (p->flag&0xff)|(is_pe?1:0)); + ksprintf(&str, "%s\t%d", ks->name, (p->flag&0x7f)|(is_pe?1:0)); ksprintf(&str, "\t%s\t%ld", seqid>=0? bns->anns[seqid].name : "*", (long)coor + 1); if (p->l == 0) { - { // estimate mapping quality - qual = est_mapq(p, opt); - if ((p->flag & BSW2_FLAG_MATESW) && bmate && bmate->n == 1) { // this alignment is from Smith-Waterman rescue - int mate_qual = est_mapq(bmate->hits, opt); + int qual = est_mapq(p, opt); + if (is_pe && bmate && bmate->n == 1) { + int mate_qual = est_mapq(bmate->hits, opt); + if (p->flag & BSW2_FLAG_MATESW) { // this alignment is rescued by Smith-Waterman qual = qual < mate_qual? qual : mate_qual; + } else if (p->flag&2) { // properly paired + if (!(p->flag & BSW2_FLAG_TANDEM)) { // not around a tandem repeat + if (qual < mate_qual) { + qual += 20; + if (qual >= mate_qual) qual = mate_qual; + } + } } } ksprintf(&str, "\t%d\t", qual); diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index e4721d4..1a1d3ca 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -63,6 +63,7 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf) r.low = tmp > max_len? tmp : max_len; if (r.high < r.avg - MAX_STDDEV * 4.) r.high = (int)(r.avg + MAX_STDDEV * 4. + .499); fprintf(stderr, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high); + free(isize); return r; } @@ -110,30 +111,32 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b for (i = 0; i < l_mseq; ++i) // on the forward strand seq[i] = nst_nt4_table[(int)mseq[i]]; } - /* The following code can be made up to 2-fold as fast. I am just lazy... */ // forward Smith-Waterman aux[0].T = opt->t; aux[0].gapo = opt->q; aux[0].gape = opt->r; aux[1] = aux[0]; q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat); ksw_sse2(q, end - beg, ref, &aux[0]); free(q); - if (aux[0].score == 0) { + if (aux[0].score < opt->t) { + aux[0].score = 0; free(seq); return; } + ++aux[0].qe; ++aux[0].te; // reverse Smith-Waterman - seq_reverse(l_mseq, seq, 0); - seq_reverse(end - beg, ref, 0); - q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat); - ksw_sse2(q, end - beg, ref, &aux[1]); + seq_reverse(aux[0].qe, seq, 0); + seq_reverse(aux[0].te, ref, 0); + q = ksw_qinit(aux[0].qe * g_mat[0] < 250? 1 : 2, aux[0].qe, seq, 5, g_mat); + ksw_sse2(q, aux[0].te, ref, &aux[1]); free(q); - aux[1].te = end - beg - 1 - aux[1].te; // change to the forward-strand coordinate + ++aux[1].qe; ++aux[1].te; // write output a->G = aux[0].score; a->G2 = aux[0].score2 > aux[1].score2? aux[0].score2 : aux[1].score2; - a->k = beg + aux[1].te; - a->len = aux[0].te + 1 - aux[1].te; - a->beg = l_mseq - 1 - aux[1].qe; - a->end = aux[0].qe + 1; + a->k = beg + (aux[0].te - aux[1].te); + a->len = aux[1].te; + a->beg = aux[0].qe - aux[1].qe; + a->end = aux[0].qe; + if (a->is_rev) i = a->beg, a->beg = l_mseq - a->end, a->end = l_mseq - i; free(seq); } @@ -141,7 +144,7 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b { extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS); bsw2pestat_t pes; - int i, j, k, n_rescued = 0; + int i, j, k, n_rescued = 0, n_moved = 0, n_fixed = 0; pes = bsw2_stat(n, hits); for (i = k = 0; i < 5; ++i) { for (j = 0; j < 4; ++j) @@ -165,7 +168,6 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b if (hits[i+0]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+0]->hits[0], seq[i+1].l, seq[i+1].seq, &a[1]); if (hits[i+1]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+1]->hits[0], seq[i+0].l, seq[i+0].seq, &a[0]); // the following enumerate all possibilities. It is tedious but necessary... - //if (strstr(seq[i].name, "22_49258265_49258755_4")) fprintf(stderr, "%lld\t%lld\t(%d,%d)\n", hits[i+1]->hits[0].k, a[1].k, a[0].G, a[0].G2); if (hits[i]->n + hits[i+1]->n == 1) { // one end mapped; the other not bwtsw2_t *p[2]; int which; @@ -176,11 +178,61 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b p[1]->max = 1; p[1]->hits = malloc(sizeof(bsw2hit_t)); } - memcpy(p[1]->hits, &a[which], sizeof(bsw2hit_t)); + p[1]->hits[0] = a[which]; p[1]->n = 1; ++n_rescued; } else { // then both ends mapped + //fprintf(stderr, "%d; %lld,%lld; %d,%d\n", a[0].is_rev, hits[i]->hits[0].k, a[0].k, hits[i]->hits[0].end, a[0].end); + for (j = 0; j < 2; ++j) { // first fix wrong mappings + if (hits[i+j]->hits[0].G < a[j].G) { // the orginal mapping is suboptimal + a[j].G2 = a[j].G2 > hits[i+j]->hits[0].G? a[j].G2 : hits[i+j]->hits[0].G; + hits[i+j]->hits[0] = a[j]; + ++n_fixed; + } + } + if (hits[i]->hits[0].k == a[0].k && hits[i+1]->hits[0].k == a[1].k) { // properly paired and no ends need to be moved + for (j = 0; j < 2; ++j) { + if (hits[i+j]->hits[0].G2 < a[j].G2) + hits[i+j]->hits[0].G2 = a[j].G2; + if (a[j].G2) hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM; + hits[i+j]->hits[0].flag |= 2; + } + } else if (hits[i]->hits[0].k == a[0].k || hits[i+1]->hits[0].k == a[1].k) { // a tandem match + for (j = 0; j < 2; ++j) { + hits[i+j]->hits[0].flag |= 2; + if (hits[i+j]->hits[0].k != a[j].k) + hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM | 2; + } + } else if (a[0].G || a[1].G) { // it is possible to move one end + if (a[0].G && a[1].G) { // now we have two "proper pairs" + int G[2]; + double diff; + G[0] = hits[i]->hits[0].G + a[1].G; + G[1] = hits[i+1]->hits[0].G + a[0].G; + diff = fabs(G[0] - G[1]) / (opt->a + opt->b) / ((hits[i]->hits[0].len + a[1].len + hits[i+1]->hits[0].len + a[0].len) / 2.); + if (diff > 0.05) a[G[0] > G[1]? 0 : 1].G = 0; + } + if (a[0].G == 0 || a[1].G == 0) { // one proper pair only + bsw2hit_t *p[2]; + int which, isize; + double dev, diff; + if (a[0].G) p[0] = &hits[i+1]->hits[0], p[1] = &hits[i]->hits[0], which = 0; + else p[0] = &hits[i]->hits[0], p[1] = &hits[i+1]->hits[0], which = 1; + isize = p[0]->is_rev? p[0]->k + p[0]->len - a[which].k : a[which].k + a[which].len - p[0]->k; + dev = fabs(isize - pes.avg) / pes.std; + diff = (double)(p[1]->G - a[which].G) / (opt->a + opt->b) / (p[1]->end - p[1]->beg) * 100.0; + if (diff < dev * 2.) { // then move + int tflag = 0; + if (a[which].G - a[which].G2 < 2 * (opt->a + opt->b)) tflag = BSW2_FLAG_TANDEM; + a[which].G2 = a[which].G; + p[1][0] = a[which]; + p[1]->flag |= BSW2_FLAG_MOVED | 2 | tflag; + p[0]->flag |= 2; + ++n_moved; + } + } + } } } - fprintf(stderr, "[%s] rescued %d reads\n", __func__, n_rescued); + fprintf(stderr, "[%s] #fixed=%d, #rescued=%d, #moved=%d\n", __func__, n_fixed, n_rescued, n_moved); } diff --git a/main.c b/main.c index 7f9601f..1e5613c 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.0-r70-dev" +#define PACKAGE_VERSION "0.6.0-r77-dev" #endif static int usage()