bwa-sw PE seems working (SAM is incorrect)

This commit is contained in:
Heng Li 2011-11-07 00:51:43 -05:00
parent c8c79ef024
commit e06685db45
4 changed files with 83 additions and 22 deletions

View File

@ -6,7 +6,9 @@
#include "bwt_lite.h"
#include "bwt.h"
#define BSW2_FLAG_MOVED 0x80
#define BSW2_FLAG_MATESW 0x100
#define BSW2_FLAG_TANDEM 0x200
typedef struct {
int a, b, q, r, t, qr, bw;

View File

@ -431,21 +431,28 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks
bsw2hit_t *p = b->hits + i;
int seqid = -1;
int64_t coor = -1;
int j, qual, nn = 0;
int j, nn = 0;
int beg, end;
if (p->l == 0) {
b->n_cigar[i] = fix_cigar(ks->name, bns, p, b->n_cigar[i], b->cigar[i]);
nn = bns_cnt_ambi(bns, p->k, p->len, &seqid);
coor = p->k - bns->anns[seqid].offset;
}
ksprintf(&str, "%s\t%d", ks->name, (p->flag&0xff)|(is_pe?1:0));
ksprintf(&str, "%s\t%d", ks->name, (p->flag&0x7f)|(is_pe?1:0));
ksprintf(&str, "\t%s\t%ld", seqid>=0? bns->anns[seqid].name : "*", (long)coor + 1);
if (p->l == 0) {
{ // estimate mapping quality
qual = est_mapq(p, opt);
if ((p->flag & BSW2_FLAG_MATESW) && bmate && bmate->n == 1) { // this alignment is from Smith-Waterman rescue
int mate_qual = est_mapq(bmate->hits, opt);
int qual = est_mapq(p, opt);
if (is_pe && bmate && bmate->n == 1) {
int mate_qual = est_mapq(bmate->hits, opt);
if (p->flag & BSW2_FLAG_MATESW) { // this alignment is rescued by Smith-Waterman
qual = qual < mate_qual? qual : mate_qual;
} else if (p->flag&2) { // properly paired
if (!(p->flag & BSW2_FLAG_TANDEM)) { // not around a tandem repeat
if (qual < mate_qual) {
qual += 20;
if (qual >= mate_qual) qual = mate_qual;
}
}
}
}
ksprintf(&str, "\t%d\t", qual);

View File

@ -63,6 +63,7 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf)
r.low = tmp > max_len? tmp : max_len;
if (r.high < r.avg - MAX_STDDEV * 4.) r.high = (int)(r.avg + MAX_STDDEV * 4. + .499);
fprintf(stderr, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high);
free(isize);
return r;
}
@ -110,30 +111,32 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b
for (i = 0; i < l_mseq; ++i) // on the forward strand
seq[i] = nst_nt4_table[(int)mseq[i]];
}
/* The following code can be made up to 2-fold as fast. I am just lazy... */
// forward Smith-Waterman
aux[0].T = opt->t; aux[0].gapo = opt->q; aux[0].gape = opt->r; aux[1] = aux[0];
q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat);
ksw_sse2(q, end - beg, ref, &aux[0]);
free(q);
if (aux[0].score == 0) {
if (aux[0].score < opt->t) {
aux[0].score = 0;
free(seq);
return;
}
++aux[0].qe; ++aux[0].te;
// reverse Smith-Waterman
seq_reverse(l_mseq, seq, 0);
seq_reverse(end - beg, ref, 0);
q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat);
ksw_sse2(q, end - beg, ref, &aux[1]);
seq_reverse(aux[0].qe, seq, 0);
seq_reverse(aux[0].te, ref, 0);
q = ksw_qinit(aux[0].qe * g_mat[0] < 250? 1 : 2, aux[0].qe, seq, 5, g_mat);
ksw_sse2(q, aux[0].te, ref, &aux[1]);
free(q);
aux[1].te = end - beg - 1 - aux[1].te; // change to the forward-strand coordinate
++aux[1].qe; ++aux[1].te;
// write output
a->G = aux[0].score;
a->G2 = aux[0].score2 > aux[1].score2? aux[0].score2 : aux[1].score2;
a->k = beg + aux[1].te;
a->len = aux[0].te + 1 - aux[1].te;
a->beg = l_mseq - 1 - aux[1].qe;
a->end = aux[0].qe + 1;
a->k = beg + (aux[0].te - aux[1].te);
a->len = aux[1].te;
a->beg = aux[0].qe - aux[1].qe;
a->end = aux[0].qe;
if (a->is_rev) i = a->beg, a->beg = l_mseq - a->end, a->end = l_mseq - i;
free(seq);
}
@ -141,7 +144,7 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b
{
extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS);
bsw2pestat_t pes;
int i, j, k, n_rescued = 0;
int i, j, k, n_rescued = 0, n_moved = 0, n_fixed = 0;
pes = bsw2_stat(n, hits);
for (i = k = 0; i < 5; ++i) {
for (j = 0; j < 4; ++j)
@ -165,7 +168,6 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b
if (hits[i+0]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+0]->hits[0], seq[i+1].l, seq[i+1].seq, &a[1]);
if (hits[i+1]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+1]->hits[0], seq[i+0].l, seq[i+0].seq, &a[0]);
// the following enumerate all possibilities. It is tedious but necessary...
//if (strstr(seq[i].name, "22_49258265_49258755_4")) fprintf(stderr, "%lld\t%lld\t(%d,%d)\n", hits[i+1]->hits[0].k, a[1].k, a[0].G, a[0].G2);
if (hits[i]->n + hits[i+1]->n == 1) { // one end mapped; the other not
bwtsw2_t *p[2];
int which;
@ -176,11 +178,61 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b
p[1]->max = 1;
p[1]->hits = malloc(sizeof(bsw2hit_t));
}
memcpy(p[1]->hits, &a[which], sizeof(bsw2hit_t));
p[1]->hits[0] = a[which];
p[1]->n = 1;
++n_rescued;
} else { // then both ends mapped
//fprintf(stderr, "%d; %lld,%lld; %d,%d\n", a[0].is_rev, hits[i]->hits[0].k, a[0].k, hits[i]->hits[0].end, a[0].end);
for (j = 0; j < 2; ++j) { // first fix wrong mappings
if (hits[i+j]->hits[0].G < a[j].G) { // the orginal mapping is suboptimal
a[j].G2 = a[j].G2 > hits[i+j]->hits[0].G? a[j].G2 : hits[i+j]->hits[0].G;
hits[i+j]->hits[0] = a[j];
++n_fixed;
}
}
if (hits[i]->hits[0].k == a[0].k && hits[i+1]->hits[0].k == a[1].k) { // properly paired and no ends need to be moved
for (j = 0; j < 2; ++j) {
if (hits[i+j]->hits[0].G2 < a[j].G2)
hits[i+j]->hits[0].G2 = a[j].G2;
if (a[j].G2) hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM;
hits[i+j]->hits[0].flag |= 2;
}
} else if (hits[i]->hits[0].k == a[0].k || hits[i+1]->hits[0].k == a[1].k) { // a tandem match
for (j = 0; j < 2; ++j) {
hits[i+j]->hits[0].flag |= 2;
if (hits[i+j]->hits[0].k != a[j].k)
hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM | 2;
}
} else if (a[0].G || a[1].G) { // it is possible to move one end
if (a[0].G && a[1].G) { // now we have two "proper pairs"
int G[2];
double diff;
G[0] = hits[i]->hits[0].G + a[1].G;
G[1] = hits[i+1]->hits[0].G + a[0].G;
diff = fabs(G[0] - G[1]) / (opt->a + opt->b) / ((hits[i]->hits[0].len + a[1].len + hits[i+1]->hits[0].len + a[0].len) / 2.);
if (diff > 0.05) a[G[0] > G[1]? 0 : 1].G = 0;
}
if (a[0].G == 0 || a[1].G == 0) { // one proper pair only
bsw2hit_t *p[2];
int which, isize;
double dev, diff;
if (a[0].G) p[0] = &hits[i+1]->hits[0], p[1] = &hits[i]->hits[0], which = 0;
else p[0] = &hits[i]->hits[0], p[1] = &hits[i+1]->hits[0], which = 1;
isize = p[0]->is_rev? p[0]->k + p[0]->len - a[which].k : a[which].k + a[which].len - p[0]->k;
dev = fabs(isize - pes.avg) / pes.std;
diff = (double)(p[1]->G - a[which].G) / (opt->a + opt->b) / (p[1]->end - p[1]->beg) * 100.0;
if (diff < dev * 2.) { // then move
int tflag = 0;
if (a[which].G - a[which].G2 < 2 * (opt->a + opt->b)) tflag = BSW2_FLAG_TANDEM;
a[which].G2 = a[which].G;
p[1][0] = a[which];
p[1]->flag |= BSW2_FLAG_MOVED | 2 | tflag;
p[0]->flag |= 2;
++n_moved;
}
}
}
}
}
fprintf(stderr, "[%s] rescued %d reads\n", __func__, n_rescued);
fprintf(stderr, "[%s] #fixed=%d, #rescued=%d, #moved=%d\n", __func__, n_fixed, n_rescued, n_moved);
}

2
main.c
View File

@ -4,7 +4,7 @@
#include "utils.h"
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "0.6.0-r70-dev"
#define PACKAGE_VERSION "0.6.0-r77-dev"
#endif
static int usage()