PE seems working; more testing needed
This commit is contained in:
parent
098f44cd51
commit
ec307a10e6
1
bntseq.c
1
bntseq.c
|
|
@ -294,6 +294,7 @@ int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *r
|
|||
} else right = mid;
|
||||
}
|
||||
*real_seq = mid;
|
||||
if (len == 0) return 0;
|
||||
// binary search for holes
|
||||
left = 0; right = bns->n_holes; nn = 0;
|
||||
while (left < right) {
|
||||
|
|
|
|||
57
bwape.c
57
bwape.c
|
|
@ -9,6 +9,7 @@
|
|||
#include "bntseq.h"
|
||||
#include "utils.h"
|
||||
#include "stdaln.h"
|
||||
#include "bwase.h"
|
||||
|
||||
typedef struct {
|
||||
int n;
|
||||
|
|
@ -37,10 +38,8 @@ typedef struct {
|
|||
extern int g_log_n[256]; // in bwase.c
|
||||
static kh_64_t *g_hash;
|
||||
|
||||
void bwase_initialize();
|
||||
void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi);
|
||||
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
|
||||
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns);
|
||||
int bwa_approx_mapQ(const bwa_seq_t *p, int mm);
|
||||
void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2);
|
||||
bntseq_t *bwa_open_nt(const char *prefix);
|
||||
|
|
@ -277,12 +276,12 @@ typedef struct {
|
|||
kvec_t(bwt_aln1_t) aln;
|
||||
} aln_buf_t;
|
||||
|
||||
int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa_seq_t *seqs[2], FILE *fp_sa[2], isize_info_t *ii,
|
||||
int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bwt, int n_seqs, bwa_seq_t *seqs[2], FILE *fp_sa[2], isize_info_t *ii,
|
||||
const pe_opt_t *opt, const gap_opt_t *gopt, const isize_info_t *last_ii)
|
||||
{
|
||||
int i, j, cnt_chg = 0;
|
||||
char str[1024];
|
||||
bwt_t *bwt[2];
|
||||
bwt_t *bwt;
|
||||
pe_data_t *d;
|
||||
aln_buf_t *buf[2];
|
||||
|
||||
|
|
@ -290,12 +289,10 @@ int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa
|
|||
buf[0] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t));
|
||||
buf[1] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t));
|
||||
|
||||
if (_bwt[0] == 0) { // load forward SA
|
||||
strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
|
||||
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]);
|
||||
strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
|
||||
strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]);
|
||||
} else bwt[0] = _bwt[0], bwt[1] = _bwt[1];
|
||||
if (_bwt == 0) { // load forward SA
|
||||
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
|
||||
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
|
||||
} else bwt = _bwt;
|
||||
|
||||
// SE
|
||||
for (i = 0; i != n_seqs; ++i) {
|
||||
|
|
@ -314,16 +311,17 @@ int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa
|
|||
// generate SE alignment and mapping quality
|
||||
bwa_aln2seq(n_aln, d->aln[j].a, p[j]);
|
||||
if (p[j]->type == BWA_TYPE_UNIQUE || p[j]->type == BWA_TYPE_REPEAT) {
|
||||
int strand;
|
||||
int max_diff = gopt->fnr > 0.0? bwa_cal_maxdiff(p[j]->len, BWA_AVG_ERR, gopt->fnr) : gopt->max_diff;
|
||||
p[j]->pos = p[j]->strand? bwt_sa(bwt[0], p[j]->sa)
|
||||
: bwt[1]->seq_len - (bwt_sa(bwt[1], p[j]->sa) + p[j]->len);
|
||||
p[j]->seQ = p[j]->mapQ = bwa_approx_mapQ(p[j], max_diff);
|
||||
p[j]->pos = bwa_sa2pos(bns, bwt, p[j]->sa, p[j]->len, &strand);
|
||||
p[j]->strand = strand;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// infer isize
|
||||
infer_isize(n_seqs, seqs, ii, opt->ap_prior, bwt[0]->seq_len);
|
||||
infer_isize(n_seqs, seqs, ii, opt->ap_prior, bwt->seq_len/2);
|
||||
if (ii->avg < 0.0 && last_ii->avg > 0.0) *ii = *last_ii;
|
||||
if (opt->force_isize) {
|
||||
fprintf(stderr, "[%s] discard insert size estimate as user's request.\n", __func__);
|
||||
|
|
@ -361,8 +359,11 @@ int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa
|
|||
poslist_t *z = &kh_val(g_hash, iter);
|
||||
z->n = r->l - r->k + 1;
|
||||
z->a = (bwtint_t*)malloc(sizeof(bwtint_t) * z->n);
|
||||
for (l = r->k; l <= r->l; ++l)
|
||||
z->a[l - r->k] = r->a? bwt_sa(bwt[0], l) : bwt[1]->seq_len - (bwt_sa(bwt[1], l) + p[j]->len);
|
||||
for (l = r->k; l <= r->l; ++l) {
|
||||
int strand;
|
||||
z->a[l - r->k] = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand);
|
||||
r->a = strand;
|
||||
}
|
||||
}
|
||||
for (l = 0; l < kh_val(g_hash, iter).n; ++l) {
|
||||
x = kh_val(g_hash, iter).a[l];
|
||||
|
|
@ -371,7 +372,9 @@ int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa
|
|||
}
|
||||
} else { // then calculate on the fly
|
||||
for (l = r->k; l <= r->l; ++l) {
|
||||
x = r->a? bwt_sa(bwt[0], l) : bwt[1]->seq_len - (bwt_sa(bwt[1], l) + p[j]->len);
|
||||
int strand;
|
||||
x = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand);
|
||||
r->a = strand;
|
||||
x = x<<32 | k<<1 | j;
|
||||
kv_push(uint64_t, d->arr, x);
|
||||
}
|
||||
|
|
@ -389,8 +392,10 @@ int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa
|
|||
bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, p[j]->c1+p[j]->c2-1 > opt->N_multi? opt->n_multi : opt->N_multi);
|
||||
} else bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, opt->n_multi);
|
||||
for (k = 0; k < p[j]->n_multi; ++k) {
|
||||
int strand;
|
||||
bwt_multi1_t *q = p[j]->multi + k;
|
||||
q->pos = q->strand? bwt_sa(bwt[0], q->pos) : bwt[1]->seq_len - (bwt_sa(bwt[1], q->pos) + p[j]->len);
|
||||
q->pos = bwa_sa2pos(bns, bwt, q->pos, p[j]->len, &strand);
|
||||
q->strand = strand;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -403,9 +408,7 @@ int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa
|
|||
kv_destroy(buf[1][i].aln);
|
||||
}
|
||||
free(buf[0]); free(buf[1]);
|
||||
if (_bwt[0] == 0) {
|
||||
bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
|
||||
}
|
||||
if (_bwt == 0) bwt_destroy(bwt);
|
||||
kv_destroy(d->arr);
|
||||
kv_destroy(d->pos[0]); kv_destroy(d->pos[1]);
|
||||
kv_destroy(d->aln[0]); kv_destroy(d->aln[1]);
|
||||
|
|
@ -655,12 +658,12 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
|
|||
khint_t iter;
|
||||
isize_info_t last_ii; // this is for the last batch of reads
|
||||
char str[1024];
|
||||
bwt_t *bwt[2];
|
||||
bwt_t *bwt;
|
||||
uint8_t *pac;
|
||||
|
||||
// initialization
|
||||
bwase_initialize(); // initialize g_log_n[] in bwase.c
|
||||
pac = 0; bwt[0] = bwt[1] = 0;
|
||||
pac = 0; bwt = 0;
|
||||
for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
|
||||
bns = bns_restore(prefix);
|
||||
srand48(bns->seed);
|
||||
|
|
@ -679,10 +682,8 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
|
|||
ntbns = bwa_open_nt(prefix);
|
||||
} else { // for Illumina alignment only
|
||||
if (popt->is_preload) {
|
||||
strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
|
||||
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]);
|
||||
strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
|
||||
strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]);
|
||||
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
|
||||
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
|
||||
pac = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
|
||||
rewind(bns->fp_pac);
|
||||
fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
|
||||
|
|
@ -702,7 +703,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
|
|||
t = clock();
|
||||
|
||||
fprintf(stderr, "[bwa_sai2sam_pe_core] convert to sequence coordinate... \n");
|
||||
cnt_chg = bwa_cal_pac_pos_pe(prefix, bwt, n_seqs, seqs, fp_sa, &ii, popt, &opt, &last_ii);
|
||||
cnt_chg = bwa_cal_pac_pos_pe(bns, prefix, bwt, n_seqs, seqs, fp_sa, &ii, popt, &opt, &last_ii);
|
||||
fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
||||
fprintf(stderr, "[bwa_sai2sam_pe_core] changing coordinates of %d alignments.\n", cnt_chg);
|
||||
|
||||
|
|
@ -746,7 +747,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
|
|||
if (kh_exist(g_hash, iter)) free(kh_val(g_hash, iter).a);
|
||||
kh_destroy(64, g_hash);
|
||||
if (pac) {
|
||||
free(pac); bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
|
||||
free(pac); bwt_destroy(bwt);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
63
bwase.c
63
bwase.c
|
|
@ -109,54 +109,52 @@ int bwa_approx_mapQ(const bwa_seq_t *p, int mm)
|
|||
return (23 < g_log_n[n])? 0 : 23 - g_log_n[n];
|
||||
}
|
||||
|
||||
bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand)
|
||||
{
|
||||
bwtint_t pacpos;
|
||||
int32_t ref_id;
|
||||
pacpos = bwt_sa(bwt, sapos);
|
||||
bns_coor_pac2real(bns, pacpos, 0, &ref_id);
|
||||
*strand = !(ref_id&1);
|
||||
/* NB: For gapped alignment, pacpos may not be correct, which will be fixed
|
||||
* in refine_gapped_core(). This line also determines the way "x" is
|
||||
* calculated in refine_gapped_core() when (ext < 0 && is_end == 0). */
|
||||
if (ref_id&1) // mapped to the forward strand
|
||||
pacpos = bns->anns[ref_id].len - (pacpos + len - bns->anns[ref_id].offset) + bns->anns[ref_id-1].offset;
|
||||
return pacpos;
|
||||
}
|
||||
|
||||
/**
|
||||
* Derive the actual position in the read from the given suffix array
|
||||
* coordinates. Note that the position will be approximate based on
|
||||
* whether indels appear in the read and whether calculations are
|
||||
* performed from the start or end of the read.
|
||||
*/
|
||||
void bwa_cal_pac_pos_core(const bwt_t *forward_bwt, const bwt_t *reverse_bwt, bwa_seq_t *seq, const int max_mm, const float fnr)
|
||||
void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t *bwt, bwa_seq_t *seq, const int max_mm, const float fnr)
|
||||
{
|
||||
int max_diff;
|
||||
int max_diff, strand;
|
||||
if (seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT) return;
|
||||
max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm;
|
||||
if (seq->strand) { // reverse strand only
|
||||
seq->pos = bwt_sa(forward_bwt, seq->sa);
|
||||
seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
|
||||
} else { // forward strand only
|
||||
/* NB: For gapped alignment, p->pos may not be correct, which
|
||||
* will be fixed in refine_gapped_core(). This line also
|
||||
* determines the way "x" is calculated in
|
||||
* refine_gapped_core() when (ext < 0 && is_end == 0). */
|
||||
seq->pos = reverse_bwt->seq_len - (bwt_sa(reverse_bwt, seq->sa) + seq->len);
|
||||
seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
|
||||
}
|
||||
seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
|
||||
seq->pos = bwa_sa2pos(bns, bwt, seq->sa, seq->len, &strand);
|
||||
seq->strand = strand;
|
||||
seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
|
||||
}
|
||||
|
||||
void bwa_cal_pac_pos(const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr)
|
||||
void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr)
|
||||
{
|
||||
int i, j;
|
||||
int i, j, strand;
|
||||
char str[1024];
|
||||
bwt_t *bwt;
|
||||
// load forward SA
|
||||
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
|
||||
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
|
||||
for (i = 0; i != n_seqs; ++i) {
|
||||
if (seqs[i].strand) bwa_cal_pac_pos_core(bwt, 0, &seqs[i], max_mm, fnr);
|
||||
bwa_cal_pac_pos_core(bns, bwt, &seqs[i], max_mm, fnr);
|
||||
for (j = 0; j < seqs[i].n_multi; ++j) {
|
||||
bwt_multi1_t *p = seqs[i].multi + j;
|
||||
if (p->strand) p->pos = bwt_sa(bwt, p->pos);
|
||||
}
|
||||
}
|
||||
bwt_destroy(bwt);
|
||||
// load reverse BWT and SA
|
||||
strcpy(str, prefix); strcat(str, ".rbwt"); bwt = bwt_restore_bwt(str);
|
||||
strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt);
|
||||
for (i = 0; i != n_seqs; ++i) {
|
||||
if (!seqs[i].strand) bwa_cal_pac_pos_core(0, bwt, &seqs[i], max_mm, fnr);
|
||||
for (j = 0; j < seqs[i].n_multi; ++j) {
|
||||
bwt_multi1_t *p = seqs[i].multi + j;
|
||||
if (!p->strand) p->pos = bwt->seq_len - (bwt_sa(bwt, p->pos) + seqs[i].len);
|
||||
p->pos = bwa_sa2pos(bns, bwt, p->pos, seqs[i].len, &strand);
|
||||
p->strand = strand;
|
||||
}
|
||||
}
|
||||
bwt_destroy(bwt);
|
||||
|
|
@ -174,7 +172,7 @@ static bwa_cigar_t *refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, in
|
|||
int l = 0, path_len, ref_len;
|
||||
AlnParam ap = aln_param_bwa;
|
||||
path_t *path;
|
||||
int64_t k, __pos = *_pos > l_pac? (int64_t)((int32_t)*_pos) : *_pos;
|
||||
int64_t k, __pos = *_pos;
|
||||
|
||||
ref_len = len + abs(ext);
|
||||
if (ext > 0) {
|
||||
|
|
@ -192,7 +190,7 @@ static bwa_cigar_t *refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, in
|
|||
aln_global_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len);
|
||||
cigar = bwa_aln_path2cigar(path, path_len, n_cigar);
|
||||
|
||||
if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped on the forward strand
|
||||
if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped to the forward strand
|
||||
for (l = k = 0; k < *n_cigar; ++k) {
|
||||
if (__cigar_op(cigar[k]) == FROM_D) l -= __cigar_len(cigar[k]);
|
||||
else if (__cigar_op(cigar[k]) == FROM_I) l += __cigar_len(cigar[k]);
|
||||
|
|
@ -238,8 +236,7 @@ char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_
|
|||
} else ++u;
|
||||
}
|
||||
x += l; y += l;
|
||||
/* } else if (cigar[k]>>14 == FROM_I || cigar[k]>>14 == 3) { */
|
||||
} else if (__cigar_op(cigar[k]) == FROM_I || __cigar_op(cigar[k]) == FROM_S) {
|
||||
} else if (__cigar_op(cigar[k]) == FROM_I || __cigar_op(cigar[k]) == FROM_S) {
|
||||
y += l;
|
||||
if (__cigar_op(cigar[k]) == FROM_I) nm += l;
|
||||
} else if (__cigar_op(cigar[k]) == FROM_D) {
|
||||
|
|
@ -631,7 +628,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
|
|||
}
|
||||
|
||||
fprintf(stderr, "[bwa_aln_core] convert to sequence coordinate... ");
|
||||
bwa_cal_pac_pos(prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here
|
||||
bwa_cal_pac_pos(bns, prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here
|
||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
||||
|
||||
fprintf(stderr, "[bwa_aln_core] refine gapped alignments... ");
|
||||
|
|
|
|||
4
bwase.h
4
bwase.h
|
|
@ -12,13 +12,15 @@ extern "C" {
|
|||
// Initialize mapping tables in the bwa single-end mapper.
|
||||
void bwase_initialize();
|
||||
// Calculate the approximate position of the sequence from the specified bwt with loaded suffix array.
|
||||
void bwa_cal_pac_pos_core(const bwt_t* forward_bwt, const bwt_t* reverse_bwt, bwa_seq_t* seq, const int max_mm, const float fnr);
|
||||
void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t* bwt, bwa_seq_t* seq, const int max_mm, const float fnr);
|
||||
// Refine the approximate position of the sequence to an actual placement for the sequence.
|
||||
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns);
|
||||
// Backfill certain alignment properties mainly centering around number of matches.
|
||||
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
|
||||
// Calculate the end position of a read given a certain sequence.
|
||||
int64_t pos_end(const bwa_seq_t *p);
|
||||
//
|
||||
bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue