use forward-only pac to reduce memory

This commit is contained in:
Heng Li 2011-10-21 12:03:14 -04:00
parent fe9da3c704
commit 1cb409aaf2
5 changed files with 52 additions and 35 deletions

View File

@ -286,36 +286,44 @@ int bwa_fa2pac(int argc, char *argv[])
return 0;
}
int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq)
int64_t bns_pos2refId(const bntseq_t *bns, int64_t pos, int is_fr, int *ref_id, int *is_rev)
{
int left, mid, right, nn;
if (pac_coor >= bns->l_pac)
err_fatal("bns_coor_pac2real", "bug! Coordinate is longer than sequence (%lld>=%lld).", pac_coor, bns->l_pac);
// binary search for the sequence ID. Note that this is a bit different from the following one...
int left, mid, right;
is_fr = is_fr? 1 : 0;
left = 0; mid = 0; right = bns->n_seqs;
while (left < right) {
mid = (left + right) >> 1;
if (pac_coor >= bns->anns[mid].offset) {
if (pos >= bns->anns[mid].offset<<is_fr) {
if (mid == bns->n_seqs - 1) break;
if (pac_coor < bns->anns[mid+1].offset) break;
if (pos < bns->anns[mid+1].offset<<is_fr) break; // bracketed
left = mid + 1;
} else right = mid;
}
*real_seq = mid;
if (len == 0) return 0;
// binary search for holes
*ref_id = mid;
if (is_fr) { // then compute the forward-only position
bntann1_t *p = &bns->anns[mid];
if (pos - (p->offset<<1) < p->len) *is_rev = 0, pos -= p->offset;
else *is_rev = 1, pos = p->len - (pos - (p->offset<<1) - p->len) + p->offset;
}
return pos;
}
int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id)
{
int left, mid, right, nn;
if (ref_id) bns_pos2refId(bns, pos_f, 0, ref_id, 0);
left = 0; right = bns->n_holes; nn = 0;
while (left < right) {
int64_t mid = (left + right) >> 1;
if (pac_coor >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1;
else if (pac_coor + len <= bns->ambs[mid].offset) right = mid;
mid = (left + right) >> 1;
if (pos_f >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1;
else if (pos_f + len <= bns->ambs[mid].offset) right = mid;
else { // overlap
if (pac_coor >= bns->ambs[mid].offset) {
nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len?
bns->ambs[mid].offset + bns->ambs[mid].len - pac_coor : len;
if (pos_f >= bns->ambs[mid].offset) {
nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len?
bns->ambs[mid].offset + bns->ambs[mid].len - pos_f : len;
} else {
nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len?
bns->ambs[mid].len : len - (bns->ambs[mid].offset - pac_coor);
nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len?
bns->ambs[mid].len : len - (bns->ambs[mid].offset - pos_f);
}
break;
}

View File

@ -70,8 +70,10 @@ extern "C" {
bntseq_t *bns_restore(const char *prefix);
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename);
void bns_destroy(bntseq_t *bns);
int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix);
int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq);
int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only);
int64_t bns_pos2refId(const bntseq_t *bns, int64_t pos, int is_fr, int *ref_id, int *is_rev);
int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id);
#ifdef __cplusplus
}

21
bwase.c
View File

@ -111,17 +111,16 @@ int bwa_approx_mapQ(const bwa_seq_t *p, int mm)
bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand)
{
bwtint_t pacpos;
int32_t ref_id;
pacpos = bwt_sa(bwt, sapos);
bns_coor_pac2real(bns, pacpos, 0, &ref_id);
*strand = !(ref_id&1);
bwtint_t pos_fr, pos_f;
int is_rev, ref_id;
pos_fr = bwt_sa(bwt, sapos);
pos_f = bns_pos2refId(bns, pos_fr, 1, &ref_id, &is_rev); // pos_f
*strand = !is_rev;
/* NB: For gapped alignment, pacpos may not be correct, which will be fixed
* in refine_gapped_core(). This line also determines the way "x" is
* calculated in refine_gapped_core() when (ext < 0 && is_end == 0). */
if (ref_id&1) // mapped to the forward strand
pacpos = bns->anns[ref_id].len - (pacpos + len - bns->anns[ref_id].offset) + bns->anns[ref_id-1].offset;
return pacpos;
if (is_rev) pos_f = pos_f < len? 0 : pos_f - len; // mapped to the forward strand
return pos_f; // FIXME: it is possible that pos_f < bns->anns[ref_id].offset
}
/**
@ -423,7 +422,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
} else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment
// get seqid
nn = bns_coor_pac2real(bns, p->pos, j, &seqid);
nn = bns_cnt_ambi(bns, p->pos, j, &seqid);
if (p->type != BWA_TYPE_NO_MATCH && p->pos + j - bns->anns[seqid].offset > bns->anns[seqid].len)
flag |= SAM_FSU; // flag UNMAP as this alignment bridges two adjacent reference sequences
@ -450,7 +449,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
long long isize;
am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality
// redundant calculation here, but should not matter too much
m_is_N = bns_coor_pac2real(bns, mate->pos, mate->len, &m_seqid);
m_is_N = bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid);
err_printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name);
isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0;
if (p->type == BWA_TYPE_NO_MATCH) isize = 0;
@ -493,7 +492,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
bwt_multi1_t *q = p->multi + i;
int k;
j = pos_end_multi(q, p->len) - q->pos;
nn = bns_coor_pac2real(bns, q->pos, j, &seqid);
nn = bns_cnt_ambi(bns, q->pos, j, &seqid);
err_printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+',
(int)(q->pos - bns->anns[seqid].offset + 1));
if (q->cigar) {

View File

@ -80,7 +80,7 @@ int bwa_index(int argc, char *argv[])
gzFile fp = xzopen(argv[optind], "r");
t = clock();
fprintf(stderr, "[bwa_index] Pack FASTA... ");
l_pac = bns_fasta2bntseq(fp, prefix);
l_pac = bns_fasta2bntseq(fp, prefix, 0);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
gzclose(fp);
} else { // color indexing
@ -88,7 +88,7 @@ int bwa_index(int argc, char *argv[])
strcat(strcpy(str, prefix), ".nt");
t = clock();
fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... ");
l_pac = bns_fasta2bntseq(fp, str);
l_pac = bns_fasta2bntseq(fp, str, 0);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
gzclose(fp);
{
@ -126,6 +126,14 @@ int bwa_index(int argc, char *argv[])
bwt_destroy(bwt);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
}
{
gzFile fp = xzopen(argv[optind], "r");
t = clock();
fprintf(stderr, "[bwa_index] Pack forward-only FASTA... ");
l_pac = bns_fasta2bntseq(fp, prefix, 1);
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
gzclose(fp);
}
{
bwt_t *bwt;
strcpy(str, prefix); strcat(str, ".bwt");

View File

@ -314,7 +314,7 @@ static int fix_cigar(const char *qname, const bntseq_t *bns, bsw2hit_t *p, int n
// FIXME: this routine does not work if the query bridge three reference sequences
int32_t coor, refl, lq;
int x, y, i, seqid;
bns_coor_pac2real(bns, p->k, p->len, &seqid);
bns_cnt_ambi(bns, p->k, p->len, &seqid);
coor = p->k - bns->anns[seqid].offset;
refl = bns->anns[seqid].len;
x = coor; y = 0;
@ -404,7 +404,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks
int beg, end;
if (p->l == 0) {
b->n_cigar[i] = fix_cigar(ks->name, bns, p, b->n_cigar[i], b->cigar[i]);
nn = bns_coor_pac2real(bns, p->k, p->len, &seqid);
nn = bns_cnt_ambi(bns, p->k, p->len, &seqid);
coor = p->k - bns->anns[seqid].offset;
}
ksprintf(&str, "%s\t%d", ks->name, p->flag&0x10);