use forward-only pac to reduce memory
This commit is contained in:
parent
fe9da3c704
commit
1cb409aaf2
44
bntseq.c
44
bntseq.c
|
|
@ -286,36 +286,44 @@ int bwa_fa2pac(int argc, char *argv[])
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq)
|
int64_t bns_pos2refId(const bntseq_t *bns, int64_t pos, int is_fr, int *ref_id, int *is_rev)
|
||||||
{
|
{
|
||||||
int left, mid, right, nn;
|
int left, mid, right;
|
||||||
if (pac_coor >= bns->l_pac)
|
is_fr = is_fr? 1 : 0;
|
||||||
err_fatal("bns_coor_pac2real", "bug! Coordinate is longer than sequence (%lld>=%lld).", pac_coor, bns->l_pac);
|
|
||||||
// binary search for the sequence ID. Note that this is a bit different from the following one...
|
|
||||||
left = 0; mid = 0; right = bns->n_seqs;
|
left = 0; mid = 0; right = bns->n_seqs;
|
||||||
while (left < right) {
|
while (left < right) {
|
||||||
mid = (left + right) >> 1;
|
mid = (left + right) >> 1;
|
||||||
if (pac_coor >= bns->anns[mid].offset) {
|
if (pos >= bns->anns[mid].offset<<is_fr) {
|
||||||
if (mid == bns->n_seqs - 1) break;
|
if (mid == bns->n_seqs - 1) break;
|
||||||
if (pac_coor < bns->anns[mid+1].offset) break;
|
if (pos < bns->anns[mid+1].offset<<is_fr) break; // bracketed
|
||||||
left = mid + 1;
|
left = mid + 1;
|
||||||
} else right = mid;
|
} else right = mid;
|
||||||
}
|
}
|
||||||
*real_seq = mid;
|
*ref_id = mid;
|
||||||
if (len == 0) return 0;
|
if (is_fr) { // then compute the forward-only position
|
||||||
// binary search for holes
|
bntann1_t *p = &bns->anns[mid];
|
||||||
|
if (pos - (p->offset<<1) < p->len) *is_rev = 0, pos -= p->offset;
|
||||||
|
else *is_rev = 1, pos = p->len - (pos - (p->offset<<1) - p->len) + p->offset;
|
||||||
|
}
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id)
|
||||||
|
{
|
||||||
|
int left, mid, right, nn;
|
||||||
|
if (ref_id) bns_pos2refId(bns, pos_f, 0, ref_id, 0);
|
||||||
left = 0; right = bns->n_holes; nn = 0;
|
left = 0; right = bns->n_holes; nn = 0;
|
||||||
while (left < right) {
|
while (left < right) {
|
||||||
int64_t mid = (left + right) >> 1;
|
mid = (left + right) >> 1;
|
||||||
if (pac_coor >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1;
|
if (pos_f >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1;
|
||||||
else if (pac_coor + len <= bns->ambs[mid].offset) right = mid;
|
else if (pos_f + len <= bns->ambs[mid].offset) right = mid;
|
||||||
else { // overlap
|
else { // overlap
|
||||||
if (pac_coor >= bns->ambs[mid].offset) {
|
if (pos_f >= bns->ambs[mid].offset) {
|
||||||
nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len?
|
nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len?
|
||||||
bns->ambs[mid].offset + bns->ambs[mid].len - pac_coor : len;
|
bns->ambs[mid].offset + bns->ambs[mid].len - pos_f : len;
|
||||||
} else {
|
} else {
|
||||||
nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len?
|
nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len?
|
||||||
bns->ambs[mid].len : len - (bns->ambs[mid].offset - pac_coor);
|
bns->ambs[mid].len : len - (bns->ambs[mid].offset - pos_f);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
6
bntseq.h
6
bntseq.h
|
|
@ -70,8 +70,10 @@ extern "C" {
|
||||||
bntseq_t *bns_restore(const char *prefix);
|
bntseq_t *bns_restore(const char *prefix);
|
||||||
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename);
|
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename);
|
||||||
void bns_destroy(bntseq_t *bns);
|
void bns_destroy(bntseq_t *bns);
|
||||||
int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix);
|
int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only);
|
||||||
int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq);
|
int64_t bns_pos2refId(const bntseq_t *bns, int64_t pos, int is_fr, int *ref_id, int *is_rev);
|
||||||
|
int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id);
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
||||||
21
bwase.c
21
bwase.c
|
|
@ -111,17 +111,16 @@ int bwa_approx_mapQ(const bwa_seq_t *p, int mm)
|
||||||
|
|
||||||
bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand)
|
bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand)
|
||||||
{
|
{
|
||||||
bwtint_t pacpos;
|
bwtint_t pos_fr, pos_f;
|
||||||
int32_t ref_id;
|
int is_rev, ref_id;
|
||||||
pacpos = bwt_sa(bwt, sapos);
|
pos_fr = bwt_sa(bwt, sapos);
|
||||||
bns_coor_pac2real(bns, pacpos, 0, &ref_id);
|
pos_f = bns_pos2refId(bns, pos_fr, 1, &ref_id, &is_rev); // pos_f
|
||||||
*strand = !(ref_id&1);
|
*strand = !is_rev;
|
||||||
/* NB: For gapped alignment, pacpos may not be correct, which will be fixed
|
/* NB: For gapped alignment, pacpos may not be correct, which will be fixed
|
||||||
* in refine_gapped_core(). This line also determines the way "x" is
|
* in refine_gapped_core(). This line also determines the way "x" is
|
||||||
* calculated in refine_gapped_core() when (ext < 0 && is_end == 0). */
|
* calculated in refine_gapped_core() when (ext < 0 && is_end == 0). */
|
||||||
if (ref_id&1) // mapped to the forward strand
|
if (is_rev) pos_f = pos_f < len? 0 : pos_f - len; // mapped to the forward strand
|
||||||
pacpos = bns->anns[ref_id].len - (pacpos + len - bns->anns[ref_id].offset) + bns->anns[ref_id-1].offset;
|
return pos_f; // FIXME: it is possible that pos_f < bns->anns[ref_id].offset
|
||||||
return pacpos;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -423,7 +422,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
|
||||||
} else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment
|
} else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment
|
||||||
|
|
||||||
// get seqid
|
// get seqid
|
||||||
nn = bns_coor_pac2real(bns, p->pos, j, &seqid);
|
nn = bns_cnt_ambi(bns, p->pos, j, &seqid);
|
||||||
if (p->type != BWA_TYPE_NO_MATCH && p->pos + j - bns->anns[seqid].offset > bns->anns[seqid].len)
|
if (p->type != BWA_TYPE_NO_MATCH && p->pos + j - bns->anns[seqid].offset > bns->anns[seqid].len)
|
||||||
flag |= SAM_FSU; // flag UNMAP as this alignment bridges two adjacent reference sequences
|
flag |= SAM_FSU; // flag UNMAP as this alignment bridges two adjacent reference sequences
|
||||||
|
|
||||||
|
|
@ -450,7 +449,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
|
||||||
long long isize;
|
long long isize;
|
||||||
am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality
|
am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality
|
||||||
// redundant calculation here, but should not matter too much
|
// redundant calculation here, but should not matter too much
|
||||||
m_is_N = bns_coor_pac2real(bns, mate->pos, mate->len, &m_seqid);
|
m_is_N = bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid);
|
||||||
err_printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name);
|
err_printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name);
|
||||||
isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0;
|
isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0;
|
||||||
if (p->type == BWA_TYPE_NO_MATCH) isize = 0;
|
if (p->type == BWA_TYPE_NO_MATCH) isize = 0;
|
||||||
|
|
@ -493,7 +492,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
|
||||||
bwt_multi1_t *q = p->multi + i;
|
bwt_multi1_t *q = p->multi + i;
|
||||||
int k;
|
int k;
|
||||||
j = pos_end_multi(q, p->len) - q->pos;
|
j = pos_end_multi(q, p->len) - q->pos;
|
||||||
nn = bns_coor_pac2real(bns, q->pos, j, &seqid);
|
nn = bns_cnt_ambi(bns, q->pos, j, &seqid);
|
||||||
err_printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+',
|
err_printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+',
|
||||||
(int)(q->pos - bns->anns[seqid].offset + 1));
|
(int)(q->pos - bns->anns[seqid].offset + 1));
|
||||||
if (q->cigar) {
|
if (q->cigar) {
|
||||||
|
|
|
||||||
12
bwtindex.c
12
bwtindex.c
|
|
@ -80,7 +80,7 @@ int bwa_index(int argc, char *argv[])
|
||||||
gzFile fp = xzopen(argv[optind], "r");
|
gzFile fp = xzopen(argv[optind], "r");
|
||||||
t = clock();
|
t = clock();
|
||||||
fprintf(stderr, "[bwa_index] Pack FASTA... ");
|
fprintf(stderr, "[bwa_index] Pack FASTA... ");
|
||||||
l_pac = bns_fasta2bntseq(fp, prefix);
|
l_pac = bns_fasta2bntseq(fp, prefix, 0);
|
||||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||||
gzclose(fp);
|
gzclose(fp);
|
||||||
} else { // color indexing
|
} else { // color indexing
|
||||||
|
|
@ -88,7 +88,7 @@ int bwa_index(int argc, char *argv[])
|
||||||
strcat(strcpy(str, prefix), ".nt");
|
strcat(strcpy(str, prefix), ".nt");
|
||||||
t = clock();
|
t = clock();
|
||||||
fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... ");
|
fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... ");
|
||||||
l_pac = bns_fasta2bntseq(fp, str);
|
l_pac = bns_fasta2bntseq(fp, str, 0);
|
||||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||||
gzclose(fp);
|
gzclose(fp);
|
||||||
{
|
{
|
||||||
|
|
@ -126,6 +126,14 @@ int bwa_index(int argc, char *argv[])
|
||||||
bwt_destroy(bwt);
|
bwt_destroy(bwt);
|
||||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||||
}
|
}
|
||||||
|
{
|
||||||
|
gzFile fp = xzopen(argv[optind], "r");
|
||||||
|
t = clock();
|
||||||
|
fprintf(stderr, "[bwa_index] Pack forward-only FASTA... ");
|
||||||
|
l_pac = bns_fasta2bntseq(fp, prefix, 1);
|
||||||
|
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
|
||||||
|
gzclose(fp);
|
||||||
|
}
|
||||||
{
|
{
|
||||||
bwt_t *bwt;
|
bwt_t *bwt;
|
||||||
strcpy(str, prefix); strcat(str, ".bwt");
|
strcpy(str, prefix); strcat(str, ".bwt");
|
||||||
|
|
|
||||||
|
|
@ -314,7 +314,7 @@ static int fix_cigar(const char *qname, const bntseq_t *bns, bsw2hit_t *p, int n
|
||||||
// FIXME: this routine does not work if the query bridge three reference sequences
|
// FIXME: this routine does not work if the query bridge three reference sequences
|
||||||
int32_t coor, refl, lq;
|
int32_t coor, refl, lq;
|
||||||
int x, y, i, seqid;
|
int x, y, i, seqid;
|
||||||
bns_coor_pac2real(bns, p->k, p->len, &seqid);
|
bns_cnt_ambi(bns, p->k, p->len, &seqid);
|
||||||
coor = p->k - bns->anns[seqid].offset;
|
coor = p->k - bns->anns[seqid].offset;
|
||||||
refl = bns->anns[seqid].len;
|
refl = bns->anns[seqid].len;
|
||||||
x = coor; y = 0;
|
x = coor; y = 0;
|
||||||
|
|
@ -404,7 +404,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks
|
||||||
int beg, end;
|
int beg, end;
|
||||||
if (p->l == 0) {
|
if (p->l == 0) {
|
||||||
b->n_cigar[i] = fix_cigar(ks->name, bns, p, b->n_cigar[i], b->cigar[i]);
|
b->n_cigar[i] = fix_cigar(ks->name, bns, p, b->n_cigar[i], b->cigar[i]);
|
||||||
nn = bns_coor_pac2real(bns, p->k, p->len, &seqid);
|
nn = bns_cnt_ambi(bns, p->k, p->len, &seqid);
|
||||||
coor = p->k - bns->anns[seqid].offset;
|
coor = p->k - bns->anns[seqid].offset;
|
||||||
}
|
}
|
||||||
ksprintf(&str, "%s\t%d", ks->name, p->flag&0x10);
|
ksprintf(&str, "%s\t%d", ks->name, p->flag&0x10);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue