2011-01-14 09:52:12 +08:00
|
|
|
/* The MIT License
|
|
|
|
|
|
|
|
|
|
Copyright (c) 2008 Genome Research Ltd (GRL).
|
|
|
|
|
|
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining
|
|
|
|
|
a copy of this software and associated documentation files (the
|
|
|
|
|
"Software"), to deal in the Software without restriction, including
|
|
|
|
|
without limitation the rights to use, copy, modify, merge, publish,
|
|
|
|
|
distribute, sublicense, and/or sell copies of the Software, and to
|
|
|
|
|
permit persons to whom the Software is furnished to do so, subject to
|
|
|
|
|
the following conditions:
|
|
|
|
|
|
|
|
|
|
The above copyright notice and this permission notice shall be
|
|
|
|
|
included in all copies or substantial portions of the Software.
|
|
|
|
|
|
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
|
|
|
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
|
|
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
|
|
|
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
|
|
|
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
|
|
|
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
|
|
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
|
|
|
SOFTWARE.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/* Contact: Heng Li <lh3@sanger.ac.uk> */
|
|
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
#include <string.h>
|
|
|
|
|
#include <zlib.h>
|
|
|
|
|
#include "bntseq.h"
|
|
|
|
|
#include "main.h"
|
|
|
|
|
#include "utils.h"
|
|
|
|
|
|
|
|
|
|
#include "kseq.h"
|
|
|
|
|
KSEQ_INIT(gzFile, gzread)
|
|
|
|
|
|
|
|
|
|
unsigned char nst_nt4_table[256] = {
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
|
|
|
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
void bns_dump(const bntseq_t *bns, const char *prefix)
|
|
|
|
|
{
|
|
|
|
|
char str[1024];
|
|
|
|
|
FILE *fp;
|
|
|
|
|
int i;
|
|
|
|
|
{ // dump .ann
|
|
|
|
|
strcpy(str, prefix); strcat(str, ".ann");
|
|
|
|
|
fp = xopen(str, "w");
|
|
|
|
|
fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed);
|
|
|
|
|
for (i = 0; i != bns->n_seqs; ++i) {
|
|
|
|
|
bntann1_t *p = bns->anns + i;
|
|
|
|
|
fprintf(fp, "%d %s", p->gi, p->name);
|
|
|
|
|
if (p->anno[0]) fprintf(fp, " %s\n", p->anno);
|
|
|
|
|
else fprintf(fp, "\n");
|
|
|
|
|
fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs);
|
|
|
|
|
}
|
|
|
|
|
fclose(fp);
|
|
|
|
|
}
|
|
|
|
|
{ // dump .amb
|
|
|
|
|
strcpy(str, prefix); strcat(str, ".amb");
|
|
|
|
|
fp = xopen(str, "w");
|
|
|
|
|
fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes);
|
|
|
|
|
for (i = 0; i != bns->n_holes; ++i) {
|
|
|
|
|
bntamb1_t *p = bns->ambs + i;
|
|
|
|
|
fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb);
|
|
|
|
|
}
|
|
|
|
|
fclose(fp);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename)
|
|
|
|
|
{
|
|
|
|
|
char str[1024];
|
|
|
|
|
FILE *fp;
|
|
|
|
|
bntseq_t *bns;
|
|
|
|
|
long long xx;
|
|
|
|
|
int i;
|
|
|
|
|
bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
|
|
|
|
|
{ // read .ann
|
|
|
|
|
fp = xopen(ann_filename, "r");
|
|
|
|
|
fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed);
|
|
|
|
|
bns->l_pac = xx;
|
|
|
|
|
bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t));
|
|
|
|
|
for (i = 0; i < bns->n_seqs; ++i) {
|
|
|
|
|
bntann1_t *p = bns->anns + i;
|
|
|
|
|
char *q = str;
|
|
|
|
|
int c;
|
|
|
|
|
// read gi and sequence name
|
|
|
|
|
fscanf(fp, "%u%s", &p->gi, str);
|
|
|
|
|
p->name = strdup(str);
|
|
|
|
|
// read fasta comments
|
|
|
|
|
while ((c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
|
|
|
|
|
*q = 0;
|
|
|
|
|
if (q - str > 1) p->anno = strdup(str + 1); // skip leading space
|
|
|
|
|
else p->anno = strdup("");
|
|
|
|
|
// read the rest
|
|
|
|
|
fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs);
|
|
|
|
|
p->offset = xx;
|
|
|
|
|
}
|
|
|
|
|
fclose(fp);
|
|
|
|
|
}
|
|
|
|
|
{ // read .amb
|
|
|
|
|
int64_t l_pac;
|
|
|
|
|
int32_t n_seqs;
|
|
|
|
|
fp = xopen(amb_filename, "r");
|
|
|
|
|
fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes);
|
|
|
|
|
l_pac = xx;
|
|
|
|
|
xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files.");
|
|
|
|
|
bns->ambs = (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t));
|
|
|
|
|
for (i = 0; i < bns->n_holes; ++i) {
|
|
|
|
|
bntamb1_t *p = bns->ambs + i;
|
|
|
|
|
fscanf(fp, "%lld%d%s", &xx, &p->len, str);
|
|
|
|
|
p->offset = xx;
|
|
|
|
|
p->amb = str[0];
|
|
|
|
|
}
|
|
|
|
|
fclose(fp);
|
|
|
|
|
}
|
|
|
|
|
{ // open .pac
|
|
|
|
|
bns->fp_pac = xopen(pac_filename, "rb");
|
|
|
|
|
}
|
|
|
|
|
return bns;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bntseq_t *bns_restore(const char *prefix)
|
|
|
|
|
{
|
|
|
|
|
char ann_filename[1024], amb_filename[1024], pac_filename[1024];
|
|
|
|
|
strcat(strcpy(ann_filename, prefix), ".ann");
|
|
|
|
|
strcat(strcpy(amb_filename, prefix), ".amb");
|
|
|
|
|
strcat(strcpy(pac_filename, prefix), ".pac");
|
|
|
|
|
return bns_restore_core(ann_filename, amb_filename, pac_filename);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void bns_destroy(bntseq_t *bns)
|
|
|
|
|
{
|
|
|
|
|
if (bns == 0) return;
|
|
|
|
|
else {
|
|
|
|
|
int i;
|
|
|
|
|
if (bns->fp_pac) fclose(bns->fp_pac);
|
|
|
|
|
free(bns->ambs);
|
|
|
|
|
for (i = 0; i < bns->n_seqs; ++i) {
|
|
|
|
|
free(bns->anns[i].name);
|
|
|
|
|
free(bns->anns[i].anno);
|
|
|
|
|
}
|
|
|
|
|
free(bns->anns);
|
|
|
|
|
free(bns);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2011-10-20 23:53:44 +08:00
|
|
|
static void add1(const kseq_t *seq, bntseq_t *bns, FILE *fp, uint8_t *buf, int *l_buf, int *m_seqs, int *m_holes, bntamb1_t **q)
|
|
|
|
|
{
|
|
|
|
|
bntann1_t *p;
|
|
|
|
|
int i, lasts;
|
|
|
|
|
if (bns->n_seqs == *m_seqs) {
|
|
|
|
|
*m_seqs <<= 1;
|
|
|
|
|
bns->anns = (bntann1_t*)realloc(bns->anns, *m_seqs * sizeof(bntann1_t));
|
|
|
|
|
}
|
|
|
|
|
p = bns->anns + bns->n_seqs;
|
|
|
|
|
p->name = strdup((char*)seq->name.s);
|
|
|
|
|
p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)");
|
|
|
|
|
p->gi = 0; p->len = seq->seq.l;
|
|
|
|
|
p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
|
|
|
|
|
p->n_ambs = 0;
|
|
|
|
|
for (i = lasts = 0; i < seq->seq.l; ++i) {
|
2011-10-21 04:13:35 +08:00
|
|
|
int c = seq->seq.s[i];
|
2011-10-20 23:53:44 +08:00
|
|
|
if (c >= 4) { // N
|
|
|
|
|
if (lasts == seq->seq.s[i]) { // contiguous N
|
|
|
|
|
++(*q)->len;
|
|
|
|
|
} else {
|
|
|
|
|
if (bns->n_holes == *m_holes) {
|
|
|
|
|
(*m_holes) <<= 1;
|
|
|
|
|
bns->ambs = (bntamb1_t*)realloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t));
|
|
|
|
|
}
|
|
|
|
|
*q = bns->ambs + bns->n_holes;
|
|
|
|
|
(*q)->len = 1;
|
|
|
|
|
(*q)->offset = p->offset + i;
|
2011-10-25 22:03:57 +08:00
|
|
|
(*q)->amb = 'N';
|
2011-10-20 23:53:44 +08:00
|
|
|
++p->n_ambs;
|
|
|
|
|
++bns->n_holes;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
lasts = seq->seq.s[i];
|
|
|
|
|
{ // fill buffer
|
2011-10-25 22:03:57 +08:00
|
|
|
if (c >= 4) c = c>>4;
|
2011-10-20 23:53:44 +08:00
|
|
|
if (*l_buf == 0x40000) {
|
|
|
|
|
fwrite(buf, 1, 0x10000, fp);
|
|
|
|
|
memset(buf, 0, 0x10000);
|
|
|
|
|
*l_buf = 0;
|
|
|
|
|
}
|
|
|
|
|
buf[*l_buf>>2] |= c << ((3 - (*l_buf&3)) << 1);
|
|
|
|
|
++(*l_buf);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
++bns->n_seqs;
|
|
|
|
|
bns->l_pac += seq->seq.l;
|
|
|
|
|
}
|
|
|
|
|
|
2011-10-21 22:57:12 +08:00
|
|
|
int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only)
|
2011-01-14 09:52:12 +08:00
|
|
|
{
|
2011-10-21 00:09:35 +08:00
|
|
|
extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c
|
2011-01-14 09:52:12 +08:00
|
|
|
kseq_t *seq;
|
|
|
|
|
char name[1024];
|
|
|
|
|
bntseq_t *bns;
|
|
|
|
|
unsigned char buf[0x10000];
|
2011-10-21 04:13:35 +08:00
|
|
|
int32_t i, l_buf, m_seqs, m_holes;
|
2011-06-10 05:33:25 +08:00
|
|
|
int64_t ret = -1;
|
2011-10-20 23:53:44 +08:00
|
|
|
bntamb1_t *q;
|
2011-01-14 09:52:12 +08:00
|
|
|
FILE *fp;
|
|
|
|
|
|
|
|
|
|
// initialization
|
|
|
|
|
seq = kseq_init(fp_fa);
|
|
|
|
|
bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
|
|
|
|
|
bns->seed = 11; // fixed seed for random generator
|
|
|
|
|
srand48(bns->seed);
|
|
|
|
|
m_seqs = m_holes = 8;
|
|
|
|
|
bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
|
|
|
|
|
bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
|
|
|
|
|
q = bns->ambs;
|
|
|
|
|
l_buf = 0;
|
|
|
|
|
strcpy(name, prefix); strcat(name, ".pac");
|
|
|
|
|
fp = xopen(name, "wb");
|
|
|
|
|
memset(buf, 0, 0x10000);
|
|
|
|
|
// read sequences
|
2011-10-20 23:53:44 +08:00
|
|
|
while (kseq_read(seq) >= 0) {
|
2011-10-25 22:03:57 +08:00
|
|
|
for (i = 0; i < seq->seq.l; ++i) { // convert to 2-bit encoding
|
2011-10-21 04:13:35 +08:00
|
|
|
seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
|
2011-10-25 22:03:57 +08:00
|
|
|
if (seq->seq.s[i] > 3)
|
|
|
|
|
seq->seq.s[i] |= (lrand48()&3) << 4;
|
|
|
|
|
}
|
2011-10-20 23:53:44 +08:00
|
|
|
add1(seq, bns, fp, buf, &l_buf, &m_seqs, &m_holes, &q);
|
2011-10-21 22:57:12 +08:00
|
|
|
if (!for_only) {
|
2011-10-25 22:03:57 +08:00
|
|
|
seq_reverse(seq->seq.l, (uint8_t*)seq->seq.s, 0); // reversed but not complemented
|
|
|
|
|
for (i = 0; i < seq->seq.l; ++i) // complement
|
|
|
|
|
seq->seq.s[i] = seq->seq.s[i] < 4? 3 - seq->seq.s[i] : ((3 - (seq->seq.s[i]>>4)) << 4 | 4);
|
2011-10-21 22:57:12 +08:00
|
|
|
add1(seq, bns, fp, buf, &l_buf, &m_seqs, &m_holes, &q);
|
|
|
|
|
}
|
2011-01-14 09:52:12 +08:00
|
|
|
}
|
|
|
|
|
xassert(bns->l_pac, "zero length sequence.");
|
2011-06-10 05:33:25 +08:00
|
|
|
ret = bns->l_pac;
|
2011-01-14 09:52:12 +08:00
|
|
|
{ // finalize .pac file
|
|
|
|
|
ubyte_t ct;
|
|
|
|
|
fwrite(buf, 1, (l_buf>>2) + ((l_buf&3) == 0? 0 : 1), fp);
|
|
|
|
|
// the following codes make the pac file size always (l_pac/4+1+1)
|
|
|
|
|
if (bns->l_pac % 4 == 0) {
|
|
|
|
|
ct = 0;
|
|
|
|
|
fwrite(&ct, 1, 1, fp);
|
|
|
|
|
}
|
|
|
|
|
ct = bns->l_pac % 4;
|
|
|
|
|
fwrite(&ct, 1, 1, fp);
|
|
|
|
|
// close .pac file
|
|
|
|
|
fclose(fp);
|
|
|
|
|
}
|
|
|
|
|
bns_dump(bns, prefix);
|
|
|
|
|
bns_destroy(bns);
|
|
|
|
|
kseq_destroy(seq);
|
2011-06-10 05:33:25 +08:00
|
|
|
return ret;
|
2011-01-14 09:52:12 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int bwa_fa2pac(int argc, char *argv[])
|
|
|
|
|
{
|
2011-10-21 22:57:12 +08:00
|
|
|
int c, for_only = 0;
|
2011-01-14 09:52:12 +08:00
|
|
|
gzFile fp;
|
2011-10-21 22:57:12 +08:00
|
|
|
while ((c = getopt(argc, argv, "f")) >= 0) {
|
|
|
|
|
switch (c) {
|
|
|
|
|
case 'f': for_only = 1; break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (argc == optind) {
|
|
|
|
|
fprintf(stderr, "Usage: bwa fa2pac [-f] <in.fasta> [<out.prefix>]\n");
|
2011-01-14 09:52:12 +08:00
|
|
|
return 1;
|
|
|
|
|
}
|
2011-10-21 22:57:12 +08:00
|
|
|
fp = xzopen(argv[optind], "r");
|
|
|
|
|
bns_fasta2bntseq(fp, (optind+1 < argc)? argv[optind+1] : argv[optind], for_only);
|
2011-01-14 09:52:12 +08:00
|
|
|
gzclose(fp);
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2011-10-22 00:03:14 +08:00
|
|
|
int64_t bns_pos2refId(const bntseq_t *bns, int64_t pos, int is_fr, int *ref_id, int *is_rev)
|
2011-01-14 09:52:12 +08:00
|
|
|
{
|
2011-10-22 00:03:14 +08:00
|
|
|
int left, mid, right;
|
|
|
|
|
is_fr = is_fr? 1 : 0;
|
2011-01-14 09:52:12 +08:00
|
|
|
left = 0; mid = 0; right = bns->n_seqs;
|
|
|
|
|
while (left < right) {
|
|
|
|
|
mid = (left + right) >> 1;
|
2011-10-22 00:03:14 +08:00
|
|
|
if (pos >= bns->anns[mid].offset<<is_fr) {
|
2011-01-14 09:52:12 +08:00
|
|
|
if (mid == bns->n_seqs - 1) break;
|
2011-10-22 00:03:14 +08:00
|
|
|
if (pos < bns->anns[mid+1].offset<<is_fr) break; // bracketed
|
2011-01-14 09:52:12 +08:00
|
|
|
left = mid + 1;
|
|
|
|
|
} else right = mid;
|
|
|
|
|
}
|
2011-10-22 00:03:14 +08:00
|
|
|
*ref_id = mid;
|
|
|
|
|
if (is_fr) { // then compute the forward-only position
|
|
|
|
|
bntann1_t *p = &bns->anns[mid];
|
|
|
|
|
if (pos - (p->offset<<1) < p->len) *is_rev = 0, pos -= p->offset;
|
|
|
|
|
else *is_rev = 1, pos = p->len - (pos - (p->offset<<1) - p->len) + p->offset;
|
|
|
|
|
}
|
|
|
|
|
return pos;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id)
|
|
|
|
|
{
|
|
|
|
|
int left, mid, right, nn;
|
|
|
|
|
if (ref_id) bns_pos2refId(bns, pos_f, 0, ref_id, 0);
|
2011-01-14 09:52:12 +08:00
|
|
|
left = 0; right = bns->n_holes; nn = 0;
|
|
|
|
|
while (left < right) {
|
2011-10-22 00:03:14 +08:00
|
|
|
mid = (left + right) >> 1;
|
|
|
|
|
if (pos_f >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1;
|
|
|
|
|
else if (pos_f + len <= bns->ambs[mid].offset) right = mid;
|
2011-01-14 09:52:12 +08:00
|
|
|
else { // overlap
|
2011-10-22 00:03:14 +08:00
|
|
|
if (pos_f >= bns->ambs[mid].offset) {
|
|
|
|
|
nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len?
|
|
|
|
|
bns->ambs[mid].offset + bns->ambs[mid].len - pos_f : len;
|
2011-01-14 09:52:12 +08:00
|
|
|
} else {
|
2011-10-22 00:03:14 +08:00
|
|
|
nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len?
|
|
|
|
|
bns->ambs[mid].len : len - (bns->ambs[mid].offset - pos_f);
|
2011-01-14 09:52:12 +08:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return nn;
|
|
|
|
|
}
|