Temporarily adding changed bwa files to Sting repository until I can pull together the official bwa patch.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1975 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
eca0942644
commit
15e80aec3b
|
|
@ -0,0 +1,302 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2008 Genome Research Ltd (GRL).
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
/* Contact: Heng Li <lh3@sanger.ac.uk> */
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <zlib.h>
|
||||
#include "bntseq.h"
|
||||
#include "main.h"
|
||||
#include "utils.h"
|
||||
|
||||
#include "kseq.h"
|
||||
KSEQ_INIT(gzFile, gzread)
|
||||
|
||||
unsigned char nst_nt4_table[256] = {
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
|
||||
};
|
||||
|
||||
void bns_dump(const bntseq_t *bns, const char *prefix)
|
||||
{
|
||||
char str[1024];
|
||||
FILE *fp;
|
||||
int i;
|
||||
{ // dump .ann
|
||||
strcpy(str, prefix); strcat(str, ".ann");
|
||||
fp = xopen(str, "w");
|
||||
fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed);
|
||||
for (i = 0; i != bns->n_seqs; ++i) {
|
||||
bntann1_t *p = bns->anns + i;
|
||||
fprintf(fp, "%d %s", p->gi, p->name);
|
||||
if (p->anno[0]) fprintf(fp, " %s\n", p->anno);
|
||||
else fprintf(fp, "\n");
|
||||
fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs);
|
||||
}
|
||||
fclose(fp);
|
||||
}
|
||||
{ // dump .amb
|
||||
strcpy(str, prefix); strcat(str, ".amb");
|
||||
fp = xopen(str, "w");
|
||||
fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes);
|
||||
for (i = 0; i != bns->n_holes; ++i) {
|
||||
bntamb1_t *p = bns->ambs + i;
|
||||
fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb);
|
||||
}
|
||||
fclose(fp);
|
||||
}
|
||||
}
|
||||
|
||||
bntseq_t *bns_restore(const char *prefix)
|
||||
{
|
||||
char ann_filename[1024], amb_filename[1024], pac_filename[1024];
|
||||
strcpy(ann_filename,prefix);
|
||||
strcat(ann_filename,".ann");
|
||||
strcpy(amb_filename,prefix);
|
||||
strcat(amb_filename,".amb");
|
||||
strcpy(pac_filename,prefix);
|
||||
strcat(pac_filename,".pac");
|
||||
|
||||
return bns_restore_core(ann_filename, amb_filename, pac_filename);
|
||||
}
|
||||
|
||||
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename)
|
||||
{
|
||||
char buffer[1024];
|
||||
FILE *fp;
|
||||
bntseq_t *bns;
|
||||
int i;
|
||||
bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
|
||||
{ // read .ann
|
||||
fp = xopen(ann_filename, "r");
|
||||
fscanf(fp, "%lld%d%u", &bns->l_pac, &bns->n_seqs, &bns->seed);
|
||||
bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t));
|
||||
for (i = 0; i < bns->n_seqs; ++i) {
|
||||
bntann1_t *p = bns->anns + i;
|
||||
char *q = buffer;
|
||||
int c;
|
||||
// read gi and sequence name
|
||||
fscanf(fp, "%u%s", &p->gi, buffer);
|
||||
p->name = strdup(buffer);
|
||||
// read fasta comments
|
||||
while ((c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
|
||||
*q = 0;
|
||||
if (q - buffer > 1) p->anno = strdup(buffer + 1); // skip leading space
|
||||
else p->anno = strdup("");
|
||||
// read the rest
|
||||
fscanf(fp, "%lld%d%d", &p->offset, &p->len, &p->n_ambs);
|
||||
}
|
||||
fclose(fp);
|
||||
}
|
||||
{ // read .amb
|
||||
int64_t l_pac;
|
||||
int32_t n_seqs;
|
||||
fp = xopen(amb_filename, "r");
|
||||
fscanf(fp, "%lld%d%d", &l_pac, &n_seqs, &bns->n_holes);
|
||||
xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files.");
|
||||
bns->ambs = (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t));
|
||||
for (i = 0; i < bns->n_holes; ++i) {
|
||||
bntamb1_t *p = bns->ambs + i;
|
||||
fscanf(fp, "%lld%d%s", &p->offset, &p->len, buffer);
|
||||
p->amb = buffer[0];
|
||||
}
|
||||
fclose(fp);
|
||||
}
|
||||
{ // open .pac
|
||||
bns->fp_pac = xopen(pac_filename, "rb");
|
||||
}
|
||||
return bns;
|
||||
}
|
||||
|
||||
void bns_destroy(bntseq_t *bns)
|
||||
{
|
||||
if (bns == 0) return;
|
||||
else {
|
||||
int i;
|
||||
if (bns->fp_pac) fclose(bns->fp_pac);
|
||||
free(bns->ambs);
|
||||
for (i = 0; i < bns->n_seqs; ++i) {
|
||||
free(bns->anns[i].name);
|
||||
free(bns->anns[i].anno);
|
||||
}
|
||||
free(bns->anns);
|
||||
free(bns);
|
||||
}
|
||||
}
|
||||
|
||||
void bns_fasta2bntseq(gzFile fp_fa, const char *prefix)
|
||||
{
|
||||
kseq_t *seq;
|
||||
char name[1024];
|
||||
bntseq_t *bns;
|
||||
bntamb1_t *q;
|
||||
int l_buf;
|
||||
unsigned char buf[0x10000];
|
||||
int32_t m_seqs, m_holes, l, i;
|
||||
FILE *fp;
|
||||
|
||||
// initialization
|
||||
seq = kseq_init(fp_fa);
|
||||
bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
|
||||
bns->seed = 11; // fixed seed for random generator
|
||||
srand48(bns->seed);
|
||||
m_seqs = m_holes = 8;
|
||||
bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
|
||||
bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
|
||||
q = bns->ambs;
|
||||
l_buf = 0;
|
||||
strcpy(name, prefix); strcat(name, ".pac");
|
||||
fp = xopen(name, "wb");
|
||||
memset(buf, 0, 0x10000);
|
||||
// read sequences
|
||||
while ((l = kseq_read(seq)) >= 0) {
|
||||
bntann1_t *p;
|
||||
int lasts;
|
||||
if (bns->n_seqs == m_seqs) {
|
||||
m_seqs <<= 1;
|
||||
bns->anns = (bntann1_t*)realloc(bns->anns, m_seqs * sizeof(bntann1_t));
|
||||
}
|
||||
p = bns->anns + bns->n_seqs;
|
||||
p->name = strdup((char*)seq->name.s);
|
||||
p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)");
|
||||
p->gi = 0; p->len = l;
|
||||
p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
|
||||
p->n_ambs = 0;
|
||||
for (i = 0, lasts = 0; i < l; ++i) {
|
||||
int c = nst_nt4_table[(int)seq->seq.s[i]];
|
||||
if (c >= 4) { // N
|
||||
if (lasts == seq->seq.s[i]) { // contiguous N
|
||||
++q->len;
|
||||
} else {
|
||||
if (bns->n_holes == m_holes) {
|
||||
m_holes <<= 1;
|
||||
bns->ambs = (bntamb1_t*)realloc(bns->ambs, m_holes * sizeof(bntamb1_t));
|
||||
}
|
||||
q = bns->ambs + bns->n_holes;
|
||||
q->len = 1;
|
||||
q->offset = p->offset + i;
|
||||
q->amb = seq->seq.s[i];
|
||||
++p->n_ambs;
|
||||
++bns->n_holes;
|
||||
}
|
||||
}
|
||||
lasts = seq->seq.s[i];
|
||||
{ // fill buffer
|
||||
if (c >= 4) c = lrand48()&0x3;
|
||||
if (l_buf == 0x40000) {
|
||||
fwrite(buf, 1, 0x10000, fp);
|
||||
memset(buf, 0, 0x10000);
|
||||
l_buf = 0;
|
||||
}
|
||||
buf[l_buf>>2] |= c << ((3 - (l_buf&3)) << 1);
|
||||
++l_buf;
|
||||
}
|
||||
}
|
||||
++bns->n_seqs;
|
||||
bns->l_pac += seq->seq.l;
|
||||
}
|
||||
xassert(bns->l_pac, "zero length sequence.");
|
||||
{ // finalize .pac file
|
||||
ubyte_t ct;
|
||||
fwrite(buf, 1, (l_buf>>2) + ((l_buf&3) == 0? 0 : 1), fp);
|
||||
// the following codes make the pac file size always (l_pac/4+1+1)
|
||||
if (bns->l_pac % 4 == 0) {
|
||||
ct = 0;
|
||||
fwrite(&ct, 1, 1, fp);
|
||||
}
|
||||
ct = bns->l_pac % 4;
|
||||
fwrite(&ct, 1, 1, fp);
|
||||
// close .pac file
|
||||
fclose(fp);
|
||||
}
|
||||
bns_dump(bns, prefix);
|
||||
bns_destroy(bns);
|
||||
kseq_destroy(seq);
|
||||
}
|
||||
|
||||
int bwa_fa2pac(int argc, char *argv[])
|
||||
{
|
||||
gzFile fp;
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "Usage: bwa fa2pac <in.fasta> [<out.prefix>]\n");
|
||||
return 1;
|
||||
}
|
||||
fp = xzopen(argv[1], "r");
|
||||
bns_fasta2bntseq(fp, (argc < 3)? argv[1] : argv[2]);
|
||||
gzclose(fp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq)
|
||||
{
|
||||
int left, mid, right, nn;
|
||||
if (pac_coor >= bns->l_pac)
|
||||
err_fatal("bns_coor_pac2real", "bug! Coordinate is longer than sequence (%lld>=%lld).", pac_coor, bns->l_pac);
|
||||
// binary search for the sequence ID. Note that this is a bit different from the following one...
|
||||
left = 0; mid = 0; right = bns->n_seqs;
|
||||
while (left < right) {
|
||||
mid = (left + right) >> 1;
|
||||
if (pac_coor >= bns->anns[mid].offset) {
|
||||
if (mid == bns->n_seqs - 1) break;
|
||||
if (pac_coor < bns->anns[mid+1].offset) break;
|
||||
left = mid + 1;
|
||||
} else right = mid;
|
||||
}
|
||||
*real_seq = mid;
|
||||
// binary search for holes
|
||||
left = 0; right = bns->n_holes; nn = 0;
|
||||
while (left < right) {
|
||||
int64_t mid = (left + right) >> 1;
|
||||
if (pac_coor >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1;
|
||||
else if (pac_coor + len <= bns->ambs[mid].offset) right = mid;
|
||||
else { // overlap
|
||||
if (pac_coor >= bns->ambs[mid].offset) {
|
||||
nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len?
|
||||
bns->ambs[mid].offset + bns->ambs[mid].len - pac_coor : len;
|
||||
} else {
|
||||
nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len?
|
||||
bns->ambs[mid].len : len - (bns->ambs[mid].offset - pac_coor);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return nn;
|
||||
}
|
||||
|
|
@ -0,0 +1,80 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2008 Genome Research Ltd (GRL).
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
/* Contact: Heng Li <lh3@sanger.ac.uk> */
|
||||
|
||||
#ifndef BWT_BNTSEQ_H
|
||||
#define BWT_BNTSEQ_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <zlib.h>
|
||||
|
||||
#ifndef BWA_UBYTE
|
||||
#define BWA_UBYTE
|
||||
typedef uint8_t ubyte_t;
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
int64_t offset;
|
||||
int32_t len;
|
||||
int32_t n_ambs;
|
||||
uint32_t gi;
|
||||
char *name, *anno;
|
||||
} bntann1_t;
|
||||
|
||||
typedef struct {
|
||||
int64_t offset;
|
||||
int32_t len;
|
||||
char amb;
|
||||
} bntamb1_t;
|
||||
|
||||
typedef struct {
|
||||
int64_t l_pac;
|
||||
int32_t n_seqs;
|
||||
uint32_t seed;
|
||||
bntann1_t *anns; // n_seqs elements
|
||||
int32_t n_holes;
|
||||
bntamb1_t *ambs; // n_holes elements
|
||||
FILE *fp_pac;
|
||||
} bntseq_t;
|
||||
|
||||
extern unsigned char nst_nt4_table[256];
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void bns_dump(const bntseq_t *bns, const char *prefix);
|
||||
bntseq_t *bns_restore(const char *prefix);
|
||||
bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename);
|
||||
void bns_destroy(bntseq_t *bns);
|
||||
void bns_fasta2bntseq(gzFile fp_fa, const char *prefix);
|
||||
int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,597 @@
|
|||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include "bwase.h"
|
||||
#include "stdaln.h"
|
||||
#include "bntseq.h"
|
||||
#include "utils.h"
|
||||
#include "kstring.h"
|
||||
|
||||
static int g_log_n[256];
|
||||
|
||||
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s)
|
||||
{
|
||||
int i, cnt, best;
|
||||
if (n_aln == 0) {
|
||||
s->type = BWA_TYPE_NO_MATCH;
|
||||
s->c1 = s->c2 = 0;
|
||||
return;
|
||||
}
|
||||
best = aln[0].score;
|
||||
for (i = cnt = 0; i < n_aln; ++i) {
|
||||
const bwt_aln1_t *p = aln + i;
|
||||
if (p->score > best) break;
|
||||
if (drand48() * (p->l - p->k + 1) > (double)cnt) {
|
||||
s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape; s->strand = p->a;
|
||||
s->score = p->score;
|
||||
s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48());
|
||||
}
|
||||
cnt += p->l - p->k + 1;
|
||||
}
|
||||
s->c1 = cnt;
|
||||
for (; i < n_aln; ++i) cnt += aln[i].l - aln[i].k + 1;
|
||||
s->c2 = cnt - s->c1;
|
||||
s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE;
|
||||
}
|
||||
|
||||
int bwa_approx_mapQ(const bwa_seq_t *p, int mm)
|
||||
{
|
||||
int n;
|
||||
if (p->c1 == 0) return 23;
|
||||
if (p->c1 > 1) return 0;
|
||||
if (p->n_mm == mm) return 25;
|
||||
if (p->c2 == 0) return 37;
|
||||
n = (p->c2 >= 255)? 255 : p->c2;
|
||||
return (23 < g_log_n[n])? 0 : 23 - g_log_n[n];
|
||||
}
|
||||
|
||||
void bwa_cal_pac_pos(const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr)
|
||||
{
|
||||
int i;
|
||||
char str[1024];
|
||||
bwt_t *bwt;
|
||||
// load forward SA
|
||||
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
|
||||
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
|
||||
for (i = 0; i != n_seqs; ++i) {
|
||||
bwa_seq_t *p = seqs + i;
|
||||
if(p->strand) // reverse strand only
|
||||
bwa_cal_pac_pos_core(bwt, NULL, p, max_mm, fnr);
|
||||
}
|
||||
bwt_destroy(bwt);
|
||||
// load reverse BWT and SA
|
||||
strcpy(str, prefix); strcat(str, ".rbwt"); bwt = bwt_restore_bwt(str);
|
||||
strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt);
|
||||
for (i = 0; i != n_seqs; ++i) {
|
||||
bwa_seq_t *p = seqs + i;
|
||||
if(!p->strand) // forward strand only
|
||||
bwa_cal_pac_pos_core(NULL, bwt, p, max_mm, fnr);
|
||||
}
|
||||
bwt_destroy(bwt);
|
||||
}
|
||||
|
||||
/**
|
||||
* Derive the actual position in the read from the given suffix array coordinates.
|
||||
* Note that the position will be approximate based on whether indels appear in the
|
||||
* read and whether calculations are performed from the start or end of the read.
|
||||
*/
|
||||
void bwa_cal_pac_pos_core(const bwt_t* forward_bwt, const bwt_t* reverse_bwt, bwa_seq_t* seq, const int max_mm, const float fnr) {
|
||||
if(seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT)
|
||||
return;
|
||||
|
||||
int max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm;
|
||||
if(seq->strand) { // reverse strand only
|
||||
seq->pos = bwt_sa(forward_bwt, seq->sa);
|
||||
seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
|
||||
}
|
||||
else { // forward strand only
|
||||
/* NB: For gapped alignment, p->pos may not be correct,
|
||||
* which will be fixed in refine_gapped_core(). This
|
||||
* line also determines the way "x" is calculated in
|
||||
* refine_gapped_core() when (ext < 0 && is_end == 0). */
|
||||
seq->pos = reverse_bwt->seq_len - (bwt_sa(reverse_bwt, seq->sa) + seq->len);
|
||||
seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
|
||||
}
|
||||
}
|
||||
|
||||
/* is_end_correct == 1 if (*pos+len) gives the correct coordinate on
|
||||
* forward strand. This happens when p->pos is calculated by
|
||||
* bwa_cal_pac_pos(). is_end_correct==0 if (*pos) gives the correct
|
||||
* coordinate. This happens only for color-converted alignment. */
|
||||
static uint16_t *refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, bwtint_t *_pos,
|
||||
int ext, int *n_cigar, int is_end_correct)
|
||||
{
|
||||
uint16_t *cigar = 0;
|
||||
ubyte_t *ref_seq;
|
||||
int l = 0, path_len, ref_len;
|
||||
AlnParam ap = aln_param_bwa;
|
||||
path_t *path;
|
||||
int64_t k, __pos = *_pos > l_pac? (int64_t)((int32_t)*_pos) : *_pos;
|
||||
|
||||
ref_len = len + abs(ext);
|
||||
if (ext > 0) {
|
||||
ref_seq = (ubyte_t*)calloc(ref_len, 1);
|
||||
for (k = __pos; k < __pos + ref_len && k < l_pac; ++k)
|
||||
ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
|
||||
} else {
|
||||
int64_t x = __pos + (is_end_correct? len : ref_len);
|
||||
ref_seq = (ubyte_t*)calloc(ref_len, 1);
|
||||
for (l = 0, k = x - ref_len > 0? x - ref_len : 0; k < x && k < l_pac; ++k)
|
||||
ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
|
||||
}
|
||||
path = (path_t*)calloc(l+len, sizeof(path_t));
|
||||
|
||||
aln_global_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len);
|
||||
cigar = aln_path2cigar(path, path_len, n_cigar);
|
||||
|
||||
if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped on the forward strand
|
||||
for (l = k = 0; k < *n_cigar; ++k) {
|
||||
if (cigar[k]>>14 == FROM_D) l -= cigar[k]&0x3fff;
|
||||
else if (cigar[k]>>14 == FROM_I) l += cigar[k]&0x3fff;
|
||||
}
|
||||
__pos += l;
|
||||
}
|
||||
|
||||
if (cigar[0]>>14 == FROM_D) { // deletion at the 5'-end
|
||||
__pos += cigar[0]&0x3fff;
|
||||
for (k = 0; k < *n_cigar - 1; ++k) cigar[k] = cigar[k+1];
|
||||
--(*n_cigar);
|
||||
}
|
||||
if (cigar[*n_cigar-1]>>14 == FROM_D) --(*n_cigar); // deletion at the 3'-end
|
||||
|
||||
// change "I" at either end of the read to S. just in case. This should rarely happen...
|
||||
if (cigar[*n_cigar-1]>>14 == FROM_I) cigar[*n_cigar-1] = 3<<14 | (cigar[*n_cigar-1]&0x3fff);
|
||||
if (cigar[0]>>14 == FROM_I) cigar[0] = 3<<14 | (cigar[0]&0x3fff);
|
||||
|
||||
*_pos = (bwtint_t)__pos;
|
||||
free(ref_seq); free(path);
|
||||
return cigar;
|
||||
}
|
||||
|
||||
char *bwa_cal_md1(int n_cigar, uint16_t *cigar, int len, bwtint_t pos, ubyte_t *seq,
|
||||
bwtint_t l_pac, ubyte_t *pacseq, kstring_t *str, int *_nm)
|
||||
{
|
||||
bwtint_t x, y;
|
||||
int z, u, c, nm = 0;
|
||||
str->l = 0; // reset
|
||||
x = pos; y = 0;
|
||||
if (cigar) {
|
||||
int k, l;
|
||||
for (k = u = 0; k < n_cigar; ++k) {
|
||||
l = cigar[k]&0x3fff;
|
||||
if (cigar[k]>>14 == FROM_M) {
|
||||
for (z = 0; z < l && x+z < l_pac; ++z) {
|
||||
c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
|
||||
if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
|
||||
ksprintf(str, "%d", u);
|
||||
kputc("ACGTN"[c], str);
|
||||
++nm;
|
||||
u = 0;
|
||||
} else ++u;
|
||||
}
|
||||
x += l; y += l;
|
||||
} else if (cigar[k]>>14 == FROM_I || cigar[k]>>14 == 3) {
|
||||
y += l; nm += l;
|
||||
} else if (cigar[k]>>14 == FROM_D) {
|
||||
ksprintf(str, "%d", u);
|
||||
kputc('^', str);
|
||||
for (z = 0; z < l && x+z < l_pac; ++z)
|
||||
kputc("ACGT"[pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3], str);
|
||||
u = 0;
|
||||
x += l; nm += l;
|
||||
}
|
||||
}
|
||||
} else { // no gaps
|
||||
for (z = u = 0; z < (bwtint_t)len; ++z) {
|
||||
c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
|
||||
if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
|
||||
ksprintf(str, "%d", u);
|
||||
kputc("ACGTN"[c], str);
|
||||
++nm;
|
||||
u = 0;
|
||||
} else ++u;
|
||||
}
|
||||
}
|
||||
ksprintf(str, "%d", u);
|
||||
*_nm = nm;
|
||||
return strdup(str->s);
|
||||
}
|
||||
|
||||
void bwa_correct_trimmed(bwa_seq_t *s)
|
||||
{
|
||||
if (s->len == s->full_len) return;
|
||||
if (s->strand == 0) { // forward
|
||||
if (s->cigar && s->cigar[s->n_cigar-1]>>14 == 3) { // the last is S
|
||||
s->cigar[s->n_cigar-1] += s->full_len - s->len;
|
||||
} else {
|
||||
if (s->cigar == 0) {
|
||||
s->n_cigar = 2;
|
||||
s->cigar = calloc(s->n_cigar, 2);
|
||||
s->cigar[0] = 0<<14 | s->len;
|
||||
} else {
|
||||
++s->n_cigar;
|
||||
s->cigar = realloc(s->cigar, s->n_cigar * 2);
|
||||
}
|
||||
s->cigar[s->n_cigar-1] = 3<<14 | (s->full_len - s->len);
|
||||
}
|
||||
} else { // reverse
|
||||
if (s->cigar && s->cigar[0]>>14 == 3) { // the first is S
|
||||
s->cigar[0] += s->full_len - s->len;
|
||||
} else {
|
||||
if (s->cigar == 0) {
|
||||
s->n_cigar = 2;
|
||||
s->cigar = calloc(s->n_cigar, 2);
|
||||
s->cigar[1] = 0<<14 | s->len;
|
||||
} else {
|
||||
++s->n_cigar;
|
||||
s->cigar = realloc(s->cigar, s->n_cigar * 2);
|
||||
memmove(s->cigar + 1, s->cigar, (s->n_cigar-1) * 2);
|
||||
}
|
||||
s->cigar[0] = 3<<14 | (s->full_len - s->len);
|
||||
}
|
||||
}
|
||||
s->len = s->full_len;
|
||||
}
|
||||
|
||||
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns)
|
||||
{
|
||||
ubyte_t *pacseq, *ntpac = 0;
|
||||
int i;
|
||||
kstring_t *str;
|
||||
|
||||
if (ntbns) { // in color space
|
||||
ntpac = (ubyte_t*)calloc(ntbns->l_pac/4+1, 1);
|
||||
rewind(ntbns->fp_pac);
|
||||
fread(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac);
|
||||
}
|
||||
|
||||
if (!_pacseq) {
|
||||
pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
|
||||
rewind(bns->fp_pac);
|
||||
fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac);
|
||||
} else pacseq = _pacseq;
|
||||
for (i = 0; i != n_seqs; ++i) {
|
||||
bwa_seq_t *s = seqs + i;
|
||||
seq_reverse(s->len, s->seq, 0); // IMPORTANT: s->seq is reversed here!!!
|
||||
if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue;
|
||||
s->cigar = refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos,
|
||||
(s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1);
|
||||
}
|
||||
|
||||
if (ntbns) { // in color space
|
||||
for (i = 0; i < n_seqs; ++i) {
|
||||
bwa_seq_t *s = seqs + i;
|
||||
bwa_cs2nt_core(s, bns->l_pac, ntpac);
|
||||
if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again
|
||||
free(s->cigar);
|
||||
s->cigar = refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos,
|
||||
(s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// generate MD tag
|
||||
str = (kstring_t*)calloc(1, sizeof(kstring_t));
|
||||
for (i = 0; i != n_seqs; ++i) {
|
||||
bwa_seq_t *s = seqs + i;
|
||||
if (s->type != BWA_TYPE_NO_MATCH) {
|
||||
int nm;
|
||||
s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq,
|
||||
bns->l_pac, ntbns? ntpac : pacseq, str, &nm);
|
||||
s->nm = nm;
|
||||
}
|
||||
}
|
||||
free(str->s); free(str);
|
||||
|
||||
// correct for trimmed reads
|
||||
for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i);
|
||||
|
||||
if (!_pacseq) free(pacseq);
|
||||
free(ntpac);
|
||||
}
|
||||
|
||||
int64_t pos_end(const bwa_seq_t *p)
|
||||
{
|
||||
if (p->cigar) {
|
||||
int j;
|
||||
int64_t x = p->pos;
|
||||
for (j = 0; j != p->n_cigar; ++j) {
|
||||
int op = p->cigar[j]>>14;
|
||||
if (op == 0 || op == 2) x += p->cigar[j]&0x3fff;
|
||||
}
|
||||
return x;
|
||||
} else return p->pos + p->len;
|
||||
}
|
||||
|
||||
static int64_t pos_5(const bwa_seq_t *p)
|
||||
{
|
||||
if (p->type != BWA_TYPE_NO_MATCH)
|
||||
return p->strand? pos_end(p) : p->pos;
|
||||
return -1;
|
||||
}
|
||||
|
||||
void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2)
|
||||
{
|
||||
int j;
|
||||
if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) {
|
||||
int seqid, nn, am = 0, flag = p->extra_flag;
|
||||
char XT;
|
||||
|
||||
if (p->type == BWA_TYPE_NO_MATCH) {
|
||||
p->pos = mate->pos;
|
||||
p->strand = mate->strand;
|
||||
flag |= SAM_FSU;
|
||||
j = 1;
|
||||
} else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment
|
||||
|
||||
// get seqid
|
||||
nn = bns_coor_pac2real(bns, p->pos, j, &seqid);
|
||||
|
||||
// update flag and print it
|
||||
if (p->strand) flag |= SAM_FSR;
|
||||
if (mate) {
|
||||
if (mate->type != BWA_TYPE_NO_MATCH) {
|
||||
if (mate->strand) flag |= SAM_FMR;
|
||||
} else flag |= SAM_FMU;
|
||||
}
|
||||
printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name);
|
||||
printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ);
|
||||
|
||||
// print CIGAR
|
||||
if (p->cigar) {
|
||||
for (j = 0; j != p->n_cigar; ++j)
|
||||
printf("%d%c", p->cigar[j]&0x3fff, "MIDS"[p->cigar[j]>>14]);
|
||||
} else if (p->type == BWA_TYPE_NO_MATCH) printf("*");
|
||||
else printf("%dM", p->len);
|
||||
|
||||
// print mate coordinate
|
||||
if (mate && mate->type != BWA_TYPE_NO_MATCH) {
|
||||
int m_seqid, m_is_N;
|
||||
long long isize;
|
||||
am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality
|
||||
// redundant calculation here, but should not matter too much
|
||||
m_is_N = bns_coor_pac2real(bns, mate->pos, mate->len, &m_seqid);
|
||||
printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name);
|
||||
isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0;
|
||||
if (p->type == BWA_TYPE_NO_MATCH) isize = 0;
|
||||
printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize);
|
||||
} else if (mate) printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1));
|
||||
else printf("\t*\t0\t0\t");
|
||||
|
||||
// print sequence and quality
|
||||
if (p->strand == 0)
|
||||
for (j = 0; j != p->full_len; ++j) putchar("ACGTN"[(int)p->seq[j]]);
|
||||
else for (j = 0; j != p->full_len; ++j) putchar("TGCAN"[p->seq[p->full_len - 1 - j]]);
|
||||
putchar('\t');
|
||||
if (p->qual) {
|
||||
if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
|
||||
printf("%s", p->qual);
|
||||
} else printf("*");
|
||||
|
||||
if (p->type != BWA_TYPE_NO_MATCH) {
|
||||
// calculate XT tag
|
||||
XT = "NURM"[p->type];
|
||||
if (nn > 10) XT = 'N';
|
||||
// print tags
|
||||
printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm);
|
||||
if (nn) printf("\tXN:i:%d", nn);
|
||||
if (mate) printf("\tSM:i:%d\tAM:i:%d", p->seQ, am);
|
||||
if (p->type != BWA_TYPE_MATESW) { // X0 and X1 are not available for this type of alignment
|
||||
printf("\tX0:i:%d", p->c1);
|
||||
if (p->c1 <= max_top2) printf("\tX1:i:%d", p->c2);
|
||||
}
|
||||
printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo, p->n_gapo+p->n_gape);
|
||||
if (p->md) printf("\tMD:Z:%s", p->md);
|
||||
}
|
||||
putchar('\n');
|
||||
} else { // this read has no match
|
||||
ubyte_t *s = p->strand? p->rseq : p->seq;
|
||||
int flag = p->extra_flag | SAM_FSU;
|
||||
if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU;
|
||||
printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag);
|
||||
for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]);
|
||||
putchar('\t');
|
||||
if (p->qual) {
|
||||
if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
|
||||
printf("%s", p->qual);
|
||||
} else printf("*");
|
||||
putchar('\n');
|
||||
}
|
||||
}
|
||||
|
||||
bntseq_t *bwa_open_nt(const char *prefix)
|
||||
{
|
||||
bntseq_t *ntbns;
|
||||
char *str;
|
||||
str = (char*)calloc(strlen(prefix) + 10, 1);
|
||||
strcat(strcpy(str, prefix), ".nt");
|
||||
ntbns = bns_restore(str);
|
||||
free(str);
|
||||
return ntbns;
|
||||
}
|
||||
|
||||
void bwa_print_sam_SQ(const bntseq_t *bns)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < bns->n_seqs; ++i)
|
||||
printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
|
||||
}
|
||||
|
||||
void bwase_initialize()
|
||||
{
|
||||
int i;
|
||||
for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
|
||||
}
|
||||
|
||||
void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa)
|
||||
{
|
||||
int i, n_seqs, tot_seqs = 0, m_aln;
|
||||
bwt_aln1_t *aln = 0;
|
||||
bwa_seq_t *seqs;
|
||||
bwa_seqio_t *ks;
|
||||
clock_t t;
|
||||
bntseq_t *bns, *ntbns = 0;
|
||||
FILE *fp_sa;
|
||||
gap_opt_t opt;
|
||||
|
||||
// initialization
|
||||
bwase_initialize();
|
||||
for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
|
||||
bns = bns_restore(prefix);
|
||||
srand48(bns->seed);
|
||||
ks = bwa_seq_open(fn_fa);
|
||||
fp_sa = xopen(fn_sa, "r");
|
||||
|
||||
// core loop
|
||||
m_aln = 0;
|
||||
fread(&opt, sizeof(gap_opt_t), 1, fp_sa);
|
||||
if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac
|
||||
ntbns = bwa_open_nt(prefix);
|
||||
bwa_print_sam_SQ(bns);
|
||||
while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode & BWA_MODE_COMPREAD, opt.trim_qual)) != 0) {
|
||||
tot_seqs += n_seqs;
|
||||
t = clock();
|
||||
|
||||
// read alignment
|
||||
for (i = 0; i < n_seqs; ++i) {
|
||||
bwa_seq_t *p = seqs + i;
|
||||
int n_aln;
|
||||
fread(&n_aln, 4, 1, fp_sa);
|
||||
if (n_aln > m_aln) {
|
||||
m_aln = n_aln;
|
||||
aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln);
|
||||
}
|
||||
fread(aln, sizeof(bwt_aln1_t), n_aln, fp_sa);
|
||||
bwa_aln2seq(n_aln, aln, p);
|
||||
}
|
||||
|
||||
fprintf(stderr, "[bwa_aln_core] convert to sequence coordinate... ");
|
||||
bwa_cal_pac_pos(prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here
|
||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
||||
|
||||
fprintf(stderr, "[bwa_aln_core] refine gapped alignments... ");
|
||||
bwa_refine_gapped(bns, n_seqs, seqs, 0, ntbns);
|
||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
||||
|
||||
fprintf(stderr, "[bwa_aln_core] print alignments... ");
|
||||
for (i = 0; i < n_seqs; ++i)
|
||||
bwa_print_sam1(bns, seqs + i, 0, opt.mode, opt.max_top2);
|
||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
||||
|
||||
bwa_free_read_seq(n_seqs, seqs);
|
||||
fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs);
|
||||
}
|
||||
|
||||
// destroy
|
||||
bwa_seq_close(ks);
|
||||
if (ntbns) bns_destroy(ntbns);
|
||||
bns_destroy(bns);
|
||||
fclose(fp_sa);
|
||||
free(aln);
|
||||
}
|
||||
|
||||
static void print_aln_simple(bwt_t *const bwt[2], const bntseq_t *bns, const bwt_aln1_t *q, int len, bwtint_t k)
|
||||
{
|
||||
bwtint_t pos;
|
||||
int seqid, is_N;
|
||||
pos = q->a? bwt_sa(bwt[0], k) : bwt[1]->seq_len - (bwt_sa(bwt[1], k) + len);
|
||||
if (pos > bwt[1]->seq_len) pos = 0; // negative
|
||||
is_N = bns_coor_pac2real(bns, pos, len, &seqid);
|
||||
printf("%s\t%c%d\t%d\n", bns->anns[seqid].name, "+-"[q->a], (int)(pos - bns->anns[seqid].offset + 1),
|
||||
q->n_mm + q->n_gapo + q->n_gape);
|
||||
}
|
||||
|
||||
void bwa_print_all_hits(const char *prefix, const char *fn_sa, const char *fn_fa, int max_occ)
|
||||
{
|
||||
int i, n_seqs, tot_seqs = 0, m_aln;
|
||||
bwt_aln1_t *aln = 0;
|
||||
bwa_seq_t *seqs;
|
||||
bwa_seqio_t *ks;
|
||||
bntseq_t *bns;
|
||||
FILE *fp_sa;
|
||||
gap_opt_t opt;
|
||||
bwt_t *bwt[2];
|
||||
|
||||
bns = bns_restore(prefix);
|
||||
srand48(bns->seed);
|
||||
ks = bwa_seq_open(fn_fa);
|
||||
fp_sa = xopen(fn_sa, "r");
|
||||
{ // load BWT
|
||||
char *str = (char*)calloc(strlen(prefix) + 10, 1);
|
||||
strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
|
||||
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]);
|
||||
strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
|
||||
strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]);
|
||||
free(str);
|
||||
}
|
||||
|
||||
m_aln = 0;
|
||||
fread(&opt, sizeof(gap_opt_t), 1, fp_sa);
|
||||
while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode & BWA_MODE_COMPREAD, opt.trim_qual)) != 0) {
|
||||
tot_seqs += n_seqs;
|
||||
for (i = 0; i < n_seqs; ++i) {
|
||||
bwa_seq_t *p = seqs + i;
|
||||
int n_aln, n_occ, k, rest, len;
|
||||
len = p->len;
|
||||
fread(&n_aln, 4, 1, fp_sa);
|
||||
if (n_aln > m_aln) {
|
||||
m_aln = n_aln;
|
||||
aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln);
|
||||
}
|
||||
fread(aln, sizeof(bwt_aln1_t), n_aln, fp_sa);
|
||||
for (k = n_occ = 0; k < n_aln; ++k) {
|
||||
const bwt_aln1_t *q = aln + k;
|
||||
n_occ += q->l - q->k + 1;
|
||||
}
|
||||
rest = n_occ > max_occ? max_occ : n_occ;
|
||||
printf(">%s %d %d\n", p->name, rest, n_occ);
|
||||
for (k = 0; k < n_aln; ++k) {
|
||||
const bwt_aln1_t *q = aln + k;
|
||||
if (q->l - q->k + 1 <= rest) {
|
||||
bwtint_t l;
|
||||
for (l = q->k; l <= q->l; ++l)
|
||||
print_aln_simple(bwt, bns, q, len, l);
|
||||
rest -= q->l - q->k + 1;
|
||||
} else { // See also: http://code.activestate.com/recipes/272884/
|
||||
int j, i, k;
|
||||
for (j = rest, i = q->l - q->k + 1, k = 0; j > 0; --j) {
|
||||
double p = 1.0, x = drand48();
|
||||
while (x < p) p -= p * j / (i--);
|
||||
print_aln_simple(bwt, bns, q, len, q->l - i);
|
||||
}
|
||||
rest = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
bwa_free_read_seq(n_seqs, seqs);
|
||||
}
|
||||
bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
|
||||
bwa_seq_close(ks);
|
||||
bns_destroy(bns);
|
||||
fclose(fp_sa);
|
||||
free(aln);
|
||||
}
|
||||
|
||||
int bwa_sai2sam_se(int argc, char *argv[])
|
||||
{
|
||||
int c, n_occ = 0;
|
||||
while ((c = getopt(argc, argv, "hn:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'h': break;
|
||||
case 'n': n_occ = atoi(optarg); break;
|
||||
default: return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (optind + 3 > argc) {
|
||||
fprintf(stderr, "Usage: bwa samse [-n max_occ] <prefix> <in.sai> <in.fq>\n");
|
||||
return 1;
|
||||
}
|
||||
if (n_occ > 0) bwa_print_all_hits(argv[optind], argv[optind+1], argv[optind+2], n_occ);
|
||||
else bwa_sai2sam_se_core(argv[optind], argv[optind+1], argv[optind+2]);
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,52 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2008 Genome Research Ltd (GRL).
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BWASE_H
|
||||
#define BWASE_H
|
||||
|
||||
#include "bntseq.h"
|
||||
#include "bwt.h"
|
||||
#include "bwtaln.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Initialize mapping tables in the bwa single-end mapper.
|
||||
void bwase_initialize();
|
||||
// Calculate the approximate position of the sequence from the specified bwt with loaded suffix array.
|
||||
void bwa_cal_pac_pos_core(const bwt_t* forward_bwt, const bwt_t* reverse_bwt, bwa_seq_t* seq, const int max_mm, const float fnr);
|
||||
// Refine the approximate position of the sequence to an actual placement for the sequence.
|
||||
void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns);
|
||||
// Backfill certain alignment properties mainly centering around number of matches.
|
||||
void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
|
||||
// Calculate the end position of a read given a certain sequence.
|
||||
int64_t pos_end(const bwa_seq_t *p);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // BWASE_H
|
||||
|
|
@ -0,0 +1,295 @@
|
|||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <stdint.h>
|
||||
#include "bwtaln.h"
|
||||
#include "bwtgap.h"
|
||||
#include "utils.h"
|
||||
|
||||
#ifdef HAVE_PTHREAD
|
||||
#define THREAD_BLOCK_SIZE 1024
|
||||
#include <pthread.h>
|
||||
static pthread_mutex_t g_seq_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
#endif
|
||||
|
||||
gap_opt_t *gap_init_opt()
|
||||
{
|
||||
gap_opt_t *o;
|
||||
o = (gap_opt_t*)calloc(1, sizeof(gap_opt_t));
|
||||
/* IMPORTANT: s_mm*10 should be about the average base error
|
||||
rate. Voilating this requirement will break pairing! */
|
||||
o->s_mm = 3; o->s_gapo = 11; o->s_gape = 4;
|
||||
o->max_diff = -1; o->max_gapo = 1; o->max_gape = 6;
|
||||
o->indel_end_skip = 5; o->max_del_occ = 10; o->max_entries = 2000000;
|
||||
o->mode = BWA_MODE_GAPE | BWA_MODE_COMPREAD;
|
||||
o->seed_len = 0x7fffffff; o->max_seed_diff = 2;
|
||||
o->fnr = 0.04;
|
||||
o->n_threads = 1;
|
||||
o->max_top2 = 30;
|
||||
o->trim_qual = 0;
|
||||
return o;
|
||||
}
|
||||
|
||||
int bwa_cal_maxdiff(int l, double err, double thres)
|
||||
{
|
||||
double elambda = exp(-l * err);
|
||||
double sum, y = 1.0;
|
||||
int k, x = 1;
|
||||
for (k = 1, sum = elambda; k < 1000; ++k) {
|
||||
y *= l * err;
|
||||
x *= k;
|
||||
sum += elambda * y / x;
|
||||
if (1.0 - sum < thres) return k;
|
||||
}
|
||||
return 2;
|
||||
}
|
||||
|
||||
// width must be filled as zero
|
||||
static int bwt_cal_width(const bwt_t *rbwt, int len, const ubyte_t *str, bwt_width_t *width)
|
||||
{
|
||||
bwtint_t k, l, ok, ol;
|
||||
int i, bid;
|
||||
bid = 0;
|
||||
k = 0; l = rbwt->seq_len;
|
||||
for (i = 0; i < len; ++i) {
|
||||
ubyte_t c = str[i];
|
||||
if (c < 4) {
|
||||
bwt_2occ(rbwt, k - 1, l, c, &ok, &ol);
|
||||
k = rbwt->L2[c] + ok + 1;
|
||||
l = rbwt->L2[c] + ol;
|
||||
}
|
||||
if (k > l || c > 3) { // then restart
|
||||
k = 0;
|
||||
l = rbwt->seq_len;
|
||||
++bid;
|
||||
}
|
||||
width[i].w = l - k + 1;
|
||||
width[i].bid = bid;
|
||||
}
|
||||
width[len].w = 0;
|
||||
width[len].bid = ++bid;
|
||||
return bid;
|
||||
}
|
||||
|
||||
void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt)
|
||||
{
|
||||
int i, max_l = 0, max_len;
|
||||
gap_stack_t *stack;
|
||||
bwt_width_t *w[2], *seed_w[2];
|
||||
const ubyte_t *seq[2];
|
||||
gap_opt_t local_opt = *opt;
|
||||
|
||||
// initiate priority stack
|
||||
for (i = max_len = 0; i != n_seqs; ++i)
|
||||
if (seqs[i].len > max_len) max_len = seqs[i].len;
|
||||
if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(max_len, BWA_AVG_ERR, opt->fnr);
|
||||
if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff;
|
||||
stack = gap_init_stack(opt->max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt);
|
||||
|
||||
seed_w[0] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
|
||||
seed_w[1] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
|
||||
w[0] = w[1] = 0;
|
||||
for (i = 0; i != n_seqs; ++i) {
|
||||
bwa_seq_t *p = seqs + i;
|
||||
#ifdef HAVE_PTHREAD
|
||||
if (opt->n_threads > 1) {
|
||||
pthread_mutex_lock(&g_seq_lock);
|
||||
if (p->tid < 0) { // unassigned
|
||||
int j;
|
||||
for (j = i; j < n_seqs && j < i + THREAD_BLOCK_SIZE; ++j)
|
||||
seqs[j].tid = tid;
|
||||
} else if (p->tid != tid) {
|
||||
pthread_mutex_unlock(&g_seq_lock);
|
||||
continue;
|
||||
}
|
||||
pthread_mutex_unlock(&g_seq_lock);
|
||||
}
|
||||
#endif
|
||||
p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0;
|
||||
seq[0] = p->seq; seq[1] = p->rseq;
|
||||
if (max_l < p->len) {
|
||||
max_l = p->len;
|
||||
w[0] = (bwt_width_t*)calloc(max_l + 1, sizeof(bwt_width_t));
|
||||
w[1] = (bwt_width_t*)calloc(max_l + 1, sizeof(bwt_width_t));
|
||||
}
|
||||
bwt_cal_width(bwt[0], p->len, seq[0], w[0]);
|
||||
bwt_cal_width(bwt[1], p->len, seq[1], w[1]);
|
||||
if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr);
|
||||
local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff;
|
||||
if (p->len > opt->seed_len) {
|
||||
bwt_cal_width(bwt[0], opt->seed_len, seq[0] + (p->len - opt->seed_len), seed_w[0]);
|
||||
bwt_cal_width(bwt[1], opt->seed_len, seq[1] + (p->len - opt->seed_len), seed_w[1]);
|
||||
}
|
||||
// core function
|
||||
p->aln = bwt_match_gap(bwt, p->len, seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack);
|
||||
// store the alignment
|
||||
free(p->name); free(p->seq); free(p->rseq); free(p->qual);
|
||||
p->name = 0; p->seq = p->rseq = p->qual = 0;
|
||||
}
|
||||
free(seed_w[0]); free(seed_w[1]);
|
||||
free(w[0]); free(w[1]);
|
||||
gap_destroy_stack(stack);
|
||||
}
|
||||
|
||||
#ifdef HAVE_PTHREAD
|
||||
typedef struct {
|
||||
int tid;
|
||||
bwt_t *bwt[2];
|
||||
int n_seqs;
|
||||
bwa_seq_t *seqs;
|
||||
const gap_opt_t *opt;
|
||||
} thread_aux_t;
|
||||
|
||||
static void *worker(void *data)
|
||||
{
|
||||
thread_aux_t *d = (thread_aux_t*)data;
|
||||
bwa_cal_sa_reg_gap(d->tid, d->bwt, d->n_seqs, d->seqs, d->opt);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
|
||||
{
|
||||
int i, n_seqs, tot_seqs = 0;
|
||||
bwa_seq_t *seqs;
|
||||
bwa_seqio_t *ks;
|
||||
clock_t t;
|
||||
bwt_t *bwt[2];
|
||||
|
||||
// initialization
|
||||
ks = bwa_seq_open(fn_fa);
|
||||
|
||||
{ // load BWT
|
||||
char *str = (char*)calloc(strlen(prefix) + 10, 1);
|
||||
strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str);
|
||||
strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str);
|
||||
free(str);
|
||||
}
|
||||
|
||||
// core loop
|
||||
fwrite(opt, sizeof(gap_opt_t), 1, stdout);
|
||||
while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode & BWA_MODE_COMPREAD, opt->trim_qual)) != 0) {
|
||||
tot_seqs += n_seqs;
|
||||
t = clock();
|
||||
|
||||
fprintf(stderr, "[bwa_aln_core] calculate SA coordinate... ");
|
||||
|
||||
#ifdef HAVE_PTHREAD
|
||||
if (opt->n_threads <= 1) { // no multi-threading at all
|
||||
bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt);
|
||||
} else {
|
||||
pthread_t *tid;
|
||||
pthread_attr_t attr;
|
||||
thread_aux_t *data;
|
||||
int j;
|
||||
pthread_attr_init(&attr);
|
||||
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
|
||||
data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t));
|
||||
tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t));
|
||||
for (j = 0; j < opt->n_threads; ++j) {
|
||||
data[j].tid = j; data[j].bwt[0] = bwt[0]; data[j].bwt[1] = bwt[1];
|
||||
data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt;
|
||||
pthread_create(&tid[j], &attr, worker, data + j);
|
||||
}
|
||||
for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0);
|
||||
free(data); free(tid);
|
||||
}
|
||||
#else
|
||||
bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt);
|
||||
#endif
|
||||
|
||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
||||
|
||||
t = clock();
|
||||
fprintf(stderr, "[bwa_aln_core] write to the disk... ");
|
||||
for (i = 0; i < n_seqs; ++i) {
|
||||
bwa_seq_t *p = seqs + i;
|
||||
fwrite(&p->n_aln, 4, 1, stdout);
|
||||
if (p->n_aln) fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout);
|
||||
}
|
||||
fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
|
||||
|
||||
bwa_free_read_seq(n_seqs, seqs);
|
||||
fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs);
|
||||
}
|
||||
|
||||
// destroy
|
||||
bwt_destroy(bwt[0]); bwt_destroy(bwt[1]);
|
||||
bwa_seq_close(ks);
|
||||
}
|
||||
|
||||
int bwa_aln(int argc, char *argv[])
|
||||
{
|
||||
int c, opte = -1;
|
||||
gap_opt_t *opt;
|
||||
|
||||
opt = gap_init_opt();
|
||||
while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:")) >= 0) {
|
||||
switch (c) {
|
||||
case 'n':
|
||||
if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1;
|
||||
else opt->max_diff = atoi(optarg), opt->fnr = -1.0;
|
||||
break;
|
||||
case 'o': opt->max_gapo = atoi(optarg); break;
|
||||
case 'e': opte = atoi(optarg); break;
|
||||
case 'M': opt->s_mm = atoi(optarg); break;
|
||||
case 'O': opt->s_gapo = atoi(optarg); break;
|
||||
case 'E': opt->s_gape = atoi(optarg); break;
|
||||
case 'd': opt->max_del_occ = atoi(optarg); break;
|
||||
case 'i': opt->indel_end_skip = atoi(optarg); break;
|
||||
case 'l': opt->seed_len = atoi(optarg); break;
|
||||
case 'k': opt->max_seed_diff = atoi(optarg); break;
|
||||
case 'm': opt->max_entries = atoi(optarg); break;
|
||||
case 't': opt->n_threads = atoi(optarg); break;
|
||||
case 'L': opt->mode |= BWA_MODE_LOGGAP; break;
|
||||
case 'R': opt->max_top2 = atoi(optarg); break;
|
||||
case 'q': opt->trim_qual = atoi(optarg); break;
|
||||
case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break;
|
||||
case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break;
|
||||
default: return 1;
|
||||
}
|
||||
}
|
||||
if (opte > 0) {
|
||||
opt->max_gape = opte;
|
||||
opt->mode &= ~BWA_MODE_GAPE;
|
||||
}
|
||||
|
||||
if (optind + 2 > argc) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "Usage: bwa aln [options] <prefix> <in.fq>\n\n");
|
||||
fprintf(stderr, "Options: -n NUM max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n",
|
||||
BWA_AVG_ERR, opt->fnr);
|
||||
fprintf(stderr, " -o INT maximum number or fraction of gap opens [%d]\n", opt->max_gapo);
|
||||
fprintf(stderr, " -e INT maximum number of gap extensions, -1 for disabling long gaps [-1]\n");
|
||||
fprintf(stderr, " -i INT do not put an indel within INT bp towards the ends [%d]\n", opt->indel_end_skip);
|
||||
fprintf(stderr, " -d INT maximum occurrences for extending a long deletion [%d]\n", opt->max_del_occ);
|
||||
fprintf(stderr, " -l INT seed length [%d]\n", opt->seed_len);
|
||||
fprintf(stderr, " -k INT maximum differences in the seed [%d]\n", opt->max_seed_diff);
|
||||
fprintf(stderr, " -m INT maximum entries in the queue [%d]\n", opt->max_entries);
|
||||
fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
|
||||
fprintf(stderr, " -M INT mismatch penalty [%d]\n", opt->s_mm);
|
||||
fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->s_gapo);
|
||||
fprintf(stderr, " -E INT gap extension penalty [%d]\n", opt->s_gape);
|
||||
fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2);
|
||||
fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual);
|
||||
fprintf(stderr, " -c input sequences are in the color space\n");
|
||||
fprintf(stderr, " -L log-scaled gap penalty for long deletions\n");
|
||||
fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n");
|
||||
fprintf(stderr, "\n");
|
||||
return 1;
|
||||
}
|
||||
if (opt->fnr > 0.0) {
|
||||
int i, k;
|
||||
for (i = 17, k = 0; i <= 250; ++i) {
|
||||
int l = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr);
|
||||
if (l != k) fprintf(stderr, "[bwa_aln] %dbp reads: max_diff = %d\n", i, l);
|
||||
k = l;
|
||||
}
|
||||
}
|
||||
bwa_aln_core(argv[optind], argv[optind+1], opt);
|
||||
free(opt);
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,113 @@
|
|||
#ifndef BWTALN_H
|
||||
#define BWTALN_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "bwt.h"
|
||||
|
||||
#define BWA_TYPE_NO_MATCH 0
|
||||
#define BWA_TYPE_UNIQUE 1
|
||||
#define BWA_TYPE_REPEAT 2
|
||||
#define BWA_TYPE_MATESW 3
|
||||
|
||||
#define SAM_FPD 1 // paired
|
||||
#define SAM_FPP 2 // properly paired
|
||||
#define SAM_FSU 4 // self-unmapped
|
||||
#define SAM_FMU 8 // mate-unmapped
|
||||
#define SAM_FSR 16 // self on the reverse strand
|
||||
#define SAM_FMR 32 // mate on the reverse strand
|
||||
#define SAM_FR1 64 // this is read one
|
||||
#define SAM_FR2 128 // this is read two
|
||||
#define SAM_FSC 256 // secondary alignment
|
||||
|
||||
#define BWA_AVG_ERR 0.02
|
||||
#define BWA_MIN_RDLEN 35 // for read trimming
|
||||
|
||||
#ifndef bns_pac
|
||||
#define bns_pac(pac, k) ((pac)[(k)>>2] >> ((~(k)&3)<<1) & 3)
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
bwtint_t w;
|
||||
int bid;
|
||||
} bwt_width_t;
|
||||
|
||||
typedef struct {
|
||||
uint32_t n_mm:8, n_gapo:8, n_gape:8, a:1;
|
||||
bwtint_t k, l;
|
||||
int score;
|
||||
} bwt_aln1_t;
|
||||
|
||||
typedef struct {
|
||||
char *name;
|
||||
ubyte_t *seq, *rseq, *qual;
|
||||
uint32_t len:20, strand:1, type:2, dummy:1, extra_flag:8;
|
||||
uint32_t n_mm:8, n_gapo:8, n_gape:8, mapQ:8;
|
||||
int score;
|
||||
// alignments in SA coordinates
|
||||
int n_aln;
|
||||
bwt_aln1_t *aln;
|
||||
// alignment information
|
||||
bwtint_t sa, pos;
|
||||
uint64_t c1:28, c2:28, seQ:8; // number of top1 and top2 hits; single-end mapQ
|
||||
int n_cigar;
|
||||
uint16_t *cigar;
|
||||
// for multi-threading only
|
||||
int tid;
|
||||
// NM and MD tags
|
||||
uint32_t full_len:20, nm:12;
|
||||
char *md;
|
||||
} bwa_seq_t;
|
||||
|
||||
#define BWA_MODE_GAPE 0x01
|
||||
#define BWA_MODE_COMPREAD 0x02
|
||||
#define BWA_MODE_LOGGAP 0x04
|
||||
#define BWA_MODE_NONSTOP 0x10
|
||||
|
||||
typedef struct {
|
||||
int s_mm, s_gapo, s_gape;
|
||||
int mode;
|
||||
int indel_end_skip, max_del_occ, max_entries;
|
||||
float fnr;
|
||||
int max_diff, max_gapo, max_gape;
|
||||
int max_seed_diff, seed_len;
|
||||
int n_threads;
|
||||
int max_top2;
|
||||
int trim_qual;
|
||||
} gap_opt_t;
|
||||
|
||||
#define BWA_PET_STD 1
|
||||
#define BWA_PET_SOLID 2
|
||||
|
||||
typedef struct {
|
||||
int max_isize;
|
||||
int max_occ;
|
||||
int type, is_sw;
|
||||
} pe_opt_t;
|
||||
|
||||
struct __bwa_seqio_t;
|
||||
typedef struct __bwa_seqio_t bwa_seqio_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
gap_opt_t *gap_init_opt();
|
||||
void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt);
|
||||
|
||||
bwa_seqio_t *bwa_seq_open(const char *fn);
|
||||
void bwa_seq_close(bwa_seqio_t *bs);
|
||||
void seq_reverse(int len, ubyte_t *seq, int is_comp);
|
||||
bwa_seq_t *bwa_read_seq(bwa_seqio_t *seq, int n_needed, int *n, int is_comp, int trim_qual);
|
||||
void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs);
|
||||
|
||||
int bwa_cal_maxdiff(int l, double err, double thres);
|
||||
|
||||
void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac);
|
||||
|
||||
void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
Loading…
Reference in New Issue