moved some common code to bwa.{c,h}
This commit is contained in:
parent
3c330d5049
commit
e613195e17
14
Makefile
14
Makefile
|
|
@ -3,7 +3,7 @@ CFLAGS= -g -Wall -O2
|
||||||
CXXFLAGS= $(CFLAGS)
|
CXXFLAGS= $(CFLAGS)
|
||||||
AR= ar
|
AR= ar
|
||||||
DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64
|
DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64
|
||||||
LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwamem.o bwamem_pair.o
|
LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o
|
||||||
AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
|
||||||
is.o bwtindex.o bwape.o \
|
is.o bwtindex.o bwape.o \
|
||||||
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
|
||||||
|
|
@ -28,14 +28,16 @@ bwa:libbwa.a $(AOBJS) main.o
|
||||||
libbwa.a:$(LOBJS)
|
libbwa.a:$(LOBJS)
|
||||||
$(AR) -csru $@ $(LOBJS)
|
$(AR) -csru $@ $(LOBJS)
|
||||||
|
|
||||||
|
QSufSort.o:QSufSort.h
|
||||||
|
bwt_gen.o:QSufSort.h
|
||||||
|
|
||||||
|
ksw.o:ksw.h
|
||||||
|
utils.o:utils.h ksort.h kseq.h
|
||||||
|
bntseq.o:bntseq.h
|
||||||
|
bwt.o:bwt.h utils.h
|
||||||
bwa.o:bwa.h
|
bwa.o:bwa.h
|
||||||
|
|
||||||
QSufSort.o:QSufSort.h
|
|
||||||
|
|
||||||
bwt.o:bwt.h
|
|
||||||
bwtio.o:bwt.h
|
|
||||||
bwtaln.o:bwt.h bwtaln.h kseq.h
|
bwtaln.o:bwt.h bwtaln.h kseq.h
|
||||||
bntseq.o:bntseq.h
|
|
||||||
bwtgap.o:bwtgap.h bwtaln.h bwt.h
|
bwtgap.o:bwtgap.h bwtaln.h bwt.h
|
||||||
|
|
||||||
bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h
|
bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,96 @@
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <zlib.h>
|
||||||
|
#include "bntseq.h"
|
||||||
|
#include "bwa.h"
|
||||||
|
#include "ksw.h"
|
||||||
|
|
||||||
|
/************************
|
||||||
|
* Batch FASTA/Q reader *
|
||||||
|
************************/
|
||||||
|
|
||||||
|
#include "kseq.h"
|
||||||
|
KSEQ_DECLARE(gzFile)
|
||||||
|
|
||||||
|
static inline void trim_readno(kstring_t *s)
|
||||||
|
{
|
||||||
|
if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1]))
|
||||||
|
s->l -= 2, s->s[s->l] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s)
|
||||||
|
{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice
|
||||||
|
s->name = strdup(ks->name.s);
|
||||||
|
s->comment = ks->comment.l? strdup(s->comment) : 0;
|
||||||
|
s->seq = strdup(ks->seq.s);
|
||||||
|
s->qual = ks->qual.l? strdup(ks->qual.s) : 0;
|
||||||
|
s->l_seq = strlen(s->seq);
|
||||||
|
}
|
||||||
|
|
||||||
|
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
|
||||||
|
{
|
||||||
|
kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_;
|
||||||
|
int size = 0, m, n;
|
||||||
|
bseq1_t *seqs;
|
||||||
|
m = n = 0; seqs = 0;
|
||||||
|
while (kseq_read(ks) >= 0) {
|
||||||
|
if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads
|
||||||
|
fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (n >= m) {
|
||||||
|
m = m? m<<1 : 256;
|
||||||
|
seqs = realloc(seqs, m * sizeof(bseq1_t));
|
||||||
|
}
|
||||||
|
trim_readno(&ks->name);
|
||||||
|
kseq2bseq1(ks, &seqs[n]);
|
||||||
|
size += seqs[n++].l_seq;
|
||||||
|
if (ks2) {
|
||||||
|
trim_readno(&ks2->name);
|
||||||
|
kseq2bseq1(ks2, &seqs[n]);
|
||||||
|
size += seqs[n++].l_seq;
|
||||||
|
}
|
||||||
|
if (size >= chunk_size) break;
|
||||||
|
}
|
||||||
|
if (size == 0) { // test if the 2nd file is finished
|
||||||
|
if (ks2 && kseq_read(ks2) >= 0)
|
||||||
|
fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__);
|
||||||
|
}
|
||||||
|
*n_ = n;
|
||||||
|
return seqs;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate CIGAR when the alignment end points are known
|
||||||
|
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar)
|
||||||
|
{
|
||||||
|
uint32_t *cigar = 0;
|
||||||
|
uint8_t tmp, *rseq;
|
||||||
|
int i, w;
|
||||||
|
int64_t rlen;
|
||||||
|
*n_cigar = 0;
|
||||||
|
if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand
|
||||||
|
rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
|
||||||
|
if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range
|
||||||
|
if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position
|
||||||
|
for (i = 0; i < l_query>>1; ++i)
|
||||||
|
tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
|
||||||
|
for (i = 0; i < rlen>>1; ++i)
|
||||||
|
tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp;
|
||||||
|
}
|
||||||
|
//printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
|
||||||
|
//printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
|
||||||
|
// set the band-width
|
||||||
|
w = (int)((double)(l_query * mat[0] - q) / r + 1.);
|
||||||
|
w = w < 1? w : 1;
|
||||||
|
w = w < w_? w : w_;
|
||||||
|
w += abs(rlen - l_query);
|
||||||
|
// NW alignment
|
||||||
|
*score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar);
|
||||||
|
if (rb >= l_pac) // reverse back query
|
||||||
|
for (i = 0; i < l_query>>1; ++i)
|
||||||
|
tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
|
||||||
|
|
||||||
|
ret_gen_cigar:
|
||||||
|
free(rseq);
|
||||||
|
return cigar;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -0,0 +1,23 @@
|
||||||
|
#ifndef BWA_H_
|
||||||
|
#define BWA_H_
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
int l_seq;
|
||||||
|
char *name, *comment, *seq, *qual, *sam;
|
||||||
|
} bseq1_t;
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
|
||||||
|
|
||||||
|
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
35
bwamem.c
35
bwamem.c
|
|
@ -485,41 +485,6 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int
|
||||||
* Basic hit->SAM conversion *
|
* Basic hit->SAM conversion *
|
||||||
*****************************/
|
*****************************/
|
||||||
|
|
||||||
uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar)
|
|
||||||
{
|
|
||||||
uint32_t *cigar = 0;
|
|
||||||
uint8_t tmp, *rseq;
|
|
||||||
int i, w;
|
|
||||||
int64_t rlen;
|
|
||||||
*n_cigar = 0;
|
|
||||||
if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand
|
|
||||||
rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
|
|
||||||
if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range
|
|
||||||
if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position
|
|
||||||
for (i = 0; i < l_query>>1; ++i)
|
|
||||||
tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
|
|
||||||
for (i = 0; i < rlen>>1; ++i)
|
|
||||||
tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp;
|
|
||||||
}
|
|
||||||
//printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
|
|
||||||
//printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
|
|
||||||
// set the band-width
|
|
||||||
w = (int)((double)(l_query * mat[0] - q) / r + 1.);
|
|
||||||
w = w < 1? w : 1;
|
|
||||||
w = w < w_? w : w_;
|
|
||||||
w += abs(rlen - l_query);
|
|
||||||
// NW alignment
|
|
||||||
*score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar);
|
|
||||||
if (rb >= l_pac) // reverse back query
|
|
||||||
for (i = 0; i < l_query>>1; ++i)
|
|
||||||
tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
|
|
||||||
|
|
||||||
ret_gen_cigar:
|
|
||||||
free(rseq);
|
|
||||||
return cigar;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m)
|
void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m)
|
||||||
{
|
{
|
||||||
#define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1)
|
#define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1)
|
||||||
|
|
|
||||||
2
bwamem.h
2
bwamem.h
|
|
@ -3,7 +3,7 @@
|
||||||
|
|
||||||
#include "bwt.h"
|
#include "bwt.h"
|
||||||
#include "bntseq.h"
|
#include "bntseq.h"
|
||||||
#include "utils.h"
|
#include "bwa.h"
|
||||||
|
|
||||||
#define MEM_MAPQ_COEF 40.0
|
#define MEM_MAPQ_COEF 40.0
|
||||||
#define MEM_MAPQ_MAX 60
|
#define MEM_MAPQ_MAX 60
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,7 @@
|
||||||
#include "bwtsw2.h"
|
#include "bwtsw2.h"
|
||||||
#include "stdaln.h"
|
#include "stdaln.h"
|
||||||
#include "kstring.h"
|
#include "kstring.h"
|
||||||
|
#include "bwa.h"
|
||||||
|
|
||||||
#include "kseq.h"
|
#include "kseq.h"
|
||||||
KSEQ_DECLARE(gzFile)
|
KSEQ_DECLARE(gzFile)
|
||||||
|
|
|
||||||
58
utils.c
58
utils.c
|
|
@ -40,6 +40,9 @@
|
||||||
KSORT_INIT(128, pair64_t, pair64_lt)
|
KSORT_INIT(128, pair64_t, pair64_lt)
|
||||||
KSORT_INIT(64, uint64_t, ks_lt_generic)
|
KSORT_INIT(64, uint64_t, ks_lt_generic)
|
||||||
|
|
||||||
|
#include "kseq.h"
|
||||||
|
KSEQ_INIT2(, gzFile, gzread)
|
||||||
|
|
||||||
/********************
|
/********************
|
||||||
* System utilities *
|
* System utilities *
|
||||||
********************/
|
********************/
|
||||||
|
|
@ -160,58 +163,3 @@ double realtime()
|
||||||
gettimeofday(&tp, &tzp);
|
gettimeofday(&tp, &tzp);
|
||||||
return tp.tv_sec + tp.tv_usec * 1e-6;
|
return tp.tv_sec + tp.tv_usec * 1e-6;
|
||||||
}
|
}
|
||||||
|
|
||||||
/************************
|
|
||||||
* Batch FASTA/Q reader *
|
|
||||||
************************/
|
|
||||||
|
|
||||||
#include "kseq.h"
|
|
||||||
KSEQ_INIT2(, gzFile, gzread)
|
|
||||||
|
|
||||||
static inline void trim_readno(kstring_t *s)
|
|
||||||
{
|
|
||||||
if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1]))
|
|
||||||
s->l -= 2, s->s[s->l] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s)
|
|
||||||
{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice
|
|
||||||
s->name = strdup(ks->name.s);
|
|
||||||
s->comment = ks->comment.l? strdup(s->comment) : 0;
|
|
||||||
s->seq = strdup(ks->seq.s);
|
|
||||||
s->qual = ks->qual.l? strdup(ks->qual.s) : 0;
|
|
||||||
s->l_seq = strlen(s->seq);
|
|
||||||
}
|
|
||||||
|
|
||||||
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
|
|
||||||
{
|
|
||||||
kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_;
|
|
||||||
int size = 0, m, n;
|
|
||||||
bseq1_t *seqs;
|
|
||||||
m = n = 0; seqs = 0;
|
|
||||||
while (kseq_read(ks) >= 0) {
|
|
||||||
if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads
|
|
||||||
fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (n >= m) {
|
|
||||||
m = m? m<<1 : 256;
|
|
||||||
seqs = realloc(seqs, m * sizeof(bseq1_t));
|
|
||||||
}
|
|
||||||
trim_readno(&ks->name);
|
|
||||||
kseq2bseq1(ks, &seqs[n]);
|
|
||||||
size += seqs[n++].l_seq;
|
|
||||||
if (ks2) {
|
|
||||||
trim_readno(&ks2->name);
|
|
||||||
kseq2bseq1(ks2, &seqs[n]);
|
|
||||||
size += seqs[n++].l_seq;
|
|
||||||
}
|
|
||||||
if (size >= chunk_size) break;
|
|
||||||
}
|
|
||||||
if (size == 0) { // test if the 2nd file is finished
|
|
||||||
if (ks2 && kseq_read(ks2) >= 0)
|
|
||||||
fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__);
|
|
||||||
}
|
|
||||||
*n_ = n;
|
|
||||||
return seqs;
|
|
||||||
}
|
|
||||||
|
|
|
||||||
7
utils.h
7
utils.h
|
|
@ -52,11 +52,6 @@ typedef struct {
|
||||||
typedef struct { size_t n, m; uint64_t *a; } uint64_v;
|
typedef struct { size_t n, m; uint64_t *a; } uint64_v;
|
||||||
typedef struct { size_t n, m; pair64_t *a; } pair64_v;
|
typedef struct { size_t n, m; pair64_t *a; } pair64_v;
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
int l_seq;
|
|
||||||
char *name, *comment, *seq, *qual, *sam;
|
|
||||||
} bseq1_t;
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -80,8 +75,6 @@ extern "C" {
|
||||||
void ks_introsort_64 (size_t n, uint64_t *a);
|
void ks_introsort_64 (size_t n, uint64_t *a);
|
||||||
void ks_introsort_128(size_t n, pair64_t *a);
|
void ks_introsort_128(size_t n, pair64_t *a);
|
||||||
|
|
||||||
bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue